Skip to content

Instantly share code, notes, and snippets.

@alpha-beta-soup
Last active April 29, 2025 23:52
Show Gist options
  • Save alpha-beta-soup/b18a8ff2c869b17bfc3f839bfe11c3f5 to your computer and use it in GitHub Desktop.
Save alpha-beta-soup/b18a8ff2c869b17bfc3f839bfe11c3f5 to your computer and use it in GitHub Desktop.
Stratified random sampling of GPKG files for use in MapAccuracy, with optional grouping division
import numpy as np
from pathlib import Path
import sys
import pandas as pd
import geopandas as gpd
def stratified_sample(df: pd.DataFrame, groupby_column: str, sampling_rate: float = 0.01) -> pd.DataFrame:
df = df.copy()
assert 0.0 < sampling_rate <= 1.0
assert groupby_column in df.columns
num_rows = int((df.shape[0] * sampling_rate) // 1)
num_classes = len(df[groupby_column].unique())
num_rows_per_class = int(max(1, ((num_rows / num_classes) // 1)))
df_sample = df.groupby(groupby_column, group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows_per_class)))
return df_sample
def map_accuracy_prep(df: pd.DataFrame, mapped_cls_col: str):
df = df.copy()
df['mapped'] = df[mapped_cls_col].astype(str) # Contains class the feature was mapped as
df['truth'] = df['mapped'] # Should initially contain the same as 'mapped' by MapAccuracy will re-write this attribute if the user changes it to reflect an error in the mapping
df['checked'] = pd.Series(-1, index=df.index, dtype='int8')
df['comment'] = pd.Series([None] * len(df), dtype='object')
return df
def sample_group(df: pd.DataFrame, n_groups: int):
df = df.copy()
df['sample_group'] = np.arange(len(df)) % n_groups
df['sample_group'] = df['sample_group'].astype('int8') # small int type
return df
if __name__ == '__main__':
# python stratifed_sample.py input output groupby_column sampling_rate n_groups
# python stratified_sample.py input.gpkg output.gpkg lu_coden 0.005 2
input, output = Path(sys.argv[1]), Path(sys.argv[2])
groupby_column = str(sys.argv[3])
sampling_rate = float(sys.argv[4])
n_groups = int(sys.argv[5])
df = gpd.read_file(input)
df = df.explode(column=None, ignore_index=False, index_parts=False)
df = stratified_sample(df, groupby_column, sampling_rate)
df = map_accuracy_prep(df, groupby_column)
df = sample_group(df, n_groups)
df.to_file(output)
sys.exit(0)
@alpha-beta-soup
Copy link
Author

Sample .met file for MapAccuracy:

#@ CLASS Yes
#@ CLASS No
#@ CLASS Maybe

#@ SHORTCUT . Yes
#@ SHORTCUT , No
#@ SHORTCUT M Maybe

Filename should the same as the output file. If splitting the file into groups on the basis of n_groups, there will need to be a .met file for each.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment