Last active
April 29, 2025 23:52
-
-
Save alpha-beta-soup/b18a8ff2c869b17bfc3f839bfe11c3f5 to your computer and use it in GitHub Desktop.
Stratified random sampling of GPKG files for use in MapAccuracy, with optional grouping division
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from pathlib import Path | |
import sys | |
import pandas as pd | |
import geopandas as gpd | |
def stratified_sample(df: pd.DataFrame, groupby_column: str, sampling_rate: float = 0.01) -> pd.DataFrame: | |
df = df.copy() | |
assert 0.0 < sampling_rate <= 1.0 | |
assert groupby_column in df.columns | |
num_rows = int((df.shape[0] * sampling_rate) // 1) | |
num_classes = len(df[groupby_column].unique()) | |
num_rows_per_class = int(max(1, ((num_rows / num_classes) // 1))) | |
df_sample = df.groupby(groupby_column, group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows_per_class))) | |
return df_sample | |
def map_accuracy_prep(df: pd.DataFrame, mapped_cls_col: str): | |
df = df.copy() | |
df['mapped'] = df[mapped_cls_col].astype(str) # Contains class the feature was mapped as | |
df['truth'] = df['mapped'] # Should initially contain the same as 'mapped' by MapAccuracy will re-write this attribute if the user changes it to reflect an error in the mapping | |
df['checked'] = pd.Series(-1, index=df.index, dtype='int8') | |
df['comment'] = pd.Series([None] * len(df), dtype='object') | |
return df | |
def sample_group(df: pd.DataFrame, n_groups: int): | |
df = df.copy() | |
df['sample_group'] = np.arange(len(df)) % n_groups | |
df['sample_group'] = df['sample_group'].astype('int8') # small int type | |
return df | |
if __name__ == '__main__': | |
# python stratifed_sample.py input output groupby_column sampling_rate n_groups | |
# python stratified_sample.py input.gpkg output.gpkg lu_coden 0.005 2 | |
input, output = Path(sys.argv[1]), Path(sys.argv[2]) | |
groupby_column = str(sys.argv[3]) | |
sampling_rate = float(sys.argv[4]) | |
n_groups = int(sys.argv[5]) | |
df = gpd.read_file(input) | |
df = df.explode(column=None, ignore_index=False, index_parts=False) | |
df = stratified_sample(df, groupby_column, sampling_rate) | |
df = map_accuracy_prep(df, groupby_column) | |
df = sample_group(df, n_groups) | |
df.to_file(output) | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample .met file for MapAccuracy:
Filename should the same as the output file. If splitting the file into groups on the basis of
n_groups
, there will need to be a .met file for each.