Skip to content

Instantly share code, notes, and snippets.

@alpha-beta-soup
Last active May 19, 2025 00:41
Show Gist options
  • Save alpha-beta-soup/b18a8ff2c869b17bfc3f839bfe11c3f5 to your computer and use it in GitHub Desktop.
Save alpha-beta-soup/b18a8ff2c869b17bfc3f839bfe11c3f5 to your computer and use it in GitHub Desktop.
Stratified random sampling of GPKG files for use in MapAccuracy, with optional grouping division and bias
import numpy as np
from pathlib import Path
import sys
import pandas as pd
import geopandas as gpd
def stratified_sample_validated(
df: gpd.GeoDataFrame,
groupby_column: str,
bias_column: str = None,
random_state: int = 1,
fractional_power: float = None,
confidence_z: float = 1.96, # 95% confidence
margin_error: float = 0.05,
p_est: float = 0.5,
min_samples: int = 30 # minimum per class for meaningful validation
) -> gpd.GeoDataFrame:
df = df.copy()
assert groupby_column in df.columns
samples = []
for class_val in df[groupby_column].unique():
group_df = df[df[groupby_column] == class_val]
N = len(group_df)
if N == 0:
continue
# Initial sample size without finite population correction
n = (confidence_z**2 * p_est * (1 - p_est)) / (margin_error**2)
# Apply finite population correction
n = n / (1 + ((n - 1) / N))
n = int(np.ceil(min(max(n, min_samples), N))) # Clip to [min_samples, N]
if bias_column == "area":
weights = group_df.geometry.area
elif bias_column is not None:
weights = group_df[bias_column]
else:
weights = None
if weights is not None:
if fractional_power == 0:
weights = np.log1p(weights)
elif fractional_power is not None:
weights = weights ** fractional_power
sampled = group_df.sample(n=n, weights=weights, random_state=random_state)
samples.append(sampled)
print(class_val, len(sampled))
return pd.concat(samples, ignore_index=True)
def map_accuracy_prep(df: gpd.GeoDataFrame, mapped_cls_col: str) -> gpd.GeoDataFrame:
df = df.copy()
df['mapped'] = df[mapped_cls_col].astype(str) # Contains class the feature was mapped as
df['truth'] = df['mapped'] # Should initially contain the same as 'mapped' by MapAccuracy will re-write this attribute if the user changes it to reflect an error in the mapping
df['checked'] = pd.Series(-1, index=df.index, dtype='int8')
df['comment'] = pd.Series([None] * len(df), dtype='object')
return df
def sample_group(df: gpd.GeoDataFrame, n_groups: int) -> gpd.GeoDataFrame:
df = df.copy()
df['sample_group'] = np.arange(len(df)) % n_groups
df['sample_group'] = df['sample_group'].astype('int8') # small int type
return df
if __name__ == '__main__':
# python stratifed_sample.py input output groupby_column n_groups bias_column random_state fractional_power confidence_z margin_error min_samples
# python stratified_sample.py input.gpkg output.gpkg lu_coden n_groups area 1 0.25 1.96 0.1 30
# ^ area-weighted sample, with fourth root transform (0.25); 95% confidence interval, 10% margin of error, and a minimum sample size of 30 in each class
input_path = Path(sys.argv[1])
output_path = Path(sys.argv[2])
groupby_column = sys.argv[3]
n_groups = int(sys.argv[4])
bias_column = sys.argv[5]
random_state = int(sys.argv[6])
fractional_power = float(sys.argv[7])
confidence_z = float(sys.argv[8])
margin_error = float(sys.argv[9])
min_samples = int(sys.argv[10])
assert fractional_power < 1, "Fractional power must be < 1"
df = gpd.read_file(input_path)
df = df.explode(column=None, ignore_index=False, index_parts=False)
df = stratified_sample_validated(
df,
groupby_column=groupby_column,
bias_column=bias_column,
random_state=random_state,
fractional_power=fractional_power,
confidence_z=confidence_z,
margin_error=margin_error,
min_samples=min_samples
)
df = map_accuracy_prep(df, groupby_column)
df = sample_group(df, n_groups)
df.to_file(output_path)
sys.exit(0)
@alpha-beta-soup
Copy link
Author

Adjusted the method to allow the specification of a confidence interval (z), a margin of error, and a minimum sample size for each class for explicit statistical validity.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment