Last active
May 19, 2025 00:41
-
-
Save alpha-beta-soup/b18a8ff2c869b17bfc3f839bfe11c3f5 to your computer and use it in GitHub Desktop.
Stratified random sampling of GPKG files for use in MapAccuracy, with optional grouping division and bias
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from pathlib import Path | |
import sys | |
import pandas as pd | |
import geopandas as gpd | |
def stratified_sample_validated( | |
df: gpd.GeoDataFrame, | |
groupby_column: str, | |
bias_column: str = None, | |
random_state: int = 1, | |
fractional_power: float = None, | |
confidence_z: float = 1.96, # 95% confidence | |
margin_error: float = 0.05, | |
p_est: float = 0.5, | |
min_samples: int = 30 # minimum per class for meaningful validation | |
) -> gpd.GeoDataFrame: | |
df = df.copy() | |
assert groupby_column in df.columns | |
samples = [] | |
for class_val in df[groupby_column].unique(): | |
group_df = df[df[groupby_column] == class_val] | |
N = len(group_df) | |
if N == 0: | |
continue | |
# Initial sample size without finite population correction | |
n = (confidence_z**2 * p_est * (1 - p_est)) / (margin_error**2) | |
# Apply finite population correction | |
n = n / (1 + ((n - 1) / N)) | |
n = int(np.ceil(min(max(n, min_samples), N))) # Clip to [min_samples, N] | |
if bias_column == "area": | |
weights = group_df.geometry.area | |
elif bias_column is not None: | |
weights = group_df[bias_column] | |
else: | |
weights = None | |
if weights is not None: | |
if fractional_power == 0: | |
weights = np.log1p(weights) | |
elif fractional_power is not None: | |
weights = weights ** fractional_power | |
sampled = group_df.sample(n=n, weights=weights, random_state=random_state) | |
samples.append(sampled) | |
print(class_val, len(sampled)) | |
return pd.concat(samples, ignore_index=True) | |
def map_accuracy_prep(df: gpd.GeoDataFrame, mapped_cls_col: str) -> gpd.GeoDataFrame: | |
df = df.copy() | |
df['mapped'] = df[mapped_cls_col].astype(str) # Contains class the feature was mapped as | |
df['truth'] = df['mapped'] # Should initially contain the same as 'mapped' by MapAccuracy will re-write this attribute if the user changes it to reflect an error in the mapping | |
df['checked'] = pd.Series(-1, index=df.index, dtype='int8') | |
df['comment'] = pd.Series([None] * len(df), dtype='object') | |
return df | |
def sample_group(df: gpd.GeoDataFrame, n_groups: int) -> gpd.GeoDataFrame: | |
df = df.copy() | |
df['sample_group'] = np.arange(len(df)) % n_groups | |
df['sample_group'] = df['sample_group'].astype('int8') # small int type | |
return df | |
if __name__ == '__main__': | |
# python stratifed_sample.py input output groupby_column n_groups bias_column random_state fractional_power confidence_z margin_error min_samples | |
# python stratified_sample.py input.gpkg output.gpkg lu_coden n_groups area 1 0.25 1.96 0.1 30 | |
# ^ area-weighted sample, with fourth root transform (0.25); 95% confidence interval, 10% margin of error, and a minimum sample size of 30 in each class | |
input_path = Path(sys.argv[1]) | |
output_path = Path(sys.argv[2]) | |
groupby_column = sys.argv[3] | |
n_groups = int(sys.argv[4]) | |
bias_column = sys.argv[5] | |
random_state = int(sys.argv[6]) | |
fractional_power = float(sys.argv[7]) | |
confidence_z = float(sys.argv[8]) | |
margin_error = float(sys.argv[9]) | |
min_samples = int(sys.argv[10]) | |
assert fractional_power < 1, "Fractional power must be < 1" | |
df = gpd.read_file(input_path) | |
df = df.explode(column=None, ignore_index=False, index_parts=False) | |
df = stratified_sample_validated( | |
df, | |
groupby_column=groupby_column, | |
bias_column=bias_column, | |
random_state=random_state, | |
fractional_power=fractional_power, | |
confidence_z=confidence_z, | |
margin_error=margin_error, | |
min_samples=min_samples | |
) | |
df = map_accuracy_prep(df, groupby_column) | |
df = sample_group(df, n_groups) | |
df.to_file(output_path) | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Adjusted the method to allow the specification of a confidence interval (z), a margin of error, and a minimum sample size for each class for explicit statistical validity.