alpha-beta-soup · May 19, 2025 00:41 · alpha-beta-soup · May 19, 2025
diff --git a/stratified_sample.py b/stratified_sample.py
 import numpy as np
 from pathlib import Path
 import sys

 import pandas as pd
 import geopandas as gpd

 def stratified_sample_validated(
    df: gpd.GeoDataFrame,
    groupby_column: str,
    bias_column: str = None,
    random_state: int = 1,
    fractional_power: float = None,
    confidence_z: float = 1.96,  # 95% confidence
    margin_error: float = 0.05,
    p_est: float = 0.5,
    min_samples: int = 30  # minimum per class for meaningful validation
 ) -> gpd.GeoDataFrame:
    df = df.copy()

    assert groupby_column in df.columns

    samples = []

    for class_val in df[groupby_column].unique():
        group_df = df[df[groupby_column] == class_val]
        N = len(group_df)

        if N == 0:
            continue

        # Initial sample size without finite population correction
        n = (confidence_z**2 * p_est * (1 - p_est)) / (margin_error**2)
        # Apply finite population correction
        n = n / (1 + ((n - 1) / N))
        n = int(np.ceil(min(max(n, min_samples), N)))  # Clip to [min_samples, N]

        if bias_column == "area":
            weights = group_df.geometry.area
        elif bias_column is not None:
            weights = group_df[bias_column]
        else:
            weights = None

        if weights is not None:
            if fractional_power == 0:
                weights = np.log1p(weights)
            elif fractional_power is not None:
                weights = weights ** fractional_power

        sampled = group_df.sample(n=n, weights=weights, random_state=random_state)
        samples.append(sampled)
        print(class_val, len(sampled))

    return pd.concat(samples, ignore_index=True)

    
 def map_accuracy_prep(df: gpd.GeoDataFrame, mapped_cls_col: str) -> gpd.GeoDataFrame:
    df = df.copy()
    df['mapped'] = df[mapped_cls_col].astype(str) # Contains class the feature was mapped as
    df['truth'] = df['mapped'] # Should initially contain the same as 'mapped' by MapAccuracy will re-write this attribute if the user changes it to reflect an error in the mapping
    df['checked'] = pd.Series(-1, index=df.index, dtype='int8')
    df['comment'] = pd.Series([None] * len(df), dtype='object')
    return df

 def sample_group(df: gpd.GeoDataFrame, n_groups: int) -> gpd.GeoDataFrame:
    df = df.copy()
    df['sample_group'] = np.arange(len(df)) % n_groups
    df['sample_group'] = df['sample_group'].astype('int8')  # small int type
    return df
    
 if __name__ == '__main__':
    # python stratifed_sample.py input output groupby_column n_groups bias_column random_state fractional_power confidence_z margin_error min_samples
    # python stratified_sample.py input.gpkg output.gpkg lu_coden n_groups area 1 0.25 1.96 0.1 30
    # ^ area-weighted sample, with fourth root transform (0.25); 95% confidence interval, 10% margin of error, and a minimum sample size of 30 in each class
    input_path = Path(sys.argv[1])
    output_path = Path(sys.argv[2])
    groupby_column = sys.argv[3]
    n_groups = int(sys.argv[4])
    bias_column = sys.argv[5]
    random_state = int(sys.argv[6])
    fractional_power = float(sys.argv[7])
    confidence_z = float(sys.argv[8])
    margin_error = float(sys.argv[9])
    min_samples = int(sys.argv[10])

    assert fractional_power < 1, "Fractional power must be < 1"

    df = gpd.read_file(input_path)
    df = df.explode(column=None, ignore_index=False, index_parts=False)

    df = stratified_sample_validated(
        df,
        groupby_column=groupby_column,
        bias_column=bias_column,
        random_state=random_state,
        fractional_power=fractional_power,
        confidence_z=confidence_z,
        margin_error=margin_error,
        min_samples=min_samples
    )

    df = map_accuracy_prep(df, groupby_column)
    df = sample_group(df, n_groups)
    df.to_file(output_path)
    sys.exit(0)
	import numpy as np
	from pathlib import Path
	import sys

	import pandas as pd
	import geopandas as gpd

	def stratified_sample_validated(
	df: gpd.GeoDataFrame,
	groupby_column: str,
	bias_column: str = None,
	random_state: int = 1,
	fractional_power: float = None,
	confidence_z: float = 1.96, # 95% confidence
	margin_error: float = 0.05,
	p_est: float = 0.5,
	min_samples: int = 30 # minimum per class for meaningful validation
	) -> gpd.GeoDataFrame:
	df = df.copy()

	assert groupby_column in df.columns

	samples = []

	for class_val in df[groupby_column].unique():
	group_df = df[df[groupby_column] == class_val]
	N = len(group_df)

	if N == 0:
	continue

	# Initial sample size without finite population correction
	n = (confidence_z*2 p_est * (1 - p_est)) / (margin_error**2)
	# Apply finite population correction
	n = n / (1 + ((n - 1) / N))
	n = int(np.ceil(min(max(n, min_samples), N))) # Clip to [min_samples, N]

	if bias_column == "area":
	weights = group_df.geometry.area
	elif bias_column is not None:
	weights = group_df[bias_column]
	else:
	weights = None

	if weights is not None:
	if fractional_power == 0:
	weights = np.log1p(weights)
	elif fractional_power is not None:
	weights = weights ** fractional_power

	sampled = group_df.sample(n=n, weights=weights, random_state=random_state)
	samples.append(sampled)
	print(class_val, len(sampled))

	return pd.concat(samples, ignore_index=True)


	def map_accuracy_prep(df: gpd.GeoDataFrame, mapped_cls_col: str) -> gpd.GeoDataFrame:
	df = df.copy()
	df['mapped'] = df[mapped_cls_col].astype(str) # Contains class the feature was mapped as
	df['truth'] = df['mapped'] # Should initially contain the same as 'mapped' by MapAccuracy will re-write this attribute if the user changes it to reflect an error in the mapping
	df['checked'] = pd.Series(-1, index=df.index, dtype='int8')
	df['comment'] = pd.Series([None] * len(df), dtype='object')
	return df

	def sample_group(df: gpd.GeoDataFrame, n_groups: int) -> gpd.GeoDataFrame:
	df = df.copy()
	df['sample_group'] = np.arange(len(df)) % n_groups
	df['sample_group'] = df['sample_group'].astype('int8') # small int type
	return df

	if __name__ == '__main__':
	# python stratifed_sample.py input output groupby_column n_groups bias_column random_state fractional_power confidence_z margin_error min_samples
	# python stratified_sample.py input.gpkg output.gpkg lu_coden n_groups area 1 0.25 1.96 0.1 30
	# ^ area-weighted sample, with fourth root transform (0.25); 95% confidence interval, 10% margin of error, and a minimum sample size of 30 in each class
	input_path = Path(sys.argv[1])
	output_path = Path(sys.argv[2])
	groupby_column = sys.argv[3]
	n_groups = int(sys.argv[4])
	bias_column = sys.argv[5]
	random_state = int(sys.argv[6])
	fractional_power = float(sys.argv[7])
	confidence_z = float(sys.argv[8])
	margin_error = float(sys.argv[9])
	min_samples = int(sys.argv[10])

	assert fractional_power < 1, "Fractional power must be < 1"

	df = gpd.read_file(input_path)
	df = df.explode(column=None, ignore_index=False, index_parts=False)

	df = stratified_sample_validated(
	df,
	groupby_column=groupby_column,
	bias_column=bias_column,
	random_state=random_state,
	fractional_power=fractional_power,
	confidence_z=confidence_z,
	margin_error=margin_error,
	min_samples=min_samples
	)

	df = map_accuracy_prep(df, groupby_column)
	df = sample_group(df, n_groups)
	df.to_file(output_path)
	sys.exit(0)