alpha-beta-soup · April 29, 2025 23:52 · alpha-beta-soup · Apr 29, 2025
diff --git a/stratified_sample.py b/stratified_sample.py
 import numpy as np
 from pathlib import Path
 import sys

 import pandas as pd
 import geopandas as gpd

 def stratified_sample(df: pd.DataFrame, groupby_column: str, sampling_rate: float = 0.01) -> pd.DataFrame:
    df = df.copy()
    assert 0.0 < sampling_rate <= 1.0
    assert groupby_column in df.columns
    num_rows = int((df.shape[0] * sampling_rate) // 1)
    num_classes = len(df[groupby_column].unique())
    num_rows_per_class = int(max(1, ((num_rows / num_classes) // 1)))
    df_sample = df.groupby(groupby_column, group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows_per_class)))
    return df_sample
    
 def map_accuracy_prep(df: pd.DataFrame, mapped_cls_col: str):
    df = df.copy()
    df['mapped'] = df[mapped_cls_col].astype(str) # Contains class the feature was mapped as
    df['truth'] = df['mapped'] # Should initially contain the same as 'mapped' by MapAccuracy will re-write this attribute if the user changes it to reflect an error in the mapping
    df['checked'] = pd.Series(-1, index=df.index, dtype='int8')
    df['comment'] = pd.Series([None] * len(df), dtype='object')
    return df

 def sample_group(df: pd.DataFrame, n_groups: int):
    df = df.copy()
    df['sample_group'] = np.arange(len(df)) % n_groups
    df['sample_group'] = df['sample_group'].astype('int8')  # small int type
    return df
    
 if __name__ == '__main__':
    # python stratifed_sample.py input output groupby_column sampling_rate n_groups
    # python stratified_sample.py input.gpkg output.gpkg lu_coden 0.005 2
    input, output = Path(sys.argv[1]), Path(sys.argv[2])
    groupby_column = str(sys.argv[3])
    sampling_rate = float(sys.argv[4])
    n_groups = int(sys.argv[5])
    df = gpd.read_file(input)
    df = df.explode(column=None, ignore_index=False, index_parts=False)
    df = stratified_sample(df, groupby_column, sampling_rate)
    df = map_accuracy_prep(df, groupby_column)
    df = sample_group(df, n_groups)
    df.to_file(output)
    sys.exit(0)
	import numpy as np
	from pathlib import Path
	import sys

	import pandas as pd
	import geopandas as gpd

	def stratified_sample(df: pd.DataFrame, groupby_column: str, sampling_rate: float = 0.01) -> pd.DataFrame:
	df = df.copy()
	assert 0.0 < sampling_rate <= 1.0
	assert groupby_column in df.columns
	num_rows = int((df.shape[0] * sampling_rate) // 1)
	num_classes = len(df[groupby_column].unique())
	num_rows_per_class = int(max(1, ((num_rows / num_classes) // 1)))
	df_sample = df.groupby(groupby_column, group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows_per_class)))
	return df_sample

	def map_accuracy_prep(df: pd.DataFrame, mapped_cls_col: str):
	df = df.copy()
	df['mapped'] = df[mapped_cls_col].astype(str) # Contains class the feature was mapped as
	df['truth'] = df['mapped'] # Should initially contain the same as 'mapped' by MapAccuracy will re-write this attribute if the user changes it to reflect an error in the mapping
	df['checked'] = pd.Series(-1, index=df.index, dtype='int8')
	df['comment'] = pd.Series([None] * len(df), dtype='object')
	return df

	def sample_group(df: pd.DataFrame, n_groups: int):
	df = df.copy()
	df['sample_group'] = np.arange(len(df)) % n_groups
	df['sample_group'] = df['sample_group'].astype('int8') # small int type
	return df

	if __name__ == '__main__':
	# python stratifed_sample.py input output groupby_column sampling_rate n_groups
	# python stratified_sample.py input.gpkg output.gpkg lu_coden 0.005 2
	input, output = Path(sys.argv[1]), Path(sys.argv[2])
	groupby_column = str(sys.argv[3])
	sampling_rate = float(sys.argv[4])
	n_groups = int(sys.argv[5])
	df = gpd.read_file(input)
	df = df.explode(column=None, ignore_index=False, index_parts=False)
	df = stratified_sample(df, groupby_column, sampling_rate)
	df = map_accuracy_prep(df, groupby_column)
	df = sample_group(df, n_groups)
	df.to_file(output)
	sys.exit(0)