Last active
December 13, 2023 13:11
-
-
Save dhaninugraha/9656d653cff02667f6e407637474e4fb to your computer and use it in GitHub Desktop.
python - reservoir sampling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import time | |
mode = "percentage" | |
shuffle = True | |
sampling_percentage = 20 | |
sampling_count = 1000 | |
start_data_gen = time.time() | |
data = [i for i in xrange(20000, 15000001)] # 20.000 - 15.000.000 | |
end_data_gen = time.time() | |
print "data generated in {:.1f} seconds".format(end_data_gen - start_data_gen) | |
if shuffle: | |
print "shuffling the data..." | |
random.shuffle(data) | |
if mode == "percentage": | |
no_of_samples = int(float(sampling_percentage) / float(100) * float(len(data))) | |
elif mode == "count": | |
no_of_samples = sampling_count | |
sample = [] | |
sample_replace_idx = [j for j in xrange(no_of_samples)] | |
amount_groups = { \ | |
"tens_of_thou": {"lower": 10000, "upper": 99999}, \ | |
"hundreds_of_thou": {"lower": 100000, "upper": 999999}, \ | |
"millions": {"lower": 1000000, "upper": 9999999}, \ | |
"tens_of_millions": {"lower": 10000000, "upper": 99999999} | |
} | |
amount_group_order = ["tens_of_thou", "hundreds_of_thou", "millions", "tens_of_millions"] | |
count_in_amount_group = {k: 0 for k, v in amount_groups.iteritems()} | |
print "# of data: {}\nbegin sampling...".format(len(data)) | |
start_sampling = time.time() | |
for idx, d in enumerate(data): | |
if len(sample) < no_of_samples: | |
sample.append(d) | |
else: | |
prob = float(no_of_samples) / float(idx + 1) | |
if random.random() < prob: | |
replace_at = random.choice(sample_replace_idx) | |
sample[replace_at] = d | |
end_sampling = time.time() | |
print "finished sampling in {:.1f} seconds; # of samples: {} ({:.3f}% of total data)".format(end_sampling - start_sampling, len(sample), (float(len(sample)) / float(len(data)) * 100.0)) | |
for amount in sample: | |
for amount_group, bounds in amount_groups.iteritems(): | |
if bounds["lower"] <= amount <= bounds["upper"]: | |
count_in_amount_group[amount_group] += 1 | |
break | |
print "count of samples in each amount group:" | |
for i, amount_group in enumerate(amount_group_order): | |
print " {:>18} => {:>10} ({:.3f}% of sample)".format(\ | |
amount_group, \ | |
count_in_amount_group[amount_group], \ | |
(float(count_in_amount_group[amount_group]) / float(len(sample)) * 100.0) \ | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment