Skip to content

Instantly share code, notes, and snippets.

@dhaninugraha
Last active December 13, 2023 13:11
Show Gist options
  • Save dhaninugraha/9656d653cff02667f6e407637474e4fb to your computer and use it in GitHub Desktop.
Save dhaninugraha/9656d653cff02667f6e407637474e4fb to your computer and use it in GitHub Desktop.
python - reservoir sampling
import random
import time
mode = "percentage"
shuffle = True
sampling_percentage = 20
sampling_count = 1000
start_data_gen = time.time()
data = [i for i in xrange(20000, 15000001)] # 20.000 - 15.000.000
end_data_gen = time.time()
print "data generated in {:.1f} seconds".format(end_data_gen - start_data_gen)
if shuffle:
print "shuffling the data..."
random.shuffle(data)
if mode == "percentage":
no_of_samples = int(float(sampling_percentage) / float(100) * float(len(data)))
elif mode == "count":
no_of_samples = sampling_count
sample = []
sample_replace_idx = [j for j in xrange(no_of_samples)]
amount_groups = { \
"tens_of_thou": {"lower": 10000, "upper": 99999}, \
"hundreds_of_thou": {"lower": 100000, "upper": 999999}, \
"millions": {"lower": 1000000, "upper": 9999999}, \
"tens_of_millions": {"lower": 10000000, "upper": 99999999}
}
amount_group_order = ["tens_of_thou", "hundreds_of_thou", "millions", "tens_of_millions"]
count_in_amount_group = {k: 0 for k, v in amount_groups.iteritems()}
print "# of data: {}\nbegin sampling...".format(len(data))
start_sampling = time.time()
for idx, d in enumerate(data):
if len(sample) < no_of_samples:
sample.append(d)
else:
prob = float(no_of_samples) / float(idx + 1)
if random.random() < prob:
replace_at = random.choice(sample_replace_idx)
sample[replace_at] = d
end_sampling = time.time()
print "finished sampling in {:.1f} seconds; # of samples: {} ({:.3f}% of total data)".format(end_sampling - start_sampling, len(sample), (float(len(sample)) / float(len(data)) * 100.0))
for amount in sample:
for amount_group, bounds in amount_groups.iteritems():
if bounds["lower"] <= amount <= bounds["upper"]:
count_in_amount_group[amount_group] += 1
break
print "count of samples in each amount group:"
for i, amount_group in enumerate(amount_group_order):
print " {:>18} => {:>10} ({:.3f}% of sample)".format(\
amount_group, \
count_in_amount_group[amount_group], \
(float(count_in_amount_group[amount_group]) / float(len(sample)) * 100.0) \
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment