dhaninugraha · December 13, 2023 13:11
diff --git a/app.py b/app.py
 import random
 import time

 mode = "percentage"
 shuffle = True
 sampling_percentage = 20
 sampling_count = 1000

 start_data_gen = time.time()
 data = [i for i in xrange(20000, 15000001)] # 20.000 - 15.000.000
 end_data_gen = time.time()
 print "data generated in {:.1f} seconds".format(end_data_gen - start_data_gen)

 if shuffle:
 	print "shuffling the data..."
 	random.shuffle(data)

 if mode == "percentage":
 	no_of_samples = int(float(sampling_percentage) / float(100) * float(len(data)))
 elif mode == "count":
 	no_of_samples = sampling_count

 sample = []
 sample_replace_idx = [j for j in xrange(no_of_samples)]

 amount_groups = { \
 	"tens_of_thou": {"lower": 10000, "upper": 99999}, \
 	"hundreds_of_thou": {"lower": 100000, "upper": 999999}, \
 	"millions": {"lower": 1000000, "upper": 9999999}, \
 	"tens_of_millions": {"lower": 10000000, "upper": 99999999}
 	}
 amount_group_order = ["tens_of_thou", "hundreds_of_thou", "millions", "tens_of_millions"]
 count_in_amount_group = {k: 0 for k, v in amount_groups.iteritems()}

 print "# of data: {}\nbegin sampling...".format(len(data))
 start_sampling = time.time()
 for idx, d in enumerate(data):
 	if len(sample) < no_of_samples:
 		sample.append(d)
 	else:
 		prob = float(no_of_samples) / float(idx + 1)
 		if random.random() < prob:
 			replace_at = random.choice(sample_replace_idx)
 			sample[replace_at] = d
 end_sampling = time.time()
 print "finished sampling in {:.1f} seconds; # of samples: {} ({:.3f}% of total data)".format(end_sampling - start_sampling, len(sample), (float(len(sample)) / float(len(data)) * 100.0))

 for amount in sample:
 	for amount_group, bounds in amount_groups.iteritems():
 		if bounds["lower"] <= amount <= bounds["upper"]:
 			count_in_amount_group[amount_group] += 1
 			break

 print "count of samples in each amount group:"
 for i, amount_group in enumerate(amount_group_order):
 	print "  {:>18} => {:>10} ({:.3f}% of sample)".format(\
 		amount_group, \
 		count_in_amount_group[amount_group], \
 		(float(count_in_amount_group[amount_group]) / float(len(sample)) * 100.0) \
 		)
	import random
	import time

	mode = "percentage"
	shuffle = True
	sampling_percentage = 20
	sampling_count = 1000

	start_data_gen = time.time()
	data = [i for i in xrange(20000, 15000001)] # 20.000 - 15.000.000
	end_data_gen = time.time()
	print "data generated in {:.1f} seconds".format(end_data_gen - start_data_gen)

	if shuffle:
	print "shuffling the data..."
	random.shuffle(data)

	if mode == "percentage":
	no_of_samples = int(float(sampling_percentage) / float(100) * float(len(data)))
	elif mode == "count":
	no_of_samples = sampling_count

	sample = []
	sample_replace_idx = [j for j in xrange(no_of_samples)]

	amount_groups = { \
	"tens_of_thou": {"lower": 10000, "upper": 99999}, \
	"hundreds_of_thou": {"lower": 100000, "upper": 999999}, \
	"millions": {"lower": 1000000, "upper": 9999999}, \
	"tens_of_millions": {"lower": 10000000, "upper": 99999999}
	}
	amount_group_order = ["tens_of_thou", "hundreds_of_thou", "millions", "tens_of_millions"]
	count_in_amount_group = {k: 0 for k, v in amount_groups.iteritems()}

	print "# of data: {}\nbegin sampling...".format(len(data))
	start_sampling = time.time()
	for idx, d in enumerate(data):
	if len(sample) < no_of_samples:
	sample.append(d)
	else:
	prob = float(no_of_samples) / float(idx + 1)
	if random.random() < prob:
	replace_at = random.choice(sample_replace_idx)
	sample[replace_at] = d
	end_sampling = time.time()
	print "finished sampling in {:.1f} seconds; # of samples: {} ({:.3f}% of total data)".format(end_sampling - start_sampling, len(sample), (float(len(sample)) / float(len(data)) * 100.0))

	for amount in sample:
	for amount_group, bounds in amount_groups.iteritems():
	if bounds["lower"] <= amount <= bounds["upper"]:
	count_in_amount_group[amount_group] += 1
	break

	print "count of samples in each amount group:"
	for i, amount_group in enumerate(amount_group_order):
	print " {:>18} => {:>10} ({:.3f}% of sample)".format(\
	amount_group, \
	count_in_amount_group[amount_group], \
	(float(count_in_amount_group[amount_group]) / float(len(sample)) * 100.0) \
	)