twolfe18 · March 14, 2013 16:08
diff --git a/sort-compression.txt b/sort-compression.txt

 time gzip -c <orig >orig.gz
 real    0m0.311s
 user    0m0.289s
 sys     0m0.005s

 time bzip2 -c <orig >orig.bz2
 real    0m3.358s
 user    0m2.981s
 sys     0m0.367s

 time sort orig | gzip -c >orig.sorted.gz
 real    0m2.358s
 user    0m2.345s
 sys     0m0.047s

 time sort orig | bzip2 -c >orig.sorted.bz2
 real    0m6.839s
 user    0m6.823s
 sys     0m0.046s

 du -sh orig*
 4.3M    orig
 1.4M    orig.gz
 1010K   orig.bz2
 667K    orig.sort.gz
 596K    orig.sort.bz2

 # to reconstruct the file, we need a permutation matrix
 # that tells us how to get back the unsorted log file.
 # in this case, there aren't many lines, but one 2-byte
 # per line comes out to 402K, which totally wipes out the benefits!
 wc -l orig
 201100 orig

 # how about a bigger file:
 wc -l big
 48244770 big

 # 2^26 = 67,108,864 > 48,244,770
 # 26 * 48244770 / 8 = 153M

 time sort -o big.sorted -S 64M big
 # note that sorting takes by far the longest,
 # except for maybe bzip2 -9

 8.7G  big
 817M	big.bz2
 1.1G	big.gz
 1.1G	big.gz9
 8.7G	big.sorted
 722M	big.sorted.bz2
 911M	big.sorted.gz
 894M	big.sorted.gz9

 # big.sorted.gz + 153M = 1064MB == 1.1G = big.gz
 # ... tl;dr we didn't save any space and we wasted a lot of time

 # TODO look at better forms of compressing similar lines
 # you might want to consider clustering input lines,
 # do delta encoding from cluster centroid to cluster elements

	time gzip -c <orig >orig.gz
	real 0m0.311s
	user 0m0.289s
	sys 0m0.005s

	time bzip2 -c <orig >orig.bz2
	real 0m3.358s
	user 0m2.981s
	sys 0m0.367s

	time sort orig \| gzip -c >orig.sorted.gz
	real 0m2.358s
	user 0m2.345s
	sys 0m0.047s

	time sort orig \| bzip2 -c >orig.sorted.bz2
	real 0m6.839s
	user 0m6.823s
	sys 0m0.046s

	du -sh orig*
	4.3M orig
	1.4M orig.gz
	1010K orig.bz2
	667K orig.sort.gz
	596K orig.sort.bz2

	# to reconstruct the file, we need a permutation matrix
	# that tells us how to get back the unsorted log file.
	# in this case, there aren't many lines, but one 2-byte
	# per line comes out to 402K, which totally wipes out the benefits!
	wc -l orig
	201100 orig

	# how about a bigger file:
	wc -l big
	48244770 big

	# 2^26 = 67,108,864 > 48,244,770
	# 26 * 48244770 / 8 = 153M

	time sort -o big.sorted -S 64M big
	# note that sorting takes by far the longest,
	# except for maybe bzip2 -9

	8.7G big
	817M big.bz2
	1.1G big.gz
	1.1G big.gz9
	8.7G big.sorted
	722M big.sorted.bz2
	911M big.sorted.gz
	894M big.sorted.gz9

	# big.sorted.gz + 153M = 1064MB == 1.1G = big.gz
	# ... tl;dr we didn't save any space and we wasted a lot of time

	# TODO look at better forms of compressing similar lines
	# you might want to consider clustering input lines,
	# do delta encoding from cluster centroid to cluster elements