Last active
December 3, 2017 19:19
-
-
Save s-j/2303b1233b8a932f6c3803c60ce7f372 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import net.agkn.hll.*; | |
import com.clearspring.analytics.stream.cardinality.*; | |
/** | |
* A brief comparison of the cardinality estimator implementations | |
* from Clearspring and Agregated Knowledge. | |
**/ | |
public final class CardinalityBench { | |
private CardinalityBench() { } | |
private static boolean enablePrint = false; | |
private static int[] testBaseline(List<String> lines) { | |
if (enablePrint) { | |
println(lines.size() + " lines"); | |
} | |
long start = System.currentTimeMillis(); | |
Set<String> unique = Sets.newHashSet(); | |
for (String line : lines) { | |
unique.add(line); | |
} | |
long stop = System.currentTimeMillis(); | |
if (enablePrint) { | |
println(unique.size() + " keys (" + (stop - start) + "ms)"); | |
} | |
start = System.currentTimeMillis(); | |
Set<Long> uniqueHashes = Sets.newHashSet(); | |
for (String line : lines) { | |
// Using the 32bit hash function provided with the implementation, | |
// performs better then murmur3 from Guava (both time and precision). | |
uniqueHashes.add(MurmurHash.hash64(line)); | |
} | |
stop = System.currentTimeMillis(); | |
if (enablePrint) { | |
println(uniqueHashes.size() + " hashed keys (" + (stop - start) + "ms)"); | |
} | |
return new int[] {unique.size(), uniqueHashes.size()}; | |
} | |
private static int[] getCardinalityAndSize( | |
ICardinality estimator, List<String> lines) throws IOException { | |
long start = System.currentTimeMillis(); | |
for (String line : lines) { | |
estimator.offer(line); | |
} | |
long stop = System.currentTimeMillis(); | |
int bytes = estimator.getBytes().length; | |
return new int[]{(int) estimator.cardinality(), bytes, (int) (stop - start)}; | |
} | |
private static void printHeader() { | |
if (enablePrint) { | |
println(String.format("%s\t%s\t%s\t%s", "name", "error", "size", "time")); | |
} | |
} | |
private static void print(String name, int res, int base, int size, int time) { | |
if (enablePrint) { | |
println(String.format("%s\t%.4f\t%sB\t%sms", | |
name, ((double) (res - base) / base), size, time)); | |
} | |
} | |
// Test linear counting tuned to support 10 mil unique items with 1% error. | |
private static void testLinearCounting(List<String> lines, int baseline) | |
throws IOException { | |
int[] result = getCardinalityAndSize( | |
new LinearCounting.Builder().withError(0.01, 10000000).build(), lines); | |
print("lincnt", result[0], baseline, result[1], result[2]); | |
} | |
// Tune the following three methods to 1% relative standard error. | |
// Estimation is based on the assumption m = (beta / error) ^ 2 | |
// Values for beta are taken from the Flajolet's 2007 paper | |
private static void testLogLog(List<String> lines, int baseline) | |
throws IOException { | |
double rse = 0.01; | |
double beta = 1.3; | |
// Use 1.3 instead of 1.04 since the implementation | |
// does not throw the 30% of smallest values | |
int log2m = (int) (Math.log(Math.pow((beta / rse), 2)) / Math.log(2)); | |
int[] result = getCardinalityAndSize(new LogLog(log2m), lines); | |
print("ll__" + log2m, result[0], baseline, result[1], result[2]); | |
} | |
private static void testHyperLogLog(List<String> lines, int baseline) | |
throws IOException { | |
double rsd = 0.01; | |
double beta = 1.106; | |
int log2m = (int) (Math.log(Math.pow(beta / rsd, 2)) / Math.log(2)); | |
int[] result = getCardinalityAndSize(new HyperLogLog(log2m), lines); | |
print("hll_" + log2m, result[0], baseline, result[1], result[2]); | |
} | |
private static void testHyperLogLogPlus(List<String> lines, int baseline) | |
throws IOException { | |
double rsd = 0.01; | |
double beta = 1.04; | |
int log2m = (int) (Math.log(Math.pow(beta / rsd, 2)) / Math.log(2)); | |
int[] result = getCardinalityAndSize(new HyperLogLogPlus(log2m, 32), lines); | |
print("hlp" + log2m, result[0], baseline, result[1], result[2]); | |
} | |
private static void testHyperLogLogPlus2(List<String> lines, int baseline) | |
throws IOException { | |
for (int log2m = 10; log2m <= 14; log2m++) { | |
int[] result = getCardinalityAndSize(new HyperLogLogPlus(log2m, 32), lines); | |
print("hlp" + log2m, result[0], baseline, result[1], result[2]); | |
} | |
} | |
private static void testHLL(List<String> lines, int baseline, | |
int log2m, int r, int ethr, HLLType type) throws IOException { | |
final HLL hll = new HLL(log2m, r, ethr, false, type); | |
long start = System.currentTimeMillis(); | |
for (String line : lines) { | |
hll.addRaw(MurmurHash.hash64(line)); | |
} | |
long stop = System.currentTimeMillis(); | |
int cnt = (int) hll.cardinality(); | |
int size = hll.toBytes().length; | |
print("hlx_" + log2m + "_" + r + "_" + ethr + "_" + type, | |
cnt, baseline, size, (int)(stop - start)); | |
} | |
private static void testHLL(List<String> lines, int baseline) throws IOException { | |
for (int log2m = 10; log2m <= 14; log2m++) { | |
int r = 5; | |
testHLL(lines, baseline, log2m, r, -1, HLLType.FULL); | |
testHLL(lines, baseline, log2m, r, -1, HLLType.SPARSE); | |
} | |
} | |
private static void test(String type | |
throws IOException, CardinalityMergeException { | |
println("Testing " + type); | |
List<String> lines = Files.readLines( | |
new File("/home/simonj/testdata/" + type + ".txt"), Charsets.UTF_8); | |
int baseline = testBaseline(lines)[0]; | |
println(); | |
printHeader(); | |
testLinearCounting(lines, baseline); | |
testLogLog(lines, baseline); | |
testHyperLogLog(lines, baseline); | |
testHyperLogLogPlus(lines, baseline); | |
println(); | |
testHyperLogLogPlus2(lines, baseline); | |
println(); | |
testHLL(lines, baseline); | |
println(); | |
//test64(lines); | |
} | |
public static void main(String[] args) | |
throws IOException,CardinalityMergeException { | |
for (int i = 9; i < 10; i++) { | |
if (i < 9) { | |
print(i + ","); | |
} else { | |
enablePrint = true; | |
} | |
test("datasetA"); | |
test("datasetB"); | |
} | |
} | |
// Produces the same result as the original! | |
public static int getHLLP(List<String> lines) | |
throws IOException, CardinalityMergeException { | |
HyperLogLogPlus[] estimators = new HyperLogLogPlus[1]; | |
for (int i = 0; i < estimators.length; i++) { | |
estimators[i] = new HyperLogLogPlus(13); | |
} | |
for (String line : lines) { | |
long value = Hashing.murmur3_128().hashUnencodedChars(line).asLong(); | |
estimators[(int) (value & (estimators.length - 1))].offer(line); | |
} | |
return (int) estimators[0].merge(estimators).cardinality(); | |
} | |
// Produces the same result as the original! | |
public static int getHLLP64(List<String> lines) | |
throws IOException, CardinalityMergeException { | |
HyperLogLogPlus[] estimators = new HyperLogLogPlus[64]; | |
for (int i = 0; i < estimators.length; i++) { | |
estimators[i] = new HyperLogLogPlus(13); | |
} | |
for (String line : lines) { | |
long value = Hashing.murmur3_128().hashUnencodedChars(line).asLong(); | |
estimators[(int) (value & (estimators.length - 1))].offer(line); | |
} | |
return (int) estimators[0].merge(estimators).cardinality(); | |
} | |
// Produces the same result as the original! | |
public static int getHLLP64v2(List<String> lines) | |
throws IOException, CardinalityMergeException { | |
HyperLogLogPlus[] estimators = new HyperLogLogPlus[64]; | |
for (int i = 0; i < estimators.length; i++) { | |
estimators[i] = new HyperLogLogPlus(13); | |
} | |
for (String line : lines) { | |
long value = Hashing.murmur3_128().hashUnencodedChars(line).asLong(); | |
estimators[(int)(Math.random() * estimators.length)].offer(line); | |
estimators[13].offer(line); | |
} | |
return (int) estimators[0].merge(estimators).cardinality(); | |
} | |
private static void test64(List<String> lines) | |
throws IOException, CardinalityMergeException { | |
println(getHLLP(lines)); | |
println(getHLLP64(lines)); | |
println(getHLLP64v2(lines)); | |
} | |
private static<T> void println(T arg) { | |
System.out.println(arg); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment