Last active
May 12, 2021 14:23
-
-
Save phstudy/545b43bc3b944ecd06685238913f3c5e to your computer and use it in GitHub Desktop.
hive with data-sketches
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://datasketches.github.io/docs/Theta/ThetaHiveUDFs.html | |
# sudo -u hive hadoop fs -mkdir /user/hive/auxjars | |
# sudo -u hive hadoop fs -copyFromLocal datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar /user/hive/auxjars/ | |
CREATE FUNCTION data2sketch as 'com.yahoo.sketches.hive.theta.DataToSketchUDAF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar'; | |
CREATE FUNCTION unionSketches as 'com.yahoo.sketches.hive.theta.UnionSketchUDAF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar'; | |
CREATE FUNCTION estimate as 'com.yahoo.sketches.hive.theta.EstimateSketchUDF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar'; | |
CREATE FUNCTION union2 as 'com.yahoo.sketches.hive.theta.UnionSketchUDF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar'; | |
CREATE FUNCTION intersect2 as 'com.yahoo.sketches.hive.theta.IntersectSketchUDF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar'; | |
CREATE FUNCTION anotb as 'com.yahoo.sketches.hive.theta.ExcludeSketchUDF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar'; | |
# Building sketches, merging sketches and getting estimates | |
create temporary table theta_input (id int, category char(1)); | |
insert into table theta_input values | |
(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), | |
(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b'); | |
create temporary table sketch_intermediate (category char(1), sketch binary); | |
insert into sketch_intermediate select category, data2sketch(id) from theta_input group by category; | |
select category, estimate(sketch) from sketch_intermediate; | |
select estimate(unionSketches(sketch)) from sketch_intermediate; | |
# Set operations | |
create temporary table sketch_input (id1 int, id2 int); | |
insert into table sketch_input values | |
(1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20); | |
create temporary table sketch_intermediate2 (sketch1 binary, sketch2 binary); | |
insert into sketch_intermediate2 select data2sketch(id1), data2sketch(id2) from sketch_input; | |
select | |
estimate(sketch1), | |
estimate(sketch2), | |
estimate(union2(sketch1, sketch2)), | |
estimate((intersect2(sketch1, sketch2))), | |
estimate(anotb(sketch1, sketch2)), | |
estimate(anotb(sketch2, sketch1)) | |
from sketch_intermediate2; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment