Last active
September 25, 2019 09:25
-
-
Save agaszmurlo/995ba53a9fd874495d3635a88db0475f to your computer and use it in GitHub Desktop.
oap varia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
unset SPARK_HOME | |
cd /data/local/opt/spark-2.4.3-bin-hadoop2.7 | |
./bin/spark-shell -v --master yarn-client --num-executors 20 --driver-memory 2g --executor-memory 2g \ | |
--conf spark.hadoop.yarn.timeline-service.enabled=false \ | |
--conf spark.hadoop.hive.metastore.uris=thrift://cdh01.cl.ii.pw.edu.pl:9083 \ | |
--conf spark.hadoop.yarn.timeline-service.enabled=false \ | |
--conf spark.driver.extraJavaOptions=-Dhdp.version=3.1.0.0-78 \ | |
--conf spark.yarn.am.extraJavaOptions=-Dhdp.version=3.1.0.0-78 \ | |
--conf spark.hadoop.metastore.catalog.default=hive | |
sql ("use igap_dev") | |
sql( | |
s""" | |
| CREATE TABLE etl_genotypes_ac_oap ( | |
| `project` string, | |
| `chr` string, | |
| `pos` int, | |
| `posend` int, | |
| `ref` string, | |
| `alt` string, | |
| `gt` string, | |
| `ad` int, | |
| `dp` int, | |
| `gq` int, | |
| `pl` int, | |
| `sample_id` string, | |
| `al_cnt` int) | |
| USING oap | |
| OPTIONS (path 'hdfs:///oap-data-dir/') | |
""".stripMargin) | |
sql ("insert overwrite table etl_genotypes_ac_oap select * from etg_genotypes_ac_ext") | |
sql( | |
s""" | |
| CREATE TEMPORARY TABLE etl_genotypes_ac_oap_af ( | |
| `chr` string, | |
| `pos` int, | |
| `posend` int, | |
| `ref` string, | |
| `alt` string, | |
| `af` double) | |
| USING oap | |
| OPTIONS (path 'hdfs:///oap-data-dir/af') | |
""".stripMargin) | |
sql ( | |
s""" | |
|INSERT OVERWRITE TABLE etl_genotypes_ac_oap_af | |
| SELECT chr, pos, posend, ref, alt, sum(al_cnt)/(SELECT count (distinct sample_id) from etg_genotypes_ac_ext) as af | |
| FROM etg_genotypes_ac_ext | |
| GROUP BY chr, pos, posend, ref, alt | |
""".stripMargin) | |
// sql( | |
// s""" | |
// | SELECT chr, pos, posend, ref, alt, sum(al_cnt)/(SELECT count (distinct sample_id) from etg_genotypes_ac_ext) as af | |
// | FROM etg_genotypes_ac_ext | |
// | GROUP BY chr, pos, posend, ref, alt | |
// """) | |
sql("select * from etl_genotypes_ac_oap_af").show | |
sql("create oindex index1 on etl_genotypes_ac_oap_af(chr)") | |
sql("show oindex from etl_genotypes_ac_oap_af") | |
spark.time(sql ("select sum(pos) from etl_genotypes_ac_oap_af where chr='1' OR chr='5'").show) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment