Last active
May 20, 2019 13:49
-
-
Save agaszmurlo/ae0dc1d629bd34fbb95ddef6d8a88aab to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cd /data/local/opt/spark-2.4.0-bin-hadoop2.7/bin | |
./spark-shell -v --master=yarn --deploy-mode=client --num-executors=60 --executor-memory=4g --driver-memory=12g --conf spark.sql.catalogImplementation=in-memory --conf spark.jars.ivy=/data/local/cache/ivy2/repository --conf spark.hadoop.yarn.timeline-service.enabled=false --repositories http://zsibio.ii.pw.edu.pl/nexus/repository/maven-releases/,http://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/ --packages org.biodatageeks:bdg-sequila_2.11:0.5.5-spark-2.4.2-SNAPSHOT | |
sc.setLogLevel("WARN") | |
import org.apache.spark.sql.SequilaSession | |
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams} | |
val ss = SequilaSession(spark) | |
SequilaRegister.register(ss) | |
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true") | |
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false") | |
ss.sql(""" | |
CREATE TABLE IF NOT EXISTS qreads | |
USING org.biodatageeks.datasources.BAM.BAMDataSource | |
OPTIONS(path '/igap/dev/rel5-guppy*.bam')""") | |
// /data/granges/nanopore/guppy.bam | |
val cov = ss.sql(s"SELECT contigName, start, end, coverage FROM bdg_coverage('qreads','rel5-guppy-0', 'blocks')") | |
spark.time{cov.count} | |
//spark.time{cov.write.mode("overwrite").option("delimiter", "\t").csv("/igap/dev/guppy_cov.bed")} | |
//spark.time{cov.coalesce(1).write.mode("overwrite").option("delimiter", "\t").csv("/data/granges/nanopore/guppy_cov.bed")} | |
session.sqlContext.setConf(BDGInternalParams.ShowAllPositions, "true") | |
ss.sqlContext.setConf(BDGInternalParams.ShowAllPositions, "true") | |
val cov_all = ss.sql(s"SELECT contigName, start, coverage FROM bdg_coverage('qreads','NA12878', 'bases')") | |
spark.time{cov_all.coalesce(1).write.mode("overwrite").option("delimiter", "\t").csv("/data/granges/nanopore/guppy_cov_all.bed")} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment