Last active
May 7, 2019 21:14
-
-
Save agaszmurlo/1fb02444af41f39975cf82555bc24cf2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spark-shell -v --master=local[$cores] --driver-memory=12g --conf "spark.sql.catalogImplementation=in-memory" --packages org.biodatageeks:bdg-sequila_2.11:0.5.3-spark-2.4.0-SNAPSHOT --repositories http://repo.hortonworks.com/content/repositories/releases/,http://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/ | |
import org.apache.spark.sql.SequilaSession | |
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams} | |
val ss = SequilaSession(spark) | |
SequilaRegister.register(ss) | |
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true") | |
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false") | |
/* create table */ | |
ss.sql(""" | |
CREATE TABLE IF NOT EXISTS reads_exome | |
USING org.biodatageeks.datasources.BAM.BAMDataSource | |
OPTIONS(path '/data/granges/exome/NA12878.proper.wes.bam')""") | |
ss.sql(""" | |
CREATE TABLE IF NOT EXISTS reads_wgs | |
USING org.biodatageeks.datasources.BAM.BAMDataSource | |
OPTIONS(path '/data/granges/exome/NA12878.proper.wgs.bam')""") | |
/* bases - do powtorzenia dla core = 1 TYLKO, bo mamy sie porownac z samtoolsem tylko */ | |
spark.time{ ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'bases')").count } | |
spark.time{ ss.sql(s"SELECT * FROM bdg_coverage('reads_wgs','NA12878', 'bases')").count } | |
/* store as BED file do powtorzenia dla core = 1, 5, 10 */ | |
spark.time { ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'blocks')") | |
.coalesce(1) | |
.write.mode("overwrite").option("delimiter", "\t").csv("/data/granges/exome/coverage.bed")} | |
spark.time { ss.sql(s"SELECT * FROM bdg_coverage('reads_wgs','NA12878', 'blocks')") | |
.coalesce(1) | |
.write.mode("overwrite").option("delimiter", "\t").csv("/data/granges/exome/coverage.bed")} | |
/*** NANOPORE **/ | |
sc.setLogLevel("WARN") | |
import org.apache.spark.sql.SequilaSession | |
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams} | |
val ss = SequilaSession(spark) | |
SequilaRegister.register(ss) | |
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true") | |
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false") | |
session.sqlContext.setConf(BDGInternalParams.ShowAllPositions, "true") | |
//ss.sqlContext.setConf("spark.biodatageeks.readAligment.method", "disq") | |
ss.sql(""" | |
CREATE TABLE IF NOT EXISTS qreads | |
USING org.biodatageeks.datasources.BAM.BAMDataSource | |
OPTIONS(path '/data/granges/nanopore/guppy.bam')""") | |
// ss.sql(""" | |
// CREATE TABLE IF NOT EXISTS qreads | |
// USING org.biodatageeks.datasources.BAM.BAMDataSource | |
// OPTIONS(path 'file://data/work/nanopore_bam/NA12878.chr21.quality.bam')""") | |
//ss.sql("select * from qreads limit 10").show | |
//spark.time{ ss.sql(s"SELECT contigName, start, coverage FROM bdg_coverage('qreads','NA12878', 'bases')").count } | |
val cov = ss.sql(s"SELECT contigName, start, coverage FROM bdg_coverage('qreads','NA12878', 'bases')") | |
spark.time{cov.coalesce(1).write.mode("overwrite").option("delimiter", "\t").csv("/data/granges/nanopore/guppy_cov.bed")} |
mwiewior
commented
Apr 13, 2019
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment