Skip to content

Instantly share code, notes, and snippets.

@mocksu
Created August 1, 2023 15:23
Show Gist options
  • Save mocksu/edda67a0c45b8970aab6fc093c8766cb to your computer and use it in GitHub Desktop.
Save mocksu/edda67a0c45b8970aab6fc093c8766cb to your computer and use it in GitHub Desktop.
python script which has threading safe issues
#!/usr/bin/env python
import os
import sys
import pandas as pd
import numpy as np
import argparse
from argparse import RawTextHelpFormatter
import util
import re
import glob
import causal
import threading
desc = "This script is to run coloc only (no locuszoom, LDMAP, 2smr)"
epi = "Author: Mousheng Xu; Date: July 31, 2023"
parser = argparse.ArgumentParser(prog='~',
description=desc,
epilog=epi,
formatter_class=RawTextHelpFormatter)
parser.add_argument("--name1", help="the name of the dataset1. Default is <dataset1>")
parser.add_argument("--name2", help="the name of the dataset2. Default is <dataset2>")
parser.add_argument("-p", "--pval", type=float, help="the p-value threshold for trait1. Default=%(default)s", default=5e-8)
parser.add_argument("-s", "--script", help="only generate R scripts without running them", action="store_true")
parser.add_argument("-R", "--region", help="region of interest, e.g. '12:111741666-112741866'")
parser.add_argument("-d", "--debug", action="store_true", help="debug mode")
parser.add_argument("-t", "--top", action="store_true", help="Generate an html ancor ID and a href to go back to top. This only makes sense when the html file generated by this script is used in a bigger scope like by 'mrsum2clc.py' which has specified <a href='#top'>")
parser.add_argument("-O", "--no_override", action="store_true", help="If the found studies or output files exist for a pair, do not refind the studies or rerun the analysis.")
parser.add_argument("--tmp", type=str, help="the tmp filder, default is %(default)s", default="/tmp3/clc")
parser.add_argument("dataset1", type=str, help="the input dataset1 file") # positional
parser.add_argument("dataset2", type=str, help="the input dataset2 file") # positional
parser.add_argument("ostem", type=str, help="output file prefix") # positional
args = parser.parse_args()
ds1 = args.dataset1
ds2 = args.dataset2
region = args.region
chr, minpos, maxpos = re.split(r"\D+", region)
ostem = args.ostem
odir, ost, osuf = util.getDirStemSuffix(ostem + ".txt")
tmpdir = args.tmp
if not os.path.isdir(tmpdir):
os.makedirs(tmpdir)
tostem = f"{tmpdir}/{ost}"
### global vars
tmpfiles = []
regfile = ostem + ".reg" # to save the region information
tmpfiles.append(regfile)
lock1 = threading.Lock()
lock2 = threading.Lock()
lock3 = threading.Lock()
lock4 = threading.Lock()
sumcsv = ostem + ".clc.summary.csv"
if args.no_override:
if os.path.isfile(sumcsv):
print(f"Result file {sumcsv} exists, NOT re-running")
exit(0)
else:
print(f"Result file {sumcsv} does NOT exists, RE-RUNNING")
if args.name1:
name1 = args.name1
else:
if os.path.isfile(ds1):
d1, stm1, suf1 = util.getDirStemSuffix(ds1)
name1 = stm1 + "." + suf1
else:
name1 = ds1
if args.name2:
name2 = args.name2
else:
if os.path.isfile(ds2):
d2, stm2, suf2 = util.getDirStemSuffix(ds2)
name2 = stm2 + "." + suf2
else:
name2 = ds2
# escape the ' in name1 & name2
name1 = name1.replace("'", "")
name2 = name2.replace("'", "")
def writeNoResults(t1, t2, n1, n2, note, outf):
with open(outf, "w+") as f:
f.write("\t".join(["trait1id", "trait2id", "trait1", "trait2", "nsnps", "H0", "H1", "H2", "H3", "H4", "reg", "minp2", "NOTE", "htmlid"]) + "\n")
f.write("\t".join([t1, t2, n1, n2, "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", note, "NA"]) + "\n")
f.close()
def getLocalDSRCode(fname, type, reg, dsn: " dataset number: 1 or 2"):
print(f"getLocalDSRCode: fname = {fname}, reg= {reg}, dsn = {dsn}")
fdir, fst, fsuf = util.getDirStemSuffix(fname)
fstem = fst + "." + fsuf
traitname = ""
if dsn == "1":
dfname = "df1"
dsname = "ds1"
if args.name1:
traitname = args.name1
elif dsn == "2":
dfname = "df2"
dsname = "ds2"
if args.name2:
traitname = args.name2
else:
print("Unknown dsn: " + str(dsn))
sys.exit(1)
if reg: # region specified
# get the minchr, minpos, maxpos
minchr, minrange = reg.split(":")
minpos, maxpos = minrange.split("-")
# use tabix to get the regional data otherwise reading the input might take a very long time
#print(f"checking the file {fname} for pos and chr")
if os.path.getsize(fname) == 0:
print(f"ERROR 0: {fname} is empty")
exit(1)
df = pd.read_csv(fname, sep="\t", header=0, comment='#', nrows=2)
if df.empty:
print(f'ERROR 1: {fname} has no data row')
exit(1)
print(f"2 read {fname} to df, hope it has 'pos' & 'chr' columns")
pos = str(df.columns.get_loc("pos") + 1) # 1 based index
chr = str(df.columns.get_loc("chr") + 1)
lock3.acquire()
print(f"3 read {fname} to df, it does have 'pos' & 'chr' columns")
regname = f"{tostem}.reg.{dsn}"
if not os.path.isfile(regname):
util.run("echo '" + "\t".join(list(df.columns)) + "' > " + regname)
del df
# tabix and index {fname} if it's not bgzipped
if not fname.endswith(".gz"):
fgz = fname + ".gz"
util.run(f"bgzip -f {fname}")
util.run(f"tabixFiles.py -s -o {fgz}")
# else use it as is
else:
fgz = fname
lock3.release()
print(f"fname = {fname}, fgz = {fgz}")
if dsn == "1":
rcode = f"""
minchr = '{minchr}'
minpos = '{minpos}'
maxpos = '{maxpos}'
reg = '{reg}'
"""
else: # dsn == 2, don't need to specify (minchr, minpos, maxpos, reg)
rcode = ""
rcode += f"""
tbxcmd = paste0('tabix -S 1 -f -b {pos} -e {pos} -s {chr} ', '{fgz} ', reg, ' >> {regname}') # tricky, don't modify here without careful thoughts
print(tbxcmd)
system(tbxcmd)
"""
tmpfiles.append(regname)
rcode += dfname + """ = read.table('""" + regname + """', sep=' ', header=T, comment='#')"""
else: # no region specified, get from the highest peak
rcode = dfname + """ = read.table('""" + fname + """', sep=' ', header=T, comment='#')"""
if dsn == "1":
rcode += """
minrow = subset(""" + dfname + """, pval == min(pval))
minchr = {chr}
minpos = {maxpos}
minpval = minrow[1, 'pval', 1]
if(minpos < 0) {
minpos = 0
}
maxpos = {maxpos}
reg = paste0(minchr, ":", minpos, "-", maxpos) # for plotlzm.py usage
"""
else: # dsn == 2, reg should be defined already
rcode += "" # dumb statement
### get the local dataset
rcode += """
""" + dfname + """ = subset(""" + dfname + """, chr==minchr & pos > minpos & pos < maxpos)
# remove duplicated SNPs, keep the one with smallest p-value
library(dplyr)
""" + dfname + """ = """ + dfname + """ %>%
group_by(SNP) %>%
slice_min(n = 1, order_by = pval) %>%
ungroup
""" + dsname + """ = """ + dfname + """[1]
if("id" %in% colnames(""" + dfname + """)) {
""" + dsname + """$id = """ + dfname + """$id[1]
} else {
""" + dsname + """$id = '""" + fstem + """'
}
"""
if traitname:
traitstr = f'{dsname}$trait = "{traitname}"'
else: # trait name is not specified
traitstr = f"""if("Phenotype" %in% colnames({dfname})) {{
{dsname}$trait = '{dfname}$Phenotype[1]'
}} else {{
{dsname}$trait = '{fstem}'
}}
"""
rcode += traitstr + """
""" + dsname + """$N = """ + dfname + """$samplesize # sample size not needed for (at least cc) outcome
""" + dsname + """$beta = """ + dfname + """$beta
""" + dsname + """$varbeta = """ + dfname + """$se ** 2 # "varbeta is simply the square of the standard error of beta. you don't need the SD." (https://github.com/chr1swallace/coloc/issues/108)
""" + dsname + """$pval = """ + dfname + """$pval # needed for locuszoom
""" + dsname + """$ea = """ + dfname + """$effect_allele # needed for locuszoom
""" + dsname + """$oa = """ + dfname + """$other_allele # needed for locuszoom
""" + dsname + """$se = """ + dfname + """$se # needed for locuszoom
""" + dsname + """$type = '""" + type + """'
""" + dsname + """$MAF = """ + dfname + """$eaf
""" + dsname + """$snp = """ + dfname + """$SNP
""" + dsname + """$position = """ + dfname + """$pos
# remove duplicated SNPs, keep the one with smallest p-value. This is still necessary because ds1 does not remove duplicated SNPs with the same p-value
""" + dsname + """ = """ + dsname + """[!duplicated(""" + dsname + """$snp),]
### remove the column "SNP" if it exists
if("SNP" %in% colnames(""" + dsname + """)) {
""" + dsname + """ = select(""" + dsname + """, -SNP)
}
# remove rows with varbeta = 0
""" + dsname + """ = subset(""" + dsname + """, varbeta != 0)
# remove rows with MAF = 0 or 1
""" + dsname + """ = subset(""" + dsname + """, MAF != 0 & MAF != 1)
# make sure N is numeric
""" + dsname + """$N = as.numeric(""" + dsname + """$N)
"""
return rcode
def getRemoteDSRCode(id, type, reg:"genemoic region, e.g. '1:234-456'", dsn:"dataset number: 1 or 2"):
dfname = "df" + str(dsn)
dsname = "ds" + str(dsn)
idhash = util.getHash(id, "getRemoteDSRCode", 12)
if dsn == "1":
rcode = """tops = tophits('""" + id + """')
# n p chr beta position se id rsid ea nea eaf trait
minchr = tops[1, 'chr', 1]
minpos = {minpos}
maxpos = {maxpos}
minpval = tops[1, 'p', 1]
if(minpos < 0) {
minpos = 0
}
reg = paste0(minchr, ':', minpos, '-', maxpos)
reg
"""
elif dsn == "2":
rcode = """reg = paste0(minchr, ':', minpos, '-', maxpos)\n"""
else:
print("dsn has to be either '1' or '2'")
rcode += dfname + """=ieugwasr::associations(reg, '""" + id + """')
### the MAF & sample size fixup are coded in "causal.fixRemoteDF"
### replace part of the following code in the future. leave it for now
# If all MAFs are missing, run the "getSNPAF" system script to get MAF
if(all(is.na(""" + dfname + """$eaf))) {
# Check if the file with the MAF data exists
regn = gsub(":", "_", reg)
diskfile = paste0('""" + tmpdir + """/""" + idhash + """', '.', regn, '.withMAF.csv')
if(file.exists(diskfile)) {
# If the file exists, load it into a data frame called `df`
""" + dfname + """ = read.table(diskfile, sep="\t", comment='#', header=T)
} else {
# If the file doesn't exist, save the data frame `df` to disk
disksave = paste0(diskfile, ".save")
write.table(""" + dfname + """, file = disksave, sep="\t", row.names=F, quote=F)
# Run "getSNPAF.py" to get the MAF
diskmaf = paste0(diskfile, ".maf")
cmd = paste0('getSNPAF.py ', disksave, ' chr rsid ', diskmaf)
print(cmd)
system(cmd)
# remove 'eaf'
disknoeaf = paste0(diskfile, ".noeaf")
cmd1 = paste0('rmColumns.pl ', diskmaf, ' eaf ', disknoeaf)
print(cmd1)
system(cmd1) # remove the exist eaf column
# rename MAF to eaf
cmd2 = paste0('renameFields.pl ', disknoeaf, ' MAF eaf ', diskfile)
print(cmd2)
system(cmd2) # rename MAF to eaf
# remove the tmp files
#cmd3 = paste0("rm -rf ", disksave, " ", diskmaf, " ", disknoeaf)
#print(cmd3)
#system(cmd3)
# Load the MAF added file back into the data frame `df`
""" + dfname + """ <- read.table(diskfile, header = TRUE, sep = '\t')
}
}
### read sample size from a local file if it does not exist
if(all(is.na(""" + dfname + """$n))) {
libfile = "/home/moxu/softspace/d2m/2smr/data/list/2smr.outcomes.samplesize.csv"
ssfile = read.table(libfile, sep="\t", header=T, comment='#')
idrow = subset(ssfile, id == '""" + id + """')
if(dim(idrow)[1] == 1) {
""" + dfname + """$n = idrow[1,3]
} # else don't specify
}
# else leave as is
# Remove SNPs without MAF, and SNPs without N
""" + dfname + """ <- """ + dfname + """[!is.na(""" + dfname + """$eaf), ]
""" + dfname + """ <- """ + dfname + """[!is.na(""" + dfname + """$n), ]
""" + dfname + """ <- """ + dfname + """[""" + dfname + """$eaf != 0 & """ + dfname + """$eaf != 1, ]
""" + dsname + """=""" + dfname + """[1]
""" + dsname + """$id = '""" + id + """'
""" + dsname + """$trait = """ + dfname + """$trait[1]
""" + dsname + """$N = """ + dfname + """$n
""" + dsname + """$beta = """ + dfname + """$beta
""" + dsname + """$varbeta = """ + dfname + """$se ** 2 # "varbeta is simply the square of the standard error of beta. you don't need the SD."
""" + dsname + """$pval = """ + dfname + """$p # needed for locuszoom
""" + dsname + """$ea = """ + dfname + """$ea # needed for locuszoom
""" + dsname + """$oa = """ + dfname + """$nea # needed for locuszoom
""" + dsname + """$se = """ + dfname + """$se # needed for locuszoom
""" + dsname + """$type = '""" + type + """'
""" + dsname + """$MAF = """ + dfname + """$eaf
""" + dsname + """$snp = """ + dfname + """$rsid
""" + dsname + """$position = """ + dfname + """$position
# remove duplicated SNPs, keep the one with smallest p-value. This is still necessary because ds1 does not remove duplicated SNPs with the same p-value
""" + dsname + """ = """ + dsname + """[!duplicated(""" + dsname + """$snp),]
# remove rows with varbeta = 0
""" + dsname + """ = subset(""" + dsname + """, varbeta != 0)
# remove rows with MAF = 0 or 1
""" + dsname + """ = subset(""" + dsname + """, MAF != 0 & MAF != 1)
# make sure N is numeric
""" + dsname + """$N = as.numeric(""" + dsname + """$N)
"""
return rcode
def clc1pair(ds1f, trait1, ds2f, trait2, cost):
print(f"clc1pair: ds1f = {ds1f}, trait1 = {trait1}, ds2f = {ds2f}, trait2={trait2}, cost = " + cost)
rstclc = cost + ".summary.csv"
rstmf = cost + ".merged.summary.csv"
# Define the column names to check
columns_to_check = {"beta", "eaf", "samplesize"}
# Check if any of the column names are missing from both df1 and df2
if os.path.isfile(ds1f):
if os.path.getsize(ds1f) == 0:
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} is empty", ostem + ".clc.summary.csv")
exit(1)
df1 = pd.read_csv(ds1f, sep="\t", header=0, comment='#')
if df1.empty:
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} has no data row", ostem + ".clc.summary.csv")
exit(1)
if not columns_to_check.issubset(df1.columns):
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} missing one of {columns_to_check}", ostem + ".clc.summary.csv")
exit(1)
if os.path.isfile(ds2f):
if os.path.getsize(ds2f) == 0:
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} is empty", ostem + ".clc.summary.csv")
exit(1)
df2 = pd.read_csv(ds2f, sep="\t", header=0, comment='#')
if df2.empty:
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} has no data row", ostem + ".clc.summary.csv")
exit(1)
if not columns_to_check.issubset(df2.columns):
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} missing one of 'beta', 'eaf', or 'samplesize'", ostem + ".clc.summary.csv")
exit(1)
if os.path.isfile(ds1f):
print(f'0 rc1 getlocaldsrcode for {ds1f}, args.region = {args.region}')
rc1 = getLocalDSRCode(ds1f, "quant", args.region, "1")
print(f'1 rc1 getlocaldsrcode for {ds1f}, args.region = {args.region}')
else:
print(f'rc1 getremotedsrcode')
rc1 = getRemoteDSRCode(ds1f, "quant", args.region, "1")
# if args.region:
# ds2reg = args.region
# else:
# ds2reg = 'paste0(minchr, ":", minpos, "-", maxpos)'
ds2reg = args.region
print(f'ds2reg = {ds2reg}')
if os.path.isfile(ds2f):
print(f'rc2 getlocaldsrcode')
rc2 = getLocalDSRCode(ds2f, "cc", ds2reg, "2")
else:
print(f'rc2 getremotedsrcode')
rc2 = getRemoteDSRCode(ds2f, "cc", ds2reg, "2")
rs4lz = open(cost + ".lz.R", "w+")
rs4lz.write("""
library(coloc)
library(ieugwasr)
""" + rc1 + """
reg
write(reg, file='""" + regfile + """')
""" + rc2 + """
trait1id = ds1$id[1]
trait2id = ds2$id[1]
trait1 = ds1$trait[1]
trait2 = ds2$trait[1]
ds1$chromosome = minchr
ds2$chromosome = minchr
ds1 = ds1[order(ds1$position),] # locuszoom requires sorted coords
ds2 = ds2[order(ds2$position),]
""")
cstem = cost + ".compare" # compare result file prefix
scstem = cost + ".compare.sigpval" # compare result file prefix
rs4lz.write("""
# write datasets to files
regstr = gsub(":", "_", reg) # replace ':" with '_'
lzf1 = paste0('""" + ds1f + """', '.', regstr, '.lz.csv')
lzf2 = paste0('""" + ds2f + """', '.', regstr, '.lz.csv')
regf1 = paste0('""" + ds1f + """', '.', regstr, '.reg.csv')
regf2 = paste0('""" + ds2f + """', '.', regstr, '.reg.csv')
#if(!file.exists(lzf1)) {
write.table(ds1, lzf1, sep="\t", row.names=F, quote=F)
#}
#if(!file.exists(lzf2)) {
write.table(ds2, lzf2, sep="\t", row.names=F, quote=F)
#}
#if(!file.exists(regf1)) {
write.table(df1, regf1, sep="\t", row.names=F, quote=F) # for IV identification
#}
#if(!file.exists(regf2)) {
write.table(df2, regf2, sep="\t", row.names=F, quote=F) # for IV identification
#}
""")
rs4lz.close()
tmpfiles.append(regfile)
### run the R script to get file lz1 & lz2
with lock1:
print(f'Running the lz R script {rs4lz.name}')
util.run("R --no-save < " + rs4lz.name)
if not os.path.isfile(regfile):
print(f"ERROR 2: mr2clc.py: regfile {regfile} does not exist.")
writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 3: mr2clc.py: regfile {regfile} does not exist.", rstmf)
return
with open(regfile, "r") as f:
reg = f.read().strip()
#print("reg = " + reg)
rchr, rstart, rend = re.match(r'(\d+):(\d+)-(\d+)', reg).groups()
regstr = reg.replace(":", "_")
f.close()
lz1 = ds1f + "." + regstr + ".lz.csv"
lz2 = ds2f + "." + regstr + ".lz.csv"
df1reg = ds1f + "." + regstr + ".reg.csv" # files generated above
df2reg = ds2f + "." + regstr + ".reg.csv" # files generated above
if not os.path.isfile(df1reg):
writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 4: mr2clc.py: df1reg {df1reg} does not exist.", rstmf)
return
if not os.path.isfile(df2reg):
writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 4: mr2clc.py: df2reg {df2reg} does not exist.", rstmf)
return
tmpfiles.append(df1reg)
tmpfiles.append(df2reg)
rs4clc = open(cost + ".clc.R", "w+")
rs4clc.write(f"""
library(coloc)
library(ieugwasr)
ds1 = read.table('{lz1}', sep="\t", header=T, quote="")
ds2 = read.table('{lz2}', sep="\t", header=T, quote="")
trait1id = ds1$id[1]
trait2id = ds2$id[1]
trait1 = ds1$trait[1]
trait2 = ds2$trait[1]
reg = '{reg}'
# remove rows with varbeta = 0
ds1 = subset(ds1, varbeta != 0)
ds2 = subset(ds2, varbeta != 0)
# remove rows with MAF = 0 or 1
ds1 = subset(ds1, MAF != 0 & MAF != 1)
ds2 = subset(ds2, MAF != 0 & MAF != 1)
if(nrow(ds1) == 0 | nrow(ds2) == 0) {{
if(nrow(ds1) == 0) {{
NOTE = "trait1 has no valid data"
trait1id = '{lz1}'
trait1 = '{name1}'
}} else {{
NOTE = "trait2 has no valid data"
trait2id = '{lz2}'
trait2 = '{name2}'
}}
nsnps = "NA"
H0 = "NA"
H1 = "NA"
H2 = "NA"
H3 = "NA"
H4 = "NA"
minp2 = "NA"
dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE)
write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F)
quit()
}}
# make sure N is numeric
ds1$N = as.numeric(ds1$N)
ds2$N = as.numeric(ds2$N)
minrow = subset(ds1, pval == min(pval))
minchr = {chr}
minpos = {minpos}
maxpos = {maxpos}
minpval = minrow[1, 'pval', 1]
if(minpos < 0) {{
minpos = 0
}}
### if p-value is too big, don't do coloc
if(minpval > {args.pval}) {{
NOTE = paste0("trait1 pval too big: ", minpval)
nsnps = "NA"
H0 = "NA"
H1 = "NA"
H2 = "NA"
H3 = "NA"
H4 = "NA"
minp2 = "NA"
dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE)
write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F)
quit()
}}
### draw regional manhattan plot with selected snps highlighted
ds1$chromosome = as.numeric(ds1$chromosome)
ds2$chromosome = as.numeric(ds2$chromosome)
minp2 = min(ds2$pval)
tc = tryCatch({{
res <- coloc.abf(dataset1=ds1, dataset2=ds2)
res
### save major results to a file
nsnps = res$summary[1]
H0 = signif(res$summary[2], digits=3)
H1 = signif(res$summary[3], digits=3)
H2 = signif(res$summary[4], digits=3)
H3 = signif(res$summary[5], digits=3)
H4 = signif(res$summary[6], digits=3)
NOTE = ""
dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE)
print("writing coloc results dfres to {cost}.summary.csv")
write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F)
}},
error=function(err) {{
print(err)
NOTE = paste0("ERROR - ", conditionMessage(err))
nsnps = "NA"
H0 = "NA"
H1 = "NA"
H2 = "NA"
H3 = "NA"
H4 = "NA"
dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE)
write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F)
}},
finally=function(f) {{
print("Done")
}})
""")
rs4clc.close()
if args.script:
print(f'NOT running coloc R script: {rs4clc.name}')
exit(2)
else:
print(f'Running coloc R script: {rs4clc.name}')
rshit = "/tmp/t.rshit.out"
with open(rs4clc.name, "r") as f:
print("r code:")
print(f.read())
util.run(f"R --no-save < {rs4clc.name}")
tmpfiles.append(rs4clc.name)
### construct the html report
# write all the result
if not os.path.isfile(rstclc):
print(f"WARNING: coloc summary file {rstclc} does NOT exist. Check next ...")
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: coloc summary file {rstclc} does NOT exist. Check next ...",rstmf)
return
dfclc = pd.read_csv(rstclc, sep="\t", header=0, comment='#')
print(f"rstclc = {rstclc}")
### rewrite the above to make it look better
trait1id = dfclc.at[0, "trait1id"]
trait2id = dfclc.at[0, "trait2id"]
hfile = open(cost + ".summary.html", "w+")
# html anchor id, which will be embeded in the html file and can be used to identify the html file
htmlid = util.getHash(hfile.name, "mr2clc", 8)
rstm = open(rstmf, "w+") # clc + 2smr merged results
rstm.write("\t".join(list(dfclc.columns) + ["htmlid"]) + "\n")
rstm.write("\t".join(list(map(lambda x: str(x), list(dfclc.iloc[0,:]) + [htmlid]))) + "\n")
rstm.close()
# rename the result file
util.run("mv %s %s" %(rstm.name, rstclc))
rplot = os.path.abspath(cost + ".png")
hfile.write(f"""<html>
<head><title>{name1} ~ {name2}</title>
<style>
table {{
page-break-inside: avoid;
}}
</style>
</head>
<body>
""")
# trait1id trait2id trait1 trait2 nsnps H0 H1 H2 H3 H4 reg NOTE 2smr_pval_min bbR
# prot-c-5102_55_3 ukb-b-14521 MICB Illnesses of father: Lung cancer 2510 1.41e-34 1.66e-06 5.48e-29 0.644 0.356 6:30465047-32465047 nan 0.435 -0.6587235634261688
dfsum = pd.read_csv(rstclc, sep="\t", header=0, comment='#')
# print the stats
nsnps = dfsum.iloc[0].loc[ "nsnps"]
h0 = dfsum.iloc[0].loc[ "H0"]
h1 = dfsum.iloc[0].loc[ "H1"]
h2 = dfsum.iloc[0].loc[ "H2"]
h3 = dfsum.iloc[0].loc[ "H3"]
h4 = dfsum.iloc[0].loc[ "H4"]
note = str(dfsum.iloc[0].loc["NOTE"])
reg = dfsum.iloc[0].loc["reg"]
#print("reg = " + str(reg))
reg = reg.replace(":", "_")
#print("update reg to be " + str(reg))
print(f'ds1 = {ds1}, ds2 = {ds2}')
# partition the ds1 by study 'id'
df1s = pd.read_csv(ds1, sep="\t", comment='#', header=0)
df1ids = list(set(df1s["id"]))
#print("Dataset1 has %d unique IDs" %(len(df1ids)))
csvs = []
htms = []
for i in range(len(df1ids)):
id = df1ids[i]
#print("processing id " + str(id))
df1 = df1s[df1s["id"] == id]
pheno = df1["Phenotype"].iloc[0]
# make ds1f file name shorter
if os.path.isfile(id):
idir, istem, isuf = util.getDirStemSuffix(id)
if len(istem) > 10: # file name too long can cause problems for unix
istem = istem[0:3] + util.getHash(istem, "mr2clc", 4) + istem[-4:-1] # use first 3 and last 3 and 4 random chars to form the length 10 istem
else:
istem = id
ds1f = f"{tmpdir}/{istem}.lz.csv"
lock2.acquire()
if os.path.isfile(ds1f):
if os.path.getsize(ds1f) == 0:
df1.to_csv(ds1f, sep="\t", index=False)
# else exists and has content, don't save it again
else: # file does not exist
df1.to_csv(ds1f, sep="\t", index=False)
lock2.release()
ostem1 = tostem + "." + util.getHash(istem, "ostem1", 8) + util.getHash(ds2, "ostem1", 8)
print(f"ds1f = {ds1f}, pheno = {pheno}, name2 = {name2}, ostem1 = {ostem1}")
clc1pair(ds1f, pheno, ds2, name2, ostem1)
csvf = ostem1 + ".summary.csv"
htmf = ostem1 + ".summary.html"
csvs.append(csvf)
htms.append(htmf)
tmpfiles.append(csvf)
tmpfiles.append(htmf)
### summarize all results
# summarize the ".summary.csv"
util.run("cat " + " ".join(csvs) + " > " + sumcsv)
util.run("rmDuplicatedRows.pl " + sumcsv)
# summarize the ".html"
sumhtm = ostem + ".clc.summary.html"
util.run("cat " + " ".join(htms) + " > " + sumhtm)
#if not args.debug:
# for tf in tmpfiles:
# util.run("rm -rf " + tf)
print(f'clc.py {ds1} {ds2} {ostem} is DONE')
f = open(sumcsv, "r")
print(f'the summary file {sumcsv} content is {f.read()}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment