Created
August 1, 2023 15:23
-
-
Save mocksu/edda67a0c45b8970aab6fc093c8766cb to your computer and use it in GitHub Desktop.
python script which has threading safe issues
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import pandas as pd | |
import numpy as np | |
import argparse | |
from argparse import RawTextHelpFormatter | |
import util | |
import re | |
import glob | |
import causal | |
import threading | |
desc = "This script is to run coloc only (no locuszoom, LDMAP, 2smr)" | |
epi = "Author: Mousheng Xu; Date: July 31, 2023" | |
parser = argparse.ArgumentParser(prog='~', | |
description=desc, | |
epilog=epi, | |
formatter_class=RawTextHelpFormatter) | |
parser.add_argument("--name1", help="the name of the dataset1. Default is <dataset1>") | |
parser.add_argument("--name2", help="the name of the dataset2. Default is <dataset2>") | |
parser.add_argument("-p", "--pval", type=float, help="the p-value threshold for trait1. Default=%(default)s", default=5e-8) | |
parser.add_argument("-s", "--script", help="only generate R scripts without running them", action="store_true") | |
parser.add_argument("-R", "--region", help="region of interest, e.g. '12:111741666-112741866'") | |
parser.add_argument("-d", "--debug", action="store_true", help="debug mode") | |
parser.add_argument("-t", "--top", action="store_true", help="Generate an html ancor ID and a href to go back to top. This only makes sense when the html file generated by this script is used in a bigger scope like by 'mrsum2clc.py' which has specified <a href='#top'>") | |
parser.add_argument("-O", "--no_override", action="store_true", help="If the found studies or output files exist for a pair, do not refind the studies or rerun the analysis.") | |
parser.add_argument("--tmp", type=str, help="the tmp filder, default is %(default)s", default="/tmp3/clc") | |
parser.add_argument("dataset1", type=str, help="the input dataset1 file") # positional | |
parser.add_argument("dataset2", type=str, help="the input dataset2 file") # positional | |
parser.add_argument("ostem", type=str, help="output file prefix") # positional | |
args = parser.parse_args() | |
ds1 = args.dataset1 | |
ds2 = args.dataset2 | |
region = args.region | |
chr, minpos, maxpos = re.split(r"\D+", region) | |
ostem = args.ostem | |
odir, ost, osuf = util.getDirStemSuffix(ostem + ".txt") | |
tmpdir = args.tmp | |
if not os.path.isdir(tmpdir): | |
os.makedirs(tmpdir) | |
tostem = f"{tmpdir}/{ost}" | |
### global vars | |
tmpfiles = [] | |
regfile = ostem + ".reg" # to save the region information | |
tmpfiles.append(regfile) | |
lock1 = threading.Lock() | |
lock2 = threading.Lock() | |
lock3 = threading.Lock() | |
lock4 = threading.Lock() | |
sumcsv = ostem + ".clc.summary.csv" | |
if args.no_override: | |
if os.path.isfile(sumcsv): | |
print(f"Result file {sumcsv} exists, NOT re-running") | |
exit(0) | |
else: | |
print(f"Result file {sumcsv} does NOT exists, RE-RUNNING") | |
if args.name1: | |
name1 = args.name1 | |
else: | |
if os.path.isfile(ds1): | |
d1, stm1, suf1 = util.getDirStemSuffix(ds1) | |
name1 = stm1 + "." + suf1 | |
else: | |
name1 = ds1 | |
if args.name2: | |
name2 = args.name2 | |
else: | |
if os.path.isfile(ds2): | |
d2, stm2, suf2 = util.getDirStemSuffix(ds2) | |
name2 = stm2 + "." + suf2 | |
else: | |
name2 = ds2 | |
# escape the ' in name1 & name2 | |
name1 = name1.replace("'", "") | |
name2 = name2.replace("'", "") | |
def writeNoResults(t1, t2, n1, n2, note, outf): | |
with open(outf, "w+") as f: | |
f.write("\t".join(["trait1id", "trait2id", "trait1", "trait2", "nsnps", "H0", "H1", "H2", "H3", "H4", "reg", "minp2", "NOTE", "htmlid"]) + "\n") | |
f.write("\t".join([t1, t2, n1, n2, "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", note, "NA"]) + "\n") | |
f.close() | |
def getLocalDSRCode(fname, type, reg, dsn: " dataset number: 1 or 2"): | |
print(f"getLocalDSRCode: fname = {fname}, reg= {reg}, dsn = {dsn}") | |
fdir, fst, fsuf = util.getDirStemSuffix(fname) | |
fstem = fst + "." + fsuf | |
traitname = "" | |
if dsn == "1": | |
dfname = "df1" | |
dsname = "ds1" | |
if args.name1: | |
traitname = args.name1 | |
elif dsn == "2": | |
dfname = "df2" | |
dsname = "ds2" | |
if args.name2: | |
traitname = args.name2 | |
else: | |
print("Unknown dsn: " + str(dsn)) | |
sys.exit(1) | |
if reg: # region specified | |
# get the minchr, minpos, maxpos | |
minchr, minrange = reg.split(":") | |
minpos, maxpos = minrange.split("-") | |
# use tabix to get the regional data otherwise reading the input might take a very long time | |
#print(f"checking the file {fname} for pos and chr") | |
if os.path.getsize(fname) == 0: | |
print(f"ERROR 0: {fname} is empty") | |
exit(1) | |
df = pd.read_csv(fname, sep="\t", header=0, comment='#', nrows=2) | |
if df.empty: | |
print(f'ERROR 1: {fname} has no data row') | |
exit(1) | |
print(f"2 read {fname} to df, hope it has 'pos' & 'chr' columns") | |
pos = str(df.columns.get_loc("pos") + 1) # 1 based index | |
chr = str(df.columns.get_loc("chr") + 1) | |
lock3.acquire() | |
print(f"3 read {fname} to df, it does have 'pos' & 'chr' columns") | |
regname = f"{tostem}.reg.{dsn}" | |
if not os.path.isfile(regname): | |
util.run("echo '" + "\t".join(list(df.columns)) + "' > " + regname) | |
del df | |
# tabix and index {fname} if it's not bgzipped | |
if not fname.endswith(".gz"): | |
fgz = fname + ".gz" | |
util.run(f"bgzip -f {fname}") | |
util.run(f"tabixFiles.py -s -o {fgz}") | |
# else use it as is | |
else: | |
fgz = fname | |
lock3.release() | |
print(f"fname = {fname}, fgz = {fgz}") | |
if dsn == "1": | |
rcode = f""" | |
minchr = '{minchr}' | |
minpos = '{minpos}' | |
maxpos = '{maxpos}' | |
reg = '{reg}' | |
""" | |
else: # dsn == 2, don't need to specify (minchr, minpos, maxpos, reg) | |
rcode = "" | |
rcode += f""" | |
tbxcmd = paste0('tabix -S 1 -f -b {pos} -e {pos} -s {chr} ', '{fgz} ', reg, ' >> {regname}') # tricky, don't modify here without careful thoughts | |
print(tbxcmd) | |
system(tbxcmd) | |
""" | |
tmpfiles.append(regname) | |
rcode += dfname + """ = read.table('""" + regname + """', sep=' ', header=T, comment='#')""" | |
else: # no region specified, get from the highest peak | |
rcode = dfname + """ = read.table('""" + fname + """', sep=' ', header=T, comment='#')""" | |
if dsn == "1": | |
rcode += """ | |
minrow = subset(""" + dfname + """, pval == min(pval)) | |
minchr = {chr} | |
minpos = {maxpos} | |
minpval = minrow[1, 'pval', 1] | |
if(minpos < 0) { | |
minpos = 0 | |
} | |
maxpos = {maxpos} | |
reg = paste0(minchr, ":", minpos, "-", maxpos) # for plotlzm.py usage | |
""" | |
else: # dsn == 2, reg should be defined already | |
rcode += "" # dumb statement | |
### get the local dataset | |
rcode += """ | |
""" + dfname + """ = subset(""" + dfname + """, chr==minchr & pos > minpos & pos < maxpos) | |
# remove duplicated SNPs, keep the one with smallest p-value | |
library(dplyr) | |
""" + dfname + """ = """ + dfname + """ %>% | |
group_by(SNP) %>% | |
slice_min(n = 1, order_by = pval) %>% | |
ungroup | |
""" + dsname + """ = """ + dfname + """[1] | |
if("id" %in% colnames(""" + dfname + """)) { | |
""" + dsname + """$id = """ + dfname + """$id[1] | |
} else { | |
""" + dsname + """$id = '""" + fstem + """' | |
} | |
""" | |
if traitname: | |
traitstr = f'{dsname}$trait = "{traitname}"' | |
else: # trait name is not specified | |
traitstr = f"""if("Phenotype" %in% colnames({dfname})) {{ | |
{dsname}$trait = '{dfname}$Phenotype[1]' | |
}} else {{ | |
{dsname}$trait = '{fstem}' | |
}} | |
""" | |
rcode += traitstr + """ | |
""" + dsname + """$N = """ + dfname + """$samplesize # sample size not needed for (at least cc) outcome | |
""" + dsname + """$beta = """ + dfname + """$beta | |
""" + dsname + """$varbeta = """ + dfname + """$se ** 2 # "varbeta is simply the square of the standard error of beta. you don't need the SD." (https://github.com/chr1swallace/coloc/issues/108) | |
""" + dsname + """$pval = """ + dfname + """$pval # needed for locuszoom | |
""" + dsname + """$ea = """ + dfname + """$effect_allele # needed for locuszoom | |
""" + dsname + """$oa = """ + dfname + """$other_allele # needed for locuszoom | |
""" + dsname + """$se = """ + dfname + """$se # needed for locuszoom | |
""" + dsname + """$type = '""" + type + """' | |
""" + dsname + """$MAF = """ + dfname + """$eaf | |
""" + dsname + """$snp = """ + dfname + """$SNP | |
""" + dsname + """$position = """ + dfname + """$pos | |
# remove duplicated SNPs, keep the one with smallest p-value. This is still necessary because ds1 does not remove duplicated SNPs with the same p-value | |
""" + dsname + """ = """ + dsname + """[!duplicated(""" + dsname + """$snp),] | |
### remove the column "SNP" if it exists | |
if("SNP" %in% colnames(""" + dsname + """)) { | |
""" + dsname + """ = select(""" + dsname + """, -SNP) | |
} | |
# remove rows with varbeta = 0 | |
""" + dsname + """ = subset(""" + dsname + """, varbeta != 0) | |
# remove rows with MAF = 0 or 1 | |
""" + dsname + """ = subset(""" + dsname + """, MAF != 0 & MAF != 1) | |
# make sure N is numeric | |
""" + dsname + """$N = as.numeric(""" + dsname + """$N) | |
""" | |
return rcode | |
def getRemoteDSRCode(id, type, reg:"genemoic region, e.g. '1:234-456'", dsn:"dataset number: 1 or 2"): | |
dfname = "df" + str(dsn) | |
dsname = "ds" + str(dsn) | |
idhash = util.getHash(id, "getRemoteDSRCode", 12) | |
if dsn == "1": | |
rcode = """tops = tophits('""" + id + """') | |
# n p chr beta position se id rsid ea nea eaf trait | |
minchr = tops[1, 'chr', 1] | |
minpos = {minpos} | |
maxpos = {maxpos} | |
minpval = tops[1, 'p', 1] | |
if(minpos < 0) { | |
minpos = 0 | |
} | |
reg = paste0(minchr, ':', minpos, '-', maxpos) | |
reg | |
""" | |
elif dsn == "2": | |
rcode = """reg = paste0(minchr, ':', minpos, '-', maxpos)\n""" | |
else: | |
print("dsn has to be either '1' or '2'") | |
rcode += dfname + """=ieugwasr::associations(reg, '""" + id + """') | |
### the MAF & sample size fixup are coded in "causal.fixRemoteDF" | |
### replace part of the following code in the future. leave it for now | |
# If all MAFs are missing, run the "getSNPAF" system script to get MAF | |
if(all(is.na(""" + dfname + """$eaf))) { | |
# Check if the file with the MAF data exists | |
regn = gsub(":", "_", reg) | |
diskfile = paste0('""" + tmpdir + """/""" + idhash + """', '.', regn, '.withMAF.csv') | |
if(file.exists(diskfile)) { | |
# If the file exists, load it into a data frame called `df` | |
""" + dfname + """ = read.table(diskfile, sep="\t", comment='#', header=T) | |
} else { | |
# If the file doesn't exist, save the data frame `df` to disk | |
disksave = paste0(diskfile, ".save") | |
write.table(""" + dfname + """, file = disksave, sep="\t", row.names=F, quote=F) | |
# Run "getSNPAF.py" to get the MAF | |
diskmaf = paste0(diskfile, ".maf") | |
cmd = paste0('getSNPAF.py ', disksave, ' chr rsid ', diskmaf) | |
print(cmd) | |
system(cmd) | |
# remove 'eaf' | |
disknoeaf = paste0(diskfile, ".noeaf") | |
cmd1 = paste0('rmColumns.pl ', diskmaf, ' eaf ', disknoeaf) | |
print(cmd1) | |
system(cmd1) # remove the exist eaf column | |
# rename MAF to eaf | |
cmd2 = paste0('renameFields.pl ', disknoeaf, ' MAF eaf ', diskfile) | |
print(cmd2) | |
system(cmd2) # rename MAF to eaf | |
# remove the tmp files | |
#cmd3 = paste0("rm -rf ", disksave, " ", diskmaf, " ", disknoeaf) | |
#print(cmd3) | |
#system(cmd3) | |
# Load the MAF added file back into the data frame `df` | |
""" + dfname + """ <- read.table(diskfile, header = TRUE, sep = '\t') | |
} | |
} | |
### read sample size from a local file if it does not exist | |
if(all(is.na(""" + dfname + """$n))) { | |
libfile = "/home/moxu/softspace/d2m/2smr/data/list/2smr.outcomes.samplesize.csv" | |
ssfile = read.table(libfile, sep="\t", header=T, comment='#') | |
idrow = subset(ssfile, id == '""" + id + """') | |
if(dim(idrow)[1] == 1) { | |
""" + dfname + """$n = idrow[1,3] | |
} # else don't specify | |
} | |
# else leave as is | |
# Remove SNPs without MAF, and SNPs without N | |
""" + dfname + """ <- """ + dfname + """[!is.na(""" + dfname + """$eaf), ] | |
""" + dfname + """ <- """ + dfname + """[!is.na(""" + dfname + """$n), ] | |
""" + dfname + """ <- """ + dfname + """[""" + dfname + """$eaf != 0 & """ + dfname + """$eaf != 1, ] | |
""" + dsname + """=""" + dfname + """[1] | |
""" + dsname + """$id = '""" + id + """' | |
""" + dsname + """$trait = """ + dfname + """$trait[1] | |
""" + dsname + """$N = """ + dfname + """$n | |
""" + dsname + """$beta = """ + dfname + """$beta | |
""" + dsname + """$varbeta = """ + dfname + """$se ** 2 # "varbeta is simply the square of the standard error of beta. you don't need the SD." | |
""" + dsname + """$pval = """ + dfname + """$p # needed for locuszoom | |
""" + dsname + """$ea = """ + dfname + """$ea # needed for locuszoom | |
""" + dsname + """$oa = """ + dfname + """$nea # needed for locuszoom | |
""" + dsname + """$se = """ + dfname + """$se # needed for locuszoom | |
""" + dsname + """$type = '""" + type + """' | |
""" + dsname + """$MAF = """ + dfname + """$eaf | |
""" + dsname + """$snp = """ + dfname + """$rsid | |
""" + dsname + """$position = """ + dfname + """$position | |
# remove duplicated SNPs, keep the one with smallest p-value. This is still necessary because ds1 does not remove duplicated SNPs with the same p-value | |
""" + dsname + """ = """ + dsname + """[!duplicated(""" + dsname + """$snp),] | |
# remove rows with varbeta = 0 | |
""" + dsname + """ = subset(""" + dsname + """, varbeta != 0) | |
# remove rows with MAF = 0 or 1 | |
""" + dsname + """ = subset(""" + dsname + """, MAF != 0 & MAF != 1) | |
# make sure N is numeric | |
""" + dsname + """$N = as.numeric(""" + dsname + """$N) | |
""" | |
return rcode | |
def clc1pair(ds1f, trait1, ds2f, trait2, cost): | |
print(f"clc1pair: ds1f = {ds1f}, trait1 = {trait1}, ds2f = {ds2f}, trait2={trait2}, cost = " + cost) | |
rstclc = cost + ".summary.csv" | |
rstmf = cost + ".merged.summary.csv" | |
# Define the column names to check | |
columns_to_check = {"beta", "eaf", "samplesize"} | |
# Check if any of the column names are missing from both df1 and df2 | |
if os.path.isfile(ds1f): | |
if os.path.getsize(ds1f) == 0: | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} is empty", ostem + ".clc.summary.csv") | |
exit(1) | |
df1 = pd.read_csv(ds1f, sep="\t", header=0, comment='#') | |
if df1.empty: | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} has no data row", ostem + ".clc.summary.csv") | |
exit(1) | |
if not columns_to_check.issubset(df1.columns): | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} missing one of {columns_to_check}", ostem + ".clc.summary.csv") | |
exit(1) | |
if os.path.isfile(ds2f): | |
if os.path.getsize(ds2f) == 0: | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} is empty", ostem + ".clc.summary.csv") | |
exit(1) | |
df2 = pd.read_csv(ds2f, sep="\t", header=0, comment='#') | |
if df2.empty: | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} has no data row", ostem + ".clc.summary.csv") | |
exit(1) | |
if not columns_to_check.issubset(df2.columns): | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} missing one of 'beta', 'eaf', or 'samplesize'", ostem + ".clc.summary.csv") | |
exit(1) | |
if os.path.isfile(ds1f): | |
print(f'0 rc1 getlocaldsrcode for {ds1f}, args.region = {args.region}') | |
rc1 = getLocalDSRCode(ds1f, "quant", args.region, "1") | |
print(f'1 rc1 getlocaldsrcode for {ds1f}, args.region = {args.region}') | |
else: | |
print(f'rc1 getremotedsrcode') | |
rc1 = getRemoteDSRCode(ds1f, "quant", args.region, "1") | |
# if args.region: | |
# ds2reg = args.region | |
# else: | |
# ds2reg = 'paste0(minchr, ":", minpos, "-", maxpos)' | |
ds2reg = args.region | |
print(f'ds2reg = {ds2reg}') | |
if os.path.isfile(ds2f): | |
print(f'rc2 getlocaldsrcode') | |
rc2 = getLocalDSRCode(ds2f, "cc", ds2reg, "2") | |
else: | |
print(f'rc2 getremotedsrcode') | |
rc2 = getRemoteDSRCode(ds2f, "cc", ds2reg, "2") | |
rs4lz = open(cost + ".lz.R", "w+") | |
rs4lz.write(""" | |
library(coloc) | |
library(ieugwasr) | |
""" + rc1 + """ | |
reg | |
write(reg, file='""" + regfile + """') | |
""" + rc2 + """ | |
trait1id = ds1$id[1] | |
trait2id = ds2$id[1] | |
trait1 = ds1$trait[1] | |
trait2 = ds2$trait[1] | |
ds1$chromosome = minchr | |
ds2$chromosome = minchr | |
ds1 = ds1[order(ds1$position),] # locuszoom requires sorted coords | |
ds2 = ds2[order(ds2$position),] | |
""") | |
cstem = cost + ".compare" # compare result file prefix | |
scstem = cost + ".compare.sigpval" # compare result file prefix | |
rs4lz.write(""" | |
# write datasets to files | |
regstr = gsub(":", "_", reg) # replace ':" with '_' | |
lzf1 = paste0('""" + ds1f + """', '.', regstr, '.lz.csv') | |
lzf2 = paste0('""" + ds2f + """', '.', regstr, '.lz.csv') | |
regf1 = paste0('""" + ds1f + """', '.', regstr, '.reg.csv') | |
regf2 = paste0('""" + ds2f + """', '.', regstr, '.reg.csv') | |
#if(!file.exists(lzf1)) { | |
write.table(ds1, lzf1, sep="\t", row.names=F, quote=F) | |
#} | |
#if(!file.exists(lzf2)) { | |
write.table(ds2, lzf2, sep="\t", row.names=F, quote=F) | |
#} | |
#if(!file.exists(regf1)) { | |
write.table(df1, regf1, sep="\t", row.names=F, quote=F) # for IV identification | |
#} | |
#if(!file.exists(regf2)) { | |
write.table(df2, regf2, sep="\t", row.names=F, quote=F) # for IV identification | |
#} | |
""") | |
rs4lz.close() | |
tmpfiles.append(regfile) | |
### run the R script to get file lz1 & lz2 | |
with lock1: | |
print(f'Running the lz R script {rs4lz.name}') | |
util.run("R --no-save < " + rs4lz.name) | |
if not os.path.isfile(regfile): | |
print(f"ERROR 2: mr2clc.py: regfile {regfile} does not exist.") | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 3: mr2clc.py: regfile {regfile} does not exist.", rstmf) | |
return | |
with open(regfile, "r") as f: | |
reg = f.read().strip() | |
#print("reg = " + reg) | |
rchr, rstart, rend = re.match(r'(\d+):(\d+)-(\d+)', reg).groups() | |
regstr = reg.replace(":", "_") | |
f.close() | |
lz1 = ds1f + "." + regstr + ".lz.csv" | |
lz2 = ds2f + "." + regstr + ".lz.csv" | |
df1reg = ds1f + "." + regstr + ".reg.csv" # files generated above | |
df2reg = ds2f + "." + regstr + ".reg.csv" # files generated above | |
if not os.path.isfile(df1reg): | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 4: mr2clc.py: df1reg {df1reg} does not exist.", rstmf) | |
return | |
if not os.path.isfile(df2reg): | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 4: mr2clc.py: df2reg {df2reg} does not exist.", rstmf) | |
return | |
tmpfiles.append(df1reg) | |
tmpfiles.append(df2reg) | |
rs4clc = open(cost + ".clc.R", "w+") | |
rs4clc.write(f""" | |
library(coloc) | |
library(ieugwasr) | |
ds1 = read.table('{lz1}', sep="\t", header=T, quote="") | |
ds2 = read.table('{lz2}', sep="\t", header=T, quote="") | |
trait1id = ds1$id[1] | |
trait2id = ds2$id[1] | |
trait1 = ds1$trait[1] | |
trait2 = ds2$trait[1] | |
reg = '{reg}' | |
# remove rows with varbeta = 0 | |
ds1 = subset(ds1, varbeta != 0) | |
ds2 = subset(ds2, varbeta != 0) | |
# remove rows with MAF = 0 or 1 | |
ds1 = subset(ds1, MAF != 0 & MAF != 1) | |
ds2 = subset(ds2, MAF != 0 & MAF != 1) | |
if(nrow(ds1) == 0 | nrow(ds2) == 0) {{ | |
if(nrow(ds1) == 0) {{ | |
NOTE = "trait1 has no valid data" | |
trait1id = '{lz1}' | |
trait1 = '{name1}' | |
}} else {{ | |
NOTE = "trait2 has no valid data" | |
trait2id = '{lz2}' | |
trait2 = '{name2}' | |
}} | |
nsnps = "NA" | |
H0 = "NA" | |
H1 = "NA" | |
H2 = "NA" | |
H3 = "NA" | |
H4 = "NA" | |
minp2 = "NA" | |
dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE) | |
write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F) | |
quit() | |
}} | |
# make sure N is numeric | |
ds1$N = as.numeric(ds1$N) | |
ds2$N = as.numeric(ds2$N) | |
minrow = subset(ds1, pval == min(pval)) | |
minchr = {chr} | |
minpos = {minpos} | |
maxpos = {maxpos} | |
minpval = minrow[1, 'pval', 1] | |
if(minpos < 0) {{ | |
minpos = 0 | |
}} | |
### if p-value is too big, don't do coloc | |
if(minpval > {args.pval}) {{ | |
NOTE = paste0("trait1 pval too big: ", minpval) | |
nsnps = "NA" | |
H0 = "NA" | |
H1 = "NA" | |
H2 = "NA" | |
H3 = "NA" | |
H4 = "NA" | |
minp2 = "NA" | |
dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE) | |
write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F) | |
quit() | |
}} | |
### draw regional manhattan plot with selected snps highlighted | |
ds1$chromosome = as.numeric(ds1$chromosome) | |
ds2$chromosome = as.numeric(ds2$chromosome) | |
minp2 = min(ds2$pval) | |
tc = tryCatch({{ | |
res <- coloc.abf(dataset1=ds1, dataset2=ds2) | |
res | |
### save major results to a file | |
nsnps = res$summary[1] | |
H0 = signif(res$summary[2], digits=3) | |
H1 = signif(res$summary[3], digits=3) | |
H2 = signif(res$summary[4], digits=3) | |
H3 = signif(res$summary[5], digits=3) | |
H4 = signif(res$summary[6], digits=3) | |
NOTE = "" | |
dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE) | |
print("writing coloc results dfres to {cost}.summary.csv") | |
write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F) | |
}}, | |
error=function(err) {{ | |
print(err) | |
NOTE = paste0("ERROR - ", conditionMessage(err)) | |
nsnps = "NA" | |
H0 = "NA" | |
H1 = "NA" | |
H2 = "NA" | |
H3 = "NA" | |
H4 = "NA" | |
dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE) | |
write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F) | |
}}, | |
finally=function(f) {{ | |
print("Done") | |
}}) | |
""") | |
rs4clc.close() | |
if args.script: | |
print(f'NOT running coloc R script: {rs4clc.name}') | |
exit(2) | |
else: | |
print(f'Running coloc R script: {rs4clc.name}') | |
rshit = "/tmp/t.rshit.out" | |
with open(rs4clc.name, "r") as f: | |
print("r code:") | |
print(f.read()) | |
util.run(f"R --no-save < {rs4clc.name}") | |
tmpfiles.append(rs4clc.name) | |
### construct the html report | |
# write all the result | |
if not os.path.isfile(rstclc): | |
print(f"WARNING: coloc summary file {rstclc} does NOT exist. Check next ...") | |
writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: coloc summary file {rstclc} does NOT exist. Check next ...",rstmf) | |
return | |
dfclc = pd.read_csv(rstclc, sep="\t", header=0, comment='#') | |
print(f"rstclc = {rstclc}") | |
### rewrite the above to make it look better | |
trait1id = dfclc.at[0, "trait1id"] | |
trait2id = dfclc.at[0, "trait2id"] | |
hfile = open(cost + ".summary.html", "w+") | |
# html anchor id, which will be embeded in the html file and can be used to identify the html file | |
htmlid = util.getHash(hfile.name, "mr2clc", 8) | |
rstm = open(rstmf, "w+") # clc + 2smr merged results | |
rstm.write("\t".join(list(dfclc.columns) + ["htmlid"]) + "\n") | |
rstm.write("\t".join(list(map(lambda x: str(x), list(dfclc.iloc[0,:]) + [htmlid]))) + "\n") | |
rstm.close() | |
# rename the result file | |
util.run("mv %s %s" %(rstm.name, rstclc)) | |
rplot = os.path.abspath(cost + ".png") | |
hfile.write(f"""<html> | |
<head><title>{name1} ~ {name2}</title> | |
<style> | |
table {{ | |
page-break-inside: avoid; | |
}} | |
</style> | |
</head> | |
<body> | |
""") | |
# trait1id trait2id trait1 trait2 nsnps H0 H1 H2 H3 H4 reg NOTE 2smr_pval_min bbR | |
# prot-c-5102_55_3 ukb-b-14521 MICB Illnesses of father: Lung cancer 2510 1.41e-34 1.66e-06 5.48e-29 0.644 0.356 6:30465047-32465047 nan 0.435 -0.6587235634261688 | |
dfsum = pd.read_csv(rstclc, sep="\t", header=0, comment='#') | |
# print the stats | |
nsnps = dfsum.iloc[0].loc[ "nsnps"] | |
h0 = dfsum.iloc[0].loc[ "H0"] | |
h1 = dfsum.iloc[0].loc[ "H1"] | |
h2 = dfsum.iloc[0].loc[ "H2"] | |
h3 = dfsum.iloc[0].loc[ "H3"] | |
h4 = dfsum.iloc[0].loc[ "H4"] | |
note = str(dfsum.iloc[0].loc["NOTE"]) | |
reg = dfsum.iloc[0].loc["reg"] | |
#print("reg = " + str(reg)) | |
reg = reg.replace(":", "_") | |
#print("update reg to be " + str(reg)) | |
print(f'ds1 = {ds1}, ds2 = {ds2}') | |
# partition the ds1 by study 'id' | |
df1s = pd.read_csv(ds1, sep="\t", comment='#', header=0) | |
df1ids = list(set(df1s["id"])) | |
#print("Dataset1 has %d unique IDs" %(len(df1ids))) | |
csvs = [] | |
htms = [] | |
for i in range(len(df1ids)): | |
id = df1ids[i] | |
#print("processing id " + str(id)) | |
df1 = df1s[df1s["id"] == id] | |
pheno = df1["Phenotype"].iloc[0] | |
# make ds1f file name shorter | |
if os.path.isfile(id): | |
idir, istem, isuf = util.getDirStemSuffix(id) | |
if len(istem) > 10: # file name too long can cause problems for unix | |
istem = istem[0:3] + util.getHash(istem, "mr2clc", 4) + istem[-4:-1] # use first 3 and last 3 and 4 random chars to form the length 10 istem | |
else: | |
istem = id | |
ds1f = f"{tmpdir}/{istem}.lz.csv" | |
lock2.acquire() | |
if os.path.isfile(ds1f): | |
if os.path.getsize(ds1f) == 0: | |
df1.to_csv(ds1f, sep="\t", index=False) | |
# else exists and has content, don't save it again | |
else: # file does not exist | |
df1.to_csv(ds1f, sep="\t", index=False) | |
lock2.release() | |
ostem1 = tostem + "." + util.getHash(istem, "ostem1", 8) + util.getHash(ds2, "ostem1", 8) | |
print(f"ds1f = {ds1f}, pheno = {pheno}, name2 = {name2}, ostem1 = {ostem1}") | |
clc1pair(ds1f, pheno, ds2, name2, ostem1) | |
csvf = ostem1 + ".summary.csv" | |
htmf = ostem1 + ".summary.html" | |
csvs.append(csvf) | |
htms.append(htmf) | |
tmpfiles.append(csvf) | |
tmpfiles.append(htmf) | |
### summarize all results | |
# summarize the ".summary.csv" | |
util.run("cat " + " ".join(csvs) + " > " + sumcsv) | |
util.run("rmDuplicatedRows.pl " + sumcsv) | |
# summarize the ".html" | |
sumhtm = ostem + ".clc.summary.html" | |
util.run("cat " + " ".join(htms) + " > " + sumhtm) | |
#if not args.debug: | |
# for tf in tmpfiles: | |
# util.run("rm -rf " + tf) | |
print(f'clc.py {ds1} {ds2} {ostem} is DONE') | |
f = open(sumcsv, "r") | |
print(f'the summary file {sumcsv} content is {f.read()}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment