mocksu · August 1, 2023 15:23
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/env python

 import os
 import sys
 import pandas as pd
 import numpy as np
 import argparse
 from argparse import RawTextHelpFormatter
 import util
 import re
 import glob
 import causal
 import threading

 desc = "This script is to run coloc only (no locuszoom, LDMAP, 2smr)"
 epi = "Author: Mousheng Xu; Date: July 31, 2023"

 parser = argparse.ArgumentParser(prog='~',
                                 description=desc,
                                 epilog=epi,
                                 formatter_class=RawTextHelpFormatter)

 parser.add_argument("--name1", help="the name of the dataset1. Default is <dataset1>")
 parser.add_argument("--name2", help="the name of the dataset2. Default is <dataset2>")
 parser.add_argument("-p", "--pval", type=float, help="the p-value threshold for trait1. Default=%(default)s", default=5e-8)
 parser.add_argument("-s", "--script", help="only generate R scripts without running them", action="store_true")
 parser.add_argument("-R", "--region", help="region of interest, e.g. '12:111741666-112741866'")
 parser.add_argument("-d", "--debug", action="store_true", help="debug mode")
 parser.add_argument("-t", "--top", action="store_true", help="Generate an html ancor ID and a href to go back to top. This only makes sense when the html file generated by this script is used in a bigger scope like by 'mrsum2clc.py' which has specified <a href='#top'>")
 parser.add_argument("-O", "--no_override", action="store_true", help="If the found studies or output files exist for a pair, do not refind the studies or rerun the analysis.") 
 parser.add_argument("--tmp", type=str, help="the tmp filder, default is %(default)s", default="/tmp3/clc")
 parser.add_argument("dataset1", type=str, help="the input dataset1 file") # positional
 parser.add_argument("dataset2", type=str, help="the input dataset2 file") # positional
 parser.add_argument("ostem", type=str, help="output file prefix") # positional

 args = parser.parse_args()

 ds1 = args.dataset1
 ds2 = args.dataset2
 region = args.region
 chr, minpos, maxpos = re.split(r"\D+", region)
 ostem = args.ostem
 odir, ost, osuf = util.getDirStemSuffix(ostem + ".txt")

 tmpdir = args.tmp

 if not os.path.isdir(tmpdir):
    os.makedirs(tmpdir)

 tostem = f"{tmpdir}/{ost}"

 ### global vars

 tmpfiles = []    
 regfile = ostem + ".reg" # to save the region information
 tmpfiles.append(regfile)
 lock1 = threading.Lock()
 lock2 = threading.Lock()
 lock3 = threading.Lock()
 lock4 = threading.Lock()

 sumcsv = ostem + ".clc.summary.csv"

 if args.no_override:
    if os.path.isfile(sumcsv):
        print(f"Result file {sumcsv} exists, NOT re-running")
        exit(0)
    else:
        print(f"Result file {sumcsv} does NOT exists, RE-RUNNING")
        
 if args.name1:
    name1 = args.name1
 else:
    if os.path.isfile(ds1):
        d1, stm1, suf1 = util.getDirStemSuffix(ds1)
        name1 = stm1 + "." + suf1
    else:
        name1 = ds1

 if args.name2:
    name2 = args.name2
 else:
    if os.path.isfile(ds2):
        d2, stm2, suf2 = util.getDirStemSuffix(ds2)
        name2 = stm2 + "." + suf2
    else:
        name2 = ds2

 # escape the ' in name1 & name2
 name1 = name1.replace("'", "")
 name2 = name2.replace("'", "")

 def writeNoResults(t1, t2, n1, n2, note, outf):
    with open(outf, "w+") as f:
        f.write("\t".join(["trait1id", "trait2id", "trait1", "trait2", "nsnps", "H0", "H1", "H2", "H3", "H4", "reg", "minp2", "NOTE", "htmlid"]) + "\n")
        f.write("\t".join([t1, t2, n1, n2, "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", note, "NA"]) + "\n")
        f.close()
            
 def getLocalDSRCode(fname, type, reg, dsn: " dataset number: 1 or 2"):
    print(f"getLocalDSRCode: fname = {fname}, reg= {reg}, dsn = {dsn}")
    
    fdir, fst, fsuf = util.getDirStemSuffix(fname)
    fstem = fst + "." + fsuf
    traitname = ""
    
    if dsn == "1":
        dfname = "df1"
        dsname = "ds1"
        
        if args.name1:
            traitname = args.name1
    elif dsn == "2":
        dfname = "df2"
        dsname = "ds2"

        if args.name2:
            traitname = args.name2
    else:
        print("Unknown dsn: " + str(dsn))
        sys.exit(1)
        
    if reg: # region specified
        # get the minchr, minpos, maxpos
        minchr, minrange = reg.split(":")
        minpos, maxpos = minrange.split("-")
        
        # use tabix to get the regional data otherwise reading the input might take a very long time
        #print(f"checking the file {fname} for pos and chr")
        if os.path.getsize(fname) == 0:
            print(f"ERROR 0: {fname} is empty")
            exit(1)
            
        df = pd.read_csv(fname, sep="\t", header=0, comment='#', nrows=2)

        if df.empty:
            print(f'ERROR 1: {fname} has no data row')
            exit(1)
            
        print(f"2 read {fname} to df, hope it has 'pos' & 'chr' columns")
        pos = str(df.columns.get_loc("pos") + 1) # 1 based index
        chr = str(df.columns.get_loc("chr") + 1)

        lock3.acquire()
        print(f"3 read {fname} to df, it does have 'pos' & 'chr' columns")
        regname = f"{tostem}.reg.{dsn}"

        if not os.path.isfile(regname):
            util.run("echo '" + "\t".join(list(df.columns)) + "' > " + regname)
            
        del df

        # tabix and index {fname} if it's not bgzipped
        if not fname.endswith(".gz"):
            fgz = fname + ".gz"

            util.run(f"bgzip -f {fname}")
            util.run(f"tabixFiles.py -s -o {fgz}")
            # else use it as is
        else:
            fgz = fname

        lock3.release()
        
        print(f"fname = {fname}, fgz = {fgz}")

        if dsn == "1":
            rcode = f"""
            minchr = '{minchr}'
            minpos = '{minpos}'
            maxpos = '{maxpos}'
            reg = '{reg}'
            """
        else: # dsn == 2, don't need to specify (minchr, minpos, maxpos, reg)
            rcode = ""
            
        rcode += f"""
        tbxcmd = paste0('tabix -S 1 -f -b {pos} -e {pos} -s {chr} ', '{fgz} ', reg, ' >> {regname}') # tricky, don't modify here without careful thoughts
        print(tbxcmd)
        system(tbxcmd)
        """
        
        tmpfiles.append(regname)
        rcode += dfname + """ = read.table('""" + regname + """', sep='	', header=T, comment='#')"""
    else: # no region specified, get from the highest peak
        rcode = dfname + """ = read.table('""" + fname + """', sep='	', header=T, comment='#')"""

        if dsn == "1":
            rcode += """
            minrow = subset(""" + dfname + """, pval == min(pval))
            minchr = {chr}
            minpos = {maxpos}
            minpval = minrow[1, 'pval', 1]

            if(minpos < 0) {
              minpos = 0
            }

            maxpos = {maxpos}
            reg = paste0(minchr, ":", minpos, "-", maxpos) # for plotlzm.py usage
            """
            
        else: # dsn == 2, reg should be defined already
            rcode += "" # dumb statement
    
    ### get the local dataset
    rcode += """
    """ + dfname + """ = subset(""" + dfname + """, chr==minchr & pos > minpos & pos < maxpos)
    # remove duplicated SNPs, keep the one with smallest p-value
    library(dplyr)
    """ + dfname + """ = """ + dfname + """ %>% 
    group_by(SNP) %>%
    slice_min(n = 1, order_by = pval) %>%
    ungroup    
    """ + dsname + """ = """ + dfname + """[1]
    if("id" %in% colnames(""" + dfname + """)) {
    """ + dsname + """$id = """ + dfname + """$id[1]
    } else {
    """ + dsname + """$id = '""" + fstem + """'
    }
    """

    if traitname:
        traitstr = f'{dsname}$trait = "{traitname}"'
    else: # trait name is not specified
        traitstr = f"""if("Phenotype" %in% colnames({dfname})) {{
        {dsname}$trait = '{dfname}$Phenotype[1]'
        }} else {{
        {dsname}$trait = '{fstem}'
        }}
        """

    rcode += traitstr + """
    """ + dsname + """$N = """ + dfname + """$samplesize # sample size not needed for (at least cc) outcome
    """ + dsname + """$beta = """ + dfname + """$beta
    """ + dsname + """$varbeta = """ + dfname + """$se ** 2 # "varbeta is simply the square of the standard error of beta. you don't need the SD." (https://github.com/chr1swallace/coloc/issues/108)
    """ + dsname + """$pval = """ + dfname + """$pval # needed for locuszoom
    """ + dsname + """$ea = """ + dfname + """$effect_allele # needed for locuszoom
    """ + dsname + """$oa = """ + dfname + """$other_allele # needed for locuszoom
    """ + dsname + """$se = """ + dfname + """$se # needed for locuszoom
    """ + dsname + """$type = '""" + type + """'
    """ + dsname + """$MAF = """ + dfname + """$eaf
    """ + dsname + """$snp = """ + dfname + """$SNP
    """ + dsname + """$position = """ + dfname + """$pos
    # remove duplicated SNPs, keep the one with smallest p-value. This is still necessary because ds1 does not remove duplicated SNPs with the same p-value
    """ + dsname + """ = """ + dsname + """[!duplicated(""" + dsname + """$snp),]

    ### remove the column "SNP" if it exists
    if("SNP" %in% colnames(""" + dsname + """)) {
    """ + dsname + """ = select(""" + dsname + """, -SNP)
    }

    # remove rows with varbeta = 0
    """ + dsname + """ = subset(""" + dsname + """, varbeta != 0)

    # remove rows with MAF = 0 or 1
    """ + dsname + """ = subset(""" + dsname + """, MAF != 0 & MAF != 1)
    
    # make sure N is numeric
    """ + dsname + """$N = as.numeric(""" + dsname + """$N)

    """
    
    return rcode

 def getRemoteDSRCode(id, type, reg:"genemoic region, e.g. '1:234-456'", dsn:"dataset number: 1 or 2"):
    dfname = "df" + str(dsn)
    dsname = "ds" + str(dsn)
    idhash = util.getHash(id, "getRemoteDSRCode", 12)
    
    if dsn == "1":
        rcode = """tops = tophits('""" + id + """')
        # n     p chr     beta position     se id     rsid  ea    nea      eaf trait
        minchr = tops[1, 'chr', 1]
        minpos = {minpos}
        maxpos = {maxpos}
        minpval = tops[1, 'p', 1]

        if(minpos < 0) {
          minpos = 0
        }

        reg = paste0(minchr, ':', minpos, '-', maxpos)
        reg
        """
    elif dsn == "2":
        rcode = """reg = paste0(minchr, ':', minpos, '-', maxpos)\n"""
    else:
        print("dsn has to be either '1' or '2'")
        
    rcode += dfname + """=ieugwasr::associations(reg, '""" + id + """')

    ### the MAF & sample size fixup are coded in "causal.fixRemoteDF"
    ### replace part of the following code in the future. leave it for now
    # If all MAFs are missing, run the "getSNPAF" system script to get MAF
    if(all(is.na(""" + dfname + """$eaf))) {
      # Check if the file with the MAF data exists
      regn = gsub(":", "_", reg)
      diskfile = paste0('""" + tmpdir + """/""" + idhash + """', '.', regn, '.withMAF.csv')

      if(file.exists(diskfile)) {
        # If the file exists, load it into a data frame called `df`
        """ + dfname + """ = read.table(diskfile, sep="\t", comment='#', header=T)
      } else {
      # If the file doesn't exist, save the data frame `df` to disk
      disksave = paste0(diskfile, ".save")
      write.table(""" + dfname + """, file = disksave, sep="\t", row.names=F, quote=F)
  
      # Run "getSNPAF.py" to get the MAF
      diskmaf = paste0(diskfile, ".maf")
      cmd = paste0('getSNPAF.py ', disksave, ' chr rsid ', diskmaf)
      print(cmd)
      system(cmd)

      # remove 'eaf'
      disknoeaf = paste0(diskfile, ".noeaf")
      cmd1 = paste0('rmColumns.pl ', diskmaf, ' eaf ', disknoeaf)
      print(cmd1)
      system(cmd1) # remove the exist eaf column

      # rename MAF to eaf
      cmd2 = paste0('renameFields.pl ', disknoeaf, ' MAF eaf ', diskfile)
      print(cmd2)
      system(cmd2) # rename MAF to eaf

      # remove the tmp files
      #cmd3 = paste0("rm -rf ", disksave, " ", diskmaf, " ", disknoeaf)
      #print(cmd3)
      #system(cmd3)

      # Load the MAF added file back into the data frame `df`
      """ + dfname + """ <- read.table(diskfile, header = TRUE, sep = '\t')    
      }
    }
    
    ### read sample size from a local file if it does not exist
    if(all(is.na(""" + dfname + """$n))) {
      libfile = "/home/moxu/softspace/d2m/2smr/data/list/2smr.outcomes.samplesize.csv"
      ssfile = read.table(libfile, sep="\t", header=T, comment='#')
      idrow = subset(ssfile, id == '""" + id + """')

      if(dim(idrow)[1] == 1) {
        """ + dfname + """$n = idrow[1,3]
      } # else don't specify
    } 
    # else leave as is

  # Remove SNPs without MAF, and SNPs without N
  """ + dfname + """ <- """ + dfname + """[!is.na(""" + dfname + """$eaf), ]
  """ + dfname + """ <- """ + dfname + """[!is.na(""" + dfname + """$n), ]
  """ + dfname + """ <- """ + dfname + """[""" + dfname + """$eaf != 0 & """ + dfname + """$eaf != 1, ]
    """ + dsname + """=""" + dfname + """[1]
    """ + dsname + """$id = '""" + id + """'
    """ + dsname + """$trait = """ + dfname + """$trait[1]
    """ + dsname + """$N = """ + dfname + """$n
    """ + dsname + """$beta = """ + dfname + """$beta
    """ + dsname + """$varbeta = """ + dfname + """$se ** 2 # "varbeta is simply the square of the standard error of beta. you don't need the SD."
    """ + dsname + """$pval = """ + dfname + """$p # needed for locuszoom
    """ + dsname + """$ea = """ + dfname + """$ea # needed for locuszoom
    """ + dsname + """$oa = """ + dfname + """$nea # needed for locuszoom
    """ + dsname + """$se = """ + dfname + """$se # needed for locuszoom
    """ + dsname + """$type = '""" + type + """'
    """ + dsname + """$MAF = """ + dfname + """$eaf
    """ + dsname + """$snp = """ + dfname + """$rsid
    """ + dsname + """$position = """ + dfname + """$position
    # remove duplicated SNPs, keep the one with smallest p-value. This is still necessary because ds1 does not remove duplicated SNPs with the same p-value
    """ + dsname + """ = """ + dsname + """[!duplicated(""" + dsname + """$snp),]

    # remove rows with varbeta = 0
    """ + dsname + """ = subset(""" + dsname + """, varbeta != 0)

    # remove rows with MAF = 0 or 1
    """ + dsname + """ = subset(""" + dsname + """, MAF != 0 & MAF != 1)
    
    # make sure N is numeric
    """ + dsname + """$N = as.numeric(""" + dsname + """$N)

    """
    
    return rcode
    

 def clc1pair(ds1f, trait1, ds2f, trait2, cost):
    print(f"clc1pair: ds1f = {ds1f}, trait1 = {trait1}, ds2f = {ds2f}, trait2={trait2}, cost = " + cost)

    rstclc = cost + ".summary.csv"
    rstmf = cost + ".merged.summary.csv"
    
    # Define the column names to check
    columns_to_check = {"beta", "eaf", "samplesize"}

    # Check if any of the column names are missing from both df1 and df2
    if os.path.isfile(ds1f):
        if os.path.getsize(ds1f) == 0:
            writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} is empty", ostem + ".clc.summary.csv")
            exit(1)
            
        df1 = pd.read_csv(ds1f, sep="\t", header=0, comment='#')

        if df1.empty:
            writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} has no data row", ostem + ".clc.summary.csv")
            exit(1)
            
        if not columns_to_check.issubset(df1.columns):
            writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds1f} missing one of {columns_to_check}", ostem + ".clc.summary.csv")
            exit(1)

    if os.path.isfile(ds2f):
        if os.path.getsize(ds2f) == 0:
            writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} is empty", ostem + ".clc.summary.csv")
            exit(1)
            
        df2 = pd.read_csv(ds2f, sep="\t", header=0, comment='#')

        if df2.empty:
            writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} has no data row", ostem + ".clc.summary.csv")
            exit(1)
            
        if not columns_to_check.issubset(df2.columns):
            writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: {ds2f} missing one of 'beta', 'eaf', or 'samplesize'", ostem + ".clc.summary.csv")
            exit(1)

    if os.path.isfile(ds1f):
        print(f'0 rc1 getlocaldsrcode for {ds1f}, args.region = {args.region}')
        rc1 = getLocalDSRCode(ds1f, "quant", args.region, "1")
        print(f'1 rc1 getlocaldsrcode for {ds1f}, args.region = {args.region}')
    else:
        print(f'rc1 getremotedsrcode')
        rc1 = getRemoteDSRCode(ds1f, "quant", args.region, "1")

    # if args.region:
    #     ds2reg = args.region
    # else:
    #     ds2reg = 'paste0(minchr, ":", minpos, "-", maxpos)'
    ds2reg = args.region
    print(f'ds2reg = {ds2reg}')
    
    if os.path.isfile(ds2f):
        print(f'rc2 getlocaldsrcode')
        rc2 = getLocalDSRCode(ds2f, "cc", ds2reg, "2")
    else:
        print(f'rc2 getremotedsrcode')
        rc2 = getRemoteDSRCode(ds2f, "cc", ds2reg, "2")

    rs4lz = open(cost + ".lz.R", "w+")
    rs4lz.write("""
    library(coloc)
    library(ieugwasr)
    """ + rc1 + """
    reg
    write(reg, file='""" + regfile + """')
   """ + rc2 + """

    trait1id = ds1$id[1]
    trait2id = ds2$id[1]
    trait1 = ds1$trait[1]
    trait2 = ds2$trait[1]
    ds1$chromosome = minchr
    ds2$chromosome = minchr
    ds1 = ds1[order(ds1$position),] # locuszoom requires sorted coords
    ds2 = ds2[order(ds2$position),]
 """)


    cstem = cost + ".compare" # compare result file prefix
    scstem = cost + ".compare.sigpval" # compare result file prefix

    rs4lz.write("""
    # write datasets to files
    regstr = gsub(":", "_", reg) # replace ':" with '_'
    lzf1 = paste0('""" + ds1f + """', '.', regstr, '.lz.csv')
    lzf2 = paste0('""" + ds2f + """', '.', regstr, '.lz.csv')
    regf1 = paste0('""" + ds1f + """', '.', regstr, '.reg.csv')
    regf2 = paste0('""" + ds2f + """', '.', regstr, '.reg.csv')

    #if(!file.exists(lzf1)) {
      write.table(ds1, lzf1, sep="\t", row.names=F, quote=F)
    #}

    #if(!file.exists(lzf2)) {
      write.table(ds2, lzf2, sep="\t", row.names=F, quote=F)
    #}

    #if(!file.exists(regf1)) {
      write.table(df1, regf1, sep="\t", row.names=F, quote=F) # for IV identification
    #}

    #if(!file.exists(regf2)) {
      write.table(df2, regf2, sep="\t", row.names=F, quote=F) # for IV identification
    #}
    """)

    rs4lz.close()
    tmpfiles.append(regfile)
    
    ### run the R script to get file lz1 & lz2
    with lock1:
        print(f'Running the lz R script {rs4lz.name}')
        util.run("R --no-save < " + rs4lz.name)
            
    if not os.path.isfile(regfile):
        print(f"ERROR 2: mr2clc.py: regfile {regfile} does not exist.")
        writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 3: mr2clc.py: regfile {regfile} does not exist.", rstmf)
        return

    with open(regfile, "r") as f:
        reg = f.read().strip()
        #print("reg = " + reg)            
        rchr, rstart, rend = re.match(r'(\d+):(\d+)-(\d+)', reg).groups()
        regstr = reg.replace(":", "_")

        f.close()

    lz1 = ds1f + "." + regstr + ".lz.csv"
    lz2 = ds2f + "." + regstr + ".lz.csv"
    df1reg = ds1f + "." + regstr + ".reg.csv" # files generated above
    df2reg = ds2f + "." + regstr + ".reg.csv" # files generated above

    if not os.path.isfile(df1reg):
        writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 4: mr2clc.py: df1reg {df1reg} does not exist.", rstmf)
        return
    
    if not os.path.isfile(df2reg):
        writeNoResults(ds1f, ds2f, trait1, trait2, f"ERROR 4: mr2clc.py: df2reg {df2reg} does not exist.", rstmf)
        return

    tmpfiles.append(df1reg)
    tmpfiles.append(df2reg)
    
    rs4clc = open(cost + ".clc.R", "w+")
    rs4clc.write(f"""
    library(coloc)
    library(ieugwasr)
    ds1 = read.table('{lz1}', sep="\t", header=T, quote="")
    ds2 = read.table('{lz2}', sep="\t", header=T, quote="")
    trait1id = ds1$id[1]
    trait2id = ds2$id[1]
    trait1 = ds1$trait[1]
    trait2 = ds2$trait[1]
    reg = '{reg}'

    # remove rows with varbeta = 0
    ds1 = subset(ds1, varbeta != 0)
    ds2 = subset(ds2, varbeta != 0)

    # remove rows with MAF = 0 or 1
    ds1 = subset(ds1, MAF != 0 & MAF != 1)
    ds2 = subset(ds2, MAF != 0 & MAF != 1)

    if(nrow(ds1) == 0 | nrow(ds2) == 0) {{
      if(nrow(ds1) == 0) {{
        NOTE = "trait1 has no valid data"
        trait1id = '{lz1}'
        trait1 = '{name1}'
      }} else {{
       NOTE = "trait2 has no valid data"
       trait2id = '{lz2}'
       trait2 = '{name2}'
      }}

      nsnps = "NA"
      H0 = "NA"
      H1 = "NA"
      H2 = "NA"
      H3 = "NA"
      H4 = "NA"
      minp2 = "NA"
      dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE)
      write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F)
      quit()
    }}
    

    # make sure N is numeric
    ds1$N = as.numeric(ds1$N)
    ds2$N = as.numeric(ds2$N)

    minrow = subset(ds1, pval == min(pval))
    minchr = {chr}
    minpos = {minpos}
    maxpos = {maxpos}
    minpval = minrow[1, 'pval', 1]

    if(minpos < 0) {{
      minpos = 0
    }}

    ### if p-value is too big, don't do coloc
    if(minpval > {args.pval}) {{
      NOTE = paste0("trait1 pval too big: ", minpval)
      nsnps = "NA"
      H0 = "NA"
      H1 = "NA"
      H2 = "NA"
      H3 = "NA"
      H4 = "NA"
      minp2 = "NA"
      dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE)
      write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F)
      quit()
    }}

    ### draw regional manhattan plot with selected snps highlighted
    ds1$chromosome = as.numeric(ds1$chromosome)
    ds2$chromosome = as.numeric(ds2$chromosome)
    
    minp2 = min(ds2$pval)

    tc = tryCatch({{
    res <- coloc.abf(dataset1=ds1, dataset2=ds2)
    res
    ### save major results to a file
    nsnps = res$summary[1]
    H0 = signif(res$summary[2], digits=3)
    H1 = signif(res$summary[3], digits=3)
    H2 = signif(res$summary[4], digits=3)
    H3 = signif(res$summary[5], digits=3)
    H4 = signif(res$summary[6], digits=3)
    NOTE = ""
    dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE)
    print("writing coloc results dfres to {cost}.summary.csv")    
    write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F)
    }},

    error=function(err) {{
    print(err)
    NOTE = paste0("ERROR - ", conditionMessage(err))
    nsnps = "NA"
    H0 = "NA"
    H1 = "NA"
    H2 = "NA"
    H3 = "NA"
    H4 = "NA"
    dfres <- data.frame(trait1id, trait2id, trait1, trait2, nsnps, H0, H1, H2, H3, H4, reg, minp2, NOTE)
    write.table(dfres, '{cost}.summary.csv', sep="\t", row.names=F, quote=F)
    }},
    
    finally=function(f) {{
      print("Done")
    }})
    """)

    rs4clc.close()
    
    if args.script:
        print(f'NOT running coloc R script: {rs4clc.name}')
        exit(2)
    else:
        print(f'Running coloc R script: {rs4clc.name}')
        rshit = "/tmp/t.rshit.out"

        with open(rs4clc.name, "r") as f:
            print("r code:")
            print(f.read())
            
        util.run(f"R --no-save < {rs4clc.name}")
            
        tmpfiles.append(rs4clc.name)

    ### construct the html report
    # write all the result
    if not os.path.isfile(rstclc):
        print(f"WARNING: coloc summary file {rstclc} does NOT exist. Check next ...")
        writeNoResults(ds1f, ds2f, trait1, trait2, f"WARNING: coloc summary file {rstclc} does NOT exist. Check next ...",rstmf)
        return
    
    dfclc = pd.read_csv(rstclc, sep="\t", header=0, comment='#')
    print(f"rstclc = {rstclc}")
    ### rewrite the above to make it look better
    trait1id = dfclc.at[0, "trait1id"]
    trait2id = dfclc.at[0, "trait2id"]

    hfile = open(cost + ".summary.html", "w+")
    # html anchor id, which will be embeded in the html file and can be used to identify the html file
    htmlid = util.getHash(hfile.name, "mr2clc", 8)
    
    rstm = open(rstmf, "w+") # clc + 2smr merged results
    rstm.write("\t".join(list(dfclc.columns) + ["htmlid"]) + "\n")
    rstm.write("\t".join(list(map(lambda x: str(x), list(dfclc.iloc[0,:]) + [htmlid]))) + "\n")
    rstm.close()

    # rename the result file
    util.run("mv %s %s" %(rstm.name, rstclc))
    
    rplot = os.path.abspath(cost + ".png")

    hfile.write(f"""<html>
    <head><title>{name1} ~ {name2}</title>
    <style>
    table {{
      page-break-inside: avoid;
    }}
    </style>
    </head>
    <body>
    """)

    # trait1id	trait2id	trait1	trait2	nsnps	H0	H1	H2	H3	H4	reg	NOTE	2smr_pval_min	bbR
    # prot-c-5102_55_3	ukb-b-14521	MICB	Illnesses of father: Lung cancer	2510	1.41e-34	1.66e-06	5.48e-29	0.644	0.356	6:30465047-32465047	nan	0.435	-0.6587235634261688
    dfsum = pd.read_csv(rstclc, sep="\t", header=0, comment='#')
    
    # print the stats
    nsnps = dfsum.iloc[0].loc[ "nsnps"]
    h0 = dfsum.iloc[0].loc[ "H0"]
    h1 = dfsum.iloc[0].loc[ "H1"]
    h2 = dfsum.iloc[0].loc[ "H2"]
    h3 = dfsum.iloc[0].loc[ "H3"]
    h4 = dfsum.iloc[0].loc[ "H4"]
    note = str(dfsum.iloc[0].loc["NOTE"])
    
    reg = dfsum.iloc[0].loc["reg"]
    #print("reg = " + str(reg))
    reg = reg.replace(":", "_")
    #print("update reg to be " + str(reg))
    
 print(f'ds1 = {ds1}, ds2 = {ds2}')

 # partition the ds1 by study 'id'
 df1s = pd.read_csv(ds1, sep="\t", comment='#', header=0)
 df1ids = list(set(df1s["id"]))
 #print("Dataset1 has %d unique IDs" %(len(df1ids)))
 csvs = []
 htms = []

 for i in range(len(df1ids)):
    id = df1ids[i]
    #print("processing id " + str(id))
    df1 = df1s[df1s["id"] == id]
    pheno = df1["Phenotype"].iloc[0]

    # make ds1f file name shorter
    if os.path.isfile(id):
        idir, istem, isuf = util.getDirStemSuffix(id)

        if len(istem) > 10: # file name too long can cause problems for unix
            istem = istem[0:3] + util.getHash(istem, "mr2clc", 4) + istem[-4:-1] # use first 3 and last 3 and 4 random chars to form the length 10 istem
    else:
        istem = id

    ds1f = f"{tmpdir}/{istem}.lz.csv"

    lock2.acquire()

    if os.path.isfile(ds1f):
        if os.path.getsize(ds1f) == 0: 
            df1.to_csv(ds1f, sep="\t", index=False)
        # else exists and has content, don't save it again
    else: # file does not exist
        df1.to_csv(ds1f, sep="\t", index=False)

    lock2.release()

    ostem1 = tostem + "." + util.getHash(istem, "ostem1", 8) + util.getHash(ds2, "ostem1", 8) 
    print(f"ds1f = {ds1f}, pheno = {pheno}, name2 = {name2}, ostem1 = {ostem1}")

    clc1pair(ds1f, pheno, ds2, name2, ostem1)

    csvf = ostem1 + ".summary.csv"
    htmf = ostem1 + ".summary.html"
    csvs.append(csvf)
    htms.append(htmf)
    tmpfiles.append(csvf)
    tmpfiles.append(htmf)
    
    ### summarize all results
    # summarize the ".summary.csv"
    util.run("cat " + " ".join(csvs) + " > " + sumcsv)
    util.run("rmDuplicatedRows.pl " + sumcsv)
    
    # summarize the ".html"
    sumhtm = ostem + ".clc.summary.html"
    util.run("cat " + " ".join(htms) + " > " + sumhtm)

 #if not args.debug:
 #    for tf in tmpfiles:
 #        util.run("rm -rf " + tf)

 print(f'clc.py {ds1} {ds2} {ostem} is DONE')
 f = open(sumcsv, "r")
    
 print(f'the summary file {sumcsv} content is {f.read()}')