Skip to content

Instantly share code, notes, and snippets.

@ericminikel
Last active May 2, 2020 11:49

Revisions

  1. ericminikel revised this gist Jan 3, 2014. 1 changed file with 32 additions and 92 deletions.
    124 changes: 32 additions & 92 deletions rcdk-sar-tutorial.r
    Original file line number Diff line number Diff line change
    @@ -101,104 +101,44 @@ for (i in 1:length(drug.objects)) {
    # lapply then does this to each element of the list
    mfrags_all = lapply(mfrags,do.call,what=c)



    view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir
    view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir

    allfrags[1:2]
    # [1] "c1ncc2nc[nH]c2(n1)" "c1ncc2ncn(c2(n1))C3C=CCC3"

    mfrags[[1]]
    # $rings
    # [1] "c1ncc2nc[nH]c2(n1)"
    # $frameworks
    # [1] "c1ncc2ncn(c2(n1))C3C=CCC3" "c2nc1[nH]cnc1c(n2)NC3CC3"
    # [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4"


    # test that this worked out right
    allfrags[1:5] %in% mfrags[[1]]
    # appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not

    length(allfrags %in% mfrags[[1]])
    # 1947

    statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
    drugs[tolower(drugs$generic_name) %in% statins,]
    view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins])

    is_statin = tolower(drugs$generic_name) %in% statins
    pvals = numeric(length(allfrags))
    for (j in 1:length(allfrags)) {
    hasfrag = mat[,j]
    contingency_table = table(data.frame(is_statin,hasfrag))
    pvals[j] = fisher.test(contingency_table)$p.value
    is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin?
    pvals = numeric(length(allrings)) # vector to hold p vals
    for (j in 1:length(allrings)) { # iterate over rings
    hasfrag = mat[,j] # does each drug contain this fragment?
    contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table
    pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence
    }
    png('murcko.ring.statin.qqplot.png',width=500,height=500)
    qqunif(pvals,pch=19,cex.main=.8,
    main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs')
    dev.off()

    qqunif(pvals,pch=19)

    allfrags[pvals < .00001]
    png('statins.most.enriched.ring.system.png',width=300,height=300)
    allrings[pvals < .000001]
    # [1] "C=1CCC2CCCC=C2(C=1)"
    # what statins contain it?
    drugs$generic_name[is_statin & mat[,which(pvals < .000001)]]
    # "Lovastatin" "Pravastatin" "Simvastatin"
    # how many other drugs contain it
    sum(!is_statin & mat[,which(pvals < .000001)])
    # 0
    rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001])
    dev.off()

    plotsmiles = function(smiles) {
    par(mfrow=c(1,length(smiles)))
    for (i in 1:length(smiles)) {
    temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100)
    plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i])
    rasterImage(temp1,1,1,10,10)
    }
    }



    view.molecule.2d(parse.smiles(allfrags[pvals < .00001]))

    drugs[is_statin & mat[,pvals < .00001],]
    # 3 of 7 (lova, prava & simva) contain this
    # what about the OH OH OH thing which 4/7 have and 5/7 have similar
    # or the fluorobenzene which 4/7 have

    view.molecule.2d(parse.smiles(allfrags[pvals < .001]))

    # is it that fluorobenzene wasn't significantly enriched (because present
    # in many drugs) or because it wasn't considered a fragment?
    mfrags[[which(drugs$generic_name=='Rosuvastatin')]]
    # rings1 rings2 frameworks
    # "c1cncnc1" "c1ccccc1" "c1cc(ncn1)c2ccccc2"
    # apparently not a Murcko fragment.

    start_time = Sys.time()
    get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7)
    Sys.time() - start_time
    # w/ min.frag.size=6, this takes over a minute and produces 107 fragments,
    # one of which is fluorobenzene:
    # [47] "Fc1ccccc1"

    # w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments:
    Sys.time() - start_time
    #Time difference of 12.52133 mins

    # Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122
    # 2 guys from Vertex Pharma
    #We define ring systems to be cycles within
    #the graph representation of molecules and cycles sharing an
    #edge (a connection between two atoms or a bond).
    # -- side chain atoms are separate.

    # Linker Atoms: Atoms that are on the direct path connecting
    # two ring systems are defined as linker atoms
    png('frag.freq.hist1.png',width=600,height=400)
    hist(colSums(mat),col='yellow',breaks=100,
    xlab='# of molecules containing a given fragment',
    ylab='# of fragments like this',
    main='Histogram of fragment frequencies')
    dev.off()

    # Framework. The framework is defined as the union of ring
    # systems and linkers in a molecule.
    # promiscuity scores
    sum(colSums(mat)==1)
    # [1] 351

    # re-try with only smaller molecules for exhaustive frags
    drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,]
    drug.objects = parse.smiles(drugs$smiles)
    dim(drugs)
    length(drug.objects) # 1321
    sum(colSums(mat[is_statin,]) > 0)
    # 8

    start_time = Sys.time()
    efrags = get.exhaustive.fragments(drug.objects[1:50])
    get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3)
    Sys.time() - start_time
    # Time difference of 3.394177 mins
  2. ericminikel created this gist Jan 3, 2014.
    204 changes: 204 additions & 0 deletions rcdk-sar-tutorial.r
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,204 @@
    require(rcdk) # chemical informatics functionality in R
    require(gap) # for qq plots later
    options(stringsAsFactors=FALSE)
    setwd('c:/sci/037cinf/analysis/sar/')

    # plot molecules in R plot window instead of separate Java window
    rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
    par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
    temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
    plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
    rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
    }

    # first look at curcumin
    curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]]
    rcdkplot(curcumin)

    curc.frags = get.murcko.fragments(curcumin)
    curc.frags
    #[[1]]
    #[[1]]$rings
    #[1] "c1ccccc1"
    #
    #[[1]]$frameworks
    #[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
    png('curcumin.murcko.fragments.png',width=600,height=300)
    par(mfrow=c(1,2))
    rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
    rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
    dev.off()

    anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
    rcdkplot(anle138b)

    anle138b.frags = get.murcko.fragments(anle138b)
    anle138b.frags
    # [[1]]
    # [[1]]$rings
    # [1] "c1ccccc1" "c1ccc2OCOc2(c1)"
    #
    # [[1]]$frameworks
    # [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)"
    # [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
    png('anle138b.murcko.fragments.png',width=600,height=120)
    par(mfrow=c(1,5))
    rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
    rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
    rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
    rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
    rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
    dev.off()

    anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
    anle138b.frags[[1]]$rings
    # [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)"

    # now for a pretend SAR

    # load FDA drug list
    drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
    sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
    # remove those with no SMILES or with a really huge smiles
    # otherwise R will hang on the macromolecules
    drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
    drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
    # check that lengths are same
    dim(drugs) # 1467 3
    length(drug.objects) # 1467

    statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
    drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list

    # examine statin structures
    png('statin.structures.png',width=1200,height=600)
    par(mfrow=c(2,4))
    for (statin in statins) {
    rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
    }
    dev.off()

    # get all murcko fragments
    mfrags = get.murcko.fragments(drug.objects,min.frag.size=3)

    # get a list of all possible fragments in any of these drugs
    allfrags = unique(unlist(mfrags))
    length(allfrags) # 2035

    # get only the ring systems
    allrings = unique(unlist(lapply(mfrags,"[[",1)))
    length(allrings) # 556


    # convert to a matrix
    mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
    for (i in 1:length(drug.objects)) {
    mat[i,] = allrings %in% mfrags[[i]]$rings
    }

    # aside: if you want rings and frameworks thrown in together,
    # do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
    # lapply then does this to each element of the list
    mfrags_all = lapply(mfrags,do.call,what=c)



    view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir
    view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir

    allfrags[1:2]
    # [1] "c1ncc2nc[nH]c2(n1)" "c1ncc2ncn(c2(n1))C3C=CCC3"

    mfrags[[1]]
    # $rings
    # [1] "c1ncc2nc[nH]c2(n1)"
    # $frameworks
    # [1] "c1ncc2ncn(c2(n1))C3C=CCC3" "c2nc1[nH]cnc1c(n2)NC3CC3"
    # [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4"


    # test that this worked out right
    allfrags[1:5] %in% mfrags[[1]]
    # appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not

    length(allfrags %in% mfrags[[1]])
    # 1947

    statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
    drugs[tolower(drugs$generic_name) %in% statins,]
    view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins])

    is_statin = tolower(drugs$generic_name) %in% statins
    pvals = numeric(length(allfrags))
    for (j in 1:length(allfrags)) {
    hasfrag = mat[,j]
    contingency_table = table(data.frame(is_statin,hasfrag))
    pvals[j] = fisher.test(contingency_table)$p.value
    }

    qqunif(pvals,pch=19)

    allfrags[pvals < .00001]
    # [1] "C=1CCC2CCCC=C2(C=1)"

    plotsmiles = function(smiles) {
    par(mfrow=c(1,length(smiles)))
    for (i in 1:length(smiles)) {
    temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100)
    plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i])
    rasterImage(temp1,1,1,10,10)
    }
    }



    view.molecule.2d(parse.smiles(allfrags[pvals < .00001]))

    drugs[is_statin & mat[,pvals < .00001],]
    # 3 of 7 (lova, prava & simva) contain this
    # what about the OH OH OH thing which 4/7 have and 5/7 have similar
    # or the fluorobenzene which 4/7 have

    view.molecule.2d(parse.smiles(allfrags[pvals < .001]))

    # is it that fluorobenzene wasn't significantly enriched (because present
    # in many drugs) or because it wasn't considered a fragment?
    mfrags[[which(drugs$generic_name=='Rosuvastatin')]]
    # rings1 rings2 frameworks
    # "c1cncnc1" "c1ccccc1" "c1cc(ncn1)c2ccccc2"
    # apparently not a Murcko fragment.

    start_time = Sys.time()
    get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7)
    Sys.time() - start_time
    # w/ min.frag.size=6, this takes over a minute and produces 107 fragments,
    # one of which is fluorobenzene:
    # [47] "Fc1ccccc1"

    # w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments:
    Sys.time() - start_time
    #Time difference of 12.52133 mins

    # Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122
    # 2 guys from Vertex Pharma
    #We define ring systems to be cycles within
    #the graph representation of molecules and cycles sharing an
    #edge (a connection between two atoms or a bond).
    # -- side chain atoms are separate.

    # Linker Atoms: Atoms that are on the direct path connecting
    # two ring systems are defined as linker atoms

    # Framework. The framework is defined as the union of ring
    # systems and linkers in a molecule.
    # promiscuity scores

    # re-try with only smaller molecules for exhaustive frags
    drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,]
    drug.objects = parse.smiles(drugs$smiles)
    dim(drugs)
    length(drug.objects) # 1321

    start_time = Sys.time()
    efrags = get.exhaustive.fragments(drug.objects[1:50])
    Sys.time() - start_time