ericminikel · May 2, 2020 11:49 · Jan 3, 2014 · Jan 3, 2014
diff --git a/rcdk-sar-tutorial.r b/rcdk-sar-tutorial.r
@@ -101,104 +101,44 @@ for (i in 1:length(drug.objects)) {
 # lapply then does this to each element of the list
 mfrags_all = lapply(mfrags,do.call,what=c)
 
-
-
-view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir
-view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir
-
-allfrags[1:2]
-# [1] "c1ncc2nc[nH]c2(n1)"        "c1ncc2ncn(c2(n1))C3C=CCC3"
-
-mfrags[[1]]
-# $rings
-# [1] "c1ncc2nc[nH]c2(n1)"
-# $frameworks
-# [1] "c1ncc2ncn(c2(n1))C3C=CCC3"         "c2nc1[nH]cnc1c(n2)NC3CC3"         
-# [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4"
-
-
-# test that this worked out right
-allfrags[1:5] %in% mfrags[[1]]
-# appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not
-
-length(allfrags %in% mfrags[[1]])
-# 1947
-
-statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
-drugs[tolower(drugs$generic_name) %in% statins,]
-view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins])
-
-is_statin = tolower(drugs$generic_name) %in% statins
-pvals = numeric(length(allfrags))
-for (j in 1:length(allfrags)) {
-    hasfrag = mat[,j]
-    contingency_table = table(data.frame(is_statin,hasfrag))
-    pvals[j] = fisher.test(contingency_table)$p.value
+is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin?
+pvals = numeric(length(allrings)) # vector to hold p vals
+for (j in 1:length(allrings)) { # iterate over rings
+    hasfrag = mat[,j] # does each drug contain this fragment?
+    contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table
+    pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence
 }
+png('murcko.ring.statin.qqplot.png',width=500,height=500)
+qqunif(pvals,pch=19,cex.main=.8,
+    main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs')
+dev.off()
 
-qqunif(pvals,pch=19)
-
-allfrags[pvals < .00001]
+png('statins.most.enriched.ring.system.png',width=300,height=300)
+allrings[pvals < .000001]
 # [1] "C=1CCC2CCCC=C2(C=1)"
+# what statins contain it?
+drugs$generic_name[is_statin & mat[,which(pvals < .000001)]]
+# "Lovastatin"  "Pravastatin" "Simvastatin"
+# how many other drugs contain it
+sum(!is_statin & mat[,which(pvals < .000001)])
+# 0
+rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001])
+dev.off()
 
-plotsmiles = function(smiles) {
-    par(mfrow=c(1,length(smiles))) 
-    for (i in 1:length(smiles)) {
-        temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100) 
-        plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i])
-        rasterImage(temp1,1,1,10,10)
-    }
-}
-
-
-
-view.molecule.2d(parse.smiles(allfrags[pvals < .00001]))
-
-drugs[is_statin & mat[,pvals < .00001],]
-# 3 of 7 (lova, prava & simva) contain this
-# what about the OH OH OH thing which 4/7 have and 5/7 have similar
-# or the fluorobenzene which 4/7 have
-
-view.molecule.2d(parse.smiles(allfrags[pvals < .001]))
-
-# is it that fluorobenzene wasn't significantly enriched (because present
-# in many drugs) or because it wasn't considered a fragment?
-mfrags[[which(drugs$generic_name=='Rosuvastatin')]]
-#              rings1               rings2           frameworks 
-#          "c1cncnc1"           "c1ccccc1" "c1cc(ncn1)c2ccccc2" 
-# apparently not a Murcko fragment.
-
-start_time = Sys.time()
-get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7)
-Sys.time() - start_time
-# w/ min.frag.size=6, this takes over a minute and produces 107 fragments, 
-# one of which is fluorobenzene:
-#  [47] "Fc1ccccc1"                              
-
-# w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments:
-Sys.time() - start_time
-#Time difference of 12.52133 mins
-
-# Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122
-# 2 guys from Vertex Pharma
-#We define ring systems to be cycles within
-#the graph representation of molecules and cycles sharing an
-#edge (a connection between two atoms or a bond).
-# -- side chain atoms are separate. 
-
-# Linker Atoms: Atoms that are on the direct path connecting
-# two ring systems are defined as linker atoms
+png('frag.freq.hist1.png',width=600,height=400)
+hist(colSums(mat),col='yellow',breaks=100,
+    xlab='# of molecules containing a given fragment',
+    ylab='# of fragments like this',
+    main='Histogram of fragment frequencies')
+dev.off()
 
-# Framework. The framework is defined as the union of ring
-# systems and linkers in a molecule.
-# promiscuity scores
+sum(colSums(mat)==1)
+# [1] 351
 
-# re-try with only smaller molecules for exhaustive frags
-drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,]
-drug.objects = parse.smiles(drugs$smiles)
-dim(drugs)
-length(drug.objects) # 1321
+sum(colSums(mat[is_statin,]) > 0)
+# 8
 
 start_time = Sys.time()
-efrags = get.exhaustive.fragments(drug.objects[1:50]) 
+get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3)
 Sys.time() - start_time
+# Time difference of 3.394177 mins
diff --git a/rcdk-sar-tutorial.r b/rcdk-sar-tutorial.r
@@ -0,0 +1,204 @@
+require(rcdk) # chemical informatics functionality in R
+require(gap)  # for qq plots later
+options(stringsAsFactors=FALSE)
+setwd('c:/sci/037cinf/analysis/sar/')
+
+# plot molecules in R plot window instead of separate Java window
+rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
+    par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
+    temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
+    plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
+    rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
+}
+
+# first look at curcumin
+curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]] 
+rcdkplot(curcumin)
+
+curc.frags = get.murcko.fragments(curcumin)
+curc.frags
+#[[1]]
+#[[1]]$rings
+#[1] "c1ccccc1"
+#
+#[[1]]$frameworks
+#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
+png('curcumin.murcko.fragments.png',width=600,height=300)
+par(mfrow=c(1,2))
+rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
+rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
+dev.off()
+
+anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
+rcdkplot(anle138b)
+
+anle138b.frags = get.murcko.fragments(anle138b)
+anle138b.frags
+# [[1]]
+# [[1]]$rings
+# [1] "c1ccccc1"        "c1ccc2OCOc2(c1)"
+# 
+# [[1]]$frameworks
+# [1] "c1cccc(c1)c2n[nH]cc2"                  "c1n[nH]c(c1)c2ccc3OCOc3(c2)"          
+# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
+png('anle138b.murcko.fragments.png',width=600,height=120)
+par(mfrow=c(1,5))
+rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
+rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
+rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
+rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
+rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
+dev.off()
+
+anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
+anle138b.frags[[1]]$rings
+# [1] "c1ccccc1"        "c1n[nH]cc1"      "c1ccc2OCOc2(c1)"
+
+# now for a pretend SAR
+
+# load FDA drug list
+drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
+                   sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
+# remove those with no SMILES or with a really huge smiles
+# otherwise R will hang on the macromolecules
+drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
+drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
+# check that lengths are same
+dim(drugs) # 1467 3
+length(drug.objects) # 1467
+
+statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
+drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list
+
+# examine statin structures
+png('statin.structures.png',width=1200,height=600)
+par(mfrow=c(2,4))
+for (statin in statins) {
+    rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
+}
+dev.off()
+
+# get all murcko fragments
+mfrags = get.murcko.fragments(drug.objects,min.frag.size=3) 
+
+# get a list of all possible fragments in any of these drugs
+allfrags = unique(unlist(mfrags))
+length(allfrags) # 2035
+
+# get only the ring systems
+allrings = unique(unlist(lapply(mfrags,"[[",1)))
+length(allrings) # 556
+
+
+# convert to a matrix
+mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
+for (i in 1:length(drug.objects)) {
+    mat[i,] = allrings %in% mfrags[[i]]$rings
+}
+
+# aside: if you want rings and frameworks thrown in together, 
+# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
+# lapply then does this to each element of the list
+mfrags_all = lapply(mfrags,do.call,what=c)
+
+
+
+view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir
+view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir
+
+allfrags[1:2]
+# [1] "c1ncc2nc[nH]c2(n1)"        "c1ncc2ncn(c2(n1))C3C=CCC3"
+
+mfrags[[1]]
+# $rings
+# [1] "c1ncc2nc[nH]c2(n1)"
+# $frameworks
+# [1] "c1ncc2ncn(c2(n1))C3C=CCC3"         "c2nc1[nH]cnc1c(n2)NC3CC3"         
+# [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4"
+
+
+# test that this worked out right
+allfrags[1:5] %in% mfrags[[1]]
+# appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not
+
+length(allfrags %in% mfrags[[1]])
+# 1947
+
+statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
+drugs[tolower(drugs$generic_name) %in% statins,]
+view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins])
+
+is_statin = tolower(drugs$generic_name) %in% statins
+pvals = numeric(length(allfrags))
+for (j in 1:length(allfrags)) {
+    hasfrag = mat[,j]
+    contingency_table = table(data.frame(is_statin,hasfrag))
+    pvals[j] = fisher.test(contingency_table)$p.value
+}
+
+qqunif(pvals,pch=19)
+
+allfrags[pvals < .00001]
+# [1] "C=1CCC2CCCC=C2(C=1)"
+
+plotsmiles = function(smiles) {
+    par(mfrow=c(1,length(smiles))) 
+    for (i in 1:length(smiles)) {
+        temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100) 
+        plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i])
+        rasterImage(temp1,1,1,10,10)
+    }
+}
+
+
+
+view.molecule.2d(parse.smiles(allfrags[pvals < .00001]))
+
+drugs[is_statin & mat[,pvals < .00001],]
+# 3 of 7 (lova, prava & simva) contain this
+# what about the OH OH OH thing which 4/7 have and 5/7 have similar
+# or the fluorobenzene which 4/7 have
+
+view.molecule.2d(parse.smiles(allfrags[pvals < .001]))
+
+# is it that fluorobenzene wasn't significantly enriched (because present
+# in many drugs) or because it wasn't considered a fragment?
+mfrags[[which(drugs$generic_name=='Rosuvastatin')]]
+#              rings1               rings2           frameworks 
+#          "c1cncnc1"           "c1ccccc1" "c1cc(ncn1)c2ccccc2" 
+# apparently not a Murcko fragment.
+
+start_time = Sys.time()
+get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7)
+Sys.time() - start_time
+# w/ min.frag.size=6, this takes over a minute and produces 107 fragments, 
+# one of which is fluorobenzene:
+#  [47] "Fc1ccccc1"                              
+
+# w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments:
+Sys.time() - start_time
+#Time difference of 12.52133 mins
+
+# Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122
+# 2 guys from Vertex Pharma
+#We define ring systems to be cycles within
+#the graph representation of molecules and cycles sharing an
+#edge (a connection between two atoms or a bond).
+# -- side chain atoms are separate. 
+
+# Linker Atoms: Atoms that are on the direct path connecting
+# two ring systems are defined as linker atoms
+
+# Framework. The framework is defined as the union of ring
+# systems and linkers in a molecule.
+# promiscuity scores
+
+# re-try with only smaller molecules for exhaustive frags
+drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,]
+drug.objects = parse.smiles(drugs$smiles)
+dim(drugs)
+length(drug.objects) # 1321
+
+start_time = Sys.time()
+efrags = get.exhaustive.fragments(drug.objects[1:50]) 
+Sys.time() - start_time