Last active
May 2, 2020 11:49
Revisions
-
ericminikel revised this gist
Jan 3, 2014 . 1 changed file with 32 additions and 92 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -101,104 +101,44 @@ for (i in 1:length(drug.objects)) { # lapply then does this to each element of the list mfrags_all = lapply(mfrags,do.call,what=c) is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin? pvals = numeric(length(allrings)) # vector to hold p vals for (j in 1:length(allrings)) { # iterate over rings hasfrag = mat[,j] # does each drug contain this fragment? contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence } png('murcko.ring.statin.qqplot.png',width=500,height=500) qqunif(pvals,pch=19,cex.main=.8, main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs') dev.off() png('statins.most.enriched.ring.system.png',width=300,height=300) allrings[pvals < .000001] # [1] "C=1CCC2CCCC=C2(C=1)" # what statins contain it? drugs$generic_name[is_statin & mat[,which(pvals < .000001)]] # "Lovastatin" "Pravastatin" "Simvastatin" # how many other drugs contain it sum(!is_statin & mat[,which(pvals < .000001)]) # 0 rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001]) dev.off() png('frag.freq.hist1.png',width=600,height=400) hist(colSums(mat),col='yellow',breaks=100, xlab='# of molecules containing a given fragment', ylab='# of fragments like this', main='Histogram of fragment frequencies') dev.off() sum(colSums(mat)==1) # [1] 351 sum(colSums(mat[is_statin,]) > 0) # 8 start_time = Sys.time() get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3) Sys.time() - start_time # Time difference of 3.394177 mins -
ericminikel created this gist
Jan 3, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,204 @@ require(rcdk) # chemical informatics functionality in R require(gap) # for qq plots later options(stringsAsFactors=FALSE) setwd('c:/sci/037cinf/analysis/sar/') # plot molecules in R plot window instead of separate Java window rcdkplot = function(molecule,width=500,height=500,marg=0,main='') { par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries } # first look at curcumin curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]] rcdkplot(curcumin) curc.frags = get.murcko.fragments(curcumin) curc.frags #[[1]] #[[1]]$rings #[1] "c1ccccc1" # #[[1]]$frameworks #[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2" png('curcumin.murcko.fragments.png',width=600,height=300) par(mfrow=c(1,2)) rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]]) rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]]) dev.off() anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]] rcdkplot(anle138b) anle138b.frags = get.murcko.fragments(anle138b) anle138b.frags # [[1]] # [[1]]$rings # [1] "c1ccccc1" "c1ccc2OCOc2(c1)" # # [[1]]$frameworks # [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)" # [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)" png('anle138b.murcko.fragments.png',width=600,height=120) par(mfrow=c(1,5)) rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]]) rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]]) rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]]) rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]]) rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]]) dev.off() anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3) anle138b.frags[[1]]$rings # [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)" # now for a pretend SAR # load FDA drug list drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt', sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='') # remove those with no SMILES or with a really huge smiles # otherwise R will hang on the macromolecules drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,] drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug # check that lengths are same dim(drugs) # 1467 3 length(drug.objects) # 1467 statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin") drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list # examine statin structures png('statin.structures.png',width=1200,height=600) par(mfrow=c(2,4)) for (statin in statins) { rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin) } dev.off() # get all murcko fragments mfrags = get.murcko.fragments(drug.objects,min.frag.size=3) # get a list of all possible fragments in any of these drugs allfrags = unique(unlist(mfrags)) length(allfrags) # 2035 # get only the ring systems allrings = unique(unlist(lapply(mfrags,"[[",1))) length(allrings) # 556 # convert to a matrix mat = matrix(nrow=length(drug.objects), ncol=length(allrings)) for (i in 1:length(drug.objects)) { mat[i,] = allrings %in% mfrags[[i]]$rings } # aside: if you want rings and frameworks thrown in together, # do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector # lapply then does this to each element of the list mfrags_all = lapply(mfrags,do.call,what=c) view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir allfrags[1:2] # [1] "c1ncc2nc[nH]c2(n1)" "c1ncc2ncn(c2(n1))C3C=CCC3" mfrags[[1]] # $rings # [1] "c1ncc2nc[nH]c2(n1)" # $frameworks # [1] "c1ncc2ncn(c2(n1))C3C=CCC3" "c2nc1[nH]cnc1c(n2)NC3CC3" # [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4" # test that this worked out right allfrags[1:5] %in% mfrags[[1]] # appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not length(allfrags %in% mfrags[[1]]) # 1947 statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin") drugs[tolower(drugs$generic_name) %in% statins,] view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins]) is_statin = tolower(drugs$generic_name) %in% statins pvals = numeric(length(allfrags)) for (j in 1:length(allfrags)) { hasfrag = mat[,j] contingency_table = table(data.frame(is_statin,hasfrag)) pvals[j] = fisher.test(contingency_table)$p.value } qqunif(pvals,pch=19) allfrags[pvals < .00001] # [1] "C=1CCC2CCCC=C2(C=1)" plotsmiles = function(smiles) { par(mfrow=c(1,length(smiles))) for (i in 1:length(smiles)) { temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100) plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i]) rasterImage(temp1,1,1,10,10) } } view.molecule.2d(parse.smiles(allfrags[pvals < .00001])) drugs[is_statin & mat[,pvals < .00001],] # 3 of 7 (lova, prava & simva) contain this # what about the OH OH OH thing which 4/7 have and 5/7 have similar # or the fluorobenzene which 4/7 have view.molecule.2d(parse.smiles(allfrags[pvals < .001])) # is it that fluorobenzene wasn't significantly enriched (because present # in many drugs) or because it wasn't considered a fragment? mfrags[[which(drugs$generic_name=='Rosuvastatin')]] # rings1 rings2 frameworks # "c1cncnc1" "c1ccccc1" "c1cc(ncn1)c2ccccc2" # apparently not a Murcko fragment. start_time = Sys.time() get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7) Sys.time() - start_time # w/ min.frag.size=6, this takes over a minute and produces 107 fragments, # one of which is fluorobenzene: # [47] "Fc1ccccc1" # w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments: Sys.time() - start_time #Time difference of 12.52133 mins # Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122 # 2 guys from Vertex Pharma #We define ring systems to be cycles within #the graph representation of molecules and cycles sharing an #edge (a connection between two atoms or a bond). # -- side chain atoms are separate. # Linker Atoms: Atoms that are on the direct path connecting # two ring systems are defined as linker atoms # Framework. The framework is defined as the union of ring # systems and linkers in a molecule. # promiscuity scores # re-try with only smaller molecules for exhaustive frags drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,] drug.objects = parse.smiles(drugs$smiles) dim(drugs) length(drug.objects) # 1321 start_time = Sys.time() efrags = get.exhaustive.fragments(drug.objects[1:50]) Sys.time() - start_time