Last active
May 2, 2020 11:49
-
-
Save ericminikel/8229983 to your computer and use it in GitHub Desktop.
Introductory tutorial on how to use rcdk to conduct an SAR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(rcdk) # chemical informatics functionality in R | |
require(gap) # for qq plots later | |
options(stringsAsFactors=FALSE) | |
setwd('c:/sci/037cinf/analysis/sar/') | |
# plot molecules in R plot window instead of separate Java window | |
rcdkplot = function(molecule,width=500,height=500,marg=0,main='') { | |
par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot | |
temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical | |
plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot | |
rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries | |
} | |
# first look at curcumin | |
curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]] | |
rcdkplot(curcumin) | |
curc.frags = get.murcko.fragments(curcumin) | |
curc.frags | |
#[[1]] | |
#[[1]]$rings | |
#[1] "c1ccccc1" | |
# | |
#[[1]]$frameworks | |
#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2" | |
png('curcumin.murcko.fragments.png',width=600,height=300) | |
par(mfrow=c(1,2)) | |
rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]]) | |
rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]]) | |
dev.off() | |
anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]] | |
rcdkplot(anle138b) | |
anle138b.frags = get.murcko.fragments(anle138b) | |
anle138b.frags | |
# [[1]] | |
# [[1]]$rings | |
# [1] "c1ccccc1" "c1ccc2OCOc2(c1)" | |
# | |
# [[1]]$frameworks | |
# [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)" | |
# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)" | |
png('anle138b.murcko.fragments.png',width=600,height=120) | |
par(mfrow=c(1,5)) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]]) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]]) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]]) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]]) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]]) | |
dev.off() | |
anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3) | |
anle138b.frags[[1]]$rings | |
# [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)" | |
# now for a pretend SAR | |
# load FDA drug list | |
drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt', | |
sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='') | |
# remove those with no SMILES or with a really huge smiles | |
# otherwise R will hang on the macromolecules | |
drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,] | |
drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug | |
# check that lengths are same | |
dim(drugs) # 1467 3 | |
length(drug.objects) # 1467 | |
statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin") | |
drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list | |
# examine statin structures | |
png('statin.structures.png',width=1200,height=600) | |
par(mfrow=c(2,4)) | |
for (statin in statins) { | |
rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin) | |
} | |
dev.off() | |
# get all murcko fragments | |
mfrags = get.murcko.fragments(drug.objects,min.frag.size=3) | |
# get a list of all possible fragments in any of these drugs | |
allfrags = unique(unlist(mfrags)) | |
length(allfrags) # 2035 | |
# get only the ring systems | |
allrings = unique(unlist(lapply(mfrags,"[[",1))) | |
length(allrings) # 556 | |
# convert to a matrix | |
mat = matrix(nrow=length(drug.objects), ncol=length(allrings)) | |
for (i in 1:length(drug.objects)) { | |
mat[i,] = allrings %in% mfrags[[i]]$rings | |
} | |
# aside: if you want rings and frameworks thrown in together, | |
# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector | |
# lapply then does this to each element of the list | |
mfrags_all = lapply(mfrags,do.call,what=c) | |
view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir | |
view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir | |
allfrags[1:2] | |
# [1] "c1ncc2nc[nH]c2(n1)" "c1ncc2ncn(c2(n1))C3C=CCC3" | |
mfrags[[1]] | |
# $rings | |
# [1] "c1ncc2nc[nH]c2(n1)" | |
# $frameworks | |
# [1] "c1ncc2ncn(c2(n1))C3C=CCC3" "c2nc1[nH]cnc1c(n2)NC3CC3" | |
# [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4" | |
# test that this worked out right | |
allfrags[1:5] %in% mfrags[[1]] | |
# appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not | |
length(allfrags %in% mfrags[[1]]) | |
# 1947 | |
statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin") | |
drugs[tolower(drugs$generic_name) %in% statins,] | |
view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins]) | |
is_statin = tolower(drugs$generic_name) %in% statins | |
pvals = numeric(length(allfrags)) | |
for (j in 1:length(allfrags)) { | |
hasfrag = mat[,j] | |
contingency_table = table(data.frame(is_statin,hasfrag)) | |
pvals[j] = fisher.test(contingency_table)$p.value | |
} | |
qqunif(pvals,pch=19) | |
allfrags[pvals < .00001] | |
# [1] "C=1CCC2CCCC=C2(C=1)" | |
plotsmiles = function(smiles) { | |
par(mfrow=c(1,length(smiles))) | |
for (i in 1:length(smiles)) { | |
temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100) | |
plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i]) | |
rasterImage(temp1,1,1,10,10) | |
} | |
} | |
view.molecule.2d(parse.smiles(allfrags[pvals < .00001])) | |
drugs[is_statin & mat[,pvals < .00001],] | |
# 3 of 7 (lova, prava & simva) contain this | |
# what about the OH OH OH thing which 4/7 have and 5/7 have similar | |
# or the fluorobenzene which 4/7 have | |
view.molecule.2d(parse.smiles(allfrags[pvals < .001])) | |
# is it that fluorobenzene wasn't significantly enriched (because present | |
# in many drugs) or because it wasn't considered a fragment? | |
mfrags[[which(drugs$generic_name=='Rosuvastatin')]] | |
# rings1 rings2 frameworks | |
# "c1cncnc1" "c1ccccc1" "c1cc(ncn1)c2ccccc2" | |
# apparently not a Murcko fragment. | |
start_time = Sys.time() | |
get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7) | |
Sys.time() - start_time | |
# w/ min.frag.size=6, this takes over a minute and produces 107 fragments, | |
# one of which is fluorobenzene: | |
# [47] "Fc1ccccc1" | |
# w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments: | |
Sys.time() - start_time | |
#Time difference of 12.52133 mins | |
# Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122 | |
# 2 guys from Vertex Pharma | |
#We define ring systems to be cycles within | |
#the graph representation of molecules and cycles sharing an | |
#edge (a connection between two atoms or a bond). | |
# -- side chain atoms are separate. | |
# Linker Atoms: Atoms that are on the direct path connecting | |
# two ring systems are defined as linker atoms | |
# Framework. The framework is defined as the union of ring | |
# systems and linkers in a molecule. | |
# promiscuity scores | |
# re-try with only smaller molecules for exhaustive frags | |
drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,] | |
drug.objects = parse.smiles(drugs$smiles) | |
dim(drugs) | |
length(drug.objects) # 1321 | |
start_time = Sys.time() | |
efrags = get.exhaustive.fragments(drug.objects[1:50]) | |
Sys.time() - start_time |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment