Skip to content

Instantly share code, notes, and snippets.

@ericminikel
Last active May 2, 2020 11:49
Show Gist options
  • Save ericminikel/8229983 to your computer and use it in GitHub Desktop.
Save ericminikel/8229983 to your computer and use it in GitHub Desktop.
Introductory tutorial on how to use rcdk to conduct an SAR
require(rcdk) # chemical informatics functionality in R
require(gap) # for qq plots later
options(stringsAsFactors=FALSE)
setwd('c:/sci/037cinf/analysis/sar/')
# plot molecules in R plot window instead of separate Java window
rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
}
# first look at curcumin
curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]]
rcdkplot(curcumin)
curc.frags = get.murcko.fragments(curcumin)
curc.frags
#[[1]]
#[[1]]$rings
#[1] "c1ccccc1"
#
#[[1]]$frameworks
#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
png('curcumin.murcko.fragments.png',width=600,height=300)
par(mfrow=c(1,2))
rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
dev.off()
anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
rcdkplot(anle138b)
anle138b.frags = get.murcko.fragments(anle138b)
anle138b.frags
# [[1]]
# [[1]]$rings
# [1] "c1ccccc1" "c1ccc2OCOc2(c1)"
#
# [[1]]$frameworks
# [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)"
# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
png('anle138b.murcko.fragments.png',width=600,height=120)
par(mfrow=c(1,5))
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
dev.off()
anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
anle138b.frags[[1]]$rings
# [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)"
# now for a pretend SAR
# load FDA drug list
drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
# remove those with no SMILES or with a really huge smiles
# otherwise R will hang on the macromolecules
drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
# check that lengths are same
dim(drugs) # 1467 3
length(drug.objects) # 1467
statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list
# examine statin structures
png('statin.structures.png',width=1200,height=600)
par(mfrow=c(2,4))
for (statin in statins) {
rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
}
dev.off()
# get all murcko fragments
mfrags = get.murcko.fragments(drug.objects,min.frag.size=3)
# get a list of all possible fragments in any of these drugs
allfrags = unique(unlist(mfrags))
length(allfrags) # 2035
# get only the ring systems
allrings = unique(unlist(lapply(mfrags,"[[",1)))
length(allrings) # 556
# convert to a matrix
mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
for (i in 1:length(drug.objects)) {
mat[i,] = allrings %in% mfrags[[i]]$rings
}
# aside: if you want rings and frameworks thrown in together,
# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
# lapply then does this to each element of the list
mfrags_all = lapply(mfrags,do.call,what=c)
view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir
view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir
allfrags[1:2]
# [1] "c1ncc2nc[nH]c2(n1)" "c1ncc2ncn(c2(n1))C3C=CCC3"
mfrags[[1]]
# $rings
# [1] "c1ncc2nc[nH]c2(n1)"
# $frameworks
# [1] "c1ncc2ncn(c2(n1))C3C=CCC3" "c2nc1[nH]cnc1c(n2)NC3CC3"
# [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4"
# test that this worked out right
allfrags[1:5] %in% mfrags[[1]]
# appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not
length(allfrags %in% mfrags[[1]])
# 1947
statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
drugs[tolower(drugs$generic_name) %in% statins,]
view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins])
is_statin = tolower(drugs$generic_name) %in% statins
pvals = numeric(length(allfrags))
for (j in 1:length(allfrags)) {
hasfrag = mat[,j]
contingency_table = table(data.frame(is_statin,hasfrag))
pvals[j] = fisher.test(contingency_table)$p.value
}
qqunif(pvals,pch=19)
allfrags[pvals < .00001]
# [1] "C=1CCC2CCCC=C2(C=1)"
plotsmiles = function(smiles) {
par(mfrow=c(1,length(smiles)))
for (i in 1:length(smiles)) {
temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100)
plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i])
rasterImage(temp1,1,1,10,10)
}
}
view.molecule.2d(parse.smiles(allfrags[pvals < .00001]))
drugs[is_statin & mat[,pvals < .00001],]
# 3 of 7 (lova, prava & simva) contain this
# what about the OH OH OH thing which 4/7 have and 5/7 have similar
# or the fluorobenzene which 4/7 have
view.molecule.2d(parse.smiles(allfrags[pvals < .001]))
# is it that fluorobenzene wasn't significantly enriched (because present
# in many drugs) or because it wasn't considered a fragment?
mfrags[[which(drugs$generic_name=='Rosuvastatin')]]
# rings1 rings2 frameworks
# "c1cncnc1" "c1ccccc1" "c1cc(ncn1)c2ccccc2"
# apparently not a Murcko fragment.
start_time = Sys.time()
get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7)
Sys.time() - start_time
# w/ min.frag.size=6, this takes over a minute and produces 107 fragments,
# one of which is fluorobenzene:
# [47] "Fc1ccccc1"
# w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments:
Sys.time() - start_time
#Time difference of 12.52133 mins
# Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122
# 2 guys from Vertex Pharma
#We define ring systems to be cycles within
#the graph representation of molecules and cycles sharing an
#edge (a connection between two atoms or a bond).
# -- side chain atoms are separate.
# Linker Atoms: Atoms that are on the direct path connecting
# two ring systems are defined as linker atoms
# Framework. The framework is defined as the union of ring
# systems and linkers in a molecule.
# promiscuity scores
# re-try with only smaller molecules for exhaustive frags
drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,]
drug.objects = parse.smiles(drugs$smiles)
dim(drugs)
length(drug.objects) # 1321
start_time = Sys.time()
efrags = get.exhaustive.fragments(drug.objects[1:50])
Sys.time() - start_time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment