Last active
May 2, 2020 11:49
-
-
Save ericminikel/8229983 to your computer and use it in GitHub Desktop.
Introductory tutorial on how to use rcdk to conduct an SAR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(rcdk) # chemical informatics functionality in R | |
require(gap) # for qq plots later | |
options(stringsAsFactors=FALSE) | |
setwd('c:/sci/037cinf/analysis/sar/') | |
# plot molecules in R plot window instead of separate Java window | |
rcdkplot = function(molecule,width=500,height=500,marg=0,main='') { | |
par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot | |
temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical | |
plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot | |
rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries | |
} | |
# first look at curcumin | |
curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]] | |
rcdkplot(curcumin) | |
curc.frags = get.murcko.fragments(curcumin) | |
curc.frags | |
#[[1]] | |
#[[1]]$rings | |
#[1] "c1ccccc1" | |
# | |
#[[1]]$frameworks | |
#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2" | |
png('curcumin.murcko.fragments.png',width=600,height=300) | |
par(mfrow=c(1,2)) | |
rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]]) | |
rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]]) | |
dev.off() | |
anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]] | |
rcdkplot(anle138b) | |
anle138b.frags = get.murcko.fragments(anle138b) | |
anle138b.frags | |
# [[1]] | |
# [[1]]$rings | |
# [1] "c1ccccc1" "c1ccc2OCOc2(c1)" | |
# | |
# [[1]]$frameworks | |
# [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)" | |
# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)" | |
png('anle138b.murcko.fragments.png',width=600,height=120) | |
par(mfrow=c(1,5)) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]]) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]]) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]]) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]]) | |
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]]) | |
dev.off() | |
anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3) | |
anle138b.frags[[1]]$rings | |
# [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)" | |
# now for a pretend SAR | |
# load FDA drug list | |
drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt', | |
sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='') | |
# remove those with no SMILES or with a really huge smiles | |
# otherwise R will hang on the macromolecules | |
drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,] | |
drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug | |
# check that lengths are same | |
dim(drugs) # 1467 3 | |
length(drug.objects) # 1467 | |
statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin") | |
drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list | |
# examine statin structures | |
png('statin.structures.png',width=1200,height=600) | |
par(mfrow=c(2,4)) | |
for (statin in statins) { | |
rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin) | |
} | |
dev.off() | |
# get all murcko fragments | |
mfrags = get.murcko.fragments(drug.objects,min.frag.size=3) | |
# get a list of all possible fragments in any of these drugs | |
allfrags = unique(unlist(mfrags)) | |
length(allfrags) # 2035 | |
# get only the ring systems | |
allrings = unique(unlist(lapply(mfrags,"[[",1))) | |
length(allrings) # 556 | |
# convert to a matrix | |
mat = matrix(nrow=length(drug.objects), ncol=length(allrings)) | |
for (i in 1:length(drug.objects)) { | |
mat[i,] = allrings %in% mfrags[[i]]$rings | |
} | |
# aside: if you want rings and frameworks thrown in together, | |
# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector | |
# lapply then does this to each element of the list | |
mfrags_all = lapply(mfrags,do.call,what=c) | |
is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin? | |
pvals = numeric(length(allrings)) # vector to hold p vals | |
for (j in 1:length(allrings)) { # iterate over rings | |
hasfrag = mat[,j] # does each drug contain this fragment? | |
contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table | |
pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence | |
} | |
png('murcko.ring.statin.qqplot.png',width=500,height=500) | |
qqunif(pvals,pch=19,cex.main=.8, | |
main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs') | |
dev.off() | |
png('statins.most.enriched.ring.system.png',width=300,height=300) | |
allrings[pvals < .000001] | |
# [1] "C=1CCC2CCCC=C2(C=1)" | |
# what statins contain it? | |
drugs$generic_name[is_statin & mat[,which(pvals < .000001)]] | |
# "Lovastatin" "Pravastatin" "Simvastatin" | |
# how many other drugs contain it | |
sum(!is_statin & mat[,which(pvals < .000001)]) | |
# 0 | |
rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001]) | |
dev.off() | |
png('frag.freq.hist1.png',width=600,height=400) | |
hist(colSums(mat),col='yellow',breaks=100, | |
xlab='# of molecules containing a given fragment', | |
ylab='# of fragments like this', | |
main='Histogram of fragment frequencies') | |
dev.off() | |
sum(colSums(mat)==1) | |
# [1] 351 | |
sum(colSums(mat[is_statin,]) > 0) | |
# 8 | |
start_time = Sys.time() | |
get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3) | |
Sys.time() - start_time | |
# Time difference of 3.394177 mins |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment