ericminikel · May 2, 2020 11:49
diff --git a/rcdk-sar-tutorial.r b/rcdk-sar-tutorial.r
 require(rcdk) # chemical informatics functionality in R
 require(gap)  # for qq plots later
 options(stringsAsFactors=FALSE)
 setwd('c:/sci/037cinf/analysis/sar/')

 # plot molecules in R plot window instead of separate Java window
 rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
    par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
    temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
    plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
    rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
 }

 # first look at curcumin
 curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]] 
 rcdkplot(curcumin)

 curc.frags = get.murcko.fragments(curcumin)
 curc.frags
 #[[1]]
 #[[1]]$rings
 #[1] "c1ccccc1"
 #
 #[[1]]$frameworks
 #[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
 png('curcumin.murcko.fragments.png',width=600,height=300)
 par(mfrow=c(1,2))
 rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
 rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
 dev.off()

 anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
 rcdkplot(anle138b)

 anle138b.frags = get.murcko.fragments(anle138b)
 anle138b.frags
 # [[1]]
 # [[1]]$rings
 # [1] "c1ccccc1"        "c1ccc2OCOc2(c1)"
 # 
 # [[1]]$frameworks
 # [1] "c1cccc(c1)c2n[nH]cc2"                  "c1n[nH]c(c1)c2ccc3OCOc3(c2)"          
 # [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
 png('anle138b.murcko.fragments.png',width=600,height=120)
 par(mfrow=c(1,5))
 rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
 rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
 rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
 rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
 rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
 dev.off()

 anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
 anle138b.frags[[1]]$rings
 # [1] "c1ccccc1"        "c1n[nH]cc1"      "c1ccc2OCOc2(c1)"

 # now for a pretend SAR

 # load FDA drug list
 drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
                   sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
 # remove those with no SMILES or with a really huge smiles
 # otherwise R will hang on the macromolecules
 drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
 drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
 # check that lengths are same
 dim(drugs) # 1467 3
 length(drug.objects) # 1467

 statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
 drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list

 # examine statin structures
 png('statin.structures.png',width=1200,height=600)
 par(mfrow=c(2,4))
 for (statin in statins) {
    rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
 }
 dev.off()

 # get all murcko fragments
 mfrags = get.murcko.fragments(drug.objects,min.frag.size=3) 

 # get a list of all possible fragments in any of these drugs
 allfrags = unique(unlist(mfrags))
 length(allfrags) # 2035

 # get only the ring systems
 allrings = unique(unlist(lapply(mfrags,"[[",1)))
 length(allrings) # 556


 # convert to a matrix
 mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
 for (i in 1:length(drug.objects)) {
    mat[i,] = allrings %in% mfrags[[i]]$rings
 }

 # aside: if you want rings and frameworks thrown in together, 
 # do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
 # lapply then does this to each element of the list
 mfrags_all = lapply(mfrags,do.call,what=c)



 view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir
 view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir

 allfrags[1:2]
 # [1] "c1ncc2nc[nH]c2(n1)"        "c1ncc2ncn(c2(n1))C3C=CCC3"

 mfrags[[1]]
 # $rings
 # [1] "c1ncc2nc[nH]c2(n1)"
 # $frameworks
 # [1] "c1ncc2ncn(c2(n1))C3C=CCC3"         "c2nc1[nH]cnc1c(n2)NC3CC3"         
 # [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4"


 # test that this worked out right
 allfrags[1:5] %in% mfrags[[1]]
 # appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not

 length(allfrags %in% mfrags[[1]])
 # 1947

 statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
 drugs[tolower(drugs$generic_name) %in% statins,]
 view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins])

 is_statin = tolower(drugs$generic_name) %in% statins
 pvals = numeric(length(allfrags))
 for (j in 1:length(allfrags)) {
    hasfrag = mat[,j]
    contingency_table = table(data.frame(is_statin,hasfrag))
    pvals[j] = fisher.test(contingency_table)$p.value
 }

 qqunif(pvals,pch=19)

 allfrags[pvals < .00001]
 # [1] "C=1CCC2CCCC=C2(C=1)"

 plotsmiles = function(smiles) {
    par(mfrow=c(1,length(smiles))) 
    for (i in 1:length(smiles)) {
        temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100) 
        plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i])
        rasterImage(temp1,1,1,10,10)
    }
 }



 view.molecule.2d(parse.smiles(allfrags[pvals < .00001]))

 drugs[is_statin & mat[,pvals < .00001],]
 # 3 of 7 (lova, prava & simva) contain this
 # what about the OH OH OH thing which 4/7 have and 5/7 have similar
 # or the fluorobenzene which 4/7 have

 view.molecule.2d(parse.smiles(allfrags[pvals < .001]))

 # is it that fluorobenzene wasn't significantly enriched (because present
 # in many drugs) or because it wasn't considered a fragment?
 mfrags[[which(drugs$generic_name=='Rosuvastatin')]]
 #              rings1               rings2           frameworks 
 #          "c1cncnc1"           "c1ccccc1" "c1cc(ncn1)c2ccccc2" 
 # apparently not a Murcko fragment.

 start_time = Sys.time()
 get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7)
 Sys.time() - start_time
 # w/ min.frag.size=6, this takes over a minute and produces 107 fragments, 
 # one of which is fluorobenzene:
 #  [47] "Fc1ccccc1"                              

 # w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments:
 Sys.time() - start_time
 #Time difference of 12.52133 mins

 # Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122
 # 2 guys from Vertex Pharma
 #We define ring systems to be cycles within
 #the graph representation of molecules and cycles sharing an
 #edge (a connection between two atoms or a bond).
 # -- side chain atoms are separate. 

 # Linker Atoms: Atoms that are on the direct path connecting
 # two ring systems are defined as linker atoms

 # Framework. The framework is defined as the union of ring
 # systems and linkers in a molecule.
 # promiscuity scores

 # re-try with only smaller molecules for exhaustive frags
 drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,]
 drug.objects = parse.smiles(drugs$smiles)
 dim(drugs)
 length(drug.objects) # 1321

 start_time = Sys.time()
 efrags = get.exhaustive.fragments(drug.objects[1:50]) 
 Sys.time() - start_time
	require(rcdk) # chemical informatics functionality in R
	require(gap) # for qq plots later
	options(stringsAsFactors=FALSE)
	setwd('c:/sci/037cinf/analysis/sar/')

	# plot molecules in R plot window instead of separate Java window
	rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
	par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
	temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
	plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
	rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
	}

	# first look at curcumin
	curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]]
	rcdkplot(curcumin)

	curc.frags = get.murcko.fragments(curcumin)
	curc.frags
	#[[1]]
	#[[1]]$rings
	#[1] "c1ccccc1"
	#
	#[[1]]$frameworks
	#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
	png('curcumin.murcko.fragments.png',width=600,height=300)
	par(mfrow=c(1,2))
	rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
	rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
	dev.off()

	anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
	rcdkplot(anle138b)

	anle138b.frags = get.murcko.fragments(anle138b)
	anle138b.frags
	# [[1]]
	# [[1]]$rings
	# [1] "c1ccccc1" "c1ccc2OCOc2(c1)"
	#
	# [[1]]$frameworks
	# [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)"
	# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
	png('anle138b.murcko.fragments.png',width=600,height=120)
	par(mfrow=c(1,5))
	rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
	dev.off()

	anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
	anle138b.frags[[1]]$rings
	# [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)"

	# now for a pretend SAR

	# load FDA drug list
	drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
	sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
	# remove those with no SMILES or with a really huge smiles
	# otherwise R will hang on the macromolecules
	drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
	drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
	# check that lengths are same
	dim(drugs) # 1467 3
	length(drug.objects) # 1467

	statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
	drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list

	# examine statin structures
	png('statin.structures.png',width=1200,height=600)
	par(mfrow=c(2,4))
	for (statin in statins) {
	rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
	}
	dev.off()

	# get all murcko fragments
	mfrags = get.murcko.fragments(drug.objects,min.frag.size=3)

	# get a list of all possible fragments in any of these drugs
	allfrags = unique(unlist(mfrags))
	length(allfrags) # 2035

	# get only the ring systems
	allrings = unique(unlist(lapply(mfrags,"[[",1)))
	length(allrings) # 556


	# convert to a matrix
	mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
	for (i in 1:length(drug.objects)) {
	mat[i,] = allrings %in% mfrags[[i]]$rings
	}

	# aside: if you want rings and frameworks thrown in together,
	# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
	# lapply then does this to each element of the list
	mfrags_all = lapply(mfrags,do.call,what=c)



	view.molecule.2d(parse.smiles(drugs$smiles[1])) # Abacavir
	view.molecule.2d(parse.smiles(unlist(mfrags[[1]]))) # Murcko fragments of Abacavir

	allfrags[1:2]
	# [1] "c1ncc2nc[nH]c2(n1)" "c1ncc2ncn(c2(n1))C3C=CCC3"

	mfrags[[1]]
	# $rings
	# [1] "c1ncc2nc[nH]c2(n1)"
	# $frameworks
	# [1] "c1ncc2ncn(c2(n1))C3C=CCC3" "c2nc1[nH]cnc1c(n2)NC3CC3"
	# [3] "c2nc(NC1CC1)c3ncn(c3(n2))C4C=CCC4"


	# test that this worked out right
	allfrags[1:5] %in% mfrags[[1]]
	# appropriately, the first 4 are all in Abacavir and the 5th (which came from Drug #2) is not

	length(allfrags %in% mfrags[[1]])
	# 1947

	statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
	drugs[tolower(drugs$generic_name) %in% statins,]
	view.molecule.2d(drug.objects[tolower(drugs$generic_name) %in% statins])

	is_statin = tolower(drugs$generic_name) %in% statins
	pvals = numeric(length(allfrags))
	for (j in 1:length(allfrags)) {
	hasfrag = mat[,j]
	contingency_table = table(data.frame(is_statin,hasfrag))
	pvals[j] = fisher.test(contingency_table)$p.value
	}

	qqunif(pvals,pch=19)

	allfrags[pvals < .00001]
	# [1] "C=1CCC2CCCC=C2(C=1)"

	plotsmiles = function(smiles) {
	par(mfrow=c(1,length(smiles)))
	for (i in 1:length(smiles)) {
	temp1 = view.image.2d(parse.smiles(smiles[i])[[1]],100,100)
	plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=smiles[i])
	rasterImage(temp1,1,1,10,10)
	}
	}



	view.molecule.2d(parse.smiles(allfrags[pvals < .00001]))

	drugs[is_statin & mat[,pvals < .00001],]
	# 3 of 7 (lova, prava & simva) contain this
	# what about the OH OH OH thing which 4/7 have and 5/7 have similar
	# or the fluorobenzene which 4/7 have

	view.molecule.2d(parse.smiles(allfrags[pvals < .001]))

	# is it that fluorobenzene wasn't significantly enriched (because present
	# in many drugs) or because it wasn't considered a fragment?
	mfrags[[which(drugs$generic_name=='Rosuvastatin')]]
	# rings1 rings2 frameworks
	# "c1cncnc1" "c1ccccc1" "c1cc(ncn1)c2ccccc2"
	# apparently not a Murcko fragment.

	start_time = Sys.time()
	get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=7)
	Sys.time() - start_time
	# w/ min.frag.size=6, this takes over a minute and produces 107 fragments,
	# one of which is fluorobenzene:
	# [47] "Fc1ccccc1"

	# w/ min.frag.size=7, takes 12 minutes to do just Rosuvastatin and get 102 fragments:
	Sys.time() - start_time
	#Time difference of 12.52133 mins

	# Murcko frags appear to be from Bemis & Murcko 1996 http://www.ncbi.nlm.nih.gov/pubmed/8709122
	# 2 guys from Vertex Pharma
	#We define ring systems to be cycles within
	#the graph representation of molecules and cycles sharing an
	#edge (a connection between two atoms or a bond).
	# -- side chain atoms are separate.

	# Linker Atoms: Atoms that are on the direct path connecting
	# two ring systems are defined as linker atoms

	# Framework. The framework is defined as the union of ring
	# systems and linkers in a molecule.
	# promiscuity scores

	# re-try with only smaller molecules for exhaustive frags
	drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 50,]
	drug.objects = parse.smiles(drugs$smiles)
	dim(drugs)
	length(drug.objects) # 1321

	start_time = Sys.time()
	efrags = get.exhaustive.fragments(drug.objects[1:50])
	Sys.time() - start_time