ericminikel · May 2, 2020 11:49
diff --git a/rcdk-sar-tutorial.r b/rcdk-sar-tutorial.r
 require(rcdk) # chemical informatics functionality in R
 require(gap)  # for qq plots later
 options(stringsAsFactors=FALSE)
 setwd('c:/sci/037cinf/analysis/sar/')

 # plot molecules in R plot window instead of separate Java window
 rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
    par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
    temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
    plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
    rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
 }

 # first look at curcumin
 curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]] 
 rcdkplot(curcumin)

 curc.frags = get.murcko.fragments(curcumin)
 curc.frags
 #[[1]]
 #[[1]]$rings
 #[1] "c1ccccc1"
 #
 #[[1]]$frameworks
 #[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
 png('curcumin.murcko.fragments.png',width=600,height=300)
 par(mfrow=c(1,2))
 rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
 rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
 dev.off()

 anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
 rcdkplot(anle138b)

 anle138b.frags = get.murcko.fragments(anle138b)
 anle138b.frags
 # [[1]]
 # [[1]]$rings
 # [1] "c1ccccc1"        "c1ccc2OCOc2(c1)"
 # 
 # [[1]]$frameworks
 # [1] "c1cccc(c1)c2n[nH]cc2"                  "c1n[nH]c(c1)c2ccc3OCOc3(c2)"          
 # [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
 png('anle138b.murcko.fragments.png',width=600,height=120)
 par(mfrow=c(1,5))
 rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
 rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
 rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
 rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
 rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
 dev.off()

 anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
 anle138b.frags[[1]]$rings
 # [1] "c1ccccc1"        "c1n[nH]cc1"      "c1ccc2OCOc2(c1)"

 # now for a pretend SAR

 # load FDA drug list
 drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
                   sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
 # remove those with no SMILES or with a really huge smiles
 # otherwise R will hang on the macromolecules
 drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
 drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
 # check that lengths are same
 dim(drugs) # 1467 3
 length(drug.objects) # 1467

 statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
 drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list

 # examine statin structures
 png('statin.structures.png',width=1200,height=600)
 par(mfrow=c(2,4))
 for (statin in statins) {
    rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
 }
 dev.off()

 # get all murcko fragments
 mfrags = get.murcko.fragments(drug.objects,min.frag.size=3) 

 # get a list of all possible fragments in any of these drugs
 allfrags = unique(unlist(mfrags))
 length(allfrags) # 2035

 # get only the ring systems
 allrings = unique(unlist(lapply(mfrags,"[[",1)))
 length(allrings) # 556


 # convert to a matrix
 mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
 for (i in 1:length(drug.objects)) {
    mat[i,] = allrings %in% mfrags[[i]]$rings
 }

 # aside: if you want rings and frameworks thrown in together, 
 # do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
 # lapply then does this to each element of the list
 mfrags_all = lapply(mfrags,do.call,what=c)

 is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin?
 pvals = numeric(length(allrings)) # vector to hold p vals
 for (j in 1:length(allrings)) { # iterate over rings
    hasfrag = mat[,j] # does each drug contain this fragment?
    contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table
    pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence
 }
 png('murcko.ring.statin.qqplot.png',width=500,height=500)
 qqunif(pvals,pch=19,cex.main=.8,
    main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs')
 dev.off()

 png('statins.most.enriched.ring.system.png',width=300,height=300)
 allrings[pvals < .000001]
 # [1] "C=1CCC2CCCC=C2(C=1)"
 # what statins contain it?
 drugs$generic_name[is_statin & mat[,which(pvals < .000001)]]
 # "Lovastatin"  "Pravastatin" "Simvastatin"
 # how many other drugs contain it
 sum(!is_statin & mat[,which(pvals < .000001)])
 # 0
 rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001])
 dev.off()

 png('frag.freq.hist1.png',width=600,height=400)
 hist(colSums(mat),col='yellow',breaks=100,
    xlab='# of molecules containing a given fragment',
    ylab='# of fragments like this',
    main='Histogram of fragment frequencies')
 dev.off()

 sum(colSums(mat)==1)
 # [1] 351

 sum(colSums(mat[is_statin,]) > 0)
 # 8

 start_time = Sys.time()
 get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3)
 Sys.time() - start_time
 # Time difference of 3.394177 mins
	require(rcdk) # chemical informatics functionality in R
	require(gap) # for qq plots later
	options(stringsAsFactors=FALSE)
	setwd('c:/sci/037cinf/analysis/sar/')

	# plot molecules in R plot window instead of separate Java window
	rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
	par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
	temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
	plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
	rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
	}

	# first look at curcumin
	curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]]
	rcdkplot(curcumin)

	curc.frags = get.murcko.fragments(curcumin)
	curc.frags
	#[[1]]
	#[[1]]$rings
	#[1] "c1ccccc1"
	#
	#[[1]]$frameworks
	#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
	png('curcumin.murcko.fragments.png',width=600,height=300)
	par(mfrow=c(1,2))
	rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
	rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
	dev.off()

	anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
	rcdkplot(anle138b)

	anle138b.frags = get.murcko.fragments(anle138b)
	anle138b.frags
	# [[1]]
	# [[1]]$rings
	# [1] "c1ccccc1" "c1ccc2OCOc2(c1)"
	#
	# [[1]]$frameworks
	# [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)"
	# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
	png('anle138b.murcko.fragments.png',width=600,height=120)
	par(mfrow=c(1,5))
	rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
	dev.off()

	anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
	anle138b.frags[[1]]$rings
	# [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)"

	# now for a pretend SAR

	# load FDA drug list
	drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
	sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
	# remove those with no SMILES or with a really huge smiles
	# otherwise R will hang on the macromolecules
	drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
	drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
	# check that lengths are same
	dim(drugs) # 1467 3
	length(drug.objects) # 1467

	statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
	drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list

	# examine statin structures
	png('statin.structures.png',width=1200,height=600)
	par(mfrow=c(2,4))
	for (statin in statins) {
	rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
	}
	dev.off()

	# get all murcko fragments
	mfrags = get.murcko.fragments(drug.objects,min.frag.size=3)

	# get a list of all possible fragments in any of these drugs
	allfrags = unique(unlist(mfrags))
	length(allfrags) # 2035

	# get only the ring systems
	allrings = unique(unlist(lapply(mfrags,"[[",1)))
	length(allrings) # 556


	# convert to a matrix
	mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
	for (i in 1:length(drug.objects)) {
	mat[i,] = allrings %in% mfrags[[i]]$rings
	}

	# aside: if you want rings and frameworks thrown in together,
	# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
	# lapply then does this to each element of the list
	mfrags_all = lapply(mfrags,do.call,what=c)

	is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin?
	pvals = numeric(length(allrings)) # vector to hold p vals
	for (j in 1:length(allrings)) { # iterate over rings
	hasfrag = mat[,j] # does each drug contain this fragment?
	contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table
	pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence
	}
	png('murcko.ring.statin.qqplot.png',width=500,height=500)
	qqunif(pvals,pch=19,cex.main=.8,
	main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs')
	dev.off()

	png('statins.most.enriched.ring.system.png',width=300,height=300)
	allrings[pvals < .000001]
	# [1] "C=1CCC2CCCC=C2(C=1)"
	# what statins contain it?
	drugs$generic_name[is_statin & mat[,which(pvals < .000001)]]
	# "Lovastatin" "Pravastatin" "Simvastatin"
	# how many other drugs contain it
	sum(!is_statin & mat[,which(pvals < .000001)])
	# 0
	rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001])
	dev.off()

	png('frag.freq.hist1.png',width=600,height=400)
	hist(colSums(mat),col='yellow',breaks=100,
	xlab='# of molecules containing a given fragment',
	ylab='# of fragments like this',
	main='Histogram of fragment frequencies')
	dev.off()

	sum(colSums(mat)==1)
	# [1] 351

	sum(colSums(mat[is_statin,]) > 0)
	# 8

	start_time = Sys.time()
	get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3)
	Sys.time() - start_time
	# Time difference of 3.394177 mins