Created
October 25, 2020 19:05
-
-
Save MShafquat/5376d78151919769c353385101ddd821 to your computer and use it in GitHub Desktop.
Read compounds from multiple SDFs and write calculations to a single SDF file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# necessary imports | |
import pandas as pd | |
from rdkit import Chem | |
from rdkit.Chem import PandasTools | |
from rdkit.Chem import Crippen | |
import glob | |
import pandas as pd | |
# Dataframe to write calculations of each compounds | |
compounds = pd.DataFrame(columns=['ID', 'Smiles', 'Molecular_Formula', 'Molecular_Weight', 'H_Bond_Acceptors', | |
'H_Bond_Donors', 'Molar_Refractivity', 'TPSA']) | |
# skip duplicates | |
checkDuplicateSmiles = dict() | |
i = 0 # index counter | |
for file in glob.iglob('./**//*.sdf', recursive=True): # data is saved in subdirectories of current directory | |
sdf = Chem.SDMolSupplier(file) # read sdf | |
for mol in sdf: | |
smiles = Chem.MolToSmiles(mol) # get smiles | |
if checkDuplicateSmiles.get(smiles, -1) == -1: # if this is not already processed | |
checkDuplicateSmiles[smiles] = True # mark it as processed | |
i +=1 # index counter | |
molecular_formula = Chem.rdMolDescriptors.CalcMolFormula(mol) # formula | |
molecular_weight = Chem.rdMolDescriptors.CalcExactMolWt(mol) # weight | |
hba = Chem.rdMolDescriptors.CalcNumHBA(mol) # h bond acceptor | |
hbd = Chem.rdMolDescriptors.CalcNumHBD(mol) # h bond donor | |
molar_refractivity = Chem.Crippen.MolMR(mol) # molar refractivity | |
tpsa = Chem.rdMolDescriptors.CalcTPSA(mol) # tpsa | |
compounds = compounds.append({ # write this row to dataframe | |
'ID': 'Phytochem_' + str(i).zfill(5), | |
'Smiles': smiles, | |
'Molecular_Formula': molecular_formula, | |
'Molecular_Weight': molecular_weight, | |
'H_Bond_Acceptors': hba, | |
'H_Bond_Donors': hbd, | |
'Molar_Refractivity': molar_refractivity, | |
'TPSA': tpsa | |
}, ignore_index=True) | |
# add molecule column | |
PandasTools.AddMoleculeColumnToFrame(compounds,'Smiles','ROMol',includeFingerprints=True) | |
# now write to a single sdf | |
PandasTools.WriteSDF(compounds, 'new_sdf.sdf', molColName='ROMol', idName='ID', properties=list(compounds.columns)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment