from rdkit import DataStructs
from rdkit.Chem import AllChem
import itertools
from rdkit import Chem
import mdtraj as md
import numpy as np
import pandas as pd
from scipy.stats import bootstrap
from sklearn.metrics import mean_absolute_error
[docs]
class Analyser:
"""
Analyser is a utility class for analyzing molecular docking results,
Currently offers computing distances Fe-ligand distances for CYP complexes.
Still has limited funcitonality of only Fe distances calculation and
calculating MAE with bootstrapped errors.
Contains some unused functions for future functionalities of full post-docking
analysis.
This class provides methods to:
- Compute Fe distances between docked and reference ligand/hem structures.
- Facilitate downstream statistical analysis of docking results.
Parameters
----------
workdir : str
The working directory containing the docking and reference results.
Expected subdirectories: 'docked' and 'ref'.
Examples
--------
>>> analyser = Analyser(workdir="./results")
>>> docked_data = ("./results/docked/sample1-docked.pdb", "./results/docked/sample1-hem.pdb")
>>> reference_data = ("./results/ref/sample1-ligand.pdb", "./results/ref/sample1-hem.pdb")
>>> ligand_dist, docked_dist = analyser.get_Fedistance(docked_data, reference_data, kind="Fed1")
>>> print("Ligand Fe distance:", ligand_dist)
>>> print("Docked Fe distance:", docked_dist)
"""
def __init__(self, workdir) -> None:
self.workdir = workdir
[docs]
def get_sample_Fedist(self, sample_names, kind):
"""
Wrapper function to compute Fe distances for a list of sample names.
!! DOES NOT WORK CURRENTLY !!
Main issue is to provide consistent filenaming to traverse over full dataset
of docking resuts. Currently contains hardcoded names for
docked_{ligand, hem} and reference_{ligand, hem} which are not consistent with the rest
of the library.
Args:
sample_names (list of str): List of sample names.
kind (str): The kind of Fe distance to compute ("Fed1", "Fed2", "Fed3").
Returns:
pd.DataFrame: DataFrame with columns ['sample', 'ligand_dist', 'docked_dist'].
"""
results = []
for sample in sample_names:
docked_ligand = f"{self.workdir}/docked/{sample}-docked.pdb"
docked_hem = f"{self.workdir}/docked/{sample}-hem.pdb"
reference_ligand = f"{self.workdir}/ref/{sample}-ligand.pdb"
reference_hem = f"{self.workdir}/ref/{sample}-hem.pdb"
docked_data = (docked_ligand, docked_hem)
reference_data = (reference_ligand, reference_hem)
ligand_dist, docked_dist = self.get_Fedistance(docked_data, reference_data, kind)
results.append({
"sample": sample,
"ligand_dist": ligand_dist,
"docked_dist": docked_dist
})
return pd.DataFrame(results)
[docs]
def get_Fedistance(self, docked_data, reference_data, kind):
docked_ligand, docked_hem = docked_data
reference_ligand, reference_hem = reference_data
try:
docked = md.load(docked_ligand, top=docked_ligand)
ligand = md.load(reference_ligand, top=reference_ligand)
hem = md.load(docked_hem, top=docked_hem)
hemref = md.load(reference_hem, top=reference_hem)
except Exception as e:
print(f"Processing error: {e}")
return np.nan, np.nan
docked = docked[0]
dockedxyz = docked.xyz
ligandxyz = ligand.xyz
Feindex = [atom.index for atom in hem.topology.atoms if atom.element.symbol == 'Fe']
hemxyz = hem.atom_slice(Feindex).xyz
Feindex_ref=[atom.index for atom in hemref.topology.atoms if atom.element.symbol=='Fe']
hemxyz_ref=hemref.atom_slice(Feindex_ref).xyz
distlig = np.linalg.norm(ligandxyz-hemxyz_ref, axis=2)
distdock = np.linalg.norm(dockedxyz-hemxyz, axis=2)
if kind=="Fed1":
return distlig.min(), distdock.min()
elif kind=="Fed2":
try:
idx=np.argmin(distlig)
idxdock=np.argmin(distdock)
# get the closest atom in crystalographic ligand structure
vallig=distlig[0,idx]
# get distance of that atom in docked ligand structure
valdock=distdock[0,idx]
return vallig, valdock
except:
return np.nan, np.nan
elif kind=="Fed3":
return distlig.max(), distdock.max()
[docs]
def bootstrap_mae_error(self, data, ref_data, n_bootstrap=1000, confidence_level=0.95):
def statistic(x):
return np.mean(np.abs(x))
bootstrap_results = bootstrap((data-ref_data,), statistic, n_resamples=n_bootstrap, confidence_level=confidence_level)
ci_lower, ci_upper = bootstrap_results.confidence_interval
initial_error = mean_absolute_error(y_pred=data, y_true=ref_data)
return pd.Series({"initial": initial_error, 'ci_lower': ci_lower, 'ci_upper': ci_upper})
[docs]
def bootstrap_ratio_error(self, Fe_dist_ratio, n_bootstrap=1000, confidence_level=0.95):
def statistic(x):
return np.mean(x)
bootstrap_results = bootstrap((Fe_dist_ratio,), statistic, n_resamples=n_bootstrap, confidence_level=confidence_level)
ci_lower, ci_upper = bootstrap_results.confidence_interval
return pd.Series({"mean": np.mean(Fe_dist_ratio), 'ci_lower': ci_lower, 'ci_upper': ci_upper})
[docs]
def get_similarity_scores(self, smiles_list: list[str]) -> list:
# Handle the case where a single string is provided instead of a list
if isinstance(smiles_list, str):
smiles_list = [smiles_list]
elif not isinstance(smiles_list, list):
raise ValueError("Input must be a list of SMILES strings or a single SMILES string.")
ligand_mol = [Chem.MolFromSmiles(m) for m in smiles_list]
# Create a list of all pairs of ligands
ligand_pairs = list(itertools.combinations(ligand_mol, 2))
# Calculate pairwise Tanimoto similarity
similarity_scores = []
for pair in ligand_pairs:
ligand1, ligand2 = pair
fp1 = AllChem.GetMorganFingerprintAsBitVect(ligand1, radius=2, nBits=2048)
fp2 = AllChem.GetMorganFingerprintAsBitVect(ligand2, radius=2, nBits=2048)
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
similarity_scores.append(similarity)
return similarity_scores
[docs]
def calculate_molecular_weights(self, smiles_list):
"""
Calculate molecular weights for a list of SMILES strings.
Args:
smiles_list (list of str): List of SMILES strings.
Returns:
list of float: Molecular weights for each molecule.
"""
# Handle the case where a single string is provided instead of a list
if isinstance(smiles_list, str):
smiles_list = [smiles_list]
elif not isinstance(smiles_list, list):
raise ValueError("Input must be a list of SMILES strings or a single SMILES string.")
try:
from rdkit import Chem
from rdkit.Chem import Descriptors
except ImportError:
raise ImportError("RDKit is required for this function.")
weights = []
for smi in smiles_list:
mol = Chem.MolFromSmiles(smi)
if mol is not None:
mw = Descriptors.MolWt(mol)
else:
mw = float('nan')
weights.append(mw)
return weights
[docs]
def calculate_num_rotatable_bonds(self, smiles_list):
"""
Calculate the number of rotatable bonds for a list of SMILES strings.
Args:
smiles_list (list of str): List of SMILES strings.
Returns:
list of int: Number of rotatable bonds for each molecule.
"""
# Handle the case where a single string is provided instead of a list
if isinstance(smiles_list, str):
smiles_list = [smiles_list]
elif not isinstance(smiles_list, list):
raise ValueError("Input must be a list of SMILES strings or a single SMILES string.")
try:
from rdkit import Chem
from rdkit.Chem import Descriptors
except ImportError:
raise ImportError("RDKit is required for this function.")
rot_bonds = []
for smi in smiles_list:
mol = Chem.MolFromSmiles(smi)
if mol is not None:
n_rot = Descriptors.NumRotatableBonds(mol)
else:
n_rot = None
rot_bonds.append(n_rot)
return rot_bonds