Source code for docktopus.analyser

from rdkit import DataStructs
from rdkit.Chem import AllChem
import itertools
from rdkit import Chem
import mdtraj as md
import numpy as np
import pandas as pd
from scipy.stats import bootstrap
from sklearn.metrics import mean_absolute_error


[docs] class Analyser: """ Analyser is a utility class for analyzing molecular docking results, Currently offers computing distances Fe-ligand distances for CYP complexes. Still has limited funcitonality of only Fe distances calculation and calculating MAE with bootstrapped errors. Contains some unused functions for future functionalities of full post-docking analysis. This class provides methods to: - Compute Fe distances between docked and reference ligand/hem structures. - Facilitate downstream statistical analysis of docking results. Parameters ---------- workdir : str The working directory containing the docking and reference results. Expected subdirectories: 'docked' and 'ref'. Examples -------- >>> analyser = Analyser(workdir="./results") >>> docked_data = ("./results/docked/sample1-docked.pdb", "./results/docked/sample1-hem.pdb") >>> reference_data = ("./results/ref/sample1-ligand.pdb", "./results/ref/sample1-hem.pdb") >>> ligand_dist, docked_dist = analyser.get_Fedistance(docked_data, reference_data, kind="Fed1") >>> print("Ligand Fe distance:", ligand_dist) >>> print("Docked Fe distance:", docked_dist) """ def __init__(self, workdir) -> None: self.workdir = workdir
[docs] def get_sample_Fedist(self, sample_names, kind): """ Wrapper function to compute Fe distances for a list of sample names. !! DOES NOT WORK CURRENTLY !! Main issue is to provide consistent filenaming to traverse over full dataset of docking resuts. Currently contains hardcoded names for docked_{ligand, hem} and reference_{ligand, hem} which are not consistent with the rest of the library. Args: sample_names (list of str): List of sample names. kind (str): The kind of Fe distance to compute ("Fed1", "Fed2", "Fed3"). Returns: pd.DataFrame: DataFrame with columns ['sample', 'ligand_dist', 'docked_dist']. """ results = [] for sample in sample_names: docked_ligand = f"{self.workdir}/docked/{sample}-docked.pdb" docked_hem = f"{self.workdir}/docked/{sample}-hem.pdb" reference_ligand = f"{self.workdir}/ref/{sample}-ligand.pdb" reference_hem = f"{self.workdir}/ref/{sample}-hem.pdb" docked_data = (docked_ligand, docked_hem) reference_data = (reference_ligand, reference_hem) ligand_dist, docked_dist = self.get_Fedistance(docked_data, reference_data, kind) results.append({ "sample": sample, "ligand_dist": ligand_dist, "docked_dist": docked_dist }) return pd.DataFrame(results)
[docs] def get_Fedistance(self, docked_data, reference_data, kind): docked_ligand, docked_hem = docked_data reference_ligand, reference_hem = reference_data try: docked = md.load(docked_ligand, top=docked_ligand) ligand = md.load(reference_ligand, top=reference_ligand) hem = md.load(docked_hem, top=docked_hem) hemref = md.load(reference_hem, top=reference_hem) except Exception as e: print(f"Processing error: {e}") return np.nan, np.nan docked = docked[0] dockedxyz = docked.xyz ligandxyz = ligand.xyz Feindex = [atom.index for atom in hem.topology.atoms if atom.element.symbol == 'Fe'] hemxyz = hem.atom_slice(Feindex).xyz Feindex_ref=[atom.index for atom in hemref.topology.atoms if atom.element.symbol=='Fe'] hemxyz_ref=hemref.atom_slice(Feindex_ref).xyz distlig = np.linalg.norm(ligandxyz-hemxyz_ref, axis=2) distdock = np.linalg.norm(dockedxyz-hemxyz, axis=2) if kind=="Fed1": return distlig.min(), distdock.min() elif kind=="Fed2": try: idx=np.argmin(distlig) idxdock=np.argmin(distdock) # get the closest atom in crystalographic ligand structure vallig=distlig[0,idx] # get distance of that atom in docked ligand structure valdock=distdock[0,idx] return vallig, valdock except: return np.nan, np.nan elif kind=="Fed3": return distlig.max(), distdock.max()
[docs] def bootstrap_mae_error(self, data, ref_data, n_bootstrap=1000, confidence_level=0.95): def statistic(x): return np.mean(np.abs(x)) bootstrap_results = bootstrap((data-ref_data,), statistic, n_resamples=n_bootstrap, confidence_level=confidence_level) ci_lower, ci_upper = bootstrap_results.confidence_interval initial_error = mean_absolute_error(y_pred=data, y_true=ref_data) return pd.Series({"initial": initial_error, 'ci_lower': ci_lower, 'ci_upper': ci_upper})
[docs] def bootstrap_ratio_error(self, Fe_dist_ratio, n_bootstrap=1000, confidence_level=0.95): def statistic(x): return np.mean(x) bootstrap_results = bootstrap((Fe_dist_ratio,), statistic, n_resamples=n_bootstrap, confidence_level=confidence_level) ci_lower, ci_upper = bootstrap_results.confidence_interval return pd.Series({"mean": np.mean(Fe_dist_ratio), 'ci_lower': ci_lower, 'ci_upper': ci_upper})
[docs] def get_similarity_scores(self, smiles_list: list[str]) -> list: # Handle the case where a single string is provided instead of a list if isinstance(smiles_list, str): smiles_list = [smiles_list] elif not isinstance(smiles_list, list): raise ValueError("Input must be a list of SMILES strings or a single SMILES string.") ligand_mol = [Chem.MolFromSmiles(m) for m in smiles_list] # Create a list of all pairs of ligands ligand_pairs = list(itertools.combinations(ligand_mol, 2)) # Calculate pairwise Tanimoto similarity similarity_scores = [] for pair in ligand_pairs: ligand1, ligand2 = pair fp1 = AllChem.GetMorganFingerprintAsBitVect(ligand1, radius=2, nBits=2048) fp2 = AllChem.GetMorganFingerprintAsBitVect(ligand2, radius=2, nBits=2048) similarity = DataStructs.TanimotoSimilarity(fp1, fp2) similarity_scores.append(similarity) return similarity_scores
[docs] def calculate_molecular_weights(self, smiles_list): """ Calculate molecular weights for a list of SMILES strings. Args: smiles_list (list of str): List of SMILES strings. Returns: list of float: Molecular weights for each molecule. """ # Handle the case where a single string is provided instead of a list if isinstance(smiles_list, str): smiles_list = [smiles_list] elif not isinstance(smiles_list, list): raise ValueError("Input must be a list of SMILES strings or a single SMILES string.") try: from rdkit import Chem from rdkit.Chem import Descriptors except ImportError: raise ImportError("RDKit is required for this function.") weights = [] for smi in smiles_list: mol = Chem.MolFromSmiles(smi) if mol is not None: mw = Descriptors.MolWt(mol) else: mw = float('nan') weights.append(mw) return weights
[docs] def calculate_num_rotatable_bonds(self, smiles_list): """ Calculate the number of rotatable bonds for a list of SMILES strings. Args: smiles_list (list of str): List of SMILES strings. Returns: list of int: Number of rotatable bonds for each molecule. """ # Handle the case where a single string is provided instead of a list if isinstance(smiles_list, str): smiles_list = [smiles_list] elif not isinstance(smiles_list, list): raise ValueError("Input must be a list of SMILES strings or a single SMILES string.") try: from rdkit import Chem from rdkit.Chem import Descriptors except ImportError: raise ImportError("RDKit is required for this function.") rot_bonds = [] for smi in smiles_list: mol = Chem.MolFromSmiles(smi) if mol is not None: n_rot = Descriptors.NumRotatableBonds(mol) else: n_rot = None rot_bonds.append(n_rot) return rot_bonds