Source code for docktopus.validator

import numpy as np
import pandas as pd
from rdkit.Chem import MolStandardize
from rdkit import Chem
from openbabel import pybel
from rdkit.Chem import AllChem
import logging
import subprocess
pybel.ob.obErrorLog.SetOutputLevel(0)


[docs] class RFAAValidator: """ Validator for RFAA docking results to detect potential hallucinations. This class provides methods to validate protein-ligand structures generated by RFAA to detect potential hallucinations (incorrectly predicted structures). It performs chemical validity checks and compares predicted structures with reference SMILES strings. Attributes: logger (logging.Logger): Logger instance for validation events Example: >>> validator = RFAAValidator() >>> sdf_string = validator.convert_pdb_to_sdf("mol.pdb") >>> is_valid = validator.validate_ligand(sdf_string, reference_smiles) >>> if is_valid: ... print("Structure passed validation") ... else: ... print("Structure may be hallucinated") """
[docs] def __init__(self): """ Initialize the RFAA validator. Sets up logging and prepares the validator for structure validation. Example: >>> validator = RFAAValidator() """ self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO)
[docs] def extract_ligand(self, input_pdb, output_pdb): """ Extract ligand coordinates from a protein-ligand complex PDB file. This method uses pdb_selchain and pdb_tidy to extract only the ligand atoms (chain B) from a protein-ligand complex and clean up the PDB format. Args: input_pdb (str): Path to input PDB file containing protein-ligand complex output_pdb (str): Path to output PDB file containing only ligand Raises: subprocess.CalledProcessError: If pdb_selchain or pdb_tidy fails IOError: If output file cannot be written Note: - Assumes ligand is in chain B of the complex - Requires pdb_selchain and pdb_tidy executables to be installed - Output PDB is cleaned and formatted for further processing """ try: # Run pdb_selchain -B and pipe to pdb_tidy, write to output_pdb selchain = subprocess.run( ["pdb_selchain", "-B", input_pdb], capture_output=True, check=True ) tidy = subprocess.run( ["pdb_tidy"], input=selchain.stdout, capture_output=True, check=True ) with open(output_pdb, "wb") as out_f: out_f.write(tidy.stdout) except Exception as e: self.logger.error(f"Error extracting ligand for id {id}: {e}")
[docs] def standardize_smiles(self, smiles: str) -> str: """ Standardize a SMILES string to canonical form. This method converts a SMILES string to its canonical tautomeric form using RDKit's MolStandardize module. This ensures consistent comparison between different representations of the same molecule. Args: smiles (str): Input SMILES string Returns: str: Canonical, standardized SMILES string, or None if invalid Raises: RuntimeError: If SMILES standardization fails Note: - Handles tautomeric forms automatically - Removes stereochemistry information - Returns None for invalid SMILES strings - Uses RDKit's MolStandardize for robust standardization """ mol = Chem.MolFromSmiles(smiles) if mol is None: return None tautomer = MolStandardize.rdMolStandardize.CanonicalTautomer(mol) return Chem.MolToSmiles(tautomer, isomericSmiles=False, canonical=True)
[docs] def adjacency_with_orders(self, mol: Chem.Mol) -> np.ndarray: """ Create adjacency matrix with bond orders from an RDKit molecule. This method generates a symmetric adjacency matrix where each element represents the bond order between two atoms (0 = no bond, 1 = single, 2 = double, 3 = triple). Args: mol (Chem.Mol): RDKit molecule object Returns: np.ndarray: Symmetric adjacency matrix with bond orders Note: - Matrix is symmetric (A[i,j] = A[j,i]) - Diagonal elements are 0 (no self-bonds) - Bond orders: 1=single, 2=double, 3=triple - Useful for comparing molecular connectivity """ n = mol.GetNumAtoms() A = np.zeros((n, n), dtype=int) for bond in mol.GetBonds(): i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() # Convert bond type to integer (1=single, 2=double, etc.) A[i, j] = A[j, i] = int(bond.GetBondTypeAsDouble()) return A
[docs] def count_bond_diffs(self, A1: np.ndarray, A2: np.ndarray) -> int: """ Count the number of bond differences between two adjacency matrices. This method compares two adjacency matrices and counts how many bonds differ between them. Only the upper triangle is considered to avoid double counting. Args: A1 (np.ndarray): First adjacency matrix A2 (np.ndarray): Second adjacency matrix Returns: int: Number of bond differences between the molecules Raises: ValueError: If matrices have different dimensions Note: - Returns 0 if molecules have identical connectivity - Higher values indicate more structural differences """ diff_matrix = (A1 != A2).astype(int) i_upper = np.triu_indices(diff_matrix.shape[0], k=1) return diff_matrix[i_upper].sum()
[docs] def convert_pdb_to_sdf(self, pdb_file: str) -> str: """ Convert a PDB file to SDF format using OpenBabel. This method converts a PDB file containing molecular coordinates to SDF format using pybel which handle kekekulization of aromatic moieties better than rdkit. Args: pdb_file (str): Path to the input PDB file Returns: str: SDF format string block of the molecule, or None if conversion fails Raises: FileNotFoundError: If PDB file doesn't exist RuntimeError: If conversion fails Note: - Uses OpenBabel's pybel interface for conversion - Returns SDF string block, not file path - Handles coordinate information and basic molecular properties - Returns None if conversion fails """ try: # Read the PDB file mol = pybel.readfile("pdb", pdb_file).__next__() # Convert to SDF string sdf_string = mol.write("sdf") return sdf_string except Exception as e: self.logger.error(f"Error converting {pdb_file}: {str(e)}") return None
[docs] def validate_ligand(self, sdf_string: str, smiles: str, threshold=1) -> bool: """ Validate a ligand structure against a reference SMILES string. This method performs comprehensive validation of a predicted ligand structure by comparing it with a reference SMILES string. It checks chemical validity, atom counts, bond connectivity, and bond orders. Args: sdf_string (str): SDF format string of the predicted ligand structure smiles (str): Reference SMILES string of the same molecule as in the pdb file for comparison threshold (int, optional): Maximum allowed bond differences. Defaults to 1. Returns: bool: True if structure passes validation, False otherwise Note: Validation steps: 1. Chemical validity check (can be converted to SMILES) 2. Atom count comparison with reference 3. Bond connectivity comparison (adjacency matrices) 4. Bond order assignment and verification - Higher threshold allows more bond differences - Returns False if any step fails """ # print(f"Using source smiles {smiles}") try: # mol = Chem.MolFromMol2Block(mol2_string, removeHs=False, sanitize=False) mol = Chem.MolFromMolBlock(sdf_string, removeHs=False, sanitize=False) # print(mol) if mol is None: return False except Exception as e: return False self.logger.info("Performing chemical validity check...") # check 1: molecule smiles are chemically correct try: canonical_smiles = self.standardize_smiles(smiles) mol_reference = Chem.MolFromSmiles(canonical_smiles) mol_smiles = self.standardize_smiles(Chem.MolToSmiles(mol)) except Exception as e: self.logger.error(f"Failed to generate smiles: {e}") return False if mol_smiles is None: self.logger.error("Failed to generate smiles. No smiles produced") return False self.logger.info("Chemical validity check passed...") self.logger.info("Comparing molecule with reference...") #check 2: molecule and reference have the same number of atoms if mol.GetNumAtoms() != mol_reference.GetNumAtoms(): self.logger.error("Comparison with reference failed. Atom number does not match") return False # check 3: bonds order up to threshold A1 = self.adjacency_with_orders(Chem.MolFromSmiles(mol_smiles)) A2 = self.adjacency_with_orders(mol_reference) diffs_num = self.count_bond_diffs(A1, A2) if diffs_num > threshold: return False self.logger.info("Comparison with reference passed. Reconstructing bond orders of the molecule...") # check 4: Fix bonds order to match smiles. Fails if does not mol = AllChem.AssignBondOrdersFromTemplate(mol_reference, mol) bond_count = sum([b.GetBondTypeAsDouble() for b in mol.GetBonds()]) bond_count_ref = sum([b.GetBondTypeAsDouble() for b in mol_reference.GetBonds()]) if bond_count != bond_count_ref: return False return True
[docs] def fix_bond_orders(self, sdf_string: str, smiles: str): """ Fix bond orders in a molecular structure using a reference SMILES. This method attempts to correct bond orders in a molecular structure by using a reference SMILES string as a template. Args: sdf_string (str): SDF format string of the molecular structure smiles (str): Reference SMILES string to use as template Returns: Chem.Mol: RDKit molecule with corrected bond orders, or False if failed Note: - Template SMILES should represent the same molecule - Returns False if correction fails """ try: mol = Chem.MolFromMolBlock(sdf_string, removeHs=False, sanitize=False) if mol is None: return False except Exception as e: return False # construct template molecule if possible try: canonical_smiles = self.standardize_smiles(smiles) mol_reference = Chem.MolFromSmiles(canonical_smiles, sanitize=False) mol_smiles = self.standardize_smiles(Chem.MolToSmiles(mol)) except Exception as e: self.logger.error("Failed to generate smiles") return False mol.UpdatePropertyCache(strict=False) mol_reference.UpdatePropertyCache(strict=False) if mol_smiles is None: self.logger.error("Failed to generate smiles") return False mol = AllChem.AssignBondOrdersFromTemplate(mol_reference, mol) return mol