Source code for docktopus.validator

import numpy as np
import pandas as pd
from rdkit.Chem import MolStandardize
from rdkit import Chem
from openbabel import pybel
from rdkit.Chem import AllChem
import logging
import subprocess
pybel.ob.obErrorLog.SetOutputLevel(0)



[docs]
class RFAAValidator:
    """
    Validator for RFAA docking results to detect potential hallucinations.
    
    This class provides methods to validate protein-ligand structures generated
    by RFAA to detect potential hallucinations (incorrectly predicted structures).
    It performs chemical validity checks and compares predicted structures with
    reference SMILES strings.
    
    Attributes:
        logger (logging.Logger): Logger instance for validation events
    
    Example:
        >>> validator = RFAAValidator()
        >>> sdf_string = validator.convert_pdb_to_sdf("mol.pdb")
        >>> is_valid = validator.validate_ligand(sdf_string, reference_smiles)
        >>> if is_valid:
        ...     print("Structure passed validation")
        ... else:
        ...     print("Structure may be hallucinated")
    """
    

[docs]
    def __init__(self):
        """
        Initialize the RFAA validator.
        
        Sets up logging and prepares the validator for structure validation.
        
        Example:
            >>> validator = RFAAValidator()
        """
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)

    


[docs]
    def extract_ligand(self, input_pdb, output_pdb):
        """
        Extract ligand coordinates from a protein-ligand complex PDB file.
        
        This method uses pdb_selchain and pdb_tidy to extract only the ligand
        atoms (chain B) from a protein-ligand complex and clean up the PDB format.
        
        Args:
            input_pdb (str): Path to input PDB file containing protein-ligand complex
            output_pdb (str): Path to output PDB file containing only ligand
        
        Raises:
            subprocess.CalledProcessError: If pdb_selchain or pdb_tidy fails
            IOError: If output file cannot be written
        
        
        Note:
            - Assumes ligand is in chain B of the complex
            - Requires pdb_selchain and pdb_tidy executables to be installed
            - Output PDB is cleaned and formatted for further processing
        """
        try:
            # Run pdb_selchain -B and pipe to pdb_tidy, write to output_pdb
            selchain = subprocess.run(
                ["pdb_selchain", "-B", input_pdb],
                capture_output=True,
                check=True
            )
            tidy = subprocess.run(
                ["pdb_tidy"],
                input=selchain.stdout,
                capture_output=True,
                check=True
            )
            with open(output_pdb, "wb") as out_f:
                out_f.write(tidy.stdout)
        except Exception as e:
            self.logger.error(f"Error extracting ligand for id {id}: {e}")

        
    

[docs]
    def standardize_smiles(self, smiles: str) -> str:
        """
        Standardize a SMILES string to canonical form.
        
        This method converts a SMILES string to its canonical tautomeric form
        using RDKit's MolStandardize module. This ensures consistent comparison
        between different representations of the same molecule.
        
        Args:
            smiles (str): Input SMILES string
        
        Returns:
            str: Canonical, standardized SMILES string, or None if invalid
        
        Raises:
            RuntimeError: If SMILES standardization fails
        
        
        Note:
            - Handles tautomeric forms automatically
            - Removes stereochemistry information
            - Returns None for invalid SMILES strings
            - Uses RDKit's MolStandardize for robust standardization
        """
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        tautomer = MolStandardize.rdMolStandardize.CanonicalTautomer(mol)

        return Chem.MolToSmiles(tautomer, isomericSmiles=False, canonical=True)

    

[docs]
    def adjacency_with_orders(self, mol: Chem.Mol) -> np.ndarray:
        """
        Create adjacency matrix with bond orders from an RDKit molecule.
        
        This method generates a symmetric adjacency matrix where each element
        represents the bond order between two atoms (0 = no bond, 1 = single,
        2 = double, 3 = triple).
        
        Args:
            mol (Chem.Mol): RDKit molecule object
        
        Returns:
            np.ndarray: Symmetric adjacency matrix with bond orders
        
        
        Note:
            - Matrix is symmetric (A[i,j] = A[j,i])
            - Diagonal elements are 0 (no self-bonds)
            - Bond orders: 1=single, 2=double, 3=triple
            - Useful for comparing molecular connectivity
        """
        n = mol.GetNumAtoms()
        A = np.zeros((n, n), dtype=int)
        for bond in mol.GetBonds():
            i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            # Convert bond type to integer (1=single, 2=double, etc.)
            A[i, j] = A[j, i] = int(bond.GetBondTypeAsDouble())
        return A



[docs]
    def count_bond_diffs(self, A1: np.ndarray, A2: np.ndarray) -> int:
        """
        Count the number of bond differences between two adjacency matrices.
        
        This method compares two adjacency matrices and counts how many bonds
        differ between them. Only the upper triangle is considered to avoid
        double counting.
        
        Args:
            A1 (np.ndarray): First adjacency matrix
            A2 (np.ndarray): Second adjacency matrix
        
        Returns:
            int: Number of bond differences between the molecules
        
        Raises:
            ValueError: If matrices have different dimensions
        
        
        Note:
            - Returns 0 if molecules have identical connectivity
            - Higher values indicate more structural differences
        """
        diff_matrix = (A1 != A2).astype(int)
        i_upper = np.triu_indices(diff_matrix.shape[0], k=1)
        return diff_matrix[i_upper].sum()

    

[docs]
    def convert_pdb_to_sdf(self, pdb_file: str) -> str:
        """
        Convert a PDB file to SDF format using OpenBabel.
        
        This method converts a PDB file containing molecular coordinates to
        SDF format using pybel which handle kekekulization of aromatic moieties better than rdkit.
        
        Args:
            pdb_file (str): Path to the input PDB file
        
        Returns:
            str: SDF format string block of the molecule, or None if conversion fails
        
        Raises:
            FileNotFoundError: If PDB file doesn't exist
            RuntimeError: If conversion fails
        
        
        Note:
            - Uses OpenBabel's pybel interface for conversion
            - Returns SDF string block, not file path
            - Handles coordinate information and basic molecular properties
            - Returns None if conversion fails
        """
        
        try:
            # Read the PDB file
            mol = pybel.readfile("pdb", pdb_file).__next__()
            
            # Convert to SDF string
            sdf_string = mol.write("sdf")
            
            return sdf_string
            
        except Exception as e:
            self.logger.error(f"Error converting {pdb_file}: {str(e)}")
            return None

    
    

[docs]
    def validate_ligand(self, sdf_string: str, smiles: str, threshold=1) -> bool:
        """
        Validate a ligand structure against a reference SMILES string.
        
        This method performs comprehensive validation of a predicted ligand
        structure by comparing it with a reference SMILES string. It checks
        chemical validity, atom counts, bond connectivity, and bond orders.
        
        Args:
            sdf_string (str): SDF format string of the predicted ligand structure
            smiles (str): Reference SMILES string of the same molecule as in the pdb file for comparison
            threshold (int, optional): Maximum allowed bond differences.
                Defaults to 1.
        
        Returns:
            bool: True if structure passes validation, False otherwise
        
        Note:
            Validation steps:
            1. Chemical validity check (can be converted to SMILES)
            2. Atom count comparison with reference
            3. Bond connectivity comparison (adjacency matrices)
            4. Bond order assignment and verification
            - Higher threshold allows more bond differences
            - Returns False if any step fails
        """

        # print(f"Using source smiles {smiles}")
        try:
            # mol = Chem.MolFromMol2Block(mol2_string, removeHs=False, sanitize=False)
            mol = Chem.MolFromMolBlock(sdf_string, removeHs=False, sanitize=False)
            # print(mol)
            if mol is None:
                return False
        except Exception as e:
            return False
        
        self.logger.info("Performing chemical validity check...")
        # check 1: molecule smiles are chemically correct
        try:
            canonical_smiles = self.standardize_smiles(smiles)
            mol_reference = Chem.MolFromSmiles(canonical_smiles)
            mol_smiles = self.standardize_smiles(Chem.MolToSmiles(mol))

        except Exception as e:
            self.logger.error(f"Failed to generate smiles: {e}")
            return False
        
        if mol_smiles is None:
            self.logger.error("Failed to generate smiles. No smiles produced")
            return False

        self.logger.info("Chemical validity check passed...")
        self.logger.info("Comparing molecule with reference...")

        #check 2: molecule and reference have the same number of atoms
        if mol.GetNumAtoms() != mol_reference.GetNumAtoms():
            self.logger.error("Comparison with reference failed. Atom number does not match")
            return False
        
        # check 3: bonds order up to threshold
        A1 = self.adjacency_with_orders(Chem.MolFromSmiles(mol_smiles))
        A2 = self.adjacency_with_orders(mol_reference)

        diffs_num = self.count_bond_diffs(A1, A2)

        if diffs_num > threshold:
            return False
        
        self.logger.info("Comparison with reference passed. Reconstructing bond orders of the molecule...")
        
        # check 4: Fix bonds order to match smiles. Fails if does not    
        mol = AllChem.AssignBondOrdersFromTemplate(mol_reference, mol)

        bond_count = sum([b.GetBondTypeAsDouble() for b in mol.GetBonds()])
        bond_count_ref = sum([b.GetBondTypeAsDouble() for b in mol_reference.GetBonds()])
        if bond_count != bond_count_ref:
            return False

        return True

                

[docs]
    def fix_bond_orders(self, sdf_string: str, smiles: str):
        """
        Fix bond orders in a molecular structure using a reference SMILES.
        
        This method attempts to correct bond orders in a molecular structure
        by using a reference SMILES string as a template.         

        Args:
            sdf_string (str): SDF format string of the molecular structure
            smiles (str): Reference SMILES string to use as template
        
        Returns:
            Chem.Mol: RDKit molecule with corrected bond orders, or False if failed
        
        
        Note:
            - Template SMILES should represent the same molecule
            - Returns False if correction fails
        """
        try:
            mol = Chem.MolFromMolBlock(sdf_string, removeHs=False, sanitize=False)
            if mol is None:
                return False
        except Exception as e:
            return False

        # construct template molecule if possible
        try:
            canonical_smiles = self.standardize_smiles(smiles)
            mol_reference = Chem.MolFromSmiles(canonical_smiles, sanitize=False)
            mol_smiles = self.standardize_smiles(Chem.MolToSmiles(mol))
        
        except Exception as e:
            self.logger.error("Failed to generate smiles")
            return False

        mol.UpdatePropertyCache(strict=False)
        mol_reference.UpdatePropertyCache(strict=False)
        
        if mol_smiles is None:
            self.logger.error("Failed to generate smiles")
            return False
        
        mol = AllChem.AssignBondOrdersFromTemplate(mol_reference, mol)

        return mol
Source code for docktopus.validator

docktopus

Navigation

Related Topics