import numpy as np
import pandas as pd
from rdkit.Chem import MolStandardize
from rdkit import Chem
from openbabel import pybel
from rdkit.Chem import AllChem
import logging
import subprocess
pybel.ob.obErrorLog.SetOutputLevel(0)
[docs]
class RFAAValidator:
"""
Validator for RFAA docking results to detect potential hallucinations.
This class provides methods to validate protein-ligand structures generated
by RFAA to detect potential hallucinations (incorrectly predicted structures).
It performs chemical validity checks and compares predicted structures with
reference SMILES strings.
Attributes:
logger (logging.Logger): Logger instance for validation events
Example:
>>> validator = RFAAValidator()
>>> sdf_string = validator.convert_pdb_to_sdf("mol.pdb")
>>> is_valid = validator.validate_ligand(sdf_string, reference_smiles)
>>> if is_valid:
... print("Structure passed validation")
... else:
... print("Structure may be hallucinated")
"""
[docs]
def __init__(self):
"""
Initialize the RFAA validator.
Sets up logging and prepares the validator for structure validation.
Example:
>>> validator = RFAAValidator()
"""
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO)
[docs]
def extract_ligand(self, input_pdb, output_pdb):
"""
Extract ligand coordinates from a protein-ligand complex PDB file.
This method uses pdb_selchain and pdb_tidy to extract only the ligand
atoms (chain B) from a protein-ligand complex and clean up the PDB format.
Args:
input_pdb (str): Path to input PDB file containing protein-ligand complex
output_pdb (str): Path to output PDB file containing only ligand
Raises:
subprocess.CalledProcessError: If pdb_selchain or pdb_tidy fails
IOError: If output file cannot be written
Note:
- Assumes ligand is in chain B of the complex
- Requires pdb_selchain and pdb_tidy executables to be installed
- Output PDB is cleaned and formatted for further processing
"""
try:
# Run pdb_selchain -B and pipe to pdb_tidy, write to output_pdb
selchain = subprocess.run(
["pdb_selchain", "-B", input_pdb],
capture_output=True,
check=True
)
tidy = subprocess.run(
["pdb_tidy"],
input=selchain.stdout,
capture_output=True,
check=True
)
with open(output_pdb, "wb") as out_f:
out_f.write(tidy.stdout)
except Exception as e:
self.logger.error(f"Error extracting ligand for id {id}: {e}")
[docs]
def standardize_smiles(self, smiles: str) -> str:
"""
Standardize a SMILES string to canonical form.
This method converts a SMILES string to its canonical tautomeric form
using RDKit's MolStandardize module. This ensures consistent comparison
between different representations of the same molecule.
Args:
smiles (str): Input SMILES string
Returns:
str: Canonical, standardized SMILES string, or None if invalid
Raises:
RuntimeError: If SMILES standardization fails
Note:
- Handles tautomeric forms automatically
- Removes stereochemistry information
- Returns None for invalid SMILES strings
- Uses RDKit's MolStandardize for robust standardization
"""
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
tautomer = MolStandardize.rdMolStandardize.CanonicalTautomer(mol)
return Chem.MolToSmiles(tautomer, isomericSmiles=False, canonical=True)
[docs]
def adjacency_with_orders(self, mol: Chem.Mol) -> np.ndarray:
"""
Create adjacency matrix with bond orders from an RDKit molecule.
This method generates a symmetric adjacency matrix where each element
represents the bond order between two atoms (0 = no bond, 1 = single,
2 = double, 3 = triple).
Args:
mol (Chem.Mol): RDKit molecule object
Returns:
np.ndarray: Symmetric adjacency matrix with bond orders
Note:
- Matrix is symmetric (A[i,j] = A[j,i])
- Diagonal elements are 0 (no self-bonds)
- Bond orders: 1=single, 2=double, 3=triple
- Useful for comparing molecular connectivity
"""
n = mol.GetNumAtoms()
A = np.zeros((n, n), dtype=int)
for bond in mol.GetBonds():
i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
# Convert bond type to integer (1=single, 2=double, etc.)
A[i, j] = A[j, i] = int(bond.GetBondTypeAsDouble())
return A
[docs]
def count_bond_diffs(self, A1: np.ndarray, A2: np.ndarray) -> int:
"""
Count the number of bond differences between two adjacency matrices.
This method compares two adjacency matrices and counts how many bonds
differ between them. Only the upper triangle is considered to avoid
double counting.
Args:
A1 (np.ndarray): First adjacency matrix
A2 (np.ndarray): Second adjacency matrix
Returns:
int: Number of bond differences between the molecules
Raises:
ValueError: If matrices have different dimensions
Note:
- Returns 0 if molecules have identical connectivity
- Higher values indicate more structural differences
"""
diff_matrix = (A1 != A2).astype(int)
i_upper = np.triu_indices(diff_matrix.shape[0], k=1)
return diff_matrix[i_upper].sum()
[docs]
def convert_pdb_to_sdf(self, pdb_file: str) -> str:
"""
Convert a PDB file to SDF format using OpenBabel.
This method converts a PDB file containing molecular coordinates to
SDF format using pybel which handle kekekulization of aromatic moieties better than rdkit.
Args:
pdb_file (str): Path to the input PDB file
Returns:
str: SDF format string block of the molecule, or None if conversion fails
Raises:
FileNotFoundError: If PDB file doesn't exist
RuntimeError: If conversion fails
Note:
- Uses OpenBabel's pybel interface for conversion
- Returns SDF string block, not file path
- Handles coordinate information and basic molecular properties
- Returns None if conversion fails
"""
try:
# Read the PDB file
mol = pybel.readfile("pdb", pdb_file).__next__()
# Convert to SDF string
sdf_string = mol.write("sdf")
return sdf_string
except Exception as e:
self.logger.error(f"Error converting {pdb_file}: {str(e)}")
return None
[docs]
def validate_ligand(self, sdf_string: str, smiles: str, threshold=1) -> bool:
"""
Validate a ligand structure against a reference SMILES string.
This method performs comprehensive validation of a predicted ligand
structure by comparing it with a reference SMILES string. It checks
chemical validity, atom counts, bond connectivity, and bond orders.
Args:
sdf_string (str): SDF format string of the predicted ligand structure
smiles (str): Reference SMILES string of the same molecule as in the pdb file for comparison
threshold (int, optional): Maximum allowed bond differences.
Defaults to 1.
Returns:
bool: True if structure passes validation, False otherwise
Note:
Validation steps:
1. Chemical validity check (can be converted to SMILES)
2. Atom count comparison with reference
3. Bond connectivity comparison (adjacency matrices)
4. Bond order assignment and verification
- Higher threshold allows more bond differences
- Returns False if any step fails
"""
# print(f"Using source smiles {smiles}")
try:
# mol = Chem.MolFromMol2Block(mol2_string, removeHs=False, sanitize=False)
mol = Chem.MolFromMolBlock(sdf_string, removeHs=False, sanitize=False)
# print(mol)
if mol is None:
return False
except Exception as e:
return False
self.logger.info("Performing chemical validity check...")
# check 1: molecule smiles are chemically correct
try:
canonical_smiles = self.standardize_smiles(smiles)
mol_reference = Chem.MolFromSmiles(canonical_smiles)
mol_smiles = self.standardize_smiles(Chem.MolToSmiles(mol))
except Exception as e:
self.logger.error(f"Failed to generate smiles: {e}")
return False
if mol_smiles is None:
self.logger.error("Failed to generate smiles. No smiles produced")
return False
self.logger.info("Chemical validity check passed...")
self.logger.info("Comparing molecule with reference...")
#check 2: molecule and reference have the same number of atoms
if mol.GetNumAtoms() != mol_reference.GetNumAtoms():
self.logger.error("Comparison with reference failed. Atom number does not match")
return False
# check 3: bonds order up to threshold
A1 = self.adjacency_with_orders(Chem.MolFromSmiles(mol_smiles))
A2 = self.adjacency_with_orders(mol_reference)
diffs_num = self.count_bond_diffs(A1, A2)
if diffs_num > threshold:
return False
self.logger.info("Comparison with reference passed. Reconstructing bond orders of the molecule...")
# check 4: Fix bonds order to match smiles. Fails if does not
mol = AllChem.AssignBondOrdersFromTemplate(mol_reference, mol)
bond_count = sum([b.GetBondTypeAsDouble() for b in mol.GetBonds()])
bond_count_ref = sum([b.GetBondTypeAsDouble() for b in mol_reference.GetBonds()])
if bond_count != bond_count_ref:
return False
return True
[docs]
def fix_bond_orders(self, sdf_string: str, smiles: str):
"""
Fix bond orders in a molecular structure using a reference SMILES.
This method attempts to correct bond orders in a molecular structure
by using a reference SMILES string as a template.
Args:
sdf_string (str): SDF format string of the molecular structure
smiles (str): Reference SMILES string to use as template
Returns:
Chem.Mol: RDKit molecule with corrected bond orders, or False if failed
Note:
- Template SMILES should represent the same molecule
- Returns False if correction fails
"""
try:
mol = Chem.MolFromMolBlock(sdf_string, removeHs=False, sanitize=False)
if mol is None:
return False
except Exception as e:
return False
# construct template molecule if possible
try:
canonical_smiles = self.standardize_smiles(smiles)
mol_reference = Chem.MolFromSmiles(canonical_smiles, sanitize=False)
mol_smiles = self.standardize_smiles(Chem.MolToSmiles(mol))
except Exception as e:
self.logger.error("Failed to generate smiles")
return False
mol.UpdatePropertyCache(strict=False)
mol_reference.UpdatePropertyCache(strict=False)
if mol_smiles is None:
self.logger.error("Failed to generate smiles")
return False
mol = AllChem.AssignBondOrdersFromTemplate(mol_reference, mol)
return mol