Source code for docktopus.preprocessor

from openbabel import pybel
import subprocess
from pathlib import Path
from typing import Optional, Tuple

# TODO: add format conversions and information to dockingg engines about what formats for ligand and protein they require
#TODO: add Gdock and RFAA docking engines and RFAA-specific preparation

[docs] class DataPreprocessor: """ Handles molecular preparation and format conversion for docking workflows. This class provides methods for preparing molecular structures for docking simulations, including protonation, format conversion, and 3D conformer generation. It uses Open Babel for molecular manipulation and supports various input/output formats. This is a helper class called internally by the Docking class. The preprocessor handles: - Hydrogen addition at specified pH values - Format conversion between molecular file formats - Protein and ligand preparation for specific docking engines - 3D conformer generation from SMILES strings Attributes: work_dir (Path): Directory where processed files are stored """
[docs] def __init__(self, work_dir: str): """ Initialize the preprocessor with a working directory. Args: work_dir (str): Directory where processed files will be stored. Will be created if it doesn't exist. Example: >>> preprocessor = DataPreprocessor('./molecular_data') """ self.work_dir = Path(work_dir) self.work_dir.mkdir(parents=True, exist_ok=True)
[docs] def protonate(self, input_file: str, output_file: str, pH: float = 7.4, polar_only: bool = True) -> str: """ Add hydrogens to a molecule at specified pH. This method uses Open Babel to add hydrogens to molecular structures based on the specified pH value. It can add either all hydrogens or only polar hydrogens depending on the polar_only parameter. Args: input_file (str): Path to input structure file output_file (str): Path to save protonated structure pH (float, optional): pH value for protonation state calculation. Defaults to 7.4 (physiological pH). polar_only (bool, optional): If True, only add polar hydrogens. If False, add all hydrogens. Defaults to True. Returns: str: Path to the protonated structure file Raises: FileNotFoundError: If input file doesn't exist RuntimeError: If protonation fails Example: >>> protonated_file = preprocessor.protonate( ... 'molecule.sdf', ... 'molecule_protonated.sdf', ... pH=7.4, ... polar_only=False ... ) Note: - Supports various input formats (SDF, PDB, MOL2, etc.) - Output format is determined by file extension - pH affects the protonation state of titratable groups """ input_format = Path(input_file).suffix[1:] # Remove leading dot output_format = Path(output_file).suffix[1:] mol = next(pybel.readfile(input_format, input_file)) mol.OBMol.AddHydrogens(polar_only, False, pH) output = pybel.Outputfile(output_format, output_file, overwrite=True) output.write(mol) output.close() return output_file
[docs] def convert_format(self, input_file: str, output_file: str, remove_hydrogens: bool = False) -> str: """ Convert between molecular file formats. This method converts molecular structures between different file formats using Open Babel. It can optionally remove hydrogens during conversion. Args: input_file (str): Path to input file output_file (str): Path to output file remove_hydrogens (bool, optional): Whether to remove hydrogens during conversion. Defaults to False. Returns: str: Path to the converted file Raises: FileNotFoundError: If input file doesn't exist RuntimeError: If format conversion fails Example: >>> # Convert SDF to MOL2 >>> mol2_file = preprocessor.convert_format( ... 'ligand.sdf', ... 'ligand.mol2' ... ) >>> # Convert PDB to PDBQT (removing hydrogens) >>> pdbqt_file = preprocessor.convert_format( ... 'protein.pdb', ... 'protein.pdbqt', ... remove_hydrogens=True ... ) Note: - Input and output formats are determined by file extensions - Common formats: SDF, PDB, MOL2, PDBQT, SMILES - Removing hydrogens can be useful for certain docking engines """ input_format = Path(input_file).suffix[1:] output_format = Path(output_file).suffix[1:] mol = next(pybel.readfile(input_format, input_file)) if remove_hydrogens: mol.OBMol.DeleteHydrogens() output = pybel.Outputfile(output_format, output_file, overwrite=True) output.write(mol) output.close() return output_file
[docs] def prepare_protein(self, protein_file: str, format: str = "pdb", output_dir: Optional[str] = None) -> str: """ Prepare protein structure for docking. This method prepares protein structures for docking by adding polar hydrogens at physiological pH. Args: protein_file (str): Path to protein structure file format (str, optional): Output format for prepared protein. Defaults to "pdb". output_dir (Optional[str], optional): Directory to save processed file. If None, uses the preprocessor's work_dir. Defaults to None. Returns: str: Path to the prepared protein file Raises: FileNotFoundError: If protein file doesn't exist RuntimeError: If protein preparation fails Example: >>> prepared_protein = preprocessor.prepare_protein( ... 'receptor.pdb', ... format='pdb', ... output_dir='./prepared' ... ) >>> print(f"Prepared protein: {prepared_protein}") Note: - Adds polar hydrogens at pH 7.4 - Preserves non-polar hydrogens if present - Output filename includes "_prepared" suffix """ if output_dir is None: output_dir = self.work_dir else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) protein_name = Path(protein_file).stem output_file = output_dir / f"{protein_name}_prepared.{format}" # Add polar hydrogens at physiological pH return self.protonate(protein_file, str(output_file), pH=7.4, polar_only=True)
[docs] def prepare_ligand(self, ligand_file: str, format: str = "sdf", output_dir: Optional[str] = None) -> str: """ Prepare ligand structure for docking. This method prepares ligand structures for docking by adding all hydrogens at physiological pH. It's designed for general-purpose ligand preparation and works with most docking engines. Args: ligand_file (str): Path to ligand structure file format (str, optional): Output format for prepared ligand. Defaults to "sdf". output_dir (Optional[str], optional): Directory to save processed file. If None, uses the preprocessor's work_dir. Defaults to None. Returns: str: Path to the prepared ligand file Raises: FileNotFoundError: If ligand file doesn't exist RuntimeError: If ligand preparation fails Example: >>> prepared_ligand = preprocessor.prepare_ligand( ... 'molecule.sdf', ... format='sdf', ... output_dir='./prepared' ... ) >>> print(f"Prepared ligand: {prepared_ligand}") Note: - Adds all hydrogens at pH 7.4 - Useful for most docking engines that require explicit hydrogens - Output filename includes "_prepared" suffix """ if output_dir is None: output_dir = self.work_dir else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) ligand_name = Path(ligand_file).stem output_file = output_dir / f"{ligand_name}_prepared.{format}" # Add all hydrogens at physiological pH return self.protonate(ligand_file, str(output_file), pH=7.4, polar_only=False)
[docs] def prepare_protein_vina(self, protein_file: str, pH: float = 7.4, output_dir: Optional[str] = None) -> str: """ Prepare protein structure specifically for Vina docking. This method prepares protein structures for AutoDock Vina by converting them to PDBQT format with appropriate hydrogen handling. Vina requires PDBQT format with specific atom types which is properly handled by obabel binary instead of pybel. Args: protein_file (str): Path to protein structure file (typically PDB format) pH (float, optional): pH value for protonation state calculation. Defaults to 7.4. output_dir (Optional[str], optional): Directory to save processed file. If None, uses the preprocessor's work_dir. Defaults to None. Returns: str: Path to the prepared protein file in PDBQT format Raises: FileNotFoundError: If protein file doesn't exist RuntimeError: If protein preparation fails subprocess.CalledProcessError: If Open Babel conversion fails Example: >>> vina_protein = preprocessor.prepare_protein_vina( ... 'receptor.pdb', ... pH=7.4, ... output_dir='./vina_prepared' ... ) >>> print(f"Vina-ready protein: {vina_protein}") Note: - Converts to PDBQT format required by Vina using system call to obabel binary - Removes non-polar hydrogens (-xr flag) - Adds polar hydrogens at specified pH """ if output_dir is None: output_dir = self.work_dir else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) protein_name = Path(protein_file).stem output_file = output_dir / f"{protein_name}_prepared.pdbqt" # Convert to PDBQT format using Open Babel cmd = [ "obabel", "-ipdb", protein_file, "-opdbqt", "-O", str(output_file), "-xr", # Remove non-polar hydrogens "-p", str(pH) # Add polar hydrogens at pH 7.4 ] try: result = subprocess.run( cmd, capture_output=True, text=True, check=True ) return str(output_file) except subprocess.CalledProcessError as e: raise RuntimeError(f"Open Babel conversion failed: {e.stderr}")
[docs] def prepare_ligand_vina(self, ligand_file: str, pH: float = 7.4, output_dir: Optional[str] = None) -> str: """ Prepare ligand structure specifically for Vina docking. This method prepares ligand structures for AutoDock Vina by converting them to PDBQT format with appropriate hydrogen handling. Vina requires PDBQT format with specific atom types, charges, and rotatable bonds. Args: ligand_file (str): Path to ligand structure file (typically SDF format) pH (float, optional): pH value for protonation state calculation. Defaults to 7.4. output_dir (Optional[str], optional): Directory to save processed file. If None, uses the preprocessor's work_dir. Defaults to None. Returns: str: Path to the prepared ligand file in PDBQT format Raises: FileNotFoundError: If ligand file doesn't exist RuntimeError: If ligand preparation fails subprocess.CalledProcessError: If Open Babel conversion fails Example: >>> vina_ligand = preprocessor.prepare_ligand_vina( ... 'molecule.sdf', ... pH=7.4, ... output_dir='./vina_prepared' ... ) >>> print(f"Vina-ready ligand: {vina_ligand}") Note: - Converts to PDBQT format required by Vina - Adds polar hydrogens, removes non-polar hydrogens (-xpnh flag) - Assigns atom types, charges, and rotatable bonds - Assumes SDF input format (modify cmd if using different format) """ if output_dir is None: output_dir = self.work_dir else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) ligand_name = Path(ligand_file).stem output_file = output_dir / f"{ligand_name}_prepared.pdbqt" # Convert to PDBQT format using Open Babel cmd = [ "obabel", "-isdf", ligand_file, # Assuming SDF input format "-opdbqt", "-O", str(output_file), "-xpnh", # Add polar hydrogens, no non-polar hydrogens "-p", str(pH) # Add hydrogens at specified pH ] try: result = subprocess.run( cmd, capture_output=True, text=True, check=True ) return str(output_file) except subprocess.CalledProcessError as e: raise RuntimeError(f"Open Babel conversion failed: {e.stderr}")
[docs] def generate_conformers(self, smiles: str, output_file: str) -> str: """ Generate a 3D conformer from a SMILES string and write to an SDF file. This method generates a single 3D conformer from a SMILES string using Open Babel's 3D coordinate generation. The resulting structure is saved in SDF format without hydrogens added. Args: smiles (str): SMILES string of the molecule output_file (str): Path to save the 3D structure (should have .sdf extension) Returns: str: Path to the output SDF file containing the 3D conformer Raises: ValueError: If SMILES string is invalid RuntimeError: If 3D generation fails Example: >>> output_file = preprocessor.generate_conformers( ... 'CC(=O)OC1=CC=CC=C1C(=O)O', ... 'aspirin_3d.sdf' ... ) >>> print(f"3D structure saved to: {output_file}") Note: - Generates only one conformer (not multiple conformers) - Does not add hydrogens (use protonate() if needed) - Uses Open Babel's make3D() method for coordinate generation - Output is always in SDF format regardless of output_file extension """ # Read molecule from SMILES mol = pybel.readstring("smi", smiles) # Generate 3D coordinates mol.make3D() # Always export as SDF output = pybel.Outputfile("sdf", output_file, overwrite=True) output.write(mol) output.close() return output_file