from openbabel import pybel
import subprocess
from pathlib import Path
from typing import Optional, Tuple
# TODO: add format conversions and information to dockingg engines about what formats for ligand and protein they require
#TODO: add Gdock and RFAA docking engines and RFAA-specific preparation
[docs]
class DataPreprocessor:
"""
Handles molecular preparation and format conversion for docking workflows.
This class provides methods for preparing molecular structures for docking
simulations, including protonation, format conversion, and 3D conformer
generation. It uses Open Babel for molecular manipulation and supports
various input/output formats.
This is a helper class called internally by the Docking class.
The preprocessor handles:
- Hydrogen addition at specified pH values
- Format conversion between molecular file formats
- Protein and ligand preparation for specific docking engines
- 3D conformer generation from SMILES strings
Attributes:
work_dir (Path): Directory where processed files are stored
"""
[docs]
def __init__(self, work_dir: str):
"""
Initialize the preprocessor with a working directory.
Args:
work_dir (str): Directory where processed files will be stored.
Will be created if it doesn't exist.
Example:
>>> preprocessor = DataPreprocessor('./molecular_data')
"""
self.work_dir = Path(work_dir)
self.work_dir.mkdir(parents=True, exist_ok=True)
[docs]
def protonate(self, input_file: str, output_file: str, pH: float = 7.4, polar_only: bool = True) -> str:
"""
Add hydrogens to a molecule at specified pH.
This method uses Open Babel to add hydrogens to molecular structures
based on the specified pH value. It can add either all hydrogens or
only polar hydrogens depending on the polar_only parameter.
Args:
input_file (str): Path to input structure file
output_file (str): Path to save protonated structure
pH (float, optional): pH value for protonation state calculation.
Defaults to 7.4 (physiological pH).
polar_only (bool, optional): If True, only add polar hydrogens.
If False, add all hydrogens. Defaults to True.
Returns:
str: Path to the protonated structure file
Raises:
FileNotFoundError: If input file doesn't exist
RuntimeError: If protonation fails
Example:
>>> protonated_file = preprocessor.protonate(
... 'molecule.sdf',
... 'molecule_protonated.sdf',
... pH=7.4,
... polar_only=False
... )
Note:
- Supports various input formats (SDF, PDB, MOL2, etc.)
- Output format is determined by file extension
- pH affects the protonation state of titratable groups
"""
input_format = Path(input_file).suffix[1:] # Remove leading dot
output_format = Path(output_file).suffix[1:]
mol = next(pybel.readfile(input_format, input_file))
mol.OBMol.AddHydrogens(polar_only, False, pH)
output = pybel.Outputfile(output_format, output_file, overwrite=True)
output.write(mol)
output.close()
return output_file
[docs]
def prepare_protein(self, protein_file: str, format: str = "pdb", output_dir: Optional[str] = None) -> str:
"""
Prepare protein structure for docking.
This method prepares protein structures for docking by adding polar
hydrogens at physiological pH.
Args:
protein_file (str): Path to protein structure file
format (str, optional): Output format for prepared protein.
Defaults to "pdb".
output_dir (Optional[str], optional): Directory to save processed file.
If None, uses the preprocessor's work_dir. Defaults to None.
Returns:
str: Path to the prepared protein file
Raises:
FileNotFoundError: If protein file doesn't exist
RuntimeError: If protein preparation fails
Example:
>>> prepared_protein = preprocessor.prepare_protein(
... 'receptor.pdb',
... format='pdb',
... output_dir='./prepared'
... )
>>> print(f"Prepared protein: {prepared_protein}")
Note:
- Adds polar hydrogens at pH 7.4
- Preserves non-polar hydrogens if present
- Output filename includes "_prepared" suffix
"""
if output_dir is None:
output_dir = self.work_dir
else:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
protein_name = Path(protein_file).stem
output_file = output_dir / f"{protein_name}_prepared.{format}"
# Add polar hydrogens at physiological pH
return self.protonate(protein_file, str(output_file), pH=7.4, polar_only=True)
[docs]
def prepare_ligand(self, ligand_file: str, format: str = "sdf", output_dir: Optional[str] = None) -> str:
"""
Prepare ligand structure for docking.
This method prepares ligand structures for docking by adding all
hydrogens at physiological pH. It's designed for general-purpose
ligand preparation and works with most docking engines.
Args:
ligand_file (str): Path to ligand structure file
format (str, optional): Output format for prepared ligand.
Defaults to "sdf".
output_dir (Optional[str], optional): Directory to save processed file.
If None, uses the preprocessor's work_dir. Defaults to None.
Returns:
str: Path to the prepared ligand file
Raises:
FileNotFoundError: If ligand file doesn't exist
RuntimeError: If ligand preparation fails
Example:
>>> prepared_ligand = preprocessor.prepare_ligand(
... 'molecule.sdf',
... format='sdf',
... output_dir='./prepared'
... )
>>> print(f"Prepared ligand: {prepared_ligand}")
Note:
- Adds all hydrogens at pH 7.4
- Useful for most docking engines that require explicit hydrogens
- Output filename includes "_prepared" suffix
"""
if output_dir is None:
output_dir = self.work_dir
else:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
ligand_name = Path(ligand_file).stem
output_file = output_dir / f"{ligand_name}_prepared.{format}"
# Add all hydrogens at physiological pH
return self.protonate(ligand_file, str(output_file), pH=7.4, polar_only=False)
[docs]
def prepare_protein_vina(self, protein_file: str, pH: float = 7.4, output_dir: Optional[str] = None) -> str:
"""
Prepare protein structure specifically for Vina docking.
This method prepares protein structures for AutoDock Vina by converting
them to PDBQT format with appropriate hydrogen handling. Vina requires
PDBQT format with specific atom types which is properly handled by obabel binary instead of pybel.
Args:
protein_file (str): Path to protein structure file (typically PDB format)
pH (float, optional): pH value for protonation state calculation.
Defaults to 7.4.
output_dir (Optional[str], optional): Directory to save processed file.
If None, uses the preprocessor's work_dir. Defaults to None.
Returns:
str: Path to the prepared protein file in PDBQT format
Raises:
FileNotFoundError: If protein file doesn't exist
RuntimeError: If protein preparation fails
subprocess.CalledProcessError: If Open Babel conversion fails
Example:
>>> vina_protein = preprocessor.prepare_protein_vina(
... 'receptor.pdb',
... pH=7.4,
... output_dir='./vina_prepared'
... )
>>> print(f"Vina-ready protein: {vina_protein}")
Note:
- Converts to PDBQT format required by Vina using system call to obabel binary
- Removes non-polar hydrogens (-xr flag)
- Adds polar hydrogens at specified pH
"""
if output_dir is None:
output_dir = self.work_dir
else:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
protein_name = Path(protein_file).stem
output_file = output_dir / f"{protein_name}_prepared.pdbqt"
# Convert to PDBQT format using Open Babel
cmd = [
"obabel",
"-ipdb", protein_file,
"-opdbqt",
"-O", str(output_file),
"-xr", # Remove non-polar hydrogens
"-p", str(pH) # Add polar hydrogens at pH 7.4
]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True
)
return str(output_file)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Open Babel conversion failed: {e.stderr}")
[docs]
def prepare_ligand_vina(self, ligand_file: str, pH: float = 7.4, output_dir: Optional[str] = None) -> str:
"""
Prepare ligand structure specifically for Vina docking.
This method prepares ligand structures for AutoDock Vina by converting
them to PDBQT format with appropriate hydrogen handling. Vina requires
PDBQT format with specific atom types, charges, and rotatable bonds.
Args:
ligand_file (str): Path to ligand structure file (typically SDF format)
pH (float, optional): pH value for protonation state calculation.
Defaults to 7.4.
output_dir (Optional[str], optional): Directory to save processed file.
If None, uses the preprocessor's work_dir. Defaults to None.
Returns:
str: Path to the prepared ligand file in PDBQT format
Raises:
FileNotFoundError: If ligand file doesn't exist
RuntimeError: If ligand preparation fails
subprocess.CalledProcessError: If Open Babel conversion fails
Example:
>>> vina_ligand = preprocessor.prepare_ligand_vina(
... 'molecule.sdf',
... pH=7.4,
... output_dir='./vina_prepared'
... )
>>> print(f"Vina-ready ligand: {vina_ligand}")
Note:
- Converts to PDBQT format required by Vina
- Adds polar hydrogens, removes non-polar hydrogens (-xpnh flag)
- Assigns atom types, charges, and rotatable bonds
- Assumes SDF input format (modify cmd if using different format)
"""
if output_dir is None:
output_dir = self.work_dir
else:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
ligand_name = Path(ligand_file).stem
output_file = output_dir / f"{ligand_name}_prepared.pdbqt"
# Convert to PDBQT format using Open Babel
cmd = [
"obabel",
"-isdf", ligand_file, # Assuming SDF input format
"-opdbqt",
"-O", str(output_file),
"-xpnh", # Add polar hydrogens, no non-polar hydrogens
"-p", str(pH) # Add hydrogens at specified pH
]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True
)
return str(output_file)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Open Babel conversion failed: {e.stderr}")