Source code for docktopus.docking

from pathlib import Path
from typing import Optional, Dict, List, Union, Type
import logging
from datetime import datetime
from typing import Tuple

from .preprocessor import DataPreprocessor
from .gnina_engine import GninaDockingEngine
from .vina_engine import VinaDockingEngine
from .gdock_engine import GDockHEMEDockingEngine
from .rfaa_engine import RFAADockingEngine

#TODO: IDEA: maybe move target into class parameter which is natural for virtual screening. Single target and a library of ligands with some dataset metrics
[docs] class Docking: """ Main orchestrator class for molecular docking workflows. This class provides a unified interface for running molecular docking simulations using various docking engines. It handles input preparation, docking execution, and results processing in a streamlined workflow. The class supports multiple docking engines: - GNINA: Deep learning-based docking with CNN scoring - Vina: Traditional molecular docking with empirical scoring - GalaxyDock2 HEME: Specialized docking for heme-containing proteins - RFAA: AlphaFold2-based protein-ligand structure prediction Attributes: work_dir (Path): Base directory for all workflow outputs logger (logging.Logger): Logger instance for workflow events preprocessor (DataPreprocessor): Instance for molecular preparation engine: Docking engine instance (type depends on engine parameter) Example Usage: >>> from docktopus import Docking >>> # Initialize Vina docking >>> dock = Docking( ... engine="vina", ... work_dir="./test-data", ... box_center=[54.426, 78.117, 10.330], ... box_size=[15, 15, 15], ... seed=1000, ... cpu=4, ... exhaustiveness=8, ... num_modes=3 ... ) >>> # Prepare ligands from SMILES >>> smiles = ["CCNC(=O)c1ccc2c(c1)NC(=O)/C2=C(\\Nc1ccc(CN(C)C)cc1)c1ccccc1"] >>> ligands = dock.prepare_ligands(smiles) >>> # Prepare receptor >>> receptor = "test-data/1W0F-cyp.pdb" >>> target = dock.prepare_receptor(receptor_pdb=receptor) >>> # Run docking >>> results = dock.dock( ... receptor=target, ... ligands=ligands, ... smiles=smiles ... ) """
[docs] def __init__(self, engine: str, work_dir: str, model_runner_class=None, **engine_params): """ Initialize the docking workflow orchestrator. Args: engine (str): Name of docking engine to use. Supported values: - 'gnina': GNINA deep learning docking engine - 'vina': AutoDock Vina docking engine - 'galaxydock2-heme': GalaxyDock2 HEME specialized engine - 'rfaa': RFAA AlphaFold2-based engine work_dir (str): Base directory for all workflow outputs. Will be created if it doesn't exist. model_runner_class: ModelRunner class for RFAA engine (required if engine='rfaa') **engine_params: Additional parameters passed to the specific docking engine. Common parameters for non-rfaa engines include: - box_center: (x,y,z) coordinates of docking box center - box_size: (x,y,z) dimensions of docking box - exhaustiveness: Search exhaustiveness (higher = more thorough) - num_modes: Number of binding modes to generate - cpu: Number of CPU cores to use - seed: Random seed for reproducibility GNINA engine requires to pass path to gnina executable, GalaxyDock2 HEME requires path to the driver script while RosettaFold-All-Atoms requires ModelRunner object which you need to import directly from rf2aa.run_inference module shipped with RFAA. If you intend to use RFAA, your driver script should be in the top RFAA repository directory. Raises: ValueError: If unsupported engine is specified or required parameters are missing FileNotFoundError: If required executables or files are not found Example: >>> # Initialize GNINA docking >>> docking = Docking( ... engine='gnina', ... work_dir='./docking_results', ... box_center=(10.0, 20.0, 30.0), ... box_size=(20.0, 20.0, 20.0), ... gnina_path="/home/username/gnina" ... ) >>> # Initialize RFAA docking >>> from rf2aa.run_inference import ModelRunner >>> docking = Docking( ... engine='rfaa', ... work_dir='./rfaa_results', ... model_runner_class=ModelRunner ... ) """ self.work_dir = Path(work_dir) self.work_dir.mkdir(parents=True, exist_ok=True) # Set up logging log_file = self.work_dir / f"docking_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_file), logging.StreamHandler() ] ) self.logger = logging.getLogger(__name__) # Initialize preprocessor prep_dir = self.work_dir / "prepared" self.preprocessor = DataPreprocessor(str(prep_dir)) # Initialize docking engine engine_dir = self.work_dir / "docking" if engine.lower() == 'gnina': self.engine = GninaDockingEngine( work_dir=str(engine_dir), **engine_params ) elif engine.lower() == 'vina': self.engine = VinaDockingEngine( work_dir=str(engine_dir), **engine_params ) elif engine.lower() == 'galaxydock2-heme': self.engine = GDockHEMEDockingEngine( work_dir=str(self.work_dir), **engine_params ) elif engine.lower() == 'rfaa': if model_runner_class is None: raise ValueError("model_runner_class is required for RFAA engine") self.engine = RFAADockingEngine( work_dir=str(self.work_dir), model_runner_class=model_runner_class, **engine_params ) else: raise ValueError(f"Unsupported docking engine: {engine}")
[docs] def dock(self, ligands, receptor, smiles: Optional[str]= None, **kwargs): """ Orchestrate docking for single or multiple ligands. This method automatically determines whether to run single or batch docking based on the input type. It performs pre-docking checks and handles both file-based and SMILES-based ligand inputs. The method assumes the provided ligand (and receptor) files are already protonated to the correct pH. To prepare them refer to prepare_ligand (prepare_receptor) methods. Args: ligands: Ligand input specification. Can be: - str: Path to single ligand file - list: List of ligand file paths for batch docking - str: SMILES string (single molecule) - list: List of SMILES strings (batch processing) receptor (str): Path to receptor structure file smiles (Optional[str]): SMILES string corresponding to the ligand(s). Required for RFAA engine, optional for others. **kwargs: Additional arguments passed to dock_single or dock_many methods. Common parameters include: - output_prefix: Optional prefix for output files - prepare_inputs: Whether to preprocess input files (default: True) Returns: Union[Dict, List[Dict]]: Docking results. For single ligand: dictionary containing scores and output file paths. For multiple ligands: list of result dictionaries. Raises: ValueError: If ligands parameter is not a string or list FileNotFoundError: If input files are not found RuntimeError: If docking engine fails Example: >>> # Single ligand docking >>> result = docking.dock( ... ligands='ligand.sdf', ... receptor='protein.pdb' ... ) >>> print(f"Docking metrics: {result}") >>> # Batch docking with multiple ligands >>> results = docking.dock( ... ligands=['lig1.sdf', 'lig2.sdf', 'lig3.sdf'], ... receptor='protein.pdb' ... ) >>> for i, result in enumerate(results): ... print(f"Ligand {i+1}: {result['scores'][0]['affinity']}") >>> # SMILES-based docking >>> result = docking.dock( ... ligands='CC(=O)OC1=CC=CC=C1C(=O)O', ... receptor='protein.pdb', ... smiles='CC(=O)OC1=CC=CC=C1C(=O)O' ... ) """ if isinstance(ligands, str): self.logger.info("Running predocking checks...") if self.engine.precheck(ligands): self.logger.info("Files ready for docking") else: self.logger.error(f"Ligand file seems to be not available under {ligands}. Aborting") return self.dock_single(receptor_file=receptor, ligand_file=ligands, smiles=smiles, **kwargs) elif isinstance(ligands, list): precheck = [self.engine.precheck(lig) for lig in ligands] if all(precheck): self.logger.info("All files available for docking") else: bad_ligands = [lig for lig, ok in zip(ligands, precheck) if not ok] self.logger.error(f"Some ligands are not available. Check if the following directories contain your ligands {bad_ligands}") return self.dock_many(receptor_file=receptor, ligand_files=ligands, smiles=smiles, **kwargs) else: raise ValueError("ligands must be a string (file path) or a list of file paths")
[docs] def prepare_ligands(self, ligands): """ Prepare ligand structures from SMILES strings for docking. This method generates 3D conformers from SMILES strings and prepares them for docking using the appropriate preprocessor methods. The preparation process depends on the docking engine being used. Args: ligands (Union[str, List[str]]): SMILES string or list of SMILES strings representing the molecules to prepare. Returns: Union[str, List[str]]: Path(s) to prepared ligand file(s). Returns a single path if input was a single SMILES, or a list of paths if input was a list of SMILES. Raises: ValueError: If ligands parameter is not a string or list of strings RuntimeError: If conformer generation or preparation fails Example: >>> # Prepare single ligand >>> prepared_file = docking.prepare_ligands('CC(=O)OC1=CC=CC=C1C(=O)O') >>> print(f"Prepared ligand saved to: {prepared_file}") >>> # Prepare multiple ligands >>> smiles_list = [ ... 'CC(=O)OC1=CC=CC=C1C(=O)O', ... 'c1ccccc1', ... 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O' ... ] >>> prepared_files = docking.prepare_ligands(smiles_list) >>> print(f"Prepared {len(prepared_files)} ligands") Note: The preparation process includes: 1. 3D conformer generation from SMILES using openbabel 2. Hydrogen addition at physiological pH=7.4 3. Format conversion to engine-specific requirements 4. For RFAA engine, you don't need to use this function """ from pathlib import Path self.logger.info("Preparing ligand input structures from SMILES...") # Ensure working directories exist tmp_dir = Path(self.work_dir) / "tmp" prepared_dir = Path(self.work_dir) / "prepared" tmp_dir.mkdir(parents=True, exist_ok=True) prepared_dir.mkdir(parents=True, exist_ok=True) # Handle single SMILES or list of SMILES if isinstance(ligands, str): smiles_list = [ligands] elif isinstance(ligands, list): smiles_list = ligands else: raise ValueError("ligands must be a SMILES string or a list of SMILES strings") tmp_files = [] for idx, smiles in enumerate(smiles_list): tmp_file = tmp_dir / f"ligand_{idx+1}.sdf" self.preprocessor.generate_conformers(smiles, str(tmp_file)) tmp_files.append(tmp_file) prepared_files = [] for tmp_file in tmp_files: if isinstance(self.engine, VinaDockingEngine): prepared_file = self.preprocessor.prepare_ligand_vina(str(tmp_file), output_dir=str(prepared_dir)) elif not isinstance(self.engine, RFAADockingEngine): prepared_file = self.preprocessor.prepare_ligand(str(tmp_file), format=self.engine.ligand_format, output_dir=str(prepared_dir)) else: # For RFAA, just return the SDF file path (no further preparation) prepared_file = str(tmp_file) prepared_files.append(prepared_file) # Return a single file if input was a single SMILES, else a list if isinstance(ligands, str): return prepared_files[0] else: return prepared_files
[docs] def prepare_receptor(self, receptor_pdb): """ Prepare receptor structure for docking. This method handles receptor preparation including protonation and format conversion as required by the specific docking engine. Some engines (like RFAA) may not require receptor preparation. Args: receptor_pdb (str): Path to receptor structure file (typically PDB format) Returns: Optional[str]: Path to prepared receptor file, or None if no preparation is required (e.g., for RFAA engine). Raises: FileNotFoundError: If receptor file is not found RuntimeError: If receptor preparation fails Example: >>> prepared_receptor = docking.prepare_receptor('protein.pdb') >>> if prepared_receptor: ... print(f"Prepared receptor saved to: {prepared_receptor}") ... else: ... print("No receptor preparation required") Note: Preparation steps may include: - Hydrogen addition at physiological pH=7.4 - Format conversion (e.g., PDB to PDBQT for Vina) - Structure cleaning and validation """ prepared_dir = Path(self.work_dir) / "prepared" prepared_dir.mkdir(parents=True, exist_ok=True) if isinstance(self.engine, RFAADockingEngine): self.logger.warning("RFAA engine does not require receptor protonation") return None # RFAA doesn't need preparation if isinstance(self.engine, VinaDockingEngine): return self.preprocessor.prepare_protein_vina(receptor_pdb, output_dir=str(prepared_dir)) else: return self.preprocessor.prepare_protein(receptor_pdb, output_dir=str(prepared_dir))
[docs] def dock_single(self, receptor_file: str, ligand_file: str, smiles: Optional[str] = None, output_prefix: Optional[str] = None, box_center: Optional[Tuple[float, float, float]] = None, box_size: Tuple[float, float, float] = (30.0, 30.0, 30.0),) -> Dict: """ Perform docking for a single receptor-ligand pair. This method executes the actual docking calculation using the configured docking engine. It handles the specific requirements of each engine and returns comprehensive results including scores and output file paths. Args: receptor_file (str): Path to receptor structure file ligand_file (str): Path to ligand structure file smiles (Optional[str]): SMILES string corresponding to the ligand. Required for RFAA engine, optional for others. output_prefix (Optional[str]): Optional prefix for output files. If not provided, uses the ligand filename stem. box_center (Optional[Tuple[float, float, float]]): (x,y,z) coordinates of docking box center. If None, the engine will use ligand center. Defaults to None. box_size (Tuple[float, float, float]): (x,y,z) dimensions of search box in Angstroms. Defaults to (30.0, 30.0, 30.0). Returns: Dict: Dictionary containing docking results with the following keys: - output_file: Path to the main output file (docked structure) - log_file: Path to the docking log file - scores: List of dictionaries containing scores for each pose - valid: (RFAA only) Boolean indicating if the result passed validation - metrics: (RFAA only) Additional quality metrics Raises: FileNotFoundError: If input files are not found RuntimeError: If docking calculation fails ValueError: If required parameters are missing Example: >>> result = docking.dock_single( ... receptor_file='protein.pdb', ... ligand_file='ligand.sdf', ... output_prefix='my_docking' ... ) >>> print(f"Best pose score: result") >>> print(f"Output structure: {result}") Note: The exact content of the scores list depends on the docking engine: - GNINA: affinity, intramol, cnn_pose, cnn_affinity - Vina: affinity - GalaxyDock2 HEME: Energy - RFAA: ligand_mean_pae, mean_plddts """ # Prepare input structures if requested # if prepare_inputs: # Run docking self.logger.info("Running docking...") if isinstance(self.engine, RFAADockingEngine): results = self.engine.dock( ligand_sdf_file=ligand_file, smiles=smiles ) return results results = self.engine.dock( receptor_file=receptor_file, ligand_file=ligand_file, box_center=box_center, box_size=box_size, output_prefix=output_prefix, ) self.logger.info("Docking completed successfully") return results
[docs] def dock_many(self, receptor_file: str, ligand_files: List[str], smiles: Optional[str], box_center: Optional[Tuple[float, float, float]] = None, box_size: Tuple[float, float, float] = (30.0, 30.0, 30.0)) -> List[Dict]: """ Perform docking for multiple receptor-ligand pairs. This method executes batch docking for multiple ligands against a single receptor. It processes each ligand individually and collects results, with error handling to ensure that failures of individual ligands don't stop the entire batch. Args: receptor_file (str): Path to receptor structure file ligand_files (List[str]): List of paths to ligand structure files smiles (Optional[str]): SMILES string corresponding to the ligands. Required for RFAA engine, optional for others. box_center (Optional[Tuple[float, float, float]]): (x,y,z) coordinates of docking box center. If None, the engine will use ligand center. Defaults to None. box_size (Tuple[float, float, float]): (x,y,z) dimensions of search box in Angstroms. Defaults to (30.0, 30.0, 30.0). Returns: List[Dict]: List of docking result dictionaries. Each dictionary has the same structure as returned by dock_single(). Failed dockings will have an "error" key with the error message. Raises: FileNotFoundError: If receptor file is not found ValueError: If ligand_files is not a list Example: >>> ligand_files = ['lig1.sdf', 'lig2.sdf', 'lig3.sdf'] >>> results = docking.dock_many( ... receptor_file='protein.pdb', ... ligand_files=ligand_files ... ) >>> for i, result in enumerate(results): ... if 'error' in result: ... print(f"Ligand {i+1} failed: {result['error']}") ... else: ... print(f"Ligand {i+1} score: {result['scores'][0]['affinity']}") Note: - Each ligand is processed independently - Failed dockings are logged but don't stop the batch - Results maintain the same order as input ligand_files - Error handling ensures robust batch processing """ results = [] for ligand, s in zip(ligand_files, smiles): try: result = self.dock_single( receptor_file=receptor_file, ligand_file=ligand, smiles=s, box_center=box_center, box_size=box_size, ) results.append(result) except Exception as e: self.logger.error(f"Docking failed for {ligand} -> {receptor_file}: {e}") results.append({"error": str(e)}) return results