Source code for docktopus.gdock_engine

import subprocess
from pathlib import Path
from typing import Optional, Dict, Tuple, List, Any
import logging
import re
import shutil

#TODO: there is a problem with reading receptor file even though it seems to be generated correctly and same preparation function work for gnina.
[docs] class GDockHEMEDockingEngine: """ GalaxyDock2 HEME-specific docking engine implementation. This class provides an interface to GalaxyDock2 HEME, a specialized docking program designed for heme-containing proteins such as cytochromes P450. GalaxyDock2 HEME incorporates heme-specific scoring functions and binding site considerations. GalaxyDock2 HEME features: - Specialized scoring for heme-containing proteins - Heme-specific binding site detection - Support for heme-ligand interactions - Multiple output poses with comprehensive scoring - Optimized for cytochrome P450 and similar enzymes Attributes: gdock_dir (Path): Path to GalaxyDock2 HEME installation directory work_dir (Path): Directory for docking outputs gd2_scratch_dir (Path): Scratch directory for GalaxyDock2 HEME operations receptor_format (str): Expected receptor file format ("pdb") ligand_format (str): Expected ligand file format ("mol2") box_center (Optional[Tuple[float, float, float]]): Docking box center coordinates seed (int): Random seed for reproducibility gdock_script (Path): Path to GalaxyDock2 HEME Python script logger (logging.Logger): Logger instance for engine events """
[docs] def __init__(self, gdock_dir: str, work_dir: str, seed: int = 0): """ Initialize GalaxyDock2 HEME docking engine. Args: gdock_dir (str): Path to GalaxyDock2 HEME installation directory. Must contain the script/run_GalaxyDock2_heme.py file. work_dir (str): Directory for docking outputs. Will be created if it doesn't exist. seed (int, optional): Random seed for reproducibility. Defaults to 0. Raises: FileNotFoundError: If GalaxyDock2 HEME script is not found ValueError: If required parameters are missing Note: - Requires GalaxyDock2 HEME to be installed and properly configured - Box center coordinates are essential for heme docking - Creates scratch directory for temporary files """ self.gdock_dir = Path(gdock_dir) self.work_dir = Path(work_dir) self.work_dir.mkdir(parents=True, exist_ok=True) self.gd2_scratch_dir = self.work_dir / "gd2_scratch" self.gd2_scratch_dir.mkdir(parents=True, exist_ok=True) self.results = self.work_dir / "results" self.results.mkdir(parents=True, exist_ok=True) self.receptor_format = "pdb" self.ligand_format = "mol2" self.seed = seed # self.box_center = box_center # Configure logging self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO) # Check if GalaxyDock2 HEME script exists self.gdock_script = self.gdock_dir / "script" / "run_GalaxyDock2_heme.py" if not self.gdock_script.exists(): raise FileNotFoundError(f"GalaxyDock2 HEME script not found at {self.gdock_script}")
[docs] def dock(self, receptor_file: str, ligand_file: str, box_center: Tuple[float, float, float], box_size: Tuple[float, float, float] = (30, 30, 30), output_prefix: Optional[str] = None ) -> Dict[str, str]: """ Perform docking using GalaxyDock2 HEME. This method executes GalaxyDock2 HEME docking with the specified parameters and returns results including multiple poses with heme-specific scores. Args: receptor_file (str): Path to prepared receptor file (protonated PDB format) ligand_file (str): Path to prepared ligand file (protonated MOL2 format) box_center (Tuple[float, float, float]): (x,y,z) coordinates of docking box center. box_size (Tuple[float, float, float]), optional: Size in Angstroms of the docking box. Defaults to (30, 30, 30) output_prefix (Optional[str], optional): Prefix for output files. If None, uses the ligand filename stem. Defaults to None. Returns: Dict[str, Any]: Dictionary containing docking results with keys: - output_file: Path to MOL2 file with docked poses - log_file: Path to GalaxyDock2 HEME log file - scores: Dictionary containing pose information: - poses: List of dictionaries, each containing: - pose: Pose number (1-based) - Energy: Total GalaxyDock2 HEME score Raises: FileNotFoundError: If input files are not found subprocess.CalledProcessError: If GalaxyDock2 HEME execution fails RuntimeError: If score parsing fails Note: - Receptor should be in PDB format with polar hydrogens - Ligand should be in MOL2 format with all hydrogens - Box center coordinates are required and used for docking - Output is in MOL2 format with multiple poses - Scores are extracted from GalaxyDock2 HEME energy files """ # copy receptor and ligand to scratch directory shutil.copy(receptor_file, self.gd2_scratch_dir / Path(receptor_file).name) shutil.copy(ligand_file, self.gd2_scratch_dir / Path(ligand_file).name) if output_prefix is None: output_prefix = Path(ligand_file).stem # GalaxyDock2 HEME always outputs to GD2_HEME_cl.mol2 output_mol2 = self.results / f"{output_prefix}_docked.mol2" output_log = self.results / f"{output_prefix}_docked.log" # Build GalaxyDock2 HEME command - launched from scratch directory cmd = [ str(self.gdock_script), "-d", str(self.gdock_dir), "-p", str(Path(receptor_file).name), "-l", str(Path(ligand_file).name), "--random_seed", str(self.seed), "-x", str(box_center[0]), "-y", str(box_center[1]), "-z", str(box_center[2]), "-size_x", str(box_size[0]), "-size_y", str(box_size[1]), "-size_z", str(box_size[2]) ] self.logger.info(f"Launching GalaxyDock2-HEME...\n{' '.join(cmd)}") # print(f"{self.gd2_scratch_dir=}") # Run docking try: process = subprocess.run(cmd, check=True, capture_output=True, text=True, cwd=self.gd2_scratch_dir) # Save stdout to log file with open(output_log, 'w') as f: f.write(process.stdout) self.logger.info(process.stdout) # Copy the output file from GD2_HEME_cl.mol2 to our desired name in the work directory default_output = self.gd2_scratch_dir / "GD2_HEME_cl.mol2" if default_output.exists(): shutil.copy(default_output, output_mol2) # Parse scores from output scores = self._parse_scores() except subprocess.CalledProcessError as e: self.logger.error(f"Docking failed: {e.stderr}") raise return { "output_file": str(output_mol2), "log_file": str(output_log), "scores": scores }
def _parse_scores(self) -> Dict[str, Any]: """ Parse docking scores from GalaxyDock2 HEME output. This method extracts scoring information from the GalaxyDock2 HEME energy file, which contains comprehensive scoring data for each pose. Returns: Dict[str, Any]: Dictionary containing pose information: - poses: List of dictionaries, each containing: - pose: Pose number (1-based indexing) - Energy: Total GalaxyDock2 HEME score (lower is better) Raises: FileNotFoundError: If score file doesn't exist RuntimeError: If score parsing fails Note: - Parses the GD2_HEME_cl.E.info file - Extracts up to 5 poses (or all available if less than 5) - Energy scores are in arbitrary units (lower is better) - Returns empty dictionary if parsing fails """ scores = {} try: # Read the score file - name is standardized score_file = self.gd2_scratch_dir / "GD2_HEME_cl.E.info" if not score_file.exists(): self.logger.warning(f"Score file not found: {score_file}") return scores with open(score_file, 'r') as f: lines = f.readlines() # Find the data lines (skip header lines starting with '!' and column titles) data_lines = [line.strip() for line in lines if not line.startswith('!') and line.strip() and not line.startswith('Bank No')] if not data_lines: self.logger.warning("No score data found in file") return scores # Parse poses (up to 5, or all available if less than 5) # Format: Bank No Energy l_RMSD ATDK_E INT_E DS_E HM_E PLP PROT MYST scores = {"poses": []} num_poses = min(5, len(data_lines)) # Take up to 5 poses, or all if less than 5 for i, line in enumerate(data_lines[:num_poses]): result = line.split() if len(result) >= 2: pose_score = { "pose": i + 1, "Energy": float(result[1]) } scores["poses"].append(pose_score) else: self.logger.warning(f"Unexpected score file format in line {i+1}: {result}") except Exception as e: self.logger.error(f"Failed to parse scores: {e}") scores = {} return scores
[docs] def precheck(self, file_path: str) -> bool: """ Check if the provided file path exists. This method performs a simple file existence check, which is useful for validating input files before attempting docking calculations. Runs automatically before each docking to make sure you have all the files you think you have. It does not check if those files are correct. Args: file_path (str): Path to the file to check Returns: bool: True if the file exists, False otherwise Note: - Only checks file existence, not file validity - Does not verify file format or content - Useful for basic input validation """ return Path(file_path).exists()