import subprocess
from pathlib import Path
from typing import Optional, Dict, Tuple, List, Any
import logging
import re
import shutil
#TODO: there is a problem with reading receptor file even though it seems to be generated correctly and same preparation function work for gnina.
[docs]
class GDockHEMEDockingEngine:
"""
GalaxyDock2 HEME-specific docking engine implementation.
This class provides an interface to GalaxyDock2 HEME, a specialized docking
program designed for heme-containing proteins such as cytochromes P450.
GalaxyDock2 HEME incorporates heme-specific scoring functions and binding
site considerations.
GalaxyDock2 HEME features:
- Specialized scoring for heme-containing proteins
- Heme-specific binding site detection
- Support for heme-ligand interactions
- Multiple output poses with comprehensive scoring
- Optimized for cytochrome P450 and similar enzymes
Attributes:
gdock_dir (Path): Path to GalaxyDock2 HEME installation directory
work_dir (Path): Directory for docking outputs
gd2_scratch_dir (Path): Scratch directory for GalaxyDock2 HEME operations
receptor_format (str): Expected receptor file format ("pdb")
ligand_format (str): Expected ligand file format ("mol2")
box_center (Optional[Tuple[float, float, float]]): Docking box center coordinates
seed (int): Random seed for reproducibility
gdock_script (Path): Path to GalaxyDock2 HEME Python script
logger (logging.Logger): Logger instance for engine events
"""
[docs]
def __init__(self,
gdock_dir: str,
work_dir: str,
seed: int = 0):
"""
Initialize GalaxyDock2 HEME docking engine.
Args:
gdock_dir (str): Path to GalaxyDock2 HEME installation directory.
Must contain the script/run_GalaxyDock2_heme.py file.
work_dir (str): Directory for docking outputs. Will be created if
it doesn't exist.
seed (int, optional): Random seed for reproducibility. Defaults to 0.
Raises:
FileNotFoundError: If GalaxyDock2 HEME script is not found
ValueError: If required parameters are missing
Note:
- Requires GalaxyDock2 HEME to be installed and properly configured
- Box center coordinates are essential for heme docking
- Creates scratch directory for temporary files
"""
self.gdock_dir = Path(gdock_dir)
self.work_dir = Path(work_dir)
self.work_dir.mkdir(parents=True, exist_ok=True)
self.gd2_scratch_dir = self.work_dir / "gd2_scratch"
self.gd2_scratch_dir.mkdir(parents=True, exist_ok=True)
self.results = self.work_dir / "results"
self.results.mkdir(parents=True, exist_ok=True)
self.receptor_format = "pdb"
self.ligand_format = "mol2"
self.seed = seed
# self.box_center = box_center
# Configure logging
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO)
# Check if GalaxyDock2 HEME script exists
self.gdock_script = self.gdock_dir / "script" / "run_GalaxyDock2_heme.py"
if not self.gdock_script.exists():
raise FileNotFoundError(f"GalaxyDock2 HEME script not found at {self.gdock_script}")
[docs]
def dock(self,
receptor_file: str,
ligand_file: str,
box_center: Tuple[float, float, float],
box_size: Tuple[float, float, float] = (30, 30, 30),
output_prefix: Optional[str] = None
) -> Dict[str, str]:
"""
Perform docking using GalaxyDock2 HEME.
This method executes GalaxyDock2 HEME docking with the specified parameters
and returns results including multiple poses with heme-specific scores.
Args:
receptor_file (str): Path to prepared receptor file (protonated PDB format)
ligand_file (str): Path to prepared ligand file (protonated MOL2 format)
box_center (Tuple[float, float, float]): (x,y,z) coordinates of docking box center.
box_size (Tuple[float, float, float]), optional: Size in Angstroms of the docking box. Defaults to (30, 30, 30)
output_prefix (Optional[str], optional): Prefix for output files.
If None, uses the ligand filename stem. Defaults to None.
Returns:
Dict[str, Any]: Dictionary containing docking results with keys:
- output_file: Path to MOL2 file with docked poses
- log_file: Path to GalaxyDock2 HEME log file
- scores: Dictionary containing pose information:
- poses: List of dictionaries, each containing:
- pose: Pose number (1-based)
- Energy: Total GalaxyDock2 HEME score
Raises:
FileNotFoundError: If input files are not found
subprocess.CalledProcessError: If GalaxyDock2 HEME execution fails
RuntimeError: If score parsing fails
Note:
- Receptor should be in PDB format with polar hydrogens
- Ligand should be in MOL2 format with all hydrogens
- Box center coordinates are required and used for docking
- Output is in MOL2 format with multiple poses
- Scores are extracted from GalaxyDock2 HEME energy files
"""
# copy receptor and ligand to scratch directory
shutil.copy(receptor_file, self.gd2_scratch_dir / Path(receptor_file).name)
shutil.copy(ligand_file, self.gd2_scratch_dir / Path(ligand_file).name)
if output_prefix is None:
output_prefix = Path(ligand_file).stem
# GalaxyDock2 HEME always outputs to GD2_HEME_cl.mol2
output_mol2 = self.results / f"{output_prefix}_docked.mol2"
output_log = self.results / f"{output_prefix}_docked.log"
# Build GalaxyDock2 HEME command - launched from scratch directory
cmd = [
str(self.gdock_script),
"-d", str(self.gdock_dir),
"-p", str(Path(receptor_file).name),
"-l", str(Path(ligand_file).name),
"--random_seed", str(self.seed),
"-x", str(box_center[0]),
"-y", str(box_center[1]),
"-z", str(box_center[2]),
"-size_x", str(box_size[0]),
"-size_y", str(box_size[1]),
"-size_z", str(box_size[2])
]
self.logger.info(f"Launching GalaxyDock2-HEME...\n{' '.join(cmd)}")
# print(f"{self.gd2_scratch_dir=}")
# Run docking
try:
process = subprocess.run(cmd,
check=True,
capture_output=True,
text=True,
cwd=self.gd2_scratch_dir)
# Save stdout to log file
with open(output_log, 'w') as f:
f.write(process.stdout)
self.logger.info(process.stdout)
# Copy the output file from GD2_HEME_cl.mol2 to our desired name in the work directory
default_output = self.gd2_scratch_dir / "GD2_HEME_cl.mol2"
if default_output.exists():
shutil.copy(default_output, output_mol2)
# Parse scores from output
scores = self._parse_scores()
except subprocess.CalledProcessError as e:
self.logger.error(f"Docking failed: {e.stderr}")
raise
return {
"output_file": str(output_mol2),
"log_file": str(output_log),
"scores": scores
}
def _parse_scores(self) -> Dict[str, Any]:
"""
Parse docking scores from GalaxyDock2 HEME output.
This method extracts scoring information from the GalaxyDock2 HEME
energy file, which contains comprehensive scoring data for each pose.
Returns:
Dict[str, Any]: Dictionary containing pose information:
- poses: List of dictionaries, each containing:
- pose: Pose number (1-based indexing)
- Energy: Total GalaxyDock2 HEME score (lower is better)
Raises:
FileNotFoundError: If score file doesn't exist
RuntimeError: If score parsing fails
Note:
- Parses the GD2_HEME_cl.E.info file
- Extracts up to 5 poses (or all available if less than 5)
- Energy scores are in arbitrary units (lower is better)
- Returns empty dictionary if parsing fails
"""
scores = {}
try:
# Read the score file - name is standardized
score_file = self.gd2_scratch_dir / "GD2_HEME_cl.E.info"
if not score_file.exists():
self.logger.warning(f"Score file not found: {score_file}")
return scores
with open(score_file, 'r') as f:
lines = f.readlines()
# Find the data lines (skip header lines starting with '!' and column titles)
data_lines = [line.strip() for line in lines if not line.startswith('!') and line.strip() and not line.startswith('Bank No')]
if not data_lines:
self.logger.warning("No score data found in file")
return scores
# Parse poses (up to 5, or all available if less than 5)
# Format: Bank No Energy l_RMSD ATDK_E INT_E DS_E HM_E PLP PROT MYST
scores = {"poses": []}
num_poses = min(5, len(data_lines)) # Take up to 5 poses, or all if less than 5
for i, line in enumerate(data_lines[:num_poses]):
result = line.split()
if len(result) >= 2:
pose_score = {
"pose": i + 1,
"Energy": float(result[1])
}
scores["poses"].append(pose_score)
else:
self.logger.warning(f"Unexpected score file format in line {i+1}: {result}")
except Exception as e:
self.logger.error(f"Failed to parse scores: {e}")
scores = {}
return scores
[docs]
def precheck(self, file_path: str) -> bool:
"""
Check if the provided file path exists.
This method performs a simple file existence check, which is useful
for validating input files before attempting docking calculations. Runs automatically
before each docking to make sure you have all the files you think you have. It does not check if
those files are correct.
Args:
file_path (str): Path to the file to check
Returns:
bool: True if the file exists, False otherwise
Note:
- Only checks file existence, not file validity
- Does not verify file format or content
- Useful for basic input validation
"""
return Path(file_path).exists()