Source code for docktopus.rfaa_engine

from pathlib import Path
from typing import Optional, Dict, Tuple, List, Type
import logging
import shutil
from importlib import resources


[docs] class RFAADockingEngine: """ RosettaFoldAll-Atoms (RFAA) specific docking engine implementation using the model. This class provides an interface to RFAA, which uses fully flexible protein structure prediction for protein-ligand complex modeling. The prediction results can be validated (enabled by default) to remove halucinated poses. RFAA features: - Fully flexible docking with explicit bonding for covalentlly bound cofactors - Quality assessment using pLDDT and PAE metrics - Post-docking validation to detect hallucinations - Support for heme-containing proteins (CYP3A4) Attributes: work_dir (Path): Directory for docking outputs model_runner_class: ModelRunner class from rf2aa.run_inference tmp_dir (Path): Directory for temporary files results_dir (Path): Directory for results files target_dir (Path): Directory for target-specific files template_pdb (str): Template PDB ID for crossdocking fasta_file (str): Path to target protein FASTA file hem_file (str): Path to heme SDF file _validator (Optional[RFAAValidator]): Validator instance for post-docking checks validation_enabled (bool): Whether validation is available logger (logging.Logger): Logger instance for engine events """
[docs] def __init__(self, work_dir: str, model_runner_class, target="humanCYP3A4"): """ Initialize RFAA docking engine. Args: work_dir (str): Directory for docking outputs. Will be created if it doesn't exist. model_runner_class: ModelRunner class from rf2aa.run_inference. Required for RFAA model execution. Raises: ImportError: If required RFAA dependencies are not available FileNotFoundError: If template files are not found RuntimeError: If initialization fails Note: - Requires rf2aa package and dependencies to be installed - Copies template files (FASTA, HEM SDF) to work directory - Attempts to initialize validation if dependencies are available - Creates necessary subdirectories for workflow """ # Configure logging self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO) self.model_runner_class = model_runner_class self.logger.info("RFAA docking engine initialized successfully") self.work_dir = Path(work_dir) self.work_dir.mkdir(parents=True, exist_ok=True) # self.config_path = self.work_dir / "config" self.tmp_dir = Path(self.work_dir / "tmp") self.results_dir = Path(self.work_dir / "results") # Create necessary directories self.tmp_dir.mkdir(parents=True, exist_ok=True) self.results_dir.mkdir(parents=True, exist_ok=True) # Prepare target-specific directory inside work_dir self.target_dir = self.work_dir / "target" self.target_dir.mkdir(parents=True, exist_ok=True) self.targets = { "humanCYP3A4": ("hCYP-3A4.fasta", 442), "humanCYP2A6": ("hCYP-2A6.fasta", 439), "humanCYP2C9": ("hCYP-2C9.fasta", 435), "humanCYP2D6": ("hCYP-2D6.fasta", 443), "humanCYP2B6": ("hCYP-2B6.fasta", 436), "humanCYP2C8": ("hCYP-2C8.fasta", 435), "humanCYP2D13": ("hCYP-2A13.fasta", 439), "humanCYP46A1": ("hCYP-46A1.fasta", 437), "CYP199A4" : ("CYP-199A4.fasta", 359), "CYP121" : ("CYP-121.fasta", 344), "CYP105A1" : ("CYP-105A1.fasta", 355), "CYPcam" : ("CYPcam.fasta", 358), "CYP125" : ("CYP-125.fasta", 377), "CYP102A1" : ("CYP-102A1.fasta", 401), } self.selected_target = self.targets[target] self.target_name = target # Using resources library allows packaging of template config, fasta files and hem in the pip pcakage # Copy fasta file from DOCKTOPUS.template to target_dir with resources.path("docktopus.templates", self.selected_target[0]) as fasta_path: fasta_dest = self.target_dir / self.selected_target[0] shutil.copy(fasta_path, fasta_dest) self.fasta_file = str(fasta_dest) # Copy HEM.sdf from DOCKTOPUS.template to target_dir with resources.path("docktopus.templates", "HEM.sdf") as hem_path: hem_dest = self.target_dir / "HEM.sdf" shutil.copy(hem_path, hem_dest) self.hem_file = str(hem_dest) # self.fasta_file = "templates/CYP-3A4.fasta" # self.hem_file = "templates/HEM.sdf" self._validator = None self.validation_enabled = False try: from .validator import RFAAValidator self._validator = RFAAValidator() self.validation_enabled = True self.logger.info("RFAA postdocking validation enabled") except ImportError as e: self.logger.warning(f"RFAA postdocking validation dependencies not available: {e}. Docking results will not be checked for halucination.") self.validation_enabled = False
[docs] def set_target(self, target_name): """ Set the target protein for the RFAA docking engine. This method updates the selected target protein and its associated configuration (such as FASTA file and residue count) based on the provided target name. It changes the internal state of the engine so that subsequent docking runs will use the new target. Args: target_name (str): The key corresponding to the desired target protein. Available values: - "humanCYP3A4" - "humanCYP2C8" - "humanCYP2C9" - "humanCYP2C19" - "humanCYP2D6" - "humanCYP2A6" - "humanCYP2B6" - "humanCYP2E1" - "humanCYP1A2" - "humanCYP2D13" - "humanCYP46A1" - "CYP199A4" - "CYP121" - "CYP105A1" - "CYPcam" - "CYP125" - "CYP102A1" Raises: KeyError: If the provided target_name is not found in the available targets. """ self.selected_target = self.targets[target_name] self.target_name = target_name
[docs] def dock(self, ligand_sdf_file: str, smiles: str = "", ) -> Dict[str, str]: """ Perform docking using RFAA. This method executes RFAA protein-ligand structure prediction using the specified ligand and target. It generates a complete protein-ligand complex structure with quality metrics. Args: ligand_sdf_file (str): Path to ligand SDF file smiles (str, optional): SMILES string corresponding to the ligand. Required for validation. Defaults to "". target (str, optional): Target protein identifier out of supported proteins. See set_target() method for list of supported targets Returns: Dict[str, Any]: Dictionary containing docking results with keys: - output_file: Path to PDB file with protein-ligand complex - log_file: Path to RFAA log file - valid: Boolean indicating if result passed validation (if available) - metrics: Dictionary containing quality metrics: - ligand_mean_pae: Mean PAE for ligand atoms - mean_plddts: Mean pLDDT for ligand atoms Raises: FileNotFoundError: If input files are not found RuntimeError: If RFAA execution fails ImportError: If RFAA dependencies are not available Note: - Generates complete protein-ligand complex structure - Uses template-based modeling approach - Performs post-docking validation if available - Quality metrics help assess prediction reliability - Output is a PDB file with both protein and ligand """ target = self.target_name try: from omegaconf import OmegaConf from torch import load self.logger.info("RFAA support imported succesfully") except ImportError as e: self.logger.warning("Cannot import RFAA support modules. Assuming RFAA is not used.") yaml = self.generate_config_files(ligand_sdf_file=ligand_sdf_file) # Extract the basename (without extension) from the ligand_sdf_file path using pathlib ligname = Path(str(ligand_sdf_file)).stem self.logger.info(f"Starting docking {ligname} -> {target}") output_metrics = self.results_dir / f"{target}-{ligname}-metrics.pt" output_pdb = self.results_dir / f"{target}-{ligname}-complex.pdb" output_log = self.results_dir / f"{target}-{ligname}-docked.log" # config_path = Path(*self.config_path.parts[1:]) try: # # Load config for specific PDB ID cfg = OmegaConf.create(yaml) # # Initialize and run RFAA model model = self.model_runner_class(cfg) model.infer() # # Copy output from tmp to results tmp_output = self.work_dir / "tmp" / "rfaa-job.pdb" tmp_metrics = self.work_dir / "tmp" / "rfaa-job_aux.pt" shutil.copy(str(tmp_output), str(output_pdb)) shutil.copy(str(tmp_metrics), str(output_metrics)) self.logger.info(f"Docking completed successfully. Output written to {output_pdb}") is_valid = None if self.validation_enabled: output_pdb_ligand = self.results_dir / f"{target}-{ligname}-docked.pdb" self.logger.info("Running validity check...") self._validator.extract_ligand(output_pdb, output_pdb_ligand) # Convert PDB to SDF string for validation sdf_string = self._validator.convert_pdb_to_sdf(str(output_pdb_ligand)) if sdf_string: is_valid = self._validator.validate_ligand(sdf_string, smiles) else: self.logger.warning("Failed to convert PDB to SDF for validation") is_valid = False metrics = self._parse_scores(output_metrics) return { "output_file": str(output_pdb), "log_file": str(output_log), "valid": is_valid, "metrics": metrics } except Exception as e: self.logger.error(f"Docking failed for {ligname}: {str(e)}") raise
def _parse_scores(self, pt_file: Path) -> float: """ Parse metrics tensor and calculate ligand-averaged PAE. This method extracts quality metrics from the RFAA output tensor file and calculates ligand-specific quality measures including PAE (Predicted Aligned Error) and pLDDT (predicted Local Distance Difference Test). Args: pt_file (Path): Path to metrics tensor file (.pt format) Returns: Dict[str, float]: Dictionary containing quality metrics: - ligand_mean_pae: Mean PAE for ligand atoms (lower is better) - mean_plddts: Mean pLDDT for ligand atoms (higher is better) Raises: FileNotFoundError: If metrics file doesn't exist RuntimeError: If metrics parsing fails ImportError: If torch is not available Note: - PAE measures prediction confidence (lower values = higher confidence) - pLDDT measures structure quality (higher values = better quality) - Metrics are calculated only for ligand atoms (chain B) - Target node count is hardcoded for CYP3A4 (533 residues) """ try: from torch import load self.logger.info("RFAA support imported succesfully") except ImportError as e: self.logger.warning("Cannot import RFAA support modules. Assuming RFAA is not used.") #hardcoded for the exact fasta file used by default target_nodecount = 533 metrics = load(pt_file) ligand_pae = metrics["pae"][0, target_nodecount:, target_nodecount:] ligand_plddt = metrics["plddts"][:, target_nodecount:] mean_pae = ligand_pae.mean().item() mean_plddt = ligand_plddt.mean().item() result = { "ligand_mean_pae" : mean_pae, "mean_plddts": mean_plddt } return result
[docs] def generate_config_files(self, ligand_sdf_file: str, output_path: str = None, config_dir: str = None) -> str: """ Generate RFAA config files based on provided inputs. This method creates the configuration files required by RFAA for protein-ligand structure prediction. It uses template files and substitutes the provided parameters. Args: ligand_sdf_file (str): Path to ligand SDF file output_path (str, optional): Path for output files. If None, uses tmp_dir. Defaults to None. config_dir (str, optional): Directory to save config files. If None, uses work_dir/config. Defaults to None. Returns: str: Generated configuration YAML string for RFAA Raises: FileNotFoundError: If template files are not found RuntimeError: If config generation fails Note: - Uses template configuration from package resources - Substitutes file paths and parameters in template - Configuration includes protein, ligand, and heme specifications - Output path is used for RFAA temporary files """ if output_path is None: output_path = self.work_dir / "tmp" # # Template directory # template_dir = "templates" # # Generate base.yaml # base_template = "templates" / "config_template.yaml" # base_config = base_template.read_text() base_template = resources.read_text("docktopus.templates", "config_template.yaml") config_yaml = base_template.format(output_path=str(output_path), fasta_file=self.fasta_file, hem_sdf_file=self.hem_file, cys_id=self.selected_target[1], ligand_sdf_file=ligand_sdf_file) return config_yaml
[docs] def precheck(self, file_path: str) -> bool: """ Check if the provided file path exists. This method performs a simple file existence check, which is useful for validating input files before attempting docking calculations. Runs automatically before each docking to make sure you have all the files you think you have. It does not check if those files are correct. Args: file_path (str): Path to the file to check Returns: bool: True if the file exists, False otherwise Note: - Only checks file existence, not file validity - Does not verify file format or content """ return Path(file_path).exists()