from pathlib import Path
from typing import Optional, Dict, Tuple, List, Type
import logging
import shutil
from importlib import resources
[docs]
class RFAADockingEngine:
"""
RosettaFoldAll-Atoms (RFAA) specific docking engine implementation using the model.
This class provides an interface to RFAA, which uses
fully flexible protein structure prediction for protein-ligand complex
modeling. The prediction results can be validated (enabled by default) to remove halucinated poses.
RFAA features:
- Fully flexible docking with explicit bonding for covalentlly bound cofactors
- Quality assessment using pLDDT and PAE metrics
- Post-docking validation to detect hallucinations
- Support for heme-containing proteins (CYP3A4)
Attributes:
work_dir (Path): Directory for docking outputs
model_runner_class: ModelRunner class from rf2aa.run_inference
tmp_dir (Path): Directory for temporary files
results_dir (Path): Directory for results files
target_dir (Path): Directory for target-specific files
template_pdb (str): Template PDB ID for crossdocking
fasta_file (str): Path to target protein FASTA file
hem_file (str): Path to heme SDF file
_validator (Optional[RFAAValidator]): Validator instance for post-docking checks
validation_enabled (bool): Whether validation is available
logger (logging.Logger): Logger instance for engine events
"""
[docs]
def __init__(self,
work_dir: str,
model_runner_class,
target="humanCYP3A4"):
"""
Initialize RFAA docking engine.
Args:
work_dir (str): Directory for docking outputs. Will be created if
it doesn't exist.
model_runner_class: ModelRunner class from rf2aa.run_inference.
Required for RFAA model execution.
Raises:
ImportError: If required RFAA dependencies are not available
FileNotFoundError: If template files are not found
RuntimeError: If initialization fails
Note:
- Requires rf2aa package and dependencies to be installed
- Copies template files (FASTA, HEM SDF) to work directory
- Attempts to initialize validation if dependencies are available
- Creates necessary subdirectories for workflow
"""
# Configure logging
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO)
self.model_runner_class = model_runner_class
self.logger.info("RFAA docking engine initialized successfully")
self.work_dir = Path(work_dir)
self.work_dir.mkdir(parents=True, exist_ok=True)
# self.config_path = self.work_dir / "config"
self.tmp_dir = Path(self.work_dir / "tmp")
self.results_dir = Path(self.work_dir / "results")
# Create necessary directories
self.tmp_dir.mkdir(parents=True, exist_ok=True)
self.results_dir.mkdir(parents=True, exist_ok=True)
# Prepare target-specific directory inside work_dir
self.target_dir = self.work_dir / "target"
self.target_dir.mkdir(parents=True, exist_ok=True)
self.targets = {
"humanCYP3A4": ("hCYP-3A4.fasta", 442),
"humanCYP2A6": ("hCYP-2A6.fasta", 439),
"humanCYP2C9": ("hCYP-2C9.fasta", 435),
"humanCYP2D6": ("hCYP-2D6.fasta", 443),
"humanCYP2B6": ("hCYP-2B6.fasta", 436),
"humanCYP2C8": ("hCYP-2C8.fasta", 435),
"humanCYP2D13": ("hCYP-2A13.fasta", 439),
"humanCYP46A1": ("hCYP-46A1.fasta", 437),
"CYP199A4" : ("CYP-199A4.fasta", 359),
"CYP121" : ("CYP-121.fasta", 344),
"CYP105A1" : ("CYP-105A1.fasta", 355),
"CYPcam" : ("CYPcam.fasta", 358),
"CYP125" : ("CYP-125.fasta", 377),
"CYP102A1" : ("CYP-102A1.fasta", 401),
}
self.selected_target = self.targets[target]
self.target_name = target
# Using resources library allows packaging of template config, fasta files and hem in the pip pcakage
# Copy fasta file from DOCKTOPUS.template to target_dir
with resources.path("docktopus.templates", self.selected_target[0]) as fasta_path:
fasta_dest = self.target_dir / self.selected_target[0]
shutil.copy(fasta_path, fasta_dest)
self.fasta_file = str(fasta_dest)
# Copy HEM.sdf from DOCKTOPUS.template to target_dir
with resources.path("docktopus.templates", "HEM.sdf") as hem_path:
hem_dest = self.target_dir / "HEM.sdf"
shutil.copy(hem_path, hem_dest)
self.hem_file = str(hem_dest)
# self.fasta_file = "templates/CYP-3A4.fasta"
# self.hem_file = "templates/HEM.sdf"
self._validator = None
self.validation_enabled = False
try:
from .validator import RFAAValidator
self._validator = RFAAValidator()
self.validation_enabled = True
self.logger.info("RFAA postdocking validation enabled")
except ImportError as e:
self.logger.warning(f"RFAA postdocking validation dependencies not available: {e}. Docking results will not be checked for halucination.")
self.validation_enabled = False
[docs]
def set_target(self, target_name):
"""
Set the target protein for the RFAA docking engine.
This method updates the selected target protein and its associated configuration
(such as FASTA file and residue count) based on the provided target name.
It changes the internal state of the engine so that subsequent docking runs
will use the new target.
Args:
target_name (str): The key corresponding to the desired target protein.
Available values:
- "humanCYP3A4"
- "humanCYP2C8"
- "humanCYP2C9"
- "humanCYP2C19"
- "humanCYP2D6"
- "humanCYP2A6"
- "humanCYP2B6"
- "humanCYP2E1"
- "humanCYP1A2"
- "humanCYP2D13"
- "humanCYP46A1"
- "CYP199A4"
- "CYP121"
- "CYP105A1"
- "CYPcam"
- "CYP125"
- "CYP102A1"
Raises:
KeyError: If the provided target_name is not found in the available targets.
"""
self.selected_target = self.targets[target_name]
self.target_name = target_name
[docs]
def dock(self,
ligand_sdf_file: str,
smiles: str = "",
) -> Dict[str, str]:
"""
Perform docking using RFAA.
This method executes RFAA protein-ligand structure prediction using
the specified ligand and target. It generates a complete protein-ligand
complex structure with quality metrics.
Args:
ligand_sdf_file (str): Path to ligand SDF file
smiles (str, optional): SMILES string corresponding to the ligand.
Required for validation. Defaults to "".
target (str, optional): Target protein identifier out of supported proteins.
See set_target() method for list of supported targets
Returns:
Dict[str, Any]: Dictionary containing docking results with keys:
- output_file: Path to PDB file with protein-ligand complex
- log_file: Path to RFAA log file
- valid: Boolean indicating if result passed validation (if available)
- metrics: Dictionary containing quality metrics:
- ligand_mean_pae: Mean PAE for ligand atoms
- mean_plddts: Mean pLDDT for ligand atoms
Raises:
FileNotFoundError: If input files are not found
RuntimeError: If RFAA execution fails
ImportError: If RFAA dependencies are not available
Note:
- Generates complete protein-ligand complex structure
- Uses template-based modeling approach
- Performs post-docking validation if available
- Quality metrics help assess prediction reliability
- Output is a PDB file with both protein and ligand
"""
target = self.target_name
try:
from omegaconf import OmegaConf
from torch import load
self.logger.info("RFAA support imported succesfully")
except ImportError as e:
self.logger.warning("Cannot import RFAA support modules. Assuming RFAA is not used.")
yaml = self.generate_config_files(ligand_sdf_file=ligand_sdf_file)
# Extract the basename (without extension) from the ligand_sdf_file path using pathlib
ligname = Path(str(ligand_sdf_file)).stem
self.logger.info(f"Starting docking {ligname} -> {target}")
output_metrics = self.results_dir / f"{target}-{ligname}-metrics.pt"
output_pdb = self.results_dir / f"{target}-{ligname}-complex.pdb"
output_log = self.results_dir / f"{target}-{ligname}-docked.log"
# config_path = Path(*self.config_path.parts[1:])
try:
# # Load config for specific PDB ID
cfg = OmegaConf.create(yaml)
# # Initialize and run RFAA model
model = self.model_runner_class(cfg)
model.infer()
# # Copy output from tmp to results
tmp_output = self.work_dir / "tmp" / "rfaa-job.pdb"
tmp_metrics = self.work_dir / "tmp" / "rfaa-job_aux.pt"
shutil.copy(str(tmp_output), str(output_pdb))
shutil.copy(str(tmp_metrics), str(output_metrics))
self.logger.info(f"Docking completed successfully. Output written to {output_pdb}")
is_valid = None
if self.validation_enabled:
output_pdb_ligand = self.results_dir / f"{target}-{ligname}-docked.pdb"
self.logger.info("Running validity check...")
self._validator.extract_ligand(output_pdb, output_pdb_ligand)
# Convert PDB to SDF string for validation
sdf_string = self._validator.convert_pdb_to_sdf(str(output_pdb_ligand))
if sdf_string:
is_valid = self._validator.validate_ligand(sdf_string, smiles)
else:
self.logger.warning("Failed to convert PDB to SDF for validation")
is_valid = False
metrics = self._parse_scores(output_metrics)
return {
"output_file": str(output_pdb),
"log_file": str(output_log),
"valid": is_valid,
"metrics": metrics
}
except Exception as e:
self.logger.error(f"Docking failed for {ligname}: {str(e)}")
raise
def _parse_scores(self, pt_file: Path) -> float:
"""
Parse metrics tensor and calculate ligand-averaged PAE.
This method extracts quality metrics from the RFAA output tensor file
and calculates ligand-specific quality measures including PAE (Predicted
Aligned Error) and pLDDT (predicted Local Distance Difference Test).
Args:
pt_file (Path): Path to metrics tensor file (.pt format)
Returns:
Dict[str, float]: Dictionary containing quality metrics:
- ligand_mean_pae: Mean PAE for ligand atoms (lower is better)
- mean_plddts: Mean pLDDT for ligand atoms (higher is better)
Raises:
FileNotFoundError: If metrics file doesn't exist
RuntimeError: If metrics parsing fails
ImportError: If torch is not available
Note:
- PAE measures prediction confidence (lower values = higher confidence)
- pLDDT measures structure quality (higher values = better quality)
- Metrics are calculated only for ligand atoms (chain B)
- Target node count is hardcoded for CYP3A4 (533 residues)
"""
try:
from torch import load
self.logger.info("RFAA support imported succesfully")
except ImportError as e:
self.logger.warning("Cannot import RFAA support modules. Assuming RFAA is not used.")
#hardcoded for the exact fasta file used by default
target_nodecount = 533
metrics = load(pt_file)
ligand_pae = metrics["pae"][0, target_nodecount:, target_nodecount:]
ligand_plddt = metrics["plddts"][:, target_nodecount:]
mean_pae = ligand_pae.mean().item()
mean_plddt = ligand_plddt.mean().item()
result = {
"ligand_mean_pae" : mean_pae,
"mean_plddts": mean_plddt
}
return result
[docs]
def generate_config_files(self,
ligand_sdf_file: str,
output_path: str = None,
config_dir: str = None) -> str:
"""
Generate RFAA config files based on provided inputs.
This method creates the configuration files required by RFAA for
protein-ligand structure prediction. It uses template files and
substitutes the provided parameters.
Args:
ligand_sdf_file (str): Path to ligand SDF file
output_path (str, optional): Path for output files. If None, uses
tmp_dir. Defaults to None.
config_dir (str, optional): Directory to save config files. If None,
uses work_dir/config. Defaults to None.
Returns:
str: Generated configuration YAML string for RFAA
Raises:
FileNotFoundError: If template files are not found
RuntimeError: If config generation fails
Note:
- Uses template configuration from package resources
- Substitutes file paths and parameters in template
- Configuration includes protein, ligand, and heme specifications
- Output path is used for RFAA temporary files
"""
if output_path is None:
output_path = self.work_dir / "tmp"
# # Template directory
# template_dir = "templates"
# # Generate base.yaml
# base_template = "templates" / "config_template.yaml"
# base_config = base_template.read_text()
base_template = resources.read_text("docktopus.templates", "config_template.yaml")
config_yaml = base_template.format(output_path=str(output_path),
fasta_file=self.fasta_file,
hem_sdf_file=self.hem_file,
cys_id=self.selected_target[1],
ligand_sdf_file=ligand_sdf_file)
return config_yaml
[docs]
def precheck(self, file_path: str) -> bool:
"""
Check if the provided file path exists.
This method performs a simple file existence check, which is useful
for validating input files before attempting docking calculations. Runs automatically
before each docking to make sure you have all the files you think you have. It does not check if
those files are correct.
Args:
file_path (str): Path to the file to check
Returns:
bool: True if the file exists, False otherwise
Note:
- Only checks file existence, not file validity
- Does not verify file format or content
"""
return Path(file_path).exists()