Source code for deeprank2.tools.target

import glob
import logging
import os

import h5py
import numpy as np
from pdb2sql import StructureSimilarity

from deeprank2.domain import targetstorage as targets

_log = logging.getLogger(__name__)
MIN_IRMS_FOR_BINARY = 4


[docs]def add_target( # noqa: C901 graph_path: str | list[str], target_name: str, target_list: str, sep: str = " ", ) -> None: """Add a target to all the graphs in hdf5 files. Args: graph_path: Either a directory containing all the hdf5 files, a single hdf5 filename, or a list of hdf5 filenames. target_name: The name of the new target. target_list: Name of the file containing the data. sep: Separator in target list. Defaults to " " (single space). Notes: The input target list should respect the following format : 1ATN_xxx-1 0 1ATN_xxx-2 1 1ATN_xxx-3 0 1ATN_xxx-4 0 """ labels = np.loadtxt(target_list, delimiter=sep, usecols=[0], dtype=str) values = np.loadtxt(target_list, delimiter=sep, usecols=[1]) target_dict = dict(zip(labels, values, strict=False)) if os.path.isdir(graph_path): graphs = glob.glob(f"{graph_path}/*.hdf5") elif os.path.isfile(graph_path): graphs = [graph_path] elif isinstance(graph_path, list): graphs = graph_path else: msg = "Incorrect input passed." raise TypeError(msg) for hdf5 in graphs: _log.info(hdf5) if not os.path.isfile(hdf5): msg = f"File {hdf5} not found." raise FileNotFoundError(msg) try: f5 = h5py.File(hdf5, "a") for model in target_dict: if model not in f5: msg = f"{hdf5} does not contain an entry named {model}." raise ValueError(msg) # noqa: TRY301 try: model_gp = f5[model] if targets.VALUES not in model_gp: model_gp.create_group(targets.VALUES) group = f5[f"{model}/{targets.VALUES}/"] if target_name in group: # Delete the target if it already existed del group[target_name] # Create the target group.create_dataset(target_name, data=target_dict[model]) except BaseException: # noqa: BLE001 _log.info(f"no graph for {model}") f5.close() except BaseException: # noqa: BLE001 _log.info(f"no graph for {hdf5}")
[docs]def compute_ppi_scores( pdb_path: str, reference_pdb_path: str, ) -> dict[str, float | int]: """Compute structure similarity scores for the input docking model and return them as a dictionary. The computed scores are: `lrmsd` (ligand root mean square deviation), `irmsd` (interface rmsd), `fnat` (fraction of native contacts), `dockq` (docking model quality), `binary` (True - high quality, False - low quality), `capri_class` (capri classification, 1 - high quality, 2 - medium, 3 - acceptable, 4 - incorrect). See https://deeprank2.readthedocs.io/en/latest/docking.html for more details about the scores. Args: pdb_path: Path to the decoy. reference_pdb_path: Path to the reference (native) structure. Returns: dict containing values for lrmsd, irmsd, fnat, dockq, binary, capri_class. """ ref_name = os.path.splitext(os.path.basename(reference_pdb_path))[0] sim = StructureSimilarity( pdb_path, reference_pdb_path, enforce_residue_matching=False, ) scores = {} # Input pre-computed zone files if os.path.exists(ref_name + ".lzone"): scores[targets.LRMSD] = sim.compute_lrmsd_fast(method="svd", lzone=ref_name + ".lzone") scores[targets.IRMSD] = sim.compute_irmsd_fast(method="svd", izone=ref_name + ".izone") # Compute zone files else: scores[targets.LRMSD] = sim.compute_lrmsd_fast(method="svd") scores[targets.IRMSD] = sim.compute_irmsd_fast(method="svd") scores[targets.FNAT] = sim.compute_fnat_fast() scores[targets.DOCKQ] = sim.compute_DockQScore(scores[targets.FNAT], scores[targets.LRMSD], scores[targets.IRMSD]) scores[targets.BINARY] = scores[targets.IRMSD] < MIN_IRMS_FOR_BINARY scores[targets.CAPRI] = 4 for thr, val in zip([4.0, 2.0, 1.0], [3, 2, 1], strict=True): if scores[targets.IRMSD] < thr: scores[targets.CAPRI] = val return scores