Source code for deeprank2.tools.target

import glob
import logging
import os

import h5py
import numpy as np
from pdb2sql import StructureSimilarity

from deeprank2.domain import targetstorage as targets

_log = logging.getLogger(__name__)
MIN_IRMS_FOR_BINARY = 4


[docs]def add_target(  # noqa: C901
    graph_path: str | list[str],
    target_name: str,
    target_list: str,
    sep: str = " ",
) -> None:
    """Add a target to all the graphs in hdf5 files.

    Args:
        graph_path: Either a directory containing all the hdf5 files, a single hdf5 filename, or a list of hdf5 filenames.
        target_name: The name of the new target.
        target_list: Name of the file containing the data.
        sep: Separator in target list. Defaults to " " (single space).

    Notes:
        The input target list should respect the following format :
        1ATN_xxx-1 0
        1ATN_xxx-2 1
        1ATN_xxx-3 0
        1ATN_xxx-4 0
    """
    labels = np.loadtxt(target_list, delimiter=sep, usecols=[0], dtype=str)
    values = np.loadtxt(target_list, delimiter=sep, usecols=[1])
    target_dict = dict(zip(labels, values, strict=False))

    if os.path.isdir(graph_path):
        graphs = glob.glob(f"{graph_path}/*.hdf5")
    elif os.path.isfile(graph_path):
        graphs = [graph_path]
    elif isinstance(graph_path, list):
        graphs = graph_path
    else:
        msg = "Incorrect input passed."
        raise TypeError(msg)

    for hdf5 in graphs:
        _log.info(hdf5)
        if not os.path.isfile(hdf5):
            msg = f"File {hdf5} not found."
            raise FileNotFoundError(msg)

        try:
            f5 = h5py.File(hdf5, "a")
            for model in target_dict:
                if model not in f5:
                    msg = f"{hdf5} does not contain an entry named {model}."
                    raise ValueError(msg)  # noqa: TRY301
                try:
                    model_gp = f5[model]
                    if targets.VALUES not in model_gp:
                        model_gp.create_group(targets.VALUES)
                    group = f5[f"{model}/{targets.VALUES}/"]
                    if target_name in group:
                        # Delete the target if it already existed
                        del group[target_name]
                    # Create the target
                    group.create_dataset(target_name, data=target_dict[model])
                except BaseException:  # noqa: BLE001
                    _log.info(f"no graph for {model}")
            f5.close()

        except BaseException:  # noqa: BLE001
            _log.info(f"no graph for {hdf5}")


[docs]def compute_ppi_scores(
    pdb_path: str,
    reference_pdb_path: str,
) -> dict[str, float | int]:
    """Compute structure similarity scores for the input docking model and return them as a dictionary.

    The computed scores are: `lrmsd` (ligand root mean square deviation), `irmsd` (interface rmsd),
    `fnat` (fraction of native contacts), `dockq` (docking model quality), `binary` (True - high quality,
    False - low quality), `capri_class` (capri classification, 1 - high quality, 2 - medium, 3 - acceptable,
    4 - incorrect). See https://deeprank2.readthedocs.io/en/latest/docking.html for more details about the scores.

    Args:
        pdb_path: Path to the decoy.
        reference_pdb_path: Path to the reference (native) structure.

    Returns: dict containing values for lrmsd, irmsd, fnat, dockq, binary, capri_class.
    """
    ref_name = os.path.splitext(os.path.basename(reference_pdb_path))[0]
    sim = StructureSimilarity(
        pdb_path,
        reference_pdb_path,
        enforce_residue_matching=False,
    )

    scores = {}

    # Input pre-computed zone files
    if os.path.exists(ref_name + ".lzone"):
        scores[targets.LRMSD] = sim.compute_lrmsd_fast(method="svd", lzone=ref_name + ".lzone")
        scores[targets.IRMSD] = sim.compute_irmsd_fast(method="svd", izone=ref_name + ".izone")

    # Compute zone files
    else:
        scores[targets.LRMSD] = sim.compute_lrmsd_fast(method="svd")
        scores[targets.IRMSD] = sim.compute_irmsd_fast(method="svd")

    scores[targets.FNAT] = sim.compute_fnat_fast()
    scores[targets.DOCKQ] = sim.compute_DockQScore(scores[targets.FNAT], scores[targets.LRMSD], scores[targets.IRMSD])
    scores[targets.BINARY] = scores[targets.IRMSD] < MIN_IRMS_FOR_BINARY

    scores[targets.CAPRI] = 4
    for thr, val in zip([4.0, 2.0, 1.0], [3, 2, 1], strict=True):
        if scores[targets.IRMSD] < thr:
            scores[targets.CAPRI] = val

    return scores