Source code for gunz_cm.reconstructions.third_party.shneigh

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__version__ = "1.0.0"
__author__ = "Yeremia Gunawan Adhisantoso"
__license__ = "Clear BSD"
__email__ = "adhisant@tnt.uni-hannover.de"

import typing as t
import os
import numpy as np
import pandas as pd
from scipy import stats
from ... import loaders as cm_loaders
from ..mds.mds_numpy import comp_edm_from_p
from ...converters import convert_to_cm_coo

SHNEIGH_COOR_COL_NAMES = ["IDS", "X", "Y", "Z"]

COO_DELIM = "\t"


[docs]
def gen_shneigh_coo(
    chr_region: str,
    resolution: int,
    balancing: str,
    input_fpath: str,
    output_fpath: str,
    overwrite: bool = False,
    exist_ok: bool = False,
):
    """
    Generates a COO format file for SHNeigh.

    Notes
    -----
    This function converts the input data to a COO format file suitable for SHNeigh.

    Parameters
    ----------
    chr_region : str
        The chromosome region.
    resolution : int
        The resolution of the data.
    balancing : str
        The balancing method.
    input_fpath : str
        The path to the input file.
    output_fpath : str
        The path to the output file.
    overwrite : bool, optional
        Whether to overwrite the output file if it exists, by default False.

    Returns
    -------
    None

    Examples
    --------

    Authors
    -------
    - Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de)
    - Qwen2.5 72B - 4.25bpw


Examples
--------
"""
    
    convert_to_cm_coo(
        chr_region,
        resolution,
        balancing,
        input_fpath,
        output_fpath,
        gen_pseudo_weights=False,
        output_delimiter=COO_DELIM,
        overwrite=overwrite,
        exist_ok=exist_ok,
    )



[docs]
def comp_shneigh_obj_perf(
    region1: str,
    resolution: int,
    balancing: str,
    input_fpath: str,
    points_fpath: str,
    region2: t.Optional[str] = None,
) -> dict:
    """
    Computes the performance metrics (Spearman and Pearson correlation) for 
    Euclidean distances predicted by SHNeigh.

    Notes
    -----
    This function computes the Spearman rank correlation between contact counts and Euclidean distances
    derived from 3D coordinates. It handles cases where some loci are invalid and ensures that only valid
    data points are used in the computation. If the points file does not exist, a `FileNotFoundError` is raised.

    Parameters
    ----------
    region1 : str
        The chromosome region for the first chromosome.
    resolution : int
        The resolution of the data.
    balancing : str
        The balancing method.
    input_fpath : str
        The path to the input file containing contact counts.
    points_fpath : str
        The path to the file containing 3D coordinates.
    region2 : Optional[str], optional
        The chromosome region for the second chromosome, by default None.

    Returns
    -------
    dict
        A dictionary containing the region, Spearman rank correlation, Pearson correlation, and data ratio.

    Examples
    --------

    Authors
    -------
    - Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de)
    - Qwen2.5 72B - 4.25bpw


Examples
--------
"""


    if not os.path.exists(points_fpath):
        raise FileNotFoundError(f"Points file {points_fpath} not found.")

    count_df = cm_loaders.load_cm_data(
        input_fpath,
        region1,
        resolution,
        balancing=balancing,
        region2=region2,
        ret_df=True
    )

    row_ids = count_df[cm_loaders.ROW_IDS_COLNAME].to_numpy()
    col_ids = count_df[cm_loaders.COL_IDS_COLNAME].to_numpy()
    counts = count_df[cm_loaders.COUNTS_COLNAME].to_numpy()

    #? Create mapping from row/col ids to points
    #? Reason: not all loci are valid, yet the points contains only valid points
    unique_data_ids = np.unique([row_ids, col_ids])

    points_df = pd.read_csv(
        points_fpath,
        delim_whitespace=True, #? Delimiter is all possible whitespace
        header=None,
        names=SHNEIGH_COOR_COL_NAMES,
        index_col=None,
    )
    points = points_df.iloc[:, 1:4].to_numpy()

    unique_data_ids_mask = np.isin(unique_data_ids, points_df["IDS"])
    removed_data_ids = unique_data_ids[~unique_data_ids_mask]
    valid_data_mask = ~np.isin(row_ids, removed_data_ids)
    valid_data_mask &= ~np.isin(col_ids, removed_data_ids)

    row_ids = row_ids[valid_data_mask]
    col_ids = col_ids[valid_data_mask]
    counts = counts[valid_data_mask]

    unique_data_ids = unique_data_ids[unique_data_ids_mask]
    mapping = np.searchsorted(unique_data_ids, np.arange(unique_data_ids.max()+1))

    new_row_ids = mapping[row_ids]
    new_col_ids = mapping[col_ids]

    edm = comp_edm_from_p(
        points,
        new_row_ids,
        new_col_ids
    )

    res = stats.spearmanr(counts, edm)
    spearman_r = res.correlation
    res = stats.pearsonr(counts, edm)
    pearson_r = res.correlation
    
    data_ratio = valid_data_mask.sum() / valid_data_mask.size

    output_dict = {
        'region': region1,
        'spearman_r': spearman_r,
        'pearson_r': pearson_r,
        'data_ratio': data_ratio, 
    }

    return output_dict