Source code for gunz_cm.reconstructions.third_party.superrec

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__version__ = "1.0.0"
__author__ = "Yeremia Gunawan Adhisantoso"
__license__ = "Clear BSD"
__email__ = "adhisant@tnt.uni-hannover.de"

import typing as t
import numpy as np
import pandas as pd
from scipy import stats
from ... import loaders as cm_loaders
from ..mds.mds_numpy import comp_edm_from_p
from ...converters import convert_to_cm_coo

SUPERREC_COOR_COL_NAMES = ["X", "Y", "Z"]

COO_DELIM = " "


[docs]
def gen_superrec_coo(
    chr_region: str,
    resolution: int,
    balancing: str,
    input_fpath: str,
    output_fpath: str,
    overwrite: bool = False,
    exist_ok: bool = False,
) -> None:
    """
    Generates a COO format file for SHNeigh.

    Notes
    -----
    This function converts the input data to a COO format file suitable for SHNeigh.
    It uses the `convert_to_cm_coo` function with specific parameters tailored for SHNeigh's requirements.

    Parameters
    ----------
    chr_region : str
        The chromosome region.
    resolution : int
        The resolution of the data.
    balancing : str
        The balancing method.
    input_fpath : str
        The path to the input file.
    output_fpath : str
        The path to the output file.
    overwrite : bool, optional
        Whether to overwrite the output file if it exists, by default False.
    exist_ok : bool, optional
        Whether to ignore the operation if the output file already exists, by default False.

    Returns
    -------
    None

    Examples
    --------

    Authors
    -------
    - Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de)
    - Osiris v3.2


Examples
--------
"""
    
    convert_to_cm_coo(
        input_fpath,
        output_fpath,
        chr_region,
        resolution,
        balancing,
        overwrite=overwrite,
        exist_ok=exist_ok,
        gen_pseudo_weights=False,
        output_delimiter=COO_DELIM,
        res_to_one=True,
    )



[docs]
def comp_superrec_obj_perf(
    region1: str,
    resolution: int,
    balancing: str,
    input_fpath: str,
    points_fpath: str,
    region2: t.Optional[str] = None,
) -> dict:
    """
    Computes the performance metrics (Spearman and Pearson correlation) for 
    Euclidean distances predicted by SuperRec.

    Notes
    -----
    This function loads the count data for a specified region and resolution, computes the Euclidean distance matrix
    from the points file, and then calculates the Spearman and Pearson correlation coefficients between the counts and
    the distances. The function assumes that the points file contains valid points and that the row and column IDs
    are mapped to these valid points.

    Parameters
    ----------
    region1 : str
        The first region for which to compute the performance metrics.
    resolution : int
        The resolution at which to load the data.
    balancing : str
        The balancing method to use when loading the data.
    input_fpath : str
        The file path to the input data.
    points_fpath : str
        The file path to the points data.
    region2 : Optional[str], optional
        The second region for which to compute the performance metrics, by default None.

    Returns
    -------
    dict
        A dictionary containing the region, Spearman correlation coefficient, Pearson correlation coefficient, and data ratio.

    Examples
    --------

    Authors
    -------
    - Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de)
    - Qwen2.5 72B - 4.25bpw


Examples
--------
"""

    count_df = cm_loaders.load_cm_data(
        input_fpath,
        region1,
        resolution,
        balancing=balancing,
        region2=region2,
        ret_df=True
    )

    row_ids = count_df[cm_loaders.ROW_IDS_COLNAME].to_numpy()
    col_ids = count_df[cm_loaders.COL_IDS_COLNAME].to_numpy()
    counts = count_df[cm_loaders.COUNTS_COLNAME].to_numpy()

    #? Create mapping from row/col ids to points
    #? Reason: not all loci are valid, yet the points contains only valid points
    unique_ids = np.unique([row_ids, col_ids])
    mapping = np.searchsorted(unique_ids, np.arange(unique_ids.max()+1))

    new_row_ids = mapping[row_ids]
    new_col_ids = mapping[col_ids]

    points_fpath = pd.read_csv(
        points_fpath,
        delim_whitespace=True, #? Delimiter is all possible whitespace
        header=None,
        names=SUPERREC_COOR_COL_NAMES,
        index_col=None,
    ).to_numpy()

    edm = comp_edm_from_p(
        points_fpath,
        new_row_ids,
        new_col_ids
    )

    res = stats.spearmanr(counts, edm)
    spearman_r = res.correlation
    res = stats.pearsonr(counts, edm)
    pearson_r = res.correlation
    DATA_RATIO = 1.0 #? No filtering of valid ids, thus ratio is 1.0

    output_dict = {
        'region': region1,
        'spearman_r': spearman_r,
        'pearson_r': pearson_r,
        'data_ratio': DATA_RATIO, 
    }
    
    return output_dict