Source code for gunz_cm.reconstructions.third_party.shneigh

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__version__ = "1.0.0"
__author__ = "Yeremia Gunawan Adhisantoso"
__license__ = "Clear BSD"
__email__ = "adhisant@tnt.uni-hannover.de"

import typing as t
import os
import numpy as np
import pandas as pd
from scipy import stats
from ... import loaders as cm_loaders
from ..mds.mds_numpy import comp_edm_from_p
from ...converters import convert_to_cm_coo

SHNEIGH_COOR_COL_NAMES = ["IDS", "X", "Y", "Z"]

COO_DELIM = "\t"

[docs] def gen_shneigh_coo( chr_region: str, resolution: int, balancing: str, input_fpath: str, output_fpath: str, overwrite: bool = False, exist_ok: bool = False, ): """ Generates a COO format file for SHNeigh. Notes ----- This function converts the input data to a COO format file suitable for SHNeigh. Parameters ---------- chr_region : str The chromosome region. resolution : int The resolution of the data. balancing : str The balancing method. input_fpath : str The path to the input file. output_fpath : str The path to the output file. overwrite : bool, optional Whether to overwrite the output file if it exists, by default False. Returns ------- None Examples -------- Authors ------- - Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de) - Qwen2.5 72B - 4.25bpw Examples -------- """ convert_to_cm_coo( chr_region, resolution, balancing, input_fpath, output_fpath, gen_pseudo_weights=False, output_delimiter=COO_DELIM, overwrite=overwrite, exist_ok=exist_ok, )
[docs] def comp_shneigh_obj_perf( region1: str, resolution: int, balancing: str, input_fpath: str, points_fpath: str, region2: t.Optional[str] = None, ) -> dict: """ Computes the performance metrics (Spearman and Pearson correlation) for Euclidean distances predicted by SHNeigh. Notes ----- This function computes the Spearman rank correlation between contact counts and Euclidean distances derived from 3D coordinates. It handles cases where some loci are invalid and ensures that only valid data points are used in the computation. If the points file does not exist, a `FileNotFoundError` is raised. Parameters ---------- region1 : str The chromosome region for the first chromosome. resolution : int The resolution of the data. balancing : str The balancing method. input_fpath : str The path to the input file containing contact counts. points_fpath : str The path to the file containing 3D coordinates. region2 : Optional[str], optional The chromosome region for the second chromosome, by default None. Returns ------- dict A dictionary containing the region, Spearman rank correlation, Pearson correlation, and data ratio. Examples -------- Authors ------- - Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de) - Qwen2.5 72B - 4.25bpw Examples -------- """ if not os.path.exists(points_fpath): raise FileNotFoundError(f"Points file {points_fpath} not found.") count_df = cm_loaders.load_cm_data( input_fpath, region1, resolution, balancing=balancing, region2=region2, ret_df=True ) row_ids = count_df[cm_loaders.ROW_IDS_COLNAME].to_numpy() col_ids = count_df[cm_loaders.COL_IDS_COLNAME].to_numpy() counts = count_df[cm_loaders.COUNTS_COLNAME].to_numpy() #? Create mapping from row/col ids to points #? Reason: not all loci are valid, yet the points contains only valid points unique_data_ids = np.unique([row_ids, col_ids]) points_df = pd.read_csv( points_fpath, delim_whitespace=True, #? Delimiter is all possible whitespace header=None, names=SHNEIGH_COOR_COL_NAMES, index_col=None, ) points = points_df.iloc[:, 1:4].to_numpy() unique_data_ids_mask = np.isin(unique_data_ids, points_df["IDS"]) removed_data_ids = unique_data_ids[~unique_data_ids_mask] valid_data_mask = ~np.isin(row_ids, removed_data_ids) valid_data_mask &= ~np.isin(col_ids, removed_data_ids) row_ids = row_ids[valid_data_mask] col_ids = col_ids[valid_data_mask] counts = counts[valid_data_mask] unique_data_ids = unique_data_ids[unique_data_ids_mask] mapping = np.searchsorted(unique_data_ids, np.arange(unique_data_ids.max()+1)) new_row_ids = mapping[row_ids] new_col_ids = mapping[col_ids] edm = comp_edm_from_p( points, new_row_ids, new_col_ids ) res = stats.spearmanr(counts, edm) spearman_r = res.correlation res = stats.pearsonr(counts, edm) pearson_r = res.correlation data_ratio = valid_data_mask.sum() / valid_data_mask.size output_dict = { 'region': region1, 'spearman_r': spearman_r, 'pearson_r': pearson_r, 'data_ratio': data_ratio, } return output_dict