Source code for gunz_cm.reconstructions.third_party.flamingo

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__version__ = "1.0.0"
__author__ = "Yeremia Gunawan Adhisantoso"
__license__ = "Clear BSD"
__email__ = "adhisant@tnt.uni-hannover.de"

import typing as t
import os
import numpy as np
import pandas as pd
from scipy import stats
from ... import loaders
# from . import mds_numpy
from ..mds.mds_numpy import comp_edm_from_p
# from .coo import create_contact_coo

FLAMINGO_COOR_COL_NAMES = ['x', 'y', 'z']

[docs] def comp_flamingo_obj_perf( region1: str, resolution: int, balancing: str, input_fpath: str, points_fpath: str, region2: t.Optional[str] = None, ) -> dict: """ Computes the performance metrics (Spearman and Pearson correlation) for Euclidean distances predicted by FLAMINGO. Notes ----- - The function loads contact map data and point coordinates, filters out invalid data, and computes the Euclidean distance matrix. - If `region2` is not provided, the function assumes the same region for both comparisons. - The function raises a `FileNotFoundError` if the points file does not exist. Parameters ---------- region1 : str The first genomic region to analyze. resolution : int The resolution of the genomic data. balancing : str The balancing method to use for the contact map data. input_fpath : str The file path to the input contact map data. points_fpath : str The file path to the points data. region2 : t.Optional[str], optional The second genomic region to analyze, by default None. Returns ------- dict A dictionary containing the region, Spearman correlation, Pearson correlation, and the ratio of valid data. Examples -------- Authors ------- - Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de) - Qwen2.5 72B - 4.25bpw Examples -------- """ if not os.path.exists(points_fpath): raise FileNotFoundError(f"Points file {points_fpath} not found.") count_df = loaders.load_cm_data( input_fpath, region1, resolution, balancing=balancing, region2=region2, ret_df=True ) row_ids = count_df[loaders.ROW_IDS_COLNAME].to_numpy() col_ids = count_df[loaders.COL_IDS_COLNAME].to_numpy() counts = count_df[loaders.COUNTS_COLNAME].to_numpy() #? Create mapping from row/col ids to points #? Reason: not all loci are valid, yet the points contains only valid points unique_data_ids = np.unique([row_ids, col_ids]) points_df = pd.read_csv( points_fpath, delim_whitespace=True, #? Delimiter is all possible whitespace index_col=None, ) points_ids = (points_df['start']//resolution).to_numpy() #? Filter out data that got removed during simulation unique_data_ids_mask = np.isin(unique_data_ids, points_ids) removed_data_ids = unique_data_ids[~unique_data_ids_mask] valid_data_mask = ~np.isin(row_ids, removed_data_ids) valid_data_mask &= ~np.isin(col_ids, removed_data_ids) row_ids = row_ids[valid_data_mask] col_ids = col_ids[valid_data_mask] counts = counts[valid_data_mask] unique_data_ids = unique_data_ids[unique_data_ids_mask] mapping = np.searchsorted( unique_data_ids, np.arange(unique_data_ids.max()+1) ) new_row_ids = mapping[row_ids] new_col_ids = mapping[col_ids] edm = comp_edm_from_p( points_df[FLAMINGO_COOR_COL_NAMES].to_numpy(), new_row_ids, new_col_ids ) res = stats.spearmanr(counts, edm) spearman_r = res.correlation res = stats.pearsonr(counts, edm) pearson_r = res.correlation data_ratio = valid_data_mask.sum() / valid_data_mask.size output_dict = { 'region': region1, 'spearman_r': spearman_r, 'pearson_r': pearson_r, 'data_ratio': data_ratio, } return output_dict