Source code for gunz_cm.reconstructions.third_party.flamingo

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__version__ = "1.0.0"
__author__ = "Yeremia Gunawan Adhisantoso"
__license__ = "Clear BSD"
__email__ = "adhisant@tnt.uni-hannover.de"

import typing as t
import os
import numpy as np
import pandas as pd
from scipy import stats
from ... import loaders
# from . import mds_numpy
from ..mds.mds_numpy import comp_edm_from_p
# from .coo import create_contact_coo

FLAMINGO_COOR_COL_NAMES = ['x', 'y', 'z']


[docs]
def comp_flamingo_obj_perf(
    region1: str,
    resolution: int,
    balancing: str,
    input_fpath: str,
    points_fpath: str,
    region2: t.Optional[str] = None,
) -> dict:
    """
    Computes the performance metrics (Spearman and Pearson correlation) for 
    Euclidean distances predicted by FLAMINGO.

    Notes
    -----
    - The function loads contact map data and point coordinates, filters out invalid data, and computes the Euclidean distance matrix.
    - If `region2` is not provided, the function assumes the same region for both comparisons.
    - The function raises a `FileNotFoundError` if the points file does not exist.

    Parameters
    ----------
    region1 : str
        The first genomic region to analyze.
    resolution : int
        The resolution of the genomic data.
    balancing : str
        The balancing method to use for the contact map data.
    input_fpath : str
        The file path to the input contact map data.
    points_fpath : str
        The file path to the points data.
    region2 : t.Optional[str], optional
        The second genomic region to analyze, by default None.

    Returns
    -------
    dict
        A dictionary containing the region, Spearman correlation, Pearson correlation, and the ratio of valid data.

    Examples
    --------

    Authors
    -------
    - Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de)
    - Qwen2.5 72B - 4.25bpw


Examples
--------
"""

    if not os.path.exists(points_fpath):
        raise FileNotFoundError(f"Points file {points_fpath} not found.")

    count_df = loaders.load_cm_data(
        input_fpath,
        region1,
        resolution,
        balancing=balancing,
        region2=region2,
        ret_df=True
    )

    row_ids = count_df[loaders.ROW_IDS_COLNAME].to_numpy()
    col_ids = count_df[loaders.COL_IDS_COLNAME].to_numpy()
    counts = count_df[loaders.COUNTS_COLNAME].to_numpy()

    #? Create mapping from row/col ids to points
    #? Reason: not all loci are valid, yet the points contains only valid points
    unique_data_ids = np.unique([row_ids, col_ids])

    points_df = pd.read_csv(
        points_fpath,
        delim_whitespace=True, #? Delimiter is all possible whitespace
        index_col=None,
    )

    points_ids = (points_df['start']//resolution).to_numpy()

    #? Filter out data that got removed during simulation
    unique_data_ids_mask = np.isin(unique_data_ids, points_ids)
    removed_data_ids = unique_data_ids[~unique_data_ids_mask]
    valid_data_mask = ~np.isin(row_ids, removed_data_ids)
    valid_data_mask &= ~np.isin(col_ids, removed_data_ids)

    row_ids = row_ids[valid_data_mask]
    col_ids = col_ids[valid_data_mask]
    counts = counts[valid_data_mask]

    unique_data_ids = unique_data_ids[unique_data_ids_mask]
    mapping = np.searchsorted(
        unique_data_ids, 
        np.arange(unique_data_ids.max()+1)
    )

    new_row_ids = mapping[row_ids]
    new_col_ids = mapping[col_ids]

    edm = comp_edm_from_p(
        points_df[FLAMINGO_COOR_COL_NAMES].to_numpy(),
        new_row_ids,
        new_col_ids
    )

    res = stats.spearmanr(counts, edm)
    spearman_r = res.correlation
    res = stats.pearsonr(counts, edm)
    pearson_r = res.correlation
    
    data_ratio = valid_data_mask.sum() / valid_data_mask.size

    output_dict = {
        'region': region1,
        'spearman_r': spearman_r,
        'pearson_r': pearson_r,
        'data_ratio': data_ratio, 
    }

    return output_dict