# -*- coding: utf-8 -*-
"""
Module.
Examples
--------
"""
__version__ = "1.0.0"
__author__ = "Yeremia Gunawan Adhisantoso"
__license__ = "Clear BSD"
__email__ = "adhisant@tnt.uni-hannover.de"
import typing as t
import os
import numpy as np
import pandas as pd
from scipy import stats
from ... import loaders
# from . import mds_numpy
from ..mds.mds_numpy import comp_edm_from_p
# from .coo import create_contact_coo
FLAMINGO_COOR_COL_NAMES = ['x', 'y', 'z']
[docs]
def comp_flamingo_obj_perf(
region1: str,
resolution: int,
balancing: str,
input_fpath: str,
points_fpath: str,
region2: t.Optional[str] = None,
) -> dict:
"""
Computes the performance metrics (Spearman and Pearson correlation) for
Euclidean distances predicted by FLAMINGO.
Notes
-----
- The function loads contact map data and point coordinates, filters out invalid data, and computes the Euclidean distance matrix.
- If `region2` is not provided, the function assumes the same region for both comparisons.
- The function raises a `FileNotFoundError` if the points file does not exist.
Parameters
----------
region1 : str
The first genomic region to analyze.
resolution : int
The resolution of the genomic data.
balancing : str
The balancing method to use for the contact map data.
input_fpath : str
The file path to the input contact map data.
points_fpath : str
The file path to the points data.
region2 : t.Optional[str], optional
The second genomic region to analyze, by default None.
Returns
-------
dict
A dictionary containing the region, Spearman correlation, Pearson correlation, and the ratio of valid data.
Examples
--------
Authors
-------
- Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de)
- Qwen2.5 72B - 4.25bpw
Examples
--------
"""
if not os.path.exists(points_fpath):
raise FileNotFoundError(f"Points file {points_fpath} not found.")
count_df = loaders.load_cm_data(
input_fpath,
region1,
resolution,
balancing=balancing,
region2=region2,
ret_df=True
)
row_ids = count_df[loaders.ROW_IDS_COLNAME].to_numpy()
col_ids = count_df[loaders.COL_IDS_COLNAME].to_numpy()
counts = count_df[loaders.COUNTS_COLNAME].to_numpy()
#? Create mapping from row/col ids to points
#? Reason: not all loci are valid, yet the points contains only valid points
unique_data_ids = np.unique([row_ids, col_ids])
points_df = pd.read_csv(
points_fpath,
delim_whitespace=True, #? Delimiter is all possible whitespace
index_col=None,
)
points_ids = (points_df['start']//resolution).to_numpy()
#? Filter out data that got removed during simulation
unique_data_ids_mask = np.isin(unique_data_ids, points_ids)
removed_data_ids = unique_data_ids[~unique_data_ids_mask]
valid_data_mask = ~np.isin(row_ids, removed_data_ids)
valid_data_mask &= ~np.isin(col_ids, removed_data_ids)
row_ids = row_ids[valid_data_mask]
col_ids = col_ids[valid_data_mask]
counts = counts[valid_data_mask]
unique_data_ids = unique_data_ids[unique_data_ids_mask]
mapping = np.searchsorted(
unique_data_ids,
np.arange(unique_data_ids.max()+1)
)
new_row_ids = mapping[row_ids]
new_col_ids = mapping[col_ids]
edm = comp_edm_from_p(
points_df[FLAMINGO_COOR_COL_NAMES].to_numpy(),
new_row_ids,
new_col_ids
)
res = stats.spearmanr(counts, edm)
spearman_r = res.correlation
res = stats.pearsonr(counts, edm)
pearson_r = res.correlation
data_ratio = valid_data_mask.sum() / valid_data_mask.size
output_dict = {
'region': region1,
'spearman_r': spearman_r,
'pearson_r': pearson_r,
'data_ratio': data_ratio,
}
return output_dict