# -*- coding: utf-8 -*-
"""
Module.
Examples
--------
"""
__version__ = "1.0.0"
__author__ = "Yeremia Gunawan Adhisantoso"
__license__ = "Clear BSD"
__email__ = "adhisant@tnt.uni-hannover.de"
import typing as t
import numpy as np
import pandas as pd
from scipy import stats
from ... import loaders as cm_loaders
from ..mds.mds_numpy import comp_edm_from_p
from ...converters import convert_to_cm_coo
SUPERREC_COOR_COL_NAMES = ["X", "Y", "Z"]
COO_DELIM = " "
[docs]
def gen_superrec_coo(
chr_region: str,
resolution: int,
balancing: str,
input_fpath: str,
output_fpath: str,
overwrite: bool = False,
exist_ok: bool = False,
) -> None:
"""
Generates a COO format file for SHNeigh.
Notes
-----
This function converts the input data to a COO format file suitable for SHNeigh.
It uses the `convert_to_cm_coo` function with specific parameters tailored for SHNeigh's requirements.
Parameters
----------
chr_region : str
The chromosome region.
resolution : int
The resolution of the data.
balancing : str
The balancing method.
input_fpath : str
The path to the input file.
output_fpath : str
The path to the output file.
overwrite : bool, optional
Whether to overwrite the output file if it exists, by default False.
exist_ok : bool, optional
Whether to ignore the operation if the output file already exists, by default False.
Returns
-------
None
Examples
--------
Authors
-------
- Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de)
- Osiris v3.2
Examples
--------
"""
convert_to_cm_coo(
input_fpath,
output_fpath,
chr_region,
resolution,
balancing,
overwrite=overwrite,
exist_ok=exist_ok,
gen_pseudo_weights=False,
output_delimiter=COO_DELIM,
res_to_one=True,
)
[docs]
def comp_superrec_obj_perf(
region1: str,
resolution: int,
balancing: str,
input_fpath: str,
points_fpath: str,
region2: t.Optional[str] = None,
) -> dict:
"""
Computes the performance metrics (Spearman and Pearson correlation) for
Euclidean distances predicted by SuperRec.
Notes
-----
This function loads the count data for a specified region and resolution, computes the Euclidean distance matrix
from the points file, and then calculates the Spearman and Pearson correlation coefficients between the counts and
the distances. The function assumes that the points file contains valid points and that the row and column IDs
are mapped to these valid points.
Parameters
----------
region1 : str
The first region for which to compute the performance metrics.
resolution : int
The resolution at which to load the data.
balancing : str
The balancing method to use when loading the data.
input_fpath : str
The file path to the input data.
points_fpath : str
The file path to the points data.
region2 : Optional[str], optional
The second region for which to compute the performance metrics, by default None.
Returns
-------
dict
A dictionary containing the region, Spearman correlation coefficient, Pearson correlation coefficient, and data ratio.
Examples
--------
Authors
-------
- Yeremia G. Adhisantoso (adhisant@tnt.uni-hannover.de)
- Qwen2.5 72B - 4.25bpw
Examples
--------
"""
count_df = cm_loaders.load_cm_data(
input_fpath,
region1,
resolution,
balancing=balancing,
region2=region2,
ret_df=True
)
row_ids = count_df[cm_loaders.ROW_IDS_COLNAME].to_numpy()
col_ids = count_df[cm_loaders.COL_IDS_COLNAME].to_numpy()
counts = count_df[cm_loaders.COUNTS_COLNAME].to_numpy()
#? Create mapping from row/col ids to points
#? Reason: not all loci are valid, yet the points contains only valid points
unique_ids = np.unique([row_ids, col_ids])
mapping = np.searchsorted(unique_ids, np.arange(unique_ids.max()+1))
new_row_ids = mapping[row_ids]
new_col_ids = mapping[col_ids]
points_fpath = pd.read_csv(
points_fpath,
delim_whitespace=True, #? Delimiter is all possible whitespace
header=None,
names=SUPERREC_COOR_COL_NAMES,
index_col=None,
).to_numpy()
edm = comp_edm_from_p(
points_fpath,
new_row_ids,
new_col_ids
)
res = stats.spearmanr(counts, edm)
spearman_r = res.correlation
res = stats.pearsonr(counts, edm)
pearson_r = res.correlation
DATA_RATIO = 1.0 #? No filtering of valid ids, thus ratio is 1.0
output_dict = {
'region': region1,
'spearman_r': spearman_r,
'pearson_r': pearson_r,
'data_ratio': DATA_RATIO,
}
return output_dict