# -*- coding: utf-8 -*-
"""
Module.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"
import functools
import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pydantic import validate_call, ConfigDict
from scipy import sparse as sp
from .. import consts as cm_consts
#TODO: Implement comp_sparse_wish_dist
[docs]
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@functools.singledispatch
def comp_sparse_wish_dist(
data,
alpha: float = -0.25,
na_inf_val: t.Optional[float] = None,
):
"""
Function comp_sparse_wish_dist.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
raise PreprocError(f"No implementation for data type: {type(data).__name__}")
[docs]
def comp_sparse_wish_dist_rc_ids(
row_ids: t.Union[np.ndarray, t.List[int]],
col_ids: t.Union[np.ndarray, t.List[int]],
C_vals: np.ndarray,
alpha: float = -0.25,
na_inf_val: t.Optional[float] = None
) -> t.Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""
Calculate sparse form of Euclidean distance matrix from contact matrix.
Create a tuple of row indices, column indices, contact matrix values, and Euclidean distance values.
Parameters
----------
row_ids : t.Union[np.ndarray, t.List[int]]
Array of row indices.
col_ids : t.Union[np.ndarray, t.List[int]]
Array of column indices.
C_vals : np.ndarray
Array of contact matrix values.
alpha : float, optional
Conversion factor from contact matrix to Euclidean distance matrix (default is -0.25).
na_inf_val : t.Optional[float], optional
Value to replace NaN or infinite values (default is None).
Returns
-------
t.Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
A tuple containing (row_ids, col_ids, C_vals, D_vals).
Notes
-----
Removes the main diagonal of the matrix.
NaN handling is not yet implemented and will raise a NotImplementedError if invalid values are found.
Examples
--------
Examples
--------
"""
#? Remove entries in the main diagonal as euclidean distance would be zero
nondiag_mask = row_ids != col_ids
row_ids = row_ids[nondiag_mask]
col_ids = col_ids[nondiag_mask]
C_vals = C_vals[nondiag_mask]
#? Calculate Euclidean distance from contact matrix using the given alpha value
D_vals = np.power(C_vals, alpha)
#? Create a mask to select invalid values (NaN or infinite) in the distance array
na_inf_mask = np.logical_or(np.isinf(D_vals), np.isnan(D_vals))
#? NaN value handling
#? If there are invalid values, replace them with the given replace value or filter them out
if na_inf_mask.any():
if na_inf_val is not None:
D_vals[na_inf_mask] = na_inf_val
else:
row_ids = row_ids[~na_inf_mask]
col_ids = col_ids[~na_inf_mask]
C_vals = C_vals[~na_inf_mask]
D_vals = D_vals[~na_inf_mask]
return row_ids, col_ids, C_vals, D_vals
@comp_sparse_wish_dist.register(sp.coo_matrix)
def _(
cm_coo: sp.coo_matrix,
alpha: float = -0.25,
na_inf_val: t.Optional[float] = None,
**kwargs,
):
"""
Function _.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
row_ids = cm_coo.row
col_ids = cm_coo.col
values = cm_coo.data
out = comp_sparse_wish_dist_rc_ids(
row_ids,
col_ids,
values,
alpha=alpha,
na_inf_val=na_inf_val,
)
row_ids, col_ids, C_vals, D_vals = out
edm_coo = sp.coo_matrix(
(D_vals, (row_ids, col_ids)),
shape=cm_coo.shape
)
return edm_coo
@comp_sparse_wish_dist.register(pd.DataFrame)
def _(
cm_df: pd.DataFrame,
alpha: float = -0.25,
na_inf_val: t.Optional[float] = None,
**kwargs,
) -> pd.DataFrame:
"""
Calculate sparse form of Euclidean distance matrix from contact matrix.
Create a DataFrame with an additional column for Euclidean distance values.
Parameters
----------
df : pd.DataFrame
Input contact matrix.
alpha : float, optional
Conversion factor from contact matrix to Euclidean distance matrix (default is -0.25).
na_inf_val : t.Optional[float], optional
Value to replace NaN or infinite values (default is None).
Returns
-------
pd.DataFrame
DataFrame with an additional column for Euclidean distance values.
Notes
-----
Removes the main diagonal of the matrix.
If invalid values are found, they will be replaced with the given replace value or filtered out.
Examples
--------
Examples
--------
"""
#? Remove entries in the main diagonal as euclidean distance would be zero
nondiag_mask = cm_df[cm_consts.DataFrameSpecs.ROW_IDS] != cm_df[cm_consts.DataFrameSpecs.COL_IDS]
cm_df = cm_df.loc[nondiag_mask]
#? Calculate Euclidean distance from contact matrix using the given alpha value
dist_arr = np.power(cm_df[cm_consts.DataFrameSpecs.COUNTS], alpha)
#? Create a mask to select invalid values (NaN or infinite) in the distance array
na_inf_mask = np.logical_or(np.isinf(dist_arr), np.isnan(dist_arr))
#? NaN value handling
#? If there are invalid values, replace them with the given replace value or filter them out
if na_inf_mask.any():
if na_inf_val is not None:
dist_arr[na_inf_mask] = na_inf_val
else:
dist_arr = dist_arr[~na_inf_mask]
cm_df = cm_df.loc[~na_inf_mask, :]
#? Add the calculated distance array as a new column to the contact matrix
cm_df.insert(
cm_df.shape[1],
cm_consts.DataFrameSpecs.DIST,
dist_arr
)
return cm_df