Source code for gunz_cm.preprocs.sparse_wish_dist

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"

import functools
import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pydantic import validate_call, ConfigDict
from scipy import sparse as sp
from .. import consts as cm_consts

#TODO: Implement comp_sparse_wish_dist
[docs] @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) @functools.singledispatch def comp_sparse_wish_dist( data, alpha: float = -0.25, na_inf_val: t.Optional[float] = None, ): """ Function comp_sparse_wish_dist. Parameters ---------- Returns ------- Examples -------- Notes ----- """ raise PreprocError(f"No implementation for data type: {type(data).__name__}")
[docs] def comp_sparse_wish_dist_rc_ids( row_ids: t.Union[np.ndarray, t.List[int]], col_ids: t.Union[np.ndarray, t.List[int]], C_vals: np.ndarray, alpha: float = -0.25, na_inf_val: t.Optional[float] = None ) -> t.Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Calculate sparse form of Euclidean distance matrix from contact matrix. Create a tuple of row indices, column indices, contact matrix values, and Euclidean distance values. Parameters ---------- row_ids : t.Union[np.ndarray, t.List[int]] Array of row indices. col_ids : t.Union[np.ndarray, t.List[int]] Array of column indices. C_vals : np.ndarray Array of contact matrix values. alpha : float, optional Conversion factor from contact matrix to Euclidean distance matrix (default is -0.25). na_inf_val : t.Optional[float], optional Value to replace NaN or infinite values (default is None). Returns ------- t.Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray] A tuple containing (row_ids, col_ids, C_vals, D_vals). Notes ----- Removes the main diagonal of the matrix. NaN handling is not yet implemented and will raise a NotImplementedError if invalid values are found. Examples -------- Examples -------- """ #? Remove entries in the main diagonal as euclidean distance would be zero nondiag_mask = row_ids != col_ids row_ids = row_ids[nondiag_mask] col_ids = col_ids[nondiag_mask] C_vals = C_vals[nondiag_mask] #? Calculate Euclidean distance from contact matrix using the given alpha value D_vals = np.power(C_vals, alpha) #? Create a mask to select invalid values (NaN or infinite) in the distance array na_inf_mask = np.logical_or(np.isinf(D_vals), np.isnan(D_vals)) #? NaN value handling #? If there are invalid values, replace them with the given replace value or filter them out if na_inf_mask.any(): if na_inf_val is not None: D_vals[na_inf_mask] = na_inf_val else: row_ids = row_ids[~na_inf_mask] col_ids = col_ids[~na_inf_mask] C_vals = C_vals[~na_inf_mask] D_vals = D_vals[~na_inf_mask] return row_ids, col_ids, C_vals, D_vals
@comp_sparse_wish_dist.register(sp.coo_matrix) def _( cm_coo: sp.coo_matrix, alpha: float = -0.25, na_inf_val: t.Optional[float] = None, **kwargs, ): """ Function _. Parameters ---------- Returns ------- Examples -------- Notes ----- """ row_ids = cm_coo.row col_ids = cm_coo.col values = cm_coo.data out = comp_sparse_wish_dist_rc_ids( row_ids, col_ids, values, alpha=alpha, na_inf_val=na_inf_val, ) row_ids, col_ids, C_vals, D_vals = out edm_coo = sp.coo_matrix( (D_vals, (row_ids, col_ids)), shape=cm_coo.shape ) return edm_coo @comp_sparse_wish_dist.register(pd.DataFrame) def _( cm_df: pd.DataFrame, alpha: float = -0.25, na_inf_val: t.Optional[float] = None, **kwargs, ) -> pd.DataFrame: """ Calculate sparse form of Euclidean distance matrix from contact matrix. Create a DataFrame with an additional column for Euclidean distance values. Parameters ---------- df : pd.DataFrame Input contact matrix. alpha : float, optional Conversion factor from contact matrix to Euclidean distance matrix (default is -0.25). na_inf_val : t.Optional[float], optional Value to replace NaN or infinite values (default is None). Returns ------- pd.DataFrame DataFrame with an additional column for Euclidean distance values. Notes ----- Removes the main diagonal of the matrix. If invalid values are found, they will be replaced with the given replace value or filtered out. Examples -------- Examples -------- """ #? Remove entries in the main diagonal as euclidean distance would be zero nondiag_mask = cm_df[cm_consts.DataFrameSpecs.ROW_IDS] != cm_df[cm_consts.DataFrameSpecs.COL_IDS] cm_df = cm_df.loc[nondiag_mask] #? Calculate Euclidean distance from contact matrix using the given alpha value dist_arr = np.power(cm_df[cm_consts.DataFrameSpecs.COUNTS], alpha) #? Create a mask to select invalid values (NaN or infinite) in the distance array na_inf_mask = np.logical_or(np.isinf(dist_arr), np.isnan(dist_arr)) #? NaN value handling #? If there are invalid values, replace them with the given replace value or filter them out if na_inf_mask.any(): if na_inf_val is not None: dist_arr[na_inf_mask] = na_inf_val else: dist_arr = dist_arr[~na_inf_mask] cm_df = cm_df.loc[~na_inf_mask, :] #? Add the calculated distance array as a new column to the contact matrix cm_df.insert( cm_df.shape[1], cm_consts.DataFrameSpecs.DIST, dist_arr ) return cm_df