Source code for gunz_cm.preprocs.resamples

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"

import functools
import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pydantic import validate_call, ConfigDict
from scipy import sparse as sp
from .. import consts as cm_consts

[docs] def uniform_resample_mat( cm_mat: np.ndarray, target_rate: float, ) -> np.ndarray: """ Uniformly resample a matrix by a specified target rate. Notes ----- This function simply multiplies the input matrix by the target rate. Parameters ---------- cm_mat : np.ndarray Input contact matrix to resample. target_rate : float Target rate for resampling. Returns ------- np.ndarray Resampled matrix. Examples -------- Examples -------- """ return cm_mat*target_rate
def _rand_downsample_helper( vals: np.ndarray, ratio: int, ) -> t.Tuple[np.ndarray, np.ndarray]: """ Helper function for random downsampling. Notes ----- This function computes the target counts after downsampling and randomly selects indices to downsample. Parameters ---------- vals : np.ndarray Input values to downsample. ratio : int Downsample ratio. Returns ------- t.Tuple[np.ndarray, np.ndarray] Indices and new values after downsampling. Examples -------- """ if not np.issubdtype(vals.dtype, np.integer): raise PreprocError("Values must be an integer!") if np.any(vals < 0): raise PreprocError("Values should be non-negative") dtype = np.min_scalar_type(vals) vals = vals.astype(dtype) total_vals = vals.sum() #? compute target counts after downsampling total_ds_vals = np.floor_divide(total_vals.astype(np.uint), ratio, dtype=np.uint) #? _THE MAGIC_ cumsum_vals = np.cumsum(vals) tmp_ids = np.random.choice(total_vals, total_ds_vals, replace=False) sel_ids = np.searchsorted(cumsum_vals, tmp_ids) ids, new_vals = np.unique(sel_ids, return_counts=True) return ids, new_vals
[docs] @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) @functools.singledispatch def rand_downsample( data: t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame], ratio: int, val_colname: str = cm_consts.DataFrameSpecs.COUNTS, ) -> t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame]: """ Randomly downsample a matrix or dataframe by a specified ratio. Notes ----- This function dispatches to different downsampling functions based on the input data type. Parameters ---------- data : Union[np.ndarray, sp.coo_matrix, pd.DataFrame] Input data to downsample. ratio : int Downsample ratio. val_colname : str, optional Column name for values in dataframe (default is cm_consts.COUNTS_COLNAME). Returns ------- Union[np.ndarray, sp.coo_matrix, pd.DataFrame] Downsampled data. Examples -------- Examples -------- """ raise PreprocError(f"No implementation for data type: {type(data).__name__}")
@rand_downsample.register(np.ndarray) def _( cm_mat: np.ndarray, ratio: int, **kwargs, ) -> np.ndarray: """ Randomly downsample a matrix by a specified ratio. Notes ----- This function uses the `_rand_downsample_helper` function to downsample the matrix. Parameters ---------- cm_mat : np.ndarray Input matrix to downsample. ratio : int Downsample ratio. Returns ------- np.ndarray Downsampled matrix. Examples -------- Examples -------- """ row_ids, col_ids = np.nonzero(cm_mat) vals = cm_mat[row_ids, col_ids] ids, new_vals = _rand_downsample_helper( vals, ratio, ) new_cm_mat = np.zeros_like(cm_mat) new_cm_mat[row_ids[ids], col_ids[ids]] = new_vals return new_cm_mat # assert ratio > 0 and ratio <= 1.0, \ # f"Invalid target_rate:{ratio}" # sum_counts = cm_mat.sum() # if sum_counts == 0.0: # return cm_mat # else: # #? The sum is 1.0 in this stage # #? DO NOT USE IN-PLACE OP otherwise the original matrix is replaced # # mat /= sum_counts #? DO NOT USE THIS # cm_mat = cm_mat / sum_counts # sampler = rand_f(*cm_mat.shape) # cm_mat *= sampler # cm_mat /= cm_mat.sum() #? Make the sum 1.0 again after random sampling # cm_mat *= ratio # cm_mat *= sum_counts # return cm_mat @rand_downsample.register(sp.coo_matrix) def _( cm_coo: sp.coo_matrix, ratio: int, **kwargs, ) -> sp.coo_matrix: """ Randomly downsample a sparse matrix in COO format by a specified ratio. Notes ----- This function uses the `_rand_downsample_helper` function to downsample the matrix. Parameters ---------- cm_coo : sp.coo_matrix Input sparse matrix in COO format to downsample. ratio : int Downsample ratio. Returns ------- sp.coo_matrix Downsampled sparse matrix in COO format. Examples -------- Examples -------- """ row_ids = cm_coo.row col_ids = cm_coo.col vals = cm_coo.data ids, new_vals = _rand_downsample_helper( vals, ratio, ) rc_ids = [row_ids[ids], col_ids[ids]] new_coo = sp.coo_matrix( (new_vals, rc_ids), shape=cm_coo.shape ) return new_coo @rand_downsample.register(pd.DataFrame) def _( cm_df: pd.DataFrame, ratio: int, val_colname: str = cm_consts.DataFrameSpecs.COUNTS, **kwargs, ) -> pd.DataFrame: """ Randomly downsample a dataframe by a specified ratio. Notes ----- This function uses the `_rand_downsample_helper` function to downsample the dataframe. Parameters ---------- cm_df : pd.DataFrame Input dataframe to downsample. ratio : int Downsample ratio. val_colname : str, optional Column name for values in dataframe (default is cm_consts.COUNTS_COLNAME). Returns ------- pd.DataFrame Downsampled dataframe. Examples -------- Examples -------- """ vals = cm_df[val_colname].to_numpy() ids, new_vals = _rand_downsample_helper( vals, ratio, ) new_df = cm_df.copy().iloc[ids] new_df[val_colname] = new_vals return new_df