Source code for gunz_cm.preprocs.resamples

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"

import functools
import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pydantic import validate_call, ConfigDict
from scipy import sparse as sp
from .. import consts as cm_consts


[docs]
def uniform_resample_mat(
    cm_mat: np.ndarray,
    target_rate: float,
) -> np.ndarray:
    """
    Uniformly resample a matrix by a specified target rate.

    Notes
    -----
    This function simply multiplies the input matrix by the target rate.

    Parameters
    ----------
    cm_mat : np.ndarray
        Input contact matrix to resample.
    target_rate : float
        Target rate for resampling.

    Returns
    -------
    np.ndarray
        Resampled matrix.

    Examples
    --------




Examples
--------
"""
    
    return cm_mat*target_rate


def _rand_downsample_helper(
    vals: np.ndarray,
    ratio: int,
) -> t.Tuple[np.ndarray, np.ndarray]:
    """
    Helper function for random downsampling.

    Notes
    -----
    This function computes the target counts after downsampling and 
    randomly selects indices to downsample.

    Parameters
    ----------
    vals : np.ndarray
        Input values to downsample.
    ratio : int
        Downsample ratio.

    Returns
    -------
    t.Tuple[np.ndarray, np.ndarray]
        Indices and new values after downsampling.



Examples
--------
"""
    
    if not np.issubdtype(vals.dtype, np.integer):
        raise PreprocError("Values must be an integer!")
    if np.any(vals < 0):
        raise PreprocError("Values should be non-negative")
    
    dtype = np.min_scalar_type(vals)
    vals = vals.astype(dtype)
    
    total_vals = vals.sum()
    
    #? compute target counts after downsampling
    total_ds_vals = np.floor_divide(total_vals.astype(np.uint), ratio, dtype=np.uint)
    
    #? _THE MAGIC_
    cumsum_vals = np.cumsum(vals)
    tmp_ids = np.random.choice(total_vals, total_ds_vals, replace=False)
    sel_ids = np.searchsorted(cumsum_vals, tmp_ids)
    ids, new_vals = np.unique(sel_ids, return_counts=True)
    
    return ids, new_vals

@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@functools.singledispatch
def rand_downsample(
    data: t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame],
    ratio: int,
    val_colname: str = cm_consts.DataFrameSpecs.COUNTS,
) -> t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame]:
    """
    Randomly downsample a matrix or dataframe by a specified ratio.

    Notes
    -----
    This function dispatches to different downsampling functions based on 
    the input data type.

    Parameters
    ----------
    data : Union[np.ndarray, sp.coo_matrix, pd.DataFrame]
        Input data to downsample.
    ratio : int
        Downsample ratio.
    val_colname : str, optional
        Column name for values in dataframe (default is cm_consts.COUNTS_COLNAME).

    Returns
    -------
    Union[np.ndarray, sp.coo_matrix, pd.DataFrame]
        Downsampled data.

    Examples
    --------




Examples
--------
"""
    
    raise PreprocError(f"No implementation for data type: {type(data).__name__}")

@rand_downsample.register(np.ndarray)
def _(
    cm_mat: np.ndarray,
    ratio: int,
    **kwargs,
) -> np.ndarray:
    """
    Randomly downsample a matrix by a specified ratio.

    Notes
    -----
    This function uses the `_rand_downsample_helper` function to downsample 
    the matrix.

    Parameters
    ----------
    cm_mat : np.ndarray
        Input matrix to downsample.
    ratio : int
        Downsample ratio.

    Returns
    -------
    np.ndarray
        Downsampled matrix.

    Examples
    --------




Examples
--------
"""
    
    row_ids, col_ids = np.nonzero(cm_mat)
    vals = cm_mat[row_ids, col_ids]
    
    ids, new_vals = _rand_downsample_helper(
        vals,
        ratio,
    )
    
    new_cm_mat = np.zeros_like(cm_mat)
    new_cm_mat[row_ids[ids], col_ids[ids]] = new_vals
    
    return new_cm_mat
    
    # assert ratio > 0 and ratio <= 1.0, \
    #     f"Invalid target_rate:{ratio}"

    # sum_counts = cm_mat.sum()
    # if sum_counts == 0.0:
    #     return cm_mat
    # else:
    #     #? The sum is 1.0 in this stage
    #     #? DO NOT USE IN-PLACE OP otherwise the original matrix is replaced
    #     # mat /= sum_counts #? DO NOT USE THIS
    #     cm_mat = cm_mat / sum_counts

    #     sampler = rand_f(*cm_mat.shape)
    #     cm_mat *= sampler
    #     cm_mat /= cm_mat.sum() #? Make the sum 1.0 again after random sampling

    #     cm_mat *= ratio

    #     cm_mat *= sum_counts

    #     return cm_mat
    
@rand_downsample.register(sp.coo_matrix)
def _(
    cm_coo: sp.coo_matrix,
    ratio: int,
    **kwargs,
) -> sp.coo_matrix:
    """
    Randomly downsample a sparse matrix in COO format by a specified ratio.

    Notes
    -----
    This function uses the `_rand_downsample_helper` function to downsample 
    the matrix.

    Parameters
    ----------
    cm_coo : sp.coo_matrix
        Input sparse matrix in COO format to downsample.
    ratio : int
        Downsample ratio.

    Returns
    -------
    sp.coo_matrix
        Downsampled sparse matrix in COO format.

    Examples
    --------




Examples
--------
"""
    
    row_ids = cm_coo.row   
    col_ids = cm_coo.col
    vals = cm_coo.data
    
    ids, new_vals = _rand_downsample_helper(
        vals,
        ratio,
    )

    rc_ids = [row_ids[ids], col_ids[ids]]
    new_coo = sp.coo_matrix(
        (new_vals, rc_ids), 
        shape=cm_coo.shape
    )
    
    return new_coo

@rand_downsample.register(pd.DataFrame)
def _(
    cm_df: pd.DataFrame,
    ratio: int,
    val_colname: str = cm_consts.DataFrameSpecs.COUNTS,
    **kwargs,
) -> pd.DataFrame:
    """
    Randomly downsample a dataframe by a specified ratio.

    Notes
    -----
    This function uses the `_rand_downsample_helper` function to downsample 
    the dataframe.

    Parameters
    ----------
    cm_df : pd.DataFrame
        Input dataframe to downsample.
    ratio : int
        Downsample ratio.
    val_colname : str, optional
        Column name for values in dataframe (default is cm_consts.COUNTS_COLNAME).

    Returns
    -------
    pd.DataFrame
        Downsampled dataframe.

    Examples
    --------




Examples
--------
"""
    
    vals = cm_df[val_colname].to_numpy()
    
    ids, new_vals = _rand_downsample_helper(
        vals,
        ratio,
    )
    
    new_df = cm_df.copy().iloc[ids]
    new_df[val_colname] = new_vals
    
    return new_df