# -*- coding: utf-8 -*-
"""
Module.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"
import functools
import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pydantic import validate_call, ConfigDict
from scipy import sparse as sp
from .. import consts as cm_consts
def _rand_downsample_helper(
vals: np.ndarray,
ratio: int,
) -> t.Tuple[np.ndarray, np.ndarray]:
"""
Helper function for random downsampling.
Notes
-----
This function computes the target counts after downsampling and
randomly selects indices to downsample.
Parameters
----------
vals : np.ndarray
Input values to downsample.
ratio : int
Downsample ratio.
Returns
-------
t.Tuple[np.ndarray, np.ndarray]
Indices and new values after downsampling.
Examples
--------
"""
if not np.issubdtype(vals.dtype, np.integer):
raise PreprocError("Values must be an integer!")
if np.any(vals < 0):
raise PreprocError("Values should be non-negative")
dtype = np.min_scalar_type(vals)
vals = vals.astype(dtype)
total_vals = vals.sum()
#? compute target counts after downsampling
total_ds_vals = np.floor_divide(total_vals.astype(np.uint), ratio, dtype=np.uint)
#? _THE MAGIC_
cumsum_vals = np.cumsum(vals)
tmp_ids = np.random.choice(total_vals, total_ds_vals, replace=False)
sel_ids = np.searchsorted(cumsum_vals, tmp_ids)
ids, new_vals = np.unique(sel_ids, return_counts=True)
return ids, new_vals
[docs]
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@functools.singledispatch
def rand_downsample(
data: t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame],
ratio: int,
val_colname: str = cm_consts.DataFrameSpecs.COUNTS,
) -> t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame]:
"""
Randomly downsample a matrix or dataframe by a specified ratio.
Notes
-----
This function dispatches to different downsampling functions based on
the input data type.
Parameters
----------
data : Union[np.ndarray, sp.coo_matrix, pd.DataFrame]
Input data to downsample.
ratio : int
Downsample ratio.
val_colname : str, optional
Column name for values in dataframe (default is cm_consts.COUNTS_COLNAME).
Returns
-------
Union[np.ndarray, sp.coo_matrix, pd.DataFrame]
Downsampled data.
Examples
--------
Examples
--------
"""
raise PreprocError(f"No implementation for data type: {type(data).__name__}")
@rand_downsample.register(np.ndarray)
def _(
cm_mat: np.ndarray,
ratio: int,
**kwargs,
) -> np.ndarray:
"""
Randomly downsample a matrix by a specified ratio.
Notes
-----
This function uses the `_rand_downsample_helper` function to downsample
the matrix.
Parameters
----------
cm_mat : np.ndarray
Input matrix to downsample.
ratio : int
Downsample ratio.
Returns
-------
np.ndarray
Downsampled matrix.
Examples
--------
Examples
--------
"""
row_ids, col_ids = np.nonzero(cm_mat)
vals = cm_mat[row_ids, col_ids]
ids, new_vals = _rand_downsample_helper(
vals,
ratio,
)
new_cm_mat = np.zeros_like(cm_mat)
new_cm_mat[row_ids[ids], col_ids[ids]] = new_vals
return new_cm_mat
# assert ratio > 0 and ratio <= 1.0, \
# f"Invalid target_rate:{ratio}"
# sum_counts = cm_mat.sum()
# if sum_counts == 0.0:
# return cm_mat
# else:
# #? The sum is 1.0 in this stage
# #? DO NOT USE IN-PLACE OP otherwise the original matrix is replaced
# # mat /= sum_counts #? DO NOT USE THIS
# cm_mat = cm_mat / sum_counts
# sampler = rand_f(*cm_mat.shape)
# cm_mat *= sampler
# cm_mat /= cm_mat.sum() #? Make the sum 1.0 again after random sampling
# cm_mat *= ratio
# cm_mat *= sum_counts
# return cm_mat
@rand_downsample.register(sp.coo_matrix)
def _(
cm_coo: sp.coo_matrix,
ratio: int,
**kwargs,
) -> sp.coo_matrix:
"""
Randomly downsample a sparse matrix in COO format by a specified ratio.
Notes
-----
This function uses the `_rand_downsample_helper` function to downsample
the matrix.
Parameters
----------
cm_coo : sp.coo_matrix
Input sparse matrix in COO format to downsample.
ratio : int
Downsample ratio.
Returns
-------
sp.coo_matrix
Downsampled sparse matrix in COO format.
Examples
--------
Examples
--------
"""
row_ids = cm_coo.row
col_ids = cm_coo.col
vals = cm_coo.data
ids, new_vals = _rand_downsample_helper(
vals,
ratio,
)
rc_ids = [row_ids[ids], col_ids[ids]]
new_coo = sp.coo_matrix(
(new_vals, rc_ids),
shape=cm_coo.shape
)
return new_coo
@rand_downsample.register(pd.DataFrame)
def _(
cm_df: pd.DataFrame,
ratio: int,
val_colname: str = cm_consts.DataFrameSpecs.COUNTS,
**kwargs,
) -> pd.DataFrame:
"""
Randomly downsample a dataframe by a specified ratio.
Notes
-----
This function uses the `_rand_downsample_helper` function to downsample
the dataframe.
Parameters
----------
cm_df : pd.DataFrame
Input dataframe to downsample.
ratio : int
Downsample ratio.
val_colname : str, optional
Column name for values in dataframe (default is cm_consts.COUNTS_COLNAME).
Returns
-------
pd.DataFrame
Downsampled dataframe.
Examples
--------
Examples
--------
"""
vals = cm_df[val_colname].to_numpy()
ids, new_vals = _rand_downsample_helper(
vals,
ratio,
)
new_df = cm_df.copy().iloc[ids]
new_df[val_colname] = new_vals
return new_df