Source code for gunz_cm.preprocs.noises

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"

import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from scipy import sparse as sp
from .. import consts as cm_consts
from .converters import to_coo_matrix, to_dataframe

def _true_rand_ligation_noise_helper(
    vals: np.ndarray,
    ratio: float,
    mat_shape: t.Tuple[int, int],
    is_triu_sym: bool = True
) -> t.Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Helper function for generating random ligation noise.

    Notes
    -----
    This function checks the input values for integer type and non-negativity.
    It generates random ligation noise based on the given ratio and matrix shape.

    Parameters
    ----------
    vals : numpy.ndarray
        Input values.
    ratio : float
        Noise ratio.
    mat_shape : Tuple[int, int]
        Matrix shape.
    is_triu_sym : bool, optional
        Whether the matrix is triangular upper and symmetric (default is True).

    Returns
    -------
    Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]
        Noise row ids, noise column ids, and noise counts.

    Examples
    --------



Examples
--------
"""

    #? input vals instead of total_vals due to type checking
    assert np.issubdtype(vals.dtype, np.integer), \
        "Values must be an integer!"
    assert ~np.any(vals < 0), \
        "Values should be non-negative"

    total_vals = np.sum(vals)
    num_rand_ligations = np.multiply(total_vals, ratio).astype(int)

    if num_rand_ligations == 0:
        empty_int_array = np.array([], dtype=int)
        return empty_int_array, empty_int_array, empty_int_array

    #? Special handling for triangular matrix
    if is_triu_sym is True:
        assert mat_shape[0] == mat_shape[1], "Matrix must be square for symmetric triangular noise."
        # Generate two sets of random indices
        tmp_coords = np.random.randint(0, mat_shape[0], size=(num_rand_ligations, 2))
        # Sort along axis 1 to enforce upper triangular condition (row <= col)
        tmp_coords.sort(axis=1)
        noise_row_ids = tmp_coords[:, 0]
        noise_col_ids = tmp_coords[:, 1]
    else:
        noise_row_ids = np.random.randint(0, mat_shape[0], size=num_rand_ligations)
        noise_col_ids = np.random.randint(0, mat_shape[1], size=num_rand_ligations)

    coords = np.stack([noise_row_ids, noise_col_ids], axis=1)
    unique_coords, noise_counts = np.unique(coords, axis=0, return_counts=True)

    noise_row_ids = unique_coords[:, 0]
    noise_col_ids = unique_coords[:, 1]

    return noise_row_ids, noise_col_ids, noise_counts

def _pseudo_rand_ligation_noise_helper(
    row_ids: np.ndarray,
    col_ids: np.ndarray,
    vals: np.ndarray,
    ratio: float,
) -> t.Tuple[np.ndarray, np.ndarray, np.ndarray, t.Optional[t.Tuple[int, int]]]:
    """
    Helper function for generating random ligation noise based on observation set.

    Notes
    -----
    This function checks the input values for integer type and non-negativity.
    It generates pseudo-random ligation noise based on the given ratio and input ids.

    Parameters
    ----------
    row_ids : numpy.ndarray
        Row ids.
    col_ids : numpy.ndarray
        Column ids.
    vals : numpy.ndarray
        Input values.
    ratio : float
        Noise ratio.

    Returns
    -------
    Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, Optional[Tuple[int, int]]]
        Noise row ids, noise column ids, noise counts, and new shape (or None).

    Examples
    --------



Examples
--------
"""

    #? input vals instead of total_vals due to type checking
    assert np.issubdtype(vals.dtype, np.integer), \
        "Values must be an integer!"
    assert ~np.any(vals < 0), \
        "Values should be non-negative"
    assert len(row_ids) == len(col_ids), \
        "row_ids and col_ids must have the same length"

    total_vals = np.sum(vals)
    num_rand_ligations = np.multiply(total_vals, ratio).astype(int)

    num_elems = len(row_ids)
    tmp_ids = np.random.choice(
        num_elems,
        num_rand_ligations,
        replace=True
    )
    sel_ids, noise_counts = np.unique(tmp_ids, return_counts=True)

    noise_row_ids = row_ids[sel_ids]
    noise_col_ids = col_ids[sel_ids]

    return noise_row_ids, noise_col_ids, noise_counts

[docs] def add_rand_ligation_noise( data: t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame], ratio: float, use_pseudo: bool = False, is_triu_sym: bool = True, inplace: bool = False ) -> t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame]: """ Add random ligation noise to the input data. Parameters ---------- data : Union[numpy.ndarray, scipy.sparse.coo_matrix, pandas.DataFrame] Input data. ratio : float Noise ratio. is_triu_sym : bool, optional Whether the matrix is triangular upper and symmetric (default is True). inplace : bool, optional Whether to modify the input data in place (default is False). Returns ------- Union[numpy.ndarray, scipy.sparse.coo_matrix, pandas.DataFrame] Data with added random ligation noise. Notes ----- This function adds random ligation noise to the input data. It supports numpy arrays, scipy sparse matrices, and pandas dataframes. Note: The inplace parameter only affects the input data type. For numpy arrays and scipy sparse matrices, inplace=True will modify the original data. For pandas dataframes, inplace=True will not modify the original data. Examples -------- Examples -------- """ if isinstance(data, np.ndarray): return add_rand_ligation_noise_mat( data, ratio, is_triu_sym=is_triu_sym, inplace=inplace, ) elif isinstance(data, sp.coo_matrix): return add_rand_ligation_noise_coo( data, ratio, use_pseudo=use_pseudo, is_triu_sym=is_triu_sym, inplace=inplace, ) elif isinstance(data, pd.DataFrame): return add_rand_ligation_noise_df( data, ratio, use_pseudo=use_pseudo, is_triu_sym=is_triu_sym, inplace=inplace, ) else: raise PreprocError("Unsupported input data type")
[docs] def add_rand_ligation_noise_mat( cm_mat: sp.coo_matrix, ratio: float, is_triu_sym: bool = True, inplace: bool = False ): """ Function add_rand_ligation_noise_mat. Parameters ---------- Returns ------- Examples -------- Notes ----- """ raise NotImplementedError("Not yet implemented!")
[docs] def add_rand_ligation_noise_coo( cm_coo: sp.coo_matrix, ratio: float, use_pseudo: bool = False, is_triu_sym: bool = True, inplace: bool = False ) -> sp.coo_matrix: """ Add random ligation noise to a scipy sparse matrix. Notes ----- This function adds random ligation noise to the input scipy sparse matrix. If `inplace` is False, a copy of the input matrix is created before adding noise. If `is_triu_sym` is True, the matrix is assumed to be triangular upper and symmetric. Parameters ---------- cm_coo : scipy.sparse.coo_matrix Input scipy sparse matrix. ratio : float Noise ratio. is_triu_sym : bool, optional Whether the matrix is triangular upper and symmetric (default is True). inplace : bool, optional Whether to modify the input data in place (default is False). Returns ------- scipy.sparse.coo_matrix Scipy sparse matrix with added random ligation noise. Examples -------- Examples -------- """ if inplace is False: cm_coo = cm_coo.copy() vals = cm_coo.data cm_shape = cm_coo.shape if is_triu_sym is True: if cm_shape[0] != cm_shape[1]: n = np.amax(cm_shape) cm_shape = [n, n] if use_pseudo is True: row_ids = cm_coo.row col_ids = cm_coo.col out = _pseudo_rand_ligation_noise_helper( row_ids, col_ids, vals, ratio, ) else: out = _true_rand_ligation_noise_helper( vals, ratio, cm_shape, is_triu_sym=is_triu_sym ) noise_row_ids, noise_col_ids, noise_counts = out cm_coo.resize(cm_shape) noise_cm_coo = sp.coo_matrix( (noise_counts, (noise_row_ids, noise_col_ids)), shape=cm_shape, dtype=cm_coo.dtype ) cm_coo = cm_coo + noise_cm_coo cm_coo = cm_coo.tocoo() return cm_coo
[docs] def add_rand_ligation_noise_df( cm_df: pd.DataFrame, ratio: float, use_pseudo: bool = False, is_triu_sym: bool = True, inplace: bool = False, row_ids_colname: str = cm_consts.DataFrameSpecs.ROW_IDS, col_ids_colname: str = cm_consts.DataFrameSpecs.COL_IDS, vals_colname: str = cm_consts.DataFrameSpecs.COUNTS, ) -> pd.DataFrame: """ Add random ligation noise to a pandas DataFrame. Notes ----- This function adds random ligation noise to the input pandas DataFrame. If `inplace` is False, a copy of the input DataFrame is created before adding noise. If `is_triu_sym` is True, the matrix is assumed to be triangular upper and symmetric. Parameters ---------- cm_df : pd.DataFrame Input pandas DataFrame. ratio : float Noise ratio. is_triu_sym : bool, optional Whether the matrix is triangular upper and symmetric (default is True). inplace : bool, optional Whether to modify the input data in place (default is False). row_ids_colname : str, optional Column name for row IDs (default is 'row_ids'). col_ids_colname : str, optional Column name for column IDs (default is 'col_ids'). vals_colname : str, optional Column name for values (default is 'counts'). Returns ------- pd.DataFrame Pandas DataFrame with added random ligation noise. Examples -------- Examples -------- """ if inplace is False: cm_df = cm_df.copy() # Convert to COO cm_coo = to_coo_matrix( cm_df, is_triu_sym=is_triu_sym, row_ids_colname=row_ids_colname, col_ids_colname=col_ids_colname, vals_colname=vals_colname ) # Call the COO version of the function noised_coo = add_rand_ligation_noise_coo( cm_coo, ratio, use_pseudo=use_pseudo, is_triu_sym=is_triu_sym, inplace=True # The copy has already been made ) # Convert back to DataFrame return to_dataframe( noised_coo, row_ids_colname=row_ids_colname, col_ids_colname=col_ids_colname, vals_colname=vals_colname )