# -*- coding: utf-8 -*-
"""
Module.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"
import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from scipy import sparse as sp
from .. import consts as cm_consts
from .converters import to_coo_matrix, to_dataframe
def _true_rand_ligation_noise_helper(
vals: np.ndarray,
ratio: float,
mat_shape: t.Tuple[int, int],
is_triu_sym: bool = True
) -> t.Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Helper function for generating random ligation noise.
Notes
-----
This function checks the input values for integer type and non-negativity.
It generates random ligation noise based on the given ratio and matrix shape.
Parameters
----------
vals : numpy.ndarray
Input values.
ratio : float
Noise ratio.
mat_shape : Tuple[int, int]
Matrix shape.
is_triu_sym : bool, optional
Whether the matrix is triangular upper and symmetric (default is True).
Returns
-------
Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]
Noise row ids, noise column ids, and noise counts.
Examples
--------
Examples
--------
"""
#? input vals instead of total_vals due to type checking
assert np.issubdtype(vals.dtype, np.integer), \
"Values must be an integer!"
assert ~np.any(vals < 0), \
"Values should be non-negative"
total_vals = np.sum(vals)
num_rand_ligations = np.multiply(total_vals, ratio).astype(int)
if num_rand_ligations == 0:
empty_int_array = np.array([], dtype=int)
return empty_int_array, empty_int_array, empty_int_array
#? Special handling for triangular matrix
if is_triu_sym is True:
assert mat_shape[0] == mat_shape[1], "Matrix must be square for symmetric triangular noise."
# Generate two sets of random indices
tmp_coords = np.random.randint(0, mat_shape[0], size=(num_rand_ligations, 2))
# Sort along axis 1 to enforce upper triangular condition (row <= col)
tmp_coords.sort(axis=1)
noise_row_ids = tmp_coords[:, 0]
noise_col_ids = tmp_coords[:, 1]
else:
noise_row_ids = np.random.randint(0, mat_shape[0], size=num_rand_ligations)
noise_col_ids = np.random.randint(0, mat_shape[1], size=num_rand_ligations)
coords = np.stack([noise_row_ids, noise_col_ids], axis=1)
unique_coords, noise_counts = np.unique(coords, axis=0, return_counts=True)
noise_row_ids = unique_coords[:, 0]
noise_col_ids = unique_coords[:, 1]
return noise_row_ids, noise_col_ids, noise_counts
def _pseudo_rand_ligation_noise_helper(
row_ids: np.ndarray,
col_ids: np.ndarray,
vals: np.ndarray,
ratio: float,
) -> t.Tuple[np.ndarray, np.ndarray, np.ndarray, t.Optional[t.Tuple[int, int]]]:
"""
Helper function for generating random ligation noise based on observation set.
Notes
-----
This function checks the input values for integer type and non-negativity.
It generates pseudo-random ligation noise based on the given ratio and input ids.
Parameters
----------
row_ids : numpy.ndarray
Row ids.
col_ids : numpy.ndarray
Column ids.
vals : numpy.ndarray
Input values.
ratio : float
Noise ratio.
Returns
-------
Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, Optional[Tuple[int, int]]]
Noise row ids, noise column ids, noise counts, and new shape (or None).
Examples
--------
Examples
--------
"""
#? input vals instead of total_vals due to type checking
assert np.issubdtype(vals.dtype, np.integer), \
"Values must be an integer!"
assert ~np.any(vals < 0), \
"Values should be non-negative"
assert len(row_ids) == len(col_ids), \
"row_ids and col_ids must have the same length"
total_vals = np.sum(vals)
num_rand_ligations = np.multiply(total_vals, ratio).astype(int)
num_elems = len(row_ids)
tmp_ids = np.random.choice(
num_elems,
num_rand_ligations,
replace=True
)
sel_ids, noise_counts = np.unique(tmp_ids, return_counts=True)
noise_row_ids = row_ids[sel_ids]
noise_col_ids = col_ids[sel_ids]
return noise_row_ids, noise_col_ids, noise_counts
[docs]
def add_rand_ligation_noise(
data: t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame],
ratio: float,
use_pseudo: bool = False,
is_triu_sym: bool = True,
inplace: bool = False
) -> t.Union[np.ndarray, sp.coo_matrix, pd.DataFrame]:
"""
Add random ligation noise to the input data.
Parameters
----------
data : Union[numpy.ndarray, scipy.sparse.coo_matrix, pandas.DataFrame]
Input data.
ratio : float
Noise ratio.
is_triu_sym : bool, optional
Whether the matrix is triangular upper and symmetric (default is True).
inplace : bool, optional
Whether to modify the input data in place (default is False).
Returns
-------
Union[numpy.ndarray, scipy.sparse.coo_matrix, pandas.DataFrame]
Data with added random ligation noise.
Notes
-----
This function adds random ligation noise to the input data.
It supports numpy arrays, scipy sparse matrices, and pandas dataframes.
Note: The inplace parameter only affects the input data type.
For numpy arrays and scipy sparse matrices, inplace=True will modify the original data.
For pandas dataframes, inplace=True will not modify the original data.
Examples
--------
Examples
--------
"""
if isinstance(data, np.ndarray):
return add_rand_ligation_noise_mat(
data,
ratio,
is_triu_sym=is_triu_sym,
inplace=inplace,
)
elif isinstance(data, sp.coo_matrix):
return add_rand_ligation_noise_coo(
data,
ratio,
use_pseudo=use_pseudo,
is_triu_sym=is_triu_sym,
inplace=inplace,
)
elif isinstance(data, pd.DataFrame):
return add_rand_ligation_noise_df(
data,
ratio,
use_pseudo=use_pseudo,
is_triu_sym=is_triu_sym,
inplace=inplace,
)
else:
raise PreprocError("Unsupported input data type")
[docs]
def add_rand_ligation_noise_mat(
cm_mat: sp.coo_matrix,
ratio: float,
is_triu_sym: bool = True,
inplace: bool = False
):
"""
Function add_rand_ligation_noise_mat.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
raise NotImplementedError("Not yet implemented!")
[docs]
def add_rand_ligation_noise_coo(
cm_coo: sp.coo_matrix,
ratio: float,
use_pseudo: bool = False,
is_triu_sym: bool = True,
inplace: bool = False
) -> sp.coo_matrix:
"""
Add random ligation noise to a scipy sparse matrix.
Notes
-----
This function adds random ligation noise to the input scipy sparse matrix.
If `inplace` is False, a copy of the input matrix is created before adding noise.
If `is_triu_sym` is True, the matrix is assumed to be triangular upper and symmetric.
Parameters
----------
cm_coo : scipy.sparse.coo_matrix
Input scipy sparse matrix.
ratio : float
Noise ratio.
is_triu_sym : bool, optional
Whether the matrix is triangular upper and symmetric (default is True).
inplace : bool, optional
Whether to modify the input data in place (default is False).
Returns
-------
scipy.sparse.coo_matrix
Scipy sparse matrix with added random ligation noise.
Examples
--------
Examples
--------
"""
if inplace is False:
cm_coo = cm_coo.copy()
vals = cm_coo.data
cm_shape = cm_coo.shape
if is_triu_sym is True:
if cm_shape[0] != cm_shape[1]:
n = np.amax(cm_shape)
cm_shape = [n, n]
if use_pseudo is True:
row_ids = cm_coo.row
col_ids = cm_coo.col
out = _pseudo_rand_ligation_noise_helper(
row_ids,
col_ids,
vals,
ratio,
)
else:
out = _true_rand_ligation_noise_helper(
vals,
ratio,
cm_shape,
is_triu_sym=is_triu_sym
)
noise_row_ids, noise_col_ids, noise_counts = out
cm_coo.resize(cm_shape)
noise_cm_coo = sp.coo_matrix(
(noise_counts, (noise_row_ids, noise_col_ids)),
shape=cm_shape,
dtype=cm_coo.dtype
)
cm_coo = cm_coo + noise_cm_coo
cm_coo = cm_coo.tocoo()
return cm_coo
[docs]
def add_rand_ligation_noise_df(
cm_df: pd.DataFrame,
ratio: float,
use_pseudo: bool = False,
is_triu_sym: bool = True,
inplace: bool = False,
row_ids_colname: str = cm_consts.DataFrameSpecs.ROW_IDS,
col_ids_colname: str = cm_consts.DataFrameSpecs.COL_IDS,
vals_colname: str = cm_consts.DataFrameSpecs.COUNTS,
) -> pd.DataFrame:
"""
Add random ligation noise to a pandas DataFrame.
Notes
-----
This function adds random ligation noise to the input pandas DataFrame.
If `inplace` is False, a copy of the input DataFrame is created before adding noise.
If `is_triu_sym` is True, the matrix is assumed to be triangular upper and symmetric.
Parameters
----------
cm_df : pd.DataFrame
Input pandas DataFrame.
ratio : float
Noise ratio.
is_triu_sym : bool, optional
Whether the matrix is triangular upper and symmetric (default is True).
inplace : bool, optional
Whether to modify the input data in place (default is False).
row_ids_colname : str, optional
Column name for row IDs (default is 'row_ids').
col_ids_colname : str, optional
Column name for column IDs (default is 'col_ids').
vals_colname : str, optional
Column name for values (default is 'counts').
Returns
-------
pd.DataFrame
Pandas DataFrame with added random ligation noise.
Examples
--------
Examples
--------
"""
if inplace is False:
cm_df = cm_df.copy()
# Convert to COO
cm_coo = to_coo_matrix(
cm_df,
is_triu_sym=is_triu_sym,
row_ids_colname=row_ids_colname,
col_ids_colname=col_ids_colname,
vals_colname=vals_colname
)
# Call the COO version of the function
noised_coo = add_rand_ligation_noise_coo(
cm_coo,
ratio,
use_pseudo=use_pseudo,
is_triu_sym=is_triu_sym,
inplace=True # The copy has already been made
)
# Convert back to DataFrame
return to_dataframe(
noised_coo,
row_ids_colname=row_ids_colname,
col_ids_colname=col_ids_colname,
vals_colname=vals_colname
)