Source code for gunz_cm.preprocs.weight_filters

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"

import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from .. import consts as cm_consts


[docs]
def filter_by_weights_quantile_df(
    cm_df: pd.DataFrame,
    q1: t.Optional[float] = 0,
    q3: t.Optional[float] = 1.0,
    log: t.Optional[bool] = True,
    val_colname: t.Optional[str] = cm_consts.DataFrameSpecs.COUNTS,
    orig_val_colname: t.Optional[str] = cm_consts.DataFrameSpecs.RAW_COUNTS,
) -> pd.DataFrame:
    """
    Filter a DataFrame based on weight quantiles.

    Notes
    -----
    This function calculates weights based on the ratio of normalized counts to raw counts.
    It then applies log transformation if specified and filters the DataFrame based on the weight quantiles.

    Parameters
    ----------
    cm_df : pd.DataFrame
        The input DataFrame containing count data.
    q1 : float, optional
        The lower quantile value (default is 0).
    q3 : float, optional
        The upper quantile value (default is 1.0).
    log : bool, optional
        Whether to apply log transformation to the weights.
        Default is True.
    val_colname : str, optional
        The column name for normalized counts.
        Default is cm_consts.COUNTS_COLNAME.
    orig_val_colname : str, optional
        The column name for raw counts.
        Default is cm_consts.RAW_COUNTS_COLNAME.

    Returns
    -------
    pd.DataFrame
        A new DataFrame filtered based on the weight quantiles.

    Examples
    --------




Examples
--------
"""

    if orig_val_colname not in cm_df.columns:
        #? Mandatory check to prevent runtime errors on missing columns
        raise PreprocError(f"Raw counts column '{orig_val_colname}' is not in the dataset!")

    if val_colname not in cm_df.columns:
        #? Mandatory check to prevent runtime errors on missing columns
        raise PreprocError(f"Normalized counts column '{val_colname}' is not in the dataset!")
    
    weight = (cm_df[val_colname]  / cm_df[orig_val_colname])
    if log:
        weight = np.log(weight)
    
    lower_limit, upper_limit = np.quantile(
        weight, 
        [q1, q3]
    )
    mask = np.logical_and(
        weight >= lower_limit, 
        weight <= upper_limit
    )
    
    cm_df = cm_df[mask]
    
    return cm_df