Source code for gunz_cm.preprocs.weight_filters

# -*- coding: utf-8 -*-
"""
Module.

Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"

import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from .. import consts as cm_consts

[docs] def filter_by_weights_quantile_df( cm_df: pd.DataFrame, q1: t.Optional[float] = 0, q3: t.Optional[float] = 1.0, log: t.Optional[bool] = True, val_colname: t.Optional[str] = cm_consts.DataFrameSpecs.COUNTS, orig_val_colname: t.Optional[str] = cm_consts.DataFrameSpecs.RAW_COUNTS, ) -> pd.DataFrame: """ Filter a DataFrame based on weight quantiles. Notes ----- This function calculates weights based on the ratio of normalized counts to raw counts. It then applies log transformation if specified and filters the DataFrame based on the weight quantiles. Parameters ---------- cm_df : pd.DataFrame The input DataFrame containing count data. q1 : float, optional The lower quantile value (default is 0). q3 : float, optional The upper quantile value (default is 1.0). log : bool, optional Whether to apply log transformation to the weights. Default is True. val_colname : str, optional The column name for normalized counts. Default is cm_consts.COUNTS_COLNAME. orig_val_colname : str, optional The column name for raw counts. Default is cm_consts.RAW_COUNTS_COLNAME. Returns ------- pd.DataFrame A new DataFrame filtered based on the weight quantiles. Examples -------- Examples -------- """ if orig_val_colname not in cm_df.columns: #? Mandatory check to prevent runtime errors on missing columns raise PreprocError(f"Raw counts column '{orig_val_colname}' is not in the dataset!") if val_colname not in cm_df.columns: #? Mandatory check to prevent runtime errors on missing columns raise PreprocError(f"Normalized counts column '{val_colname}' is not in the dataset!") weight = (cm_df[val_colname] / cm_df[orig_val_colname]) if log: weight = np.log(weight) lower_limit, upper_limit = np.quantile( weight, [q1, q3] ) mask = np.logical_and( weight >= lower_limit, weight <= upper_limit ) cm_df = cm_df[mask] return cm_df