# -*- coding: utf-8 -*-
"""
Module.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__version__ = "1.0.0"
__license__ = "Clear BSD"
# __version__ = "1.0."
__email__ = "adhisant@tnt.uni-hannover.de"
# __status__ = "Production"
import typing as t
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from .. import consts as cm_consts
[docs]
def filter_by_weights_quantile_df(
cm_df: pd.DataFrame,
q1: t.Optional[float] = 0,
q3: t.Optional[float] = 1.0,
log: t.Optional[bool] = True,
val_colname: t.Optional[str] = cm_consts.DataFrameSpecs.COUNTS,
orig_val_colname: t.Optional[str] = cm_consts.DataFrameSpecs.RAW_COUNTS,
) -> pd.DataFrame:
"""
Filter a DataFrame based on weight quantiles.
Notes
-----
This function calculates weights based on the ratio of normalized counts to raw counts.
It then applies log transformation if specified and filters the DataFrame based on the weight quantiles.
Parameters
----------
cm_df : pd.DataFrame
The input DataFrame containing count data.
q1 : float, optional
The lower quantile value (default is 0).
q3 : float, optional
The upper quantile value (default is 1.0).
log : bool, optional
Whether to apply log transformation to the weights.
Default is True.
val_colname : str, optional
The column name for normalized counts.
Default is cm_consts.COUNTS_COLNAME.
orig_val_colname : str, optional
The column name for raw counts.
Default is cm_consts.RAW_COUNTS_COLNAME.
Returns
-------
pd.DataFrame
A new DataFrame filtered based on the weight quantiles.
Examples
--------
Examples
--------
"""
if orig_val_colname not in cm_df.columns:
#? Mandatory check to prevent runtime errors on missing columns
raise PreprocError(f"Raw counts column '{orig_val_colname}' is not in the dataset!")
if val_colname not in cm_df.columns:
#? Mandatory check to prevent runtime errors on missing columns
raise PreprocError(f"Normalized counts column '{val_colname}' is not in the dataset!")
weight = (cm_df[val_colname] / cm_df[orig_val_colname])
if log:
weight = np.log(weight)
lower_limit, upper_limit = np.quantile(
weight,
[q1, q3]
)
mask = np.logical_and(
weight >= lower_limit,
weight <= upper_limit
)
cm_df = cm_df[mask]
return cm_df