Source code for gunz_cm.preprocs.converters

# -*- coding: utf-8 -*-
"""
Module for converting between DataFrame and sparse matrix representations.

This module provides two primary, polymorphic functions: `to_coo_matrix` and
`to_dataframe`. These functions use single dispatch to handle conversions
from various data types (e.g., pandas DataFrame, tuples of arrays) to a
standard sparse matrix or DataFrame format, with robust validation provided
by Pydantic.


Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"

# =============================================================================
# METADATA
# =============================================================================
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.3.0"

# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import functools
import typing as t

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype, is_numeric_dtype
from pydantic import ConfigDict, validate_call
from scipy import sparse as sp

# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..consts import DataFrameSpecs
from .infer_shape import infer_mat_shape


[docs] @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) @functools.singledispatch def to_coo_matrix( matrix: t.Union[pd.DataFrame, t.Tuple[np.ndarray, np.ndarray, np.ndarray]], is_triu_sym: bool = True, *, row_ids_colname: str = DataFrameSpecs.ROW_IDS, col_ids_colname: str = DataFrameSpecs.COL_IDS, vals_colname: str = DataFrameSpecs.COUNTS, shape: t.Optional[t.Tuple[int, int]] = None, ) -> sp.coo_matrix: """ Convert various data types to a SciPy COO sparse matrix. Parameters ---------- matrix : pd.DataFrame or tuple Input data, which can be: - A pandas DataFrame with coordinate and value columns. - A tuple of (rows, columns, values) NumPy arrays. is_triu_sym : bool, optional If True, assumes the matrix is symmetric and stored in upper-triangular format, used for inferring the full matrix shape. Defaults to True. row_ids_colname : str, optional Column name for row IDs (for DataFrame input). col_ids_colname : str, optional Column name for column IDs (for DataFrame input). vals_colname : str, optional Column name for values (for DataFrame input). shape : tuple of ints, optional The shape of the matrix. If None, it is inferred from the data. Returns ------- sp.coo_matrix The COO format sparse matrix representation of the data. Examples -------- """ raise PreprocError(f"No implementation for data type: {type(matrix).__name__}")
@to_coo_matrix.register(pd.DataFrame) def _df_to_coo_matrix( matrix: pd.DataFrame, is_triu_sym: bool, *, row_ids_colname: str, col_ids_colname: str, vals_colname: str, shape: t.Optional[t.Tuple[int, int]] = None, ) -> sp.coo_matrix: """DataFrame-specific implementation for COO conversion. Examples -------- """ required_cols = [row_ids_colname, col_ids_colname, vals_colname] missing = [col for col in required_cols if col not in matrix.columns] if missing: raise PreprocError(f"Missing required columns: {', '.join(missing)}") if not is_integer_dtype(matrix[row_ids_colname]) or not is_integer_dtype(matrix[col_ids_colname]): raise PreprocError("Row and column ID columns must be of an integer dtype.") if not is_numeric_dtype(matrix[vals_colname]): raise PreprocError("Values column must be of a numeric dtype.") rows = matrix[row_ids_colname].to_numpy(dtype=DataFrameSpecs.INDICES_DTYPE) cols = matrix[col_ids_colname].to_numpy(dtype=DataFrameSpecs.INDICES_DTYPE) vals = matrix[vals_colname].to_numpy() if shape is None: shape = infer_mat_shape((rows, cols), is_triu_sym=is_triu_sym) return sp.coo_matrix((vals, (rows, cols)), shape=shape) @to_coo_matrix.register(tuple) def _arrays_to_coo_matrix( matrix: t.Tuple[np.ndarray, np.ndarray, np.ndarray], is_triu_sym: bool, shape: t.Optional[t.Tuple[int, int]] = None, **kwargs, ) -> sp.coo_matrix: """Tuple-of-arrays-specific implementation for COO conversion. Examples -------- """ rows, cols, values = matrix if shape is None: shape = infer_mat_shape((rows, cols), is_triu_sym=is_triu_sym) return sp.coo_matrix((values, (rows, cols)), shape=shape)
[docs] @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) @functools.singledispatch def to_dataframe( matrix: sp.coo_matrix, *, row_ids_colname: str = DataFrameSpecs.ROW_IDS, col_ids_colname: str = DataFrameSpecs.COL_IDS, vals_colname: str = DataFrameSpecs.COUNTS, ) -> pd.DataFrame: """ Convert a sparse matrix to a pandas DataFrame. Parameters ---------- matrix : sp.coo_matrix Input COO format sparse matrix. row_ids_colname : str, optional The desired column name for row IDs in the output DataFrame. col_ids_colname : str, optional The desired column name for column IDs in the output DataFrame. vals_colname : str, optional The desired column name for values in the output DataFrame. Returns ------- pd.DataFrame A DataFrame with columns for row IDs, column IDs, and values. Examples -------- """ raise PreprocError(f"No implementation for data type: {type(matrix).__name__}")
@to_dataframe.register(sp.coo_matrix) def _coo_to_dataframe( matrix: sp.coo_matrix, *, row_ids_colname: str, col_ids_colname: str, vals_colname: str, ) -> pd.DataFrame: """COO-matrix-specific implementation for DataFrame conversion. Examples -------- """ return pd.DataFrame({ row_ids_colname: matrix.row.astype(DataFrameSpecs.INDICES_DTYPE), col_ids_colname: matrix.col.astype(DataFrameSpecs.INDICES_DTYPE), vals_colname: matrix.data, })