# -*- coding: utf-8 -*-
"""
Module for converting between DataFrame and sparse matrix representations.
This module provides two primary, polymorphic functions: `to_coo_matrix` and
`to_dataframe`. These functions use single dispatch to handle conversions
from various data types (e.g., pandas DataFrame, tuples of arrays) to a
standard sparse matrix or DataFrame format, with robust validation provided
by Pydantic.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
# =============================================================================
# METADATA
# =============================================================================
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.3.0"
# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import functools
import typing as t
# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype, is_numeric_dtype
from pydantic import ConfigDict, validate_call
from scipy import sparse as sp
# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..consts import DataFrameSpecs
from .infer_shape import infer_mat_shape
[docs]
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@functools.singledispatch
def to_coo_matrix(
matrix: t.Union[pd.DataFrame, t.Tuple[np.ndarray, np.ndarray, np.ndarray]],
is_triu_sym: bool = True,
*,
row_ids_colname: str = DataFrameSpecs.ROW_IDS,
col_ids_colname: str = DataFrameSpecs.COL_IDS,
vals_colname: str = DataFrameSpecs.COUNTS,
shape: t.Optional[t.Tuple[int, int]] = None,
) -> sp.coo_matrix:
"""
Convert various data types to a SciPy COO sparse matrix.
Parameters
----------
matrix : pd.DataFrame or tuple
Input data, which can be:
- A pandas DataFrame with coordinate and value columns.
- A tuple of (rows, columns, values) NumPy arrays.
is_triu_sym : bool, optional
If True, assumes the matrix is symmetric and stored in upper-triangular
format, used for inferring the full matrix shape. Defaults to True.
row_ids_colname : str, optional
Column name for row IDs (for DataFrame input).
col_ids_colname : str, optional
Column name for column IDs (for DataFrame input).
vals_colname : str, optional
Column name for values (for DataFrame input).
shape : tuple of ints, optional
The shape of the matrix. If None, it is inferred from the data.
Returns
-------
sp.coo_matrix
The COO format sparse matrix representation of the data.
Examples
--------
"""
raise PreprocError(f"No implementation for data type: {type(matrix).__name__}")
@to_coo_matrix.register(pd.DataFrame)
def _df_to_coo_matrix(
matrix: pd.DataFrame,
is_triu_sym: bool,
*,
row_ids_colname: str,
col_ids_colname: str,
vals_colname: str,
shape: t.Optional[t.Tuple[int, int]] = None,
) -> sp.coo_matrix:
"""DataFrame-specific implementation for COO conversion.
Examples
--------
"""
required_cols = [row_ids_colname, col_ids_colname, vals_colname]
missing = [col for col in required_cols if col not in matrix.columns]
if missing:
raise PreprocError(f"Missing required columns: {', '.join(missing)}")
if not is_integer_dtype(matrix[row_ids_colname]) or not is_integer_dtype(matrix[col_ids_colname]):
raise PreprocError("Row and column ID columns must be of an integer dtype.")
if not is_numeric_dtype(matrix[vals_colname]):
raise PreprocError("Values column must be of a numeric dtype.")
rows = matrix[row_ids_colname].to_numpy(dtype=DataFrameSpecs.INDICES_DTYPE)
cols = matrix[col_ids_colname].to_numpy(dtype=DataFrameSpecs.INDICES_DTYPE)
vals = matrix[vals_colname].to_numpy()
if shape is None:
shape = infer_mat_shape((rows, cols), is_triu_sym=is_triu_sym)
return sp.coo_matrix((vals, (rows, cols)), shape=shape)
@to_coo_matrix.register(tuple)
def _arrays_to_coo_matrix(
matrix: t.Tuple[np.ndarray, np.ndarray, np.ndarray],
is_triu_sym: bool,
shape: t.Optional[t.Tuple[int, int]] = None,
**kwargs,
) -> sp.coo_matrix:
"""Tuple-of-arrays-specific implementation for COO conversion.
Examples
--------
"""
rows, cols, values = matrix
if shape is None:
shape = infer_mat_shape((rows, cols), is_triu_sym=is_triu_sym)
return sp.coo_matrix((values, (rows, cols)), shape=shape)
[docs]
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@functools.singledispatch
def to_dataframe(
matrix: sp.coo_matrix,
*,
row_ids_colname: str = DataFrameSpecs.ROW_IDS,
col_ids_colname: str = DataFrameSpecs.COL_IDS,
vals_colname: str = DataFrameSpecs.COUNTS,
) -> pd.DataFrame:
"""
Convert a sparse matrix to a pandas DataFrame.
Parameters
----------
matrix : sp.coo_matrix
Input COO format sparse matrix.
row_ids_colname : str, optional
The desired column name for row IDs in the output DataFrame.
col_ids_colname : str, optional
The desired column name for column IDs in the output DataFrame.
vals_colname : str, optional
The desired column name for values in the output DataFrame.
Returns
-------
pd.DataFrame
A DataFrame with columns for row IDs, column IDs, and values.
Examples
--------
"""
raise PreprocError(f"No implementation for data type: {type(matrix).__name__}")
@to_dataframe.register(sp.coo_matrix)
def _coo_to_dataframe(
matrix: sp.coo_matrix,
*,
row_ids_colname: str,
col_ids_colname: str,
vals_colname: str,
) -> pd.DataFrame:
"""COO-matrix-specific implementation for DataFrame conversion.
Examples
--------
"""
return pd.DataFrame({
row_ids_colname: matrix.row.astype(DataFrameSpecs.INDICES_DTYPE),
col_ids_colname: matrix.col.astype(DataFrameSpecs.INDICES_DTYPE),
vals_colname: matrix.data,
})