# -*- coding: utf-8 -*-
"""
Module for inferring matrix shapes from various data structures.
This module provides a utility function to determine the dimensions of a matrix
represented as a DataFrame, a SciPy COO matrix, or a tuple of coordinate arrays.
It correctly handles symmetric matrices to infer square dimensions.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
# =============================================================================
# METADATA
# =============================================================================
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.2.1"
# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import functools
import typing as t
# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
from pydantic import ConfigDict, validate_call
from scipy import sparse as sp
# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..consts import DataFrameSpecs
[docs]
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@functools.singledispatch
def infer_mat_shape(
matrix: t.Union[t.Tuple[np.ndarray, np.ndarray], sp.coo_matrix, pd.DataFrame],
is_triu_sym: bool = True,
*,
row_ids_colname: str = DataFrameSpecs.ROW_IDS,
col_ids_colname: str = DataFrameSpecs.COL_IDS,
) -> t.Tuple[int, int]:
"""
Infer the shape of a matrix from different data types.
This function uses Pydantic to validate inputs and single dispatch to
route to the correct implementation based on the input data type.
Parameters
----------
matrix : tuple, sp.coo_matrix, or pd.DataFrame
Input data, which can be:
- A tuple of (row_indices, column_indices) NumPy arrays.
- A SciPy COO sparse matrix.
- A pandas DataFrame with coordinate columns.
is_triu_sym : bool, optional
If True, the shape is inferred as a square matrix (N x N) based on
the maximum index found. Defaults to True.
row_ids_colname : str, optional
Name of the column for row indices. Only used for DataFrames.
Defaults to `DataFrameSpecs.ROW_IDS`.
col_ids_colname : str, optional
Name of the column for column indices. Only used for DataFrames.
Defaults to `DataFrameSpecs.COL_IDS`.
Returns
-------
t.Tuple[int, int]
The inferred (rows, columns) shape of the matrix.
Raises
------
pydantic.ValidationError
If any argument's type is incorrect (e.g., a tuple with != 2 elements).
ValueError
If required columns are missing from a DataFrame.
TypeError
If the input data type is not supported or if index arrays are not
of an integer dtype.
Examples
--------
"""
raise PreprocError(f"Unsupported input data type: {type(matrix)}")
@infer_mat_shape.register(tuple)
def _infer_mat_shape_rci(
matrix: t.Tuple[np.ndarray, np.ndarray],
is_triu_sym: bool,
**kwargs, # absorb unused kwargs
) -> t.Tuple[int, int]:
"""Infer matrix shape from a tuple of row and column indices.
Examples
--------
"""
row_ids, col_ids = matrix
# Explicitly check that the input arrays have an integer dtype.
if not is_integer_dtype(row_ids) or not is_integer_dtype(col_ids):
raise PreprocError("Row and column ID arrays must be of an integer dtype.")
if row_ids.size == 0 or col_ids.size == 0:
return (0, 0)
try:
nrows = np.amax(row_ids) + 1
ncols = np.amax(col_ids) + 1
except (TypeError, ValueError) as e:
# This remains as a fallback for other potential numpy errors.
raise PreprocError("Could not determine max of row/column IDs.") from e
if is_triu_sym:
n = max(nrows, ncols)
return (n, n)
return (nrows, ncols)
@infer_mat_shape.register(sp.coo_matrix)
def _infer_mat_shape_coo(
matrix: sp.coo_matrix,
is_triu_sym: bool,
**kwargs,
) -> t.Tuple[int, int]:
"""Infer matrix shape from a SciPy COO matrix.
Examples
--------
"""
if matrix.nnz == 0:
return (0, 0)
#? Optimization: Use the matrix shape directly instead of scanning indices.
#? This avoids O(NNZ) complexity.
nrows, ncols = matrix.shape
if is_triu_sym:
n = max(nrows, ncols)
return (n, n)
return (nrows, ncols)
@infer_mat_shape.register(pd.DataFrame)
def _infer_mat_shape_df(
matrix: pd.DataFrame,
is_triu_sym: bool,
*,
row_ids_colname: str,
col_ids_colname: str,
) -> t.Tuple[int, int]:
"""Infer matrix shape from a pandas DataFrame.
Examples
--------
"""
required_columns = {row_ids_colname, col_ids_colname}
if not required_columns.issubset(matrix.columns):
missing = ", ".join(required_columns - set(matrix.columns))
raise PreprocError(f"DataFrame must contain columns: {missing}")
row_col_ids = (
matrix[row_ids_colname].to_numpy(),
matrix[col_ids_colname].to_numpy()
)
# Delegate to the rci implementation
return _infer_mat_shape_rci(row_col_ids, is_triu_sym)