Source code for gunz_cm.preprocs.infer_shape

# -*- coding: utf-8 -*-
"""
Module for inferring matrix shapes from various data structures.

This module provides a utility function to determine the dimensions of a matrix
represented as a DataFrame, a SciPy COO matrix, or a tuple of coordinate arrays.
It correctly handles symmetric matrices to infer square dimensions.


Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"

# =============================================================================
# METADATA
# =============================================================================
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.2.1"

# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import functools
import typing as t

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
from gunz_cm.exceptions import PreprocError
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
from pydantic import ConfigDict, validate_call
from scipy import sparse as sp

# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..consts import DataFrameSpecs


[docs] @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) @functools.singledispatch def infer_mat_shape( matrix: t.Union[t.Tuple[np.ndarray, np.ndarray], sp.coo_matrix, pd.DataFrame], is_triu_sym: bool = True, *, row_ids_colname: str = DataFrameSpecs.ROW_IDS, col_ids_colname: str = DataFrameSpecs.COL_IDS, ) -> t.Tuple[int, int]: """ Infer the shape of a matrix from different data types. This function uses Pydantic to validate inputs and single dispatch to route to the correct implementation based on the input data type. Parameters ---------- matrix : tuple, sp.coo_matrix, or pd.DataFrame Input data, which can be: - A tuple of (row_indices, column_indices) NumPy arrays. - A SciPy COO sparse matrix. - A pandas DataFrame with coordinate columns. is_triu_sym : bool, optional If True, the shape is inferred as a square matrix (N x N) based on the maximum index found. Defaults to True. row_ids_colname : str, optional Name of the column for row indices. Only used for DataFrames. Defaults to `DataFrameSpecs.ROW_IDS`. col_ids_colname : str, optional Name of the column for column indices. Only used for DataFrames. Defaults to `DataFrameSpecs.COL_IDS`. Returns ------- t.Tuple[int, int] The inferred (rows, columns) shape of the matrix. Raises ------ pydantic.ValidationError If any argument's type is incorrect (e.g., a tuple with != 2 elements). ValueError If required columns are missing from a DataFrame. TypeError If the input data type is not supported or if index arrays are not of an integer dtype. Examples -------- """ raise PreprocError(f"Unsupported input data type: {type(matrix)}")
@infer_mat_shape.register(tuple) def _infer_mat_shape_rci( matrix: t.Tuple[np.ndarray, np.ndarray], is_triu_sym: bool, **kwargs, # absorb unused kwargs ) -> t.Tuple[int, int]: """Infer matrix shape from a tuple of row and column indices. Examples -------- """ row_ids, col_ids = matrix # Explicitly check that the input arrays have an integer dtype. if not is_integer_dtype(row_ids) or not is_integer_dtype(col_ids): raise PreprocError("Row and column ID arrays must be of an integer dtype.") if row_ids.size == 0 or col_ids.size == 0: return (0, 0) try: nrows = np.amax(row_ids) + 1 ncols = np.amax(col_ids) + 1 except (TypeError, ValueError) as e: # This remains as a fallback for other potential numpy errors. raise PreprocError("Could not determine max of row/column IDs.") from e if is_triu_sym: n = max(nrows, ncols) return (n, n) return (nrows, ncols) @infer_mat_shape.register(sp.coo_matrix) def _infer_mat_shape_coo( matrix: sp.coo_matrix, is_triu_sym: bool, **kwargs, ) -> t.Tuple[int, int]: """Infer matrix shape from a SciPy COO matrix. Examples -------- """ if matrix.nnz == 0: return (0, 0) #? Optimization: Use the matrix shape directly instead of scanning indices. #? This avoids O(NNZ) complexity. nrows, ncols = matrix.shape if is_triu_sym: n = max(nrows, ncols) return (n, n) return (nrows, ncols) @infer_mat_shape.register(pd.DataFrame) def _infer_mat_shape_df( matrix: pd.DataFrame, is_triu_sym: bool, *, row_ids_colname: str, col_ids_colname: str, ) -> t.Tuple[int, int]: """Infer matrix shape from a pandas DataFrame. Examples -------- """ required_columns = {row_ids_colname, col_ids_colname} if not required_columns.issubset(matrix.columns): missing = ", ".join(required_columns - set(matrix.columns)) raise PreprocError(f"DataFrame must contain columns: {missing}") row_col_ids = ( matrix[row_ids_colname].to_numpy(), matrix[col_ids_colname].to_numpy() ) # Delegate to the rci implementation return _infer_mat_shape_rci(row_col_ids, is_triu_sym)