Source code for gunz_cm.matrix

# -*- coding: utf-8 -*-
"""
Defines the ContactMatrix data structure.
"""

# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"

from dataclasses import dataclass, field
import typing as t
import pandas as pd
from scipy import sparse as sp

[docs] @dataclass class ContactMatrix: """ A data container for a contact matrix and its associated metadata. This class acts as a simple, data-oriented container to group a contact matrix (as a pandas DataFrame or a SciPy sparse matrix) with important metadata like its genomic coordinates and resolution. It supports lazy loading of data via a loader function. Attributes ---------- chromosome1 : str The name of the first chromosome. resolution : int The resolution of the contact matrix in base pairs. loader_func : callable A function or callable that returns the raw data when called. loader_kwargs : dict Keyword arguments to pass to the loader function. chromosome2 : str, optional The name of the second chromosome, if different from the first (for inter-chromosomal matrices). Defaults to chromosome1. metadata : dict A dictionary to hold any other relevant metadata. Examples -------- >>> from gunz_cm.matrix import ContactMatrix >>> import numpy as np >>> def dummy_loader(n): return np.eye(n) >>> cm = ContactMatrix("chr1", 10000, loader_func=dummy_loader, loader_kwargs={"n": 5}) >>> print(cm.data.shape) (5, 5) """ chromosome1: str resolution: int loader_func: t.Callable = field(repr=False) loader_kwargs: t.Dict[str, t.Any] = field(default_factory=dict, repr=False) chromosome2: t.Optional[str] = None metadata: t.Dict[str, t.Any] = field(default_factory=dict) _data: t.Any = field(init=False, repr=False, default=None) def __post_init__(self): if self.chromosome2 is None: self.chromosome2 = self.chromosome1 @property def data(self) -> t.Any: """ The raw contact matrix data, loaded lazily. Returns ------- any The raw data returned by the loader function (usually a DataFrame or Sparse Matrix). """ if self._data is None: self._data = self.loader_func(**self.loader_kwargs) return self._data
[docs] def as_coo(self) -> sp.coo_matrix: """ Returns the contact matrix as a SciPy COO sparse matrix. Returns ------- scipy.sparse.coo_matrix The contact matrix data in COO format. Examples -------- >>> cm = load_cm_data("sample.cool", "chr1", 10000) >>> coo = cm.as_coo() >>> print(f"Non-zero elements: {coo.nnz}") """ # Deferred import to break circular dependency from .preprocs.converters import to_coo_matrix if isinstance(self.data, sp.coo_matrix): return self.data return to_coo_matrix(self.data)
[docs] def as_csr(self) -> sp.csr_matrix: """ Returns the contact matrix as a SciPy CSR sparse matrix. Returns ------- scipy.sparse.csr_matrix The contact matrix data in CSR format. """ return self.as_coo().tocsr()
[docs] def as_csc(self) -> sp.csc_matrix: """ Returns the contact matrix as a SciPy CSC sparse matrix. Returns ------- scipy.sparse.csc_matrix The contact matrix data in CSC format. """ return self.as_coo().tocsc()
[docs] def as_dataframe(self) -> pd.DataFrame: """ Returns the contact matrix as a pandas DataFrame. Returns ------- pandas.DataFrame The contact matrix data as a DataFrame with bin IDs and counts. Examples -------- >>> df = cm.as_dataframe() >>> print(df.columns) Index(['bin1_id', 'bin2_id', 'count'], dtype='object') """ # Deferred import to break circular dependency from .preprocs.converters import to_dataframe if isinstance(self.data, pd.DataFrame): return self.data return to_dataframe(self.data)