Source code for gunz_cm.matrix
# -*- coding: utf-8 -*-
"""
Defines the ContactMatrix data structure.
"""
# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"
from dataclasses import dataclass, field
import typing as t
import pandas as pd
from scipy import sparse as sp
[docs]
@dataclass
class ContactMatrix:
"""
A data container for a contact matrix and its associated metadata.
This class acts as a simple, data-oriented container to group a contact
matrix (as a pandas DataFrame or a SciPy sparse matrix) with important
metadata like its genomic coordinates and resolution. It supports lazy
loading of data via a loader function.
Attributes
----------
chromosome1 : str
The name of the first chromosome.
resolution : int
The resolution of the contact matrix in base pairs.
loader_func : callable
A function or callable that returns the raw data when called.
loader_kwargs : dict
Keyword arguments to pass to the loader function.
chromosome2 : str, optional
The name of the second chromosome, if different from the first
(for inter-chromosomal matrices). Defaults to chromosome1.
metadata : dict
A dictionary to hold any other relevant metadata.
Examples
--------
>>> from gunz_cm.matrix import ContactMatrix
>>> import numpy as np
>>> def dummy_loader(n): return np.eye(n)
>>> cm = ContactMatrix("chr1", 10000, loader_func=dummy_loader, loader_kwargs={"n": 5})
>>> print(cm.data.shape)
(5, 5)
"""
chromosome1: str
resolution: int
loader_func: t.Callable = field(repr=False)
loader_kwargs: t.Dict[str, t.Any] = field(default_factory=dict, repr=False)
chromosome2: t.Optional[str] = None
metadata: t.Dict[str, t.Any] = field(default_factory=dict)
_data: t.Any = field(init=False, repr=False, default=None)
def __post_init__(self):
if self.chromosome2 is None:
self.chromosome2 = self.chromosome1
@property
def data(self) -> t.Any:
"""
The raw contact matrix data, loaded lazily.
Returns
-------
any
The raw data returned by the loader function (usually a DataFrame or Sparse Matrix).
"""
if self._data is None:
self._data = self.loader_func(**self.loader_kwargs)
return self._data
[docs]
def as_coo(self) -> sp.coo_matrix:
"""
Returns the contact matrix as a SciPy COO sparse matrix.
Returns
-------
scipy.sparse.coo_matrix
The contact matrix data in COO format.
Examples
--------
>>> cm = load_cm_data("sample.cool", "chr1", 10000)
>>> coo = cm.as_coo()
>>> print(f"Non-zero elements: {coo.nnz}")
"""
# Deferred import to break circular dependency
from .preprocs.converters import to_coo_matrix
if isinstance(self.data, sp.coo_matrix):
return self.data
return to_coo_matrix(self.data)
[docs]
def as_csr(self) -> sp.csr_matrix:
"""
Returns the contact matrix as a SciPy CSR sparse matrix.
Returns
-------
scipy.sparse.csr_matrix
The contact matrix data in CSR format.
"""
return self.as_coo().tocsr()
[docs]
def as_csc(self) -> sp.csc_matrix:
"""
Returns the contact matrix as a SciPy CSC sparse matrix.
Returns
-------
scipy.sparse.csc_matrix
The contact matrix data in CSC format.
"""
return self.as_coo().tocsc()
[docs]
def as_dataframe(self) -> pd.DataFrame:
"""
Returns the contact matrix as a pandas DataFrame.
Returns
-------
pandas.DataFrame
The contact matrix data as a DataFrame with bin IDs and counts.
Examples
--------
>>> df = cm.as_dataframe()
>>> print(df.columns)
Index(['bin1_id', 'bin2_id', 'count'], dtype='object')
"""
# Deferred import to break circular dependency
from .preprocs.converters import to_dataframe
if isinstance(self.data, pd.DataFrame):
return self.data
return to_dataframe(self.data)