from __future__ import annotations
# -*- coding: utf-8 -*-
"""
Module for loading contact matrix objects, such as those stored in pickle files.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "2.4"
# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import pathlib
import pickle
import typing as t
# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import pandas as pd
import scipy.sparse as ssparse
from pydantic import validate_call
# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..consts import DataStructure
from ..exceptions import LoaderError
from ..matrix import ContactMatrix
from ..preprocs.converters import to_dataframe, to_coo_matrix
@validate_call(config={"arbitrary_types_allowed": True})
def _load_pickle_data(
fpath: str | pathlib.Path,
region1: str | None = None,
resolution: int | None = None,
region2: str | None = None,
balancing: str | None = None,
output_format: DataStructure = DataStructure.DF,
) -> t.Any:
"""
Internal function to load pickle data.
Parameters
----------
fpath : str | pathlib.Path
The file path to load.
region1 : str | None
The first genomic region (unused for direct loading).
resolution : int | None
The resolution (unused for direct loading).
region2 : str | None, optional
The second genomic region (unused).
balancing : str | None, optional
The balancing method (unused).
output_format : DataStructure, optional
The desired output format.
Returns
-------
t.Any
The loaded data.
Examples
--------
Examples
--------
"""
if output_format not in [DataStructure.DF, DataStructure.COO]:
raise LoaderError(
f"Unsupported output format: '{output_format}'. "
"Must be 'df' or 'coo'."
)
fpath = pathlib.Path(fpath)
if not fpath.exists():
raise FileNotFoundError(f"File not found at the specified path: {fpath}")
with fpath.open("rb") as f:
cm_obj = pickle.load(f)
if output_format == DataStructure.DF:
if isinstance(cm_obj, pd.DataFrame):
return cm_obj
elif isinstance(cm_obj, ssparse.coo_matrix):
return to_dataframe(cm_obj)
else:
raise TypeError(
f"Loaded object of type {type(cm_obj).__name__} cannot be "
"converted to a pandas DataFrame."
)
elif output_format == DataStructure.COO:
if isinstance(cm_obj, ssparse.coo_matrix):
return cm_obj
elif isinstance(cm_obj, pd.DataFrame):
return to_coo_matrix(cm_obj)
else:
raise TypeError(
f"Loaded object of type {type(cm_obj).__name__} cannot be "
"converted to a COO sparse matrix."
)
[docs]
@validate_call(config={"arbitrary_types_allowed": True})
def load_pickle(
fpath: str | pathlib.Path,
region1: str | None = None,
resolution: int | None = None,
region2: str | None = None,
balancing: str | None = None,
output_format: DataStructure = DataStructure.DF,
) -> ContactMatrix:
"""Loads a pickle file containing a contact matrix object lazily.
Examples
--------
Examples
--------
"""
loader_kwargs = {
"fpath": fpath,
"region1": region1,
"resolution": resolution,
"region2": region2,
"balancing": balancing,
"output_format": output_format,
}
return ContactMatrix(
chromosome1=region1,
chromosome2=region2,
resolution=resolution,
loader_func=_load_pickle_data,
loader_kwargs=loader_kwargs,
metadata={"format": "pickle"}
)