Source code for gunz_cm.loaders.memmap_loader

from __future__ import annotations
# -*- coding: utf-8 -*-
"""
This module provides utility functions for creating, checking, and loading
NumPy memory-mapped (memmap) files, which consist of a binary data file
and a corresponding JSON metadata file.


Examples
--------
"""

# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "2.0.0"


# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import json
import pathlib

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import numpy as np
from pydantic import validate_call

# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..matrix import ContactMatrix

# =============================================================================
# MODULE CONSTANTS
# =============================================================================
BIN_EXT = "npdat"
META_EXT = "json"
DEFAULT_MODE = "r"


# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

[docs] @validate_call(config=dict(arbitrary_types_allowed=True)) def gen_memmap_fpaths( #? --- Input Files --- base_fpath: str | pathlib.Path, ) -> tuple[pathlib.Path, pathlib.Path]: """ Generates paths for the binary data and JSON metadata files. Parameters ---------- base_fpath : str | pathlib.Path The base path for the memmap, without an extension. Returns ------- tuple[pathlib.Path, pathlib.Path] A tuple containing the path to the binary (.npdat) file and the metadata (.json) file. Examples -------- Examples -------- """ base_fpath = pathlib.Path(base_fpath) bin_fpath = base_fpath.with_suffix(f".{BIN_EXT}") meta_fpath = base_fpath.with_suffix(f".{META_EXT}") return bin_fpath, meta_fpath
[docs] @validate_call(config=dict(arbitrary_types_allowed=True)) def is_memmap_exists( #? --- Input Files --- base_fpath: str | pathlib.Path, ) -> bool: """ Checks if both the binary and metadata files for a memmap exist. Parameters ---------- base_fpath : str | pathlib.Path The base path for the memmap to check. Returns ------- bool True if both the .npdat and .json files exist, False otherwise. Examples -------- Examples -------- """ base_fpath = pathlib.Path(base_fpath) bin_fpath, meta_fpath = gen_memmap_fpaths(base_fpath) return bin_fpath.exists() and meta_fpath.exists()
@validate_call(config=dict(arbitrary_types_allowed=True)) def _load_memmap_data( base_fpath: str | pathlib.Path, mode: str = DEFAULT_MODE, ) -> np.memmap: """ Internal function to load memmap data. Parameters ---------- base_fpath : str | pathlib.Path The base path to the memmap files. mode : str, optional The file open mode for the memmap. Returns ------- np.memmap A NumPy memory-mapped array. Examples -------- Examples -------- """ base_fpath = pathlib.Path(base_fpath) if not is_memmap_exists(base_fpath): bin_fpath, meta_fpath = gen_memmap_fpaths(base_fpath) raise FileNotFoundError( "Memmap files not found. " f"Binary exists: {bin_fpath.exists()}. " f"Metadata exists: {meta_fpath.exists()}." ) _, meta_fpath = gen_memmap_fpaths(base_fpath) with meta_fpath.open("r") as f: meta = json.load(f) bin_fpath, _ = gen_memmap_fpaths(base_fpath) return np.memmap( bin_fpath, mode=mode, dtype=np.dtype(meta["dtype"]), shape=tuple(meta["shape"]), )
[docs] @validate_call(config=dict(arbitrary_types_allowed=True)) def load_memmap( base_fpath: str | pathlib.Path, mode: str = DEFAULT_MODE, ) -> ContactMatrix: """ Loads a NumPy array from a memory-mapped file lazily. Examples -------- Examples -------- """ # For memmap, we need to read the metadata to know the chromosome and resolution # so we can't be fully lazy. _, meta_fpath = gen_memmap_fpaths(pathlib.Path(base_fpath)) with meta_fpath.open("r") as f: meta = json.load(f) loader_kwargs = { "base_fpath": base_fpath, "mode": mode, } return ContactMatrix( chromosome1=meta.get("chromosome1"), chromosome2=meta.get("chromosome2"), resolution=meta.get("resolution"), loader_func=_load_memmap_data, loader_kwargs=loader_kwargs, metadata=meta )