Source code for gunz_cm.converters.memmap

"""
Module for converting various contact matrix formats into a memory-mapped
(memmap) file for efficient, on-disk matrix operations.


Examples
--------
"""

# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__maintainer__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"


# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import functools
import json
import pathlib
import typing as t

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import numpy as np
import numpy.typing as npt
import pandas as pd
from pydantic import ConfigDict, validate_call

# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from .. import loaders
from ..exceptions import ConverterError
from ..consts import Balancing, DataStructure, DataFrameSpecs

# =============================================================================
# CORE CONVERSION LOGIC (SINGLE DISPATCH)
# =============================================================================

[docs]@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@functools.singledispatch
def convert_to_memmap(
    data: pathlib.Path | pd.DataFrame | tuple[npt.ArrayLike, ...],
    output_fpath: pathlib.Path,
    # Additional arguments are passed via kwargs to the registered functions
    **kwargs,
) -> None:
    """
    Converts contact matrix data to a NumPy memory-mapped file (memmap).

    This is a polymorphic function that dispatches to the appropriate
    implementation based on the type of the `data` argument.

    Parameters
    ----------
    data : pathlib.Path or pd.DataFrame or tuple
        The input data to convert. Can be:
        - A path to a standard contact matrix file (.hic, .cool, etc.).
        - A pandas DataFrame in COO format.
        - A tuple of (rows, cols, values) arrays.
    output_fpath : pathlib.Path
        The base path for the output memmap file.
    **kwargs :
        Additional arguments specific to the conversion type, such as
        `bin_size_bp`, `balancing`, `output_full_matrix`, etc.


Examples
--------
"""
    raise NotImplementedError(f"No conversion implementation for type: {type(data)}")


@convert_to_memmap.register(pd.DataFrame)
def _convert_df_to_memmap(
    data: pd.DataFrame,
    output_fpath: pathlib.Path,
    **kwargs,
) -> None:
    """Dispatched implementation for converting a DataFrame to memmap.

Examples
--------
"""
    row_ids = data[DataFrameSpecs.ROW_IDS].to_numpy()
    col_ids = data[DataFrameSpecs.COL_IDS].to_numpy()
    counts = data[DataFrameSpecs.COUNTS].to_numpy()

    # Delegate to the array-based implementation
    _convert_coo_to_memmap((row_ids, col_ids, counts), output_fpath, **kwargs)


def _write_memmap_metadata(meta_fpath: pathlib.Path, meta_dict: dict) -> None:
    """
    Write metadata JSON file.

    Examples
    --------
    """
    with open(meta_fpath, 'w') as f:
        json.dump(meta_dict, f, indent=4, sort_keys=True)


def _verify_memmap_output(
    output_fpath: pathlib.Path,
    row_ids: np.ndarray,
    col_ids: np.ndarray,
    counts: np.ndarray,
    final_dtype: np.dtype,
    output_full_matrix: bool
) -> None:
    """
    Verify memmap output integrity after writing.

    Examples
    --------
    """
    recon_fp = loaders.load_memmap(output_fpath)
    expected_counts = counts.astype(final_dtype)

    if not np.allclose(recon_fp.data[row_ids, col_ids], expected_counts, equal_nan=True):
        raise ConverterError("Memmap data integrity check failed: wrote vs read mismatch.")
    if output_full_matrix:
        if not np.allclose(recon_fp.data[col_ids, row_ids], expected_counts, equal_nan=True):
            raise ConverterError("Memmap data integrity check failed: full matrix mismatch.")


def _validate_memmap_output(output_fpath: pathlib.Path, overwrite: bool) -> tuple[pathlib.Path, pathlib.Path]:
    """
    Validate output path and return bin/meta paths.

    Examples
    --------
    """
    if loaders.is_memmap_exists(output_fpath) and not overwrite:
        raise FileExistsError(f"Memmap file already exists: {output_fpath}")
    bin_fpath, meta_fpath = loaders.gen_memmap_fpaths(output_fpath)
    bin_fpath.parent.mkdir(parents=True, exist_ok=True)
    return bin_fpath, meta_fpath


def _process_counts_for_memmap(counts: np.ndarray, dtype: np.dtype) -> tuple[np.ndarray, np.ndarray]:
    """
    Process counts array for memmap writing, handling NaN/Inf and dtype overflow.

    Examples
    --------
    """
    if np.issubdtype(counts.dtype, np.floating):
        mask_invalid = ~np.isfinite(counts)
        if np.any(mask_invalid):
            counts = counts.copy()
            counts[mask_invalid] = 0

    final_dtype = dtype or counts.dtype

    if np.issubdtype(final_dtype, np.integer):
        info = np.iinfo(final_dtype)
        if np.any(counts > info.max) or np.any(counts < info.min):
            import warnings
            warnings.warn(
                f"Data contains values outside the range of {final_dtype} "
                f"([{info.min}, {info.max}]). Values will be clipped. "
                "Consider using a larger dtype (e.g., int32) to avoid data loss.",
                RuntimeWarning,
                stacklevel=2
            )
            counts = np.clip(counts, info.min, info.max)

    return counts, final_dtype


@convert_to_memmap.register(tuple)
def _convert_coo_to_memmap(
    data: tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike],
    output_fpath: pathlib.Path,
    output_full_matrix: bool = True,
    dtype: npt.DTypeLike | None = None,
    shape: tuple[int, int] | None = None,
    check_output: bool = True,
    overwrite: bool = False,
    metadata: dict[str, t.Any] | None = None,
    **kwargs,
) -> None:
    """Dispatched implementation for converting COO arrays to memmap.

    Examples
    --------
    """
    bin_fpath, meta_fpath = _validate_memmap_output(output_fpath, overwrite)

    row_ids, col_ids, counts = np.asarray(data[0]), np.asarray(data[1]), np.asarray(data[2])
    counts, final_dtype = _process_counts_for_memmap(counts, dtype)

    if shape is None:
        n = max(np.max(row_ids), np.max(col_ids)) + 1
        final_shape = (int(n), int(n))
    else:
        final_shape = shape

    fp = np.memmap(bin_fpath, dtype=final_dtype, mode='w+', shape=final_shape)

    if output_full_matrix:
        nondiag_mask = row_ids != col_ids
        fp[row_ids, col_ids] = counts
        fp[col_ids[nondiag_mask], row_ids[nondiag_mask]] = counts[nondiag_mask]
    else:
        fp[row_ids, col_ids] = counts

    fp.flush()

    meta_dict = {"shape": final_shape, "dtype": np.dtype(final_dtype).str}
    if metadata:
        meta_dict.update(metadata)

    with open(meta_fpath, 'w') as f:
        json.dump(meta_dict, f, indent=4, sort_keys=True)

    if check_output:
        _verify_memmap_output(output_fpath, row_ids, col_ids, counts, final_dtype, output_full_matrix)


@convert_to_memmap.register(pathlib.Path)
def _convert_file_to_memmap(
    data: pathlib.Path,
    output_fpath: pathlib.Path,
    region1: str,
    bin_size_bp: int,
    balancing: Balancing | None,
    **kwargs,
) -> None:
    """Dispatched implementation for converting a source file to memmap.

Examples
--------
"""
    if not data.exists():
        raise FileNotFoundError(f"Input file not found: {data}")
    if not loaders.is_file_standard_cm(str(data)):
        raise ConverterError("Input file must be a standard contact matrix format.")

    # Extract memmap-specific arguments to prevent passing them to load_cm_data
    memmap_keys = ["dtype", "output_full_matrix", "shape", "check_output", "overwrite"]
    memmap_kwargs = {}
    for key in memmap_keys:
        if key in kwargs:
            memmap_kwargs[key] = kwargs.pop(key)

    cm_df = loaders.load_cm_data(
        fpath=data,
        region1=region1,
        bin_size_bp=bin_size_bp,
        balancing=balancing,
        output_format=DataStructure.DF,
        **kwargs
    )

    # Pass metadata
    meta = {
        "resolution": bin_size_bp,
        "chromosome1": region1,
        "chromosome2": region1, # Default intra
        "balancing": balancing.value if balancing else "NONE"
    }

    # Delegate to the DataFrame-based implementation
    _convert_df_to_memmap(cm_df, output_fpath, metadata=meta, **memmap_kwargs)