Source code for gunz_cm.loaders.ginteractions_loader

from __future__ import annotations
# -*- coding: utf-8 -*-
"""
This module provides functionality to load and process genomic interaction
data stored in a GInteractions-like tabular format.


Examples
--------
"""

# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "2.1.0"


# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import pathlib
import typing as t
import io

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import pandas as pd
from scipy import sparse as sp

# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..exceptions import LoaderError
from ..consts import (
    DEFAULT_SEPARATOR,
    GINTERACTIONS_COLUMNS,
    DataStructure,
    DataFrameSpecs,
)
from ..matrix import ContactMatrix
# Updated import from 'conversions' to 'converter'
from ..preprocs.converters import to_coo_matrix
from gunz_utils.validation import type_checked

# =============================================================================
# LOADER FUNCTION
# =============================================================================

@type_checked(config=dict(arbitrary_types_allowed=True))
def _load_ginteractions_data(
    fpath: str | pathlib.Path | io.BytesIO,
    resolution: int,
    region1: str,
    region2: str | None = None,
    encoding: str = "utf-8",
    output_format: DataStructure = DataStructure.DF,
    **kwargs,
) -> t.Any:
    """
    Internal function to load GInteractions data.

    Parameters
    ----------
    fpath : str | pathlib.Path | io.BytesIO
        The file path or an in-memory byte stream to read from.
    resolution : int
        The resolution for binning the genomic coordinates.
    region1 : str
        The first chromosome to include in the output.
    region2 : str | None, optional
        The second chromosome.
    encoding : str, optional
        The file encoding to use.
    output_format : DataStructure, optional
        The desired output format.
    **kwargs :
        Catches extra keyword arguments.

    Returns
    -------
    t.Any
        The loaded data.

    Examples
    --------


Examples
--------
"""
    if isinstance(fpath, pathlib.Path):
        fpath = str(fpath)

    chromosome2 = region2 or region1

    df = pd.read_csv(
        fpath,
        sep=DEFAULT_SEPARATOR,
        names=GINTERACTIONS_COLUMNS,
        encoding=encoding,
    )

    mask = (df["chr1"] == region1) & (df["chr2"] == chromosome2)
    filtered_df = df[mask].copy()

    bin_divisor = 2 * resolution
    row_ids = (filtered_df["start1"] + filtered_df["end1"]) // bin_divisor
    col_ids = (filtered_df["start2"] + filtered_df["end2"]) // bin_divisor

    cm_df = pd.DataFrame(
        {
            DataFrameSpecs.ROW_IDS: row_ids,
            DataFrameSpecs.COL_IDS: col_ids,
            DataFrameSpecs.COUNTS: filtered_df["counts"],
        }
    )

    if output_format == DataStructure.DF:
        return cm_df

    coo_matrix: sp.coo_matrix = to_coo_matrix(cm_df)
    if output_format == DataStructure.COO:
        return coo_matrix
    elif output_format == DataStructure.RCV:
        return (coo_matrix.row, coo_matrix.col, coo_matrix.data)

    raise LoaderError(f"Invalid output format: {output_format}!")

[docs] @type_checked(config=dict(arbitrary_types_allowed=True)) def load_ginteractions( fpath: str | pathlib.Path | io.BytesIO, resolution: int, region1: str, region2: str | None = None, encoding: str = "utf-8", output_format: DataStructure = DataStructure.DF, **kwargs, ) -> ContactMatrix: """Loads and processes data from a GInteractions-like tabular file lazily. Parameters ---------- fpath : str | pathlib.Path | io.BytesIO The file path or an in-memory byte stream to read from. resolution : int The resolution for binning the genomic coordinates. region1 : str The first chromosome to include in the output (e.g., 'chr1'). region2 : str | None, optional The second chromosome. If None, it defaults to `region1` for intra-chromosomal interactions. encoding : str, optional The file encoding to use when reading the file. output_format : DataStructure, optional The desired output format. **kwargs : Catches extra keyword arguments. Examples -------- Examples -------- """ loader_kwargs = { "fpath": fpath, "resolution": resolution, "region1": region1, "region2": region2, "encoding": encoding, "output_format": output_format, **kwargs, } return ContactMatrix( chromosome1=region1, chromosome2=region2 or region1, resolution=resolution, loader_func=_load_ginteractions_data, loader_kwargs=loader_kwargs, metadata={"format": "ginteractions"} )