from __future__ import annotations
# -*- coding: utf-8 -*-
"""
This module provides functionality to load and process genomic interaction
data stored in a GInteractions-like tabular format.
Examples
--------
"""
# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "2.1.0"
# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import pathlib
import typing as t
import io
# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import pandas as pd
from scipy import sparse as sp
# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..exceptions import LoaderError
from ..consts import (
DEFAULT_SEPARATOR,
GINTERACTIONS_COLUMNS,
DataStructure,
DataFrameSpecs,
)
from ..matrix import ContactMatrix
# Updated import from 'conversions' to 'converter'
from ..preprocs.converters import to_coo_matrix
from gunz_utils.validation import type_checked
# =============================================================================
# LOADER FUNCTION
# =============================================================================
@type_checked(config=dict(arbitrary_types_allowed=True))
def _load_ginteractions_data(
fpath: str | pathlib.Path | io.BytesIO,
resolution: int,
region1: str,
region2: str | None = None,
encoding: str = "utf-8",
output_format: DataStructure = DataStructure.DF,
**kwargs,
) -> t.Any:
"""
Internal function to load GInteractions data.
Parameters
----------
fpath : str | pathlib.Path | io.BytesIO
The file path or an in-memory byte stream to read from.
resolution : int
The resolution for binning the genomic coordinates.
region1 : str
The first chromosome to include in the output.
region2 : str | None, optional
The second chromosome.
encoding : str, optional
The file encoding to use.
output_format : DataStructure, optional
The desired output format.
**kwargs :
Catches extra keyword arguments.
Returns
-------
t.Any
The loaded data.
Examples
--------
Examples
--------
"""
if isinstance(fpath, pathlib.Path):
fpath = str(fpath)
chromosome2 = region2 or region1
df = pd.read_csv(
fpath,
sep=DEFAULT_SEPARATOR,
names=GINTERACTIONS_COLUMNS,
encoding=encoding,
)
mask = (df["chr1"] == region1) & (df["chr2"] == chromosome2)
filtered_df = df[mask].copy()
bin_divisor = 2 * resolution
row_ids = (filtered_df["start1"] + filtered_df["end1"]) // bin_divisor
col_ids = (filtered_df["start2"] + filtered_df["end2"]) // bin_divisor
cm_df = pd.DataFrame(
{
DataFrameSpecs.ROW_IDS: row_ids,
DataFrameSpecs.COL_IDS: col_ids,
DataFrameSpecs.COUNTS: filtered_df["counts"],
}
)
if output_format == DataStructure.DF:
return cm_df
coo_matrix: sp.coo_matrix = to_coo_matrix(cm_df)
if output_format == DataStructure.COO:
return coo_matrix
elif output_format == DataStructure.RCV:
return (coo_matrix.row, coo_matrix.col, coo_matrix.data)
raise LoaderError(f"Invalid output format: {output_format}!")
[docs]
@type_checked(config=dict(arbitrary_types_allowed=True))
def load_ginteractions(
fpath: str | pathlib.Path | io.BytesIO,
resolution: int,
region1: str,
region2: str | None = None,
encoding: str = "utf-8",
output_format: DataStructure = DataStructure.DF,
**kwargs,
) -> ContactMatrix:
"""Loads and processes data from a GInteractions-like tabular file lazily.
Parameters
----------
fpath : str | pathlib.Path | io.BytesIO
The file path or an in-memory byte stream to read from.
resolution : int
The resolution for binning the genomic coordinates.
region1 : str
The first chromosome to include in the output (e.g., 'chr1').
region2 : str | None, optional
The second chromosome. If None, it defaults to `region1` for
intra-chromosomal interactions.
encoding : str, optional
The file encoding to use when reading the file.
output_format : DataStructure, optional
The desired output format.
**kwargs :
Catches extra keyword arguments.
Examples
--------
Examples
--------
"""
loader_kwargs = {
"fpath": fpath,
"resolution": resolution,
"region1": region1,
"region2": region2,
"encoding": encoding,
"output_format": output_format,
**kwargs,
}
return ContactMatrix(
chromosome1=region1,
chromosome2=region2 or region1,
resolution=resolution,
loader_func=_load_ginteractions_data,
loader_kwargs=loader_kwargs,
metadata={"format": "ginteractions"}
)