# -*- coding: utf-8 -*-
"""
Defines shared constants, enumerations, and data structures for the library.
This module centralizes common values used throughout the application,
including DataFrame column names, data types, supported file formats,
and standard genomic build information. Using this module ensures consistency
and simplifies maintenance.
Examples
--------
"""
# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.1"
# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import logging
import typing as t
from dataclasses import dataclass
from types import MappingProxyType
# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import numpy as np
# =============================================================================
# LOCAL IMPORTS
# =============================================================================
from gunz_utils.enums import BaseStrEnum
# =============================================================================
# LOGGING CONSTANTS
# =============================================================================
LOG_LEVELS: t.Mapping[str, int] = MappingProxyType({
"critical": logging.CRITICAL,
"error": logging.ERROR,
"warning": logging.WARNING,
"info": logging.INFO,
"debug": logging.DEBUG,
})
# =============================================================================
# ENUMERATIONS
# =============================================================================
[docs]
class GenomeBuild(BaseStrEnum):
"""Enumeration for standard genome builds.
Examples
--------
"""
MM9 = "mm9"
MM10 = "mm10"
HG19 = "hg19"
HG38 = "hg38"
[docs]
class Counts(BaseStrEnum):
"""Enumeration for different types of interaction counts.
Examples
--------
"""
OBSERVED = "observed"
OE = "oe"
EXPECTED = "expected"
[docs]
class Balancing(BaseStrEnum):
"""Enumeration for matrix balancing (normalization) methods.
Examples
--------
"""
NONE = "NONE"
KR = "KR"
VC = "VC"
VC_SQRT = "VC_SQRT"
[docs]
class BpFrag(BaseStrEnum):
"""Enumeration for binning units (Base Pairs vs. Fragments).
Examples
--------
"""
BP = "BP"
FRAG = "FRAG"
[docs]
class Backend(BaseStrEnum):
"""Enumeration for interaction matrix loader backends.
Examples
--------
"""
COOLER = "cooler"
HICTK = "hictk"
STRAW = "straw"
HICSTRAW = "hicstraw"
[docs]
class DataStructure(BaseStrEnum):
"""Enumeration for in-memory data representations.
Examples
--------
"""
COO = "coo"
DF = "df"
RCV = "rcv"
RC = "rc"
# Alias for convenience, maintaining compatibility with other modules
DS = DataStructure
# =============================================================================
# DATAFRAME SPECIFICATIONS
# =============================================================================
@dataclass(frozen=True)
class _DataFrameSpecs:
"""A frozen dataclass defining DataFrame column names and data types.
Examples
--------
"""
# --- Column Names ---
ROW_IDS: str
COL_IDS: str
COUNTS: str
RAW_COUNTS: str
NORM_COUNTS: str
DIST: str
# --- Data Types ---
INDICES_DTYPE: t.Type[np.uint32]
RAW_COUNTS_DTYPE: t.Type[np.uint32]
NORM_COUNTS_DTYPE: t.Type[np.float64]
# --- Column Name Lists ---
@property
def COO_COLUMN_NAMES(self) -> t.List[str]:
"""
Function COO_COLUMN_NAMES.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
return [self.ROW_IDS, self.COL_IDS, self.COUNTS]
@property
def MCOO_COLUMN_NAMES(self) -> t.List[str]:
"""
Function MCOO_COLUMN_NAMES.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
return [self.ROW_IDS, self.COL_IDS, self.COUNTS, self.RAW_COUNTS]
# --- DType Dictionaries ---
@property
def RC_DTYPES(self) -> t.Dict[str, t.Type[np.uint32]]:
"""Uses the default INDICES_DTYPE for row/col IDs.
Examples
--------
"""
return {self.ROW_IDS: self.INDICES_DTYPE, self.COL_IDS: self.INDICES_DTYPE}
@property
def RAW_COO_DTYPES(self) -> t.Dict[str, t.Any]:
"""
Function RAW_COO_DTYPES.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
return {**self.RC_DTYPES, self.COUNTS: self.RAW_COUNTS_DTYPE}
@property
def NORM_COO_DTYPES(self) -> t.Dict[str, t.Any]:
"""
Function NORM_COO_DTYPES.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
return {**self.RC_DTYPES, self.COUNTS: self.NORM_COUNTS_DTYPE}
@property
def MCOO_DTYPES(self) -> t.Dict[str, t.Any]:
"""
Function MCOO_DTYPES.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
return {
**self.RC_DTYPES,
self.COUNTS: self.NORM_COUNTS_DTYPE,
self.RAW_COUNTS: self.RAW_COUNTS_DTYPE,
}
# Create a single, immutable instance to be used as the constant.
DataFrameSpecs = _DataFrameSpecs(
ROW_IDS="row_ids",
COL_IDS="col_ids",
COUNTS="counts",
RAW_COUNTS="raw_counts",
NORM_COUNTS="norm_counts",
DIST="dists",
INDICES_DTYPE=np.uint32,
RAW_COUNTS_DTYPE=np.uint32,
NORM_COUNTS_DTYPE=np.float64
)
GINTERACTIONS_COLUMNS: t.Final[t.List[str]] = [
"chr1", "start1", "end1", "chr2", "start2", "end2", "counts"
]
NARROWPEAK_COLUMNS: t.Final[t.List[str]] = [
"chromosome", "start", "end", "name", "score", "strand",
"signalValue", "pValue", "qValue", "peak"
]
ARROWHEAD_COLUMNS: t.Final[t.List[str]] = [
"chr1", "x1", "x2", "chr2", "y1", "y2", "color", "observed",
"expectedBL", "expectedDonut", "expectedVertical", "expectedHorizontal",
"binSize", "fdr", "stars"
]
DEF_SEP: t.Final[str] = "\t"
SUPPORTED_COMPRESSION_SCHEMES: t.Final[frozenset[str]] = frozenset({
"gz", "bz2", "zip", "xz", "zst"
})
CSV_SPEC: t.Final[t.Dict[str, t.Any]] = MappingProxyType(dict(
sep=DEF_SEP,
index_col=None,
engine="c"
))
DEFAULT_SEPARATOR: t.Final[str] = "\t"
# =============================================================================
# GENOMIC BUILD CONSTANTS (using immutable mappings)
# =============================================================================
MM9_CHROM_LENGTHS: t.Mapping[str, int] = MappingProxyType({
"chr1": 197_195_432, "chr2": 181_748_087, "chr3": 159_599_783,
"chr4": 155_630_120, "chr5": 152_537_259, "chr6": 149_517_037,
"chr7": 152_524_553, "chr8": 131_738_871, "chr9": 124_076_172,
"chr10": 129_993_255, "chr11": 121_843_856, "chr12": 121_257_530,
"chr13": 120_284_312, "chr14": 125_194_864, "chr15": 103_494_974,
"chr16": 98_319_150, "chr17": 95_272_651, "chr18": 90_772_031,
"chr19": 61_342_430, "chrX": 166_650_296, "chrY": 15_902_555,
"chrM": 16_299
})
MM10_CHROM_LENGTHS: t.Mapping[str, int] = MappingProxyType({
"chr1": 195_471_971, "chr2": 182_113_224, "chr3": 160_039_680,
"chr4": 156_508_116, "chr5": 151_834_684, "chr6": 149_736_546,
"chr7": 145_441_459, "chr8": 129_401_213, "chr9": 124_595_110,
"chr10": 130_694_993, "chr11": 122_082_543, "chr12": 120_129_022,
"chr13": 120_421_639, "chr14": 124_902_244, "chr15": 104_043_685,
"chr16": 98_207_768, "chr17": 94_987_271, "chr18": 90_702_639,
"chr19": 61_431_566, "chrX": 171_031_299, "chrY": 91_744_698,
"chrM": 16_299
})
HG19_CHROM_LENGTHS: t.Mapping[str, int] = MappingProxyType({
"chr1": 249_250_621, "chr2": 243_199_373, "chr3": 198_022_430,
"chr4": 191_154_276, "chr5": 180_857_866, "chr6": 170_805_979,
"chr7": 159_345_973, "chr8": 146_364_022, "chr9": 141_213_431,
"chr10": 135_534_747, "chr11": 135_006_516, "chr12": 133_851_895,
"chr13": 115_169_878, "chr14": 107_349_540, "chr15": 102_521_392,
"chr16": 90_354_753, "chr17": 81_195_210, "chr18": 78_077_248,
"chr19": 59_128_983, "chr20": 63_025_520, "chr21": 48_129_895,
"chr22": 51_304_566, "chrX": 155_270_560, "chrY": 59_373_566,
"chrM": 16_571
})
AVAIL_GENOME_BUILDS: t.Mapping[str, t.Mapping[str, int]] = MappingProxyType({
"mm9": MM9_CHROM_LENGTHS,
"mm10": MM10_CHROM_LENGTHS,
"hg19": HG19_CHROM_LENGTHS,
})