Source code for gunz_cm.consts

# -*- coding: utf-8 -*-
"""
Defines shared constants, enumerations, and data structures for the library.

This module centralizes common values used throughout the application,
including DataFrame column names, data types, supported file formats,
and standard genomic build information. Using this module ensures consistency
and simplifies maintenance.


Examples
--------
"""

# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.1"

# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import logging
import typing as t
from dataclasses import dataclass
from types import MappingProxyType

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import numpy as np

# =============================================================================
# LOCAL IMPORTS
# =============================================================================
from gunz_utils.enums import BaseStrEnum

# =============================================================================
# LOGGING CONSTANTS
# =============================================================================
LOG_LEVELS: t.Mapping[str, int] = MappingProxyType({
    "critical": logging.CRITICAL,
    "error": logging.ERROR,
    "warning": logging.WARNING,
    "info": logging.INFO,
    "debug": logging.DEBUG,
})

# =============================================================================
# ENUMERATIONS
# =============================================================================

[docs] class GenomeBuild(BaseStrEnum): """Enumeration for standard genome builds. Examples -------- """ MM9 = "mm9" MM10 = "mm10" HG19 = "hg19" HG38 = "hg38"
[docs] class Counts(BaseStrEnum): """Enumeration for different types of interaction counts. Examples -------- """ OBSERVED = "observed" OE = "oe" EXPECTED = "expected"
[docs] class Balancing(BaseStrEnum): """Enumeration for matrix balancing (normalization) methods. Examples -------- """ NONE = "NONE" KR = "KR" VC = "VC" VC_SQRT = "VC_SQRT"
[docs] class BpFrag(BaseStrEnum): """Enumeration for binning units (Base Pairs vs. Fragments). Examples -------- """ BP = "BP" FRAG = "FRAG"
[docs] class Backend(BaseStrEnum): """Enumeration for interaction matrix loader backends. Examples -------- """ COOLER = "cooler" HICTK = "hictk" STRAW = "straw" HICSTRAW = "hicstraw"
[docs] class Format(BaseStrEnum): """ Enumeration for supported file formats. Uses BaseStrEnum for case-insensitivity and aliases. Examples -------- """ #? Define aliases for shorthand lookup __ALIASES__: t.ClassVar[dict[str, str]] = { "cool": "cooler", "mcool": "cooler", #? "npdat" is not needed here because MEMMAP value is already "npdat" } HIC = "hic" COOLER = "cooler" NPY = "npy" PICKLE = "pickle" CSV = "csv" MCSV = "mcsv" TSV = "tsv" COO = "coo" MCOO = "mcoo" MEMMAP = "npdat" GINTERACTIONS = "ginteractions"
[docs] class DataStructure(BaseStrEnum): """Enumeration for in-memory data representations. Examples -------- """ COO = "coo" DF = "df" RCV = "rcv" RC = "rc"
# Alias for convenience, maintaining compatibility with other modules DS = DataStructure # ============================================================================= # DATAFRAME SPECIFICATIONS # ============================================================================= @dataclass(frozen=True) class _DataFrameSpecs: """A frozen dataclass defining DataFrame column names and data types. Examples -------- """ # --- Column Names --- ROW_IDS: str COL_IDS: str COUNTS: str RAW_COUNTS: str NORM_COUNTS: str DIST: str # --- Data Types --- INDICES_DTYPE: t.Type[np.uint32] RAW_COUNTS_DTYPE: t.Type[np.uint32] NORM_COUNTS_DTYPE: t.Type[np.float64] # --- Column Name Lists --- @property def COO_COLUMN_NAMES(self) -> t.List[str]: """ Function COO_COLUMN_NAMES. Parameters ---------- Returns ------- Examples -------- Notes ----- """ return [self.ROW_IDS, self.COL_IDS, self.COUNTS] @property def MCOO_COLUMN_NAMES(self) -> t.List[str]: """ Function MCOO_COLUMN_NAMES. Parameters ---------- Returns ------- Examples -------- Notes ----- """ return [self.ROW_IDS, self.COL_IDS, self.COUNTS, self.RAW_COUNTS] # --- DType Dictionaries --- @property def RC_DTYPES(self) -> t.Dict[str, t.Type[np.uint32]]: """Uses the default INDICES_DTYPE for row/col IDs. Examples -------- """ return {self.ROW_IDS: self.INDICES_DTYPE, self.COL_IDS: self.INDICES_DTYPE} @property def RAW_COO_DTYPES(self) -> t.Dict[str, t.Any]: """ Function RAW_COO_DTYPES. Parameters ---------- Returns ------- Examples -------- Notes ----- """ return {**self.RC_DTYPES, self.COUNTS: self.RAW_COUNTS_DTYPE} @property def NORM_COO_DTYPES(self) -> t.Dict[str, t.Any]: """ Function NORM_COO_DTYPES. Parameters ---------- Returns ------- Examples -------- Notes ----- """ return {**self.RC_DTYPES, self.COUNTS: self.NORM_COUNTS_DTYPE} @property def MCOO_DTYPES(self) -> t.Dict[str, t.Any]: """ Function MCOO_DTYPES. Parameters ---------- Returns ------- Examples -------- Notes ----- """ return { **self.RC_DTYPES, self.COUNTS: self.NORM_COUNTS_DTYPE, self.RAW_COUNTS: self.RAW_COUNTS_DTYPE, } # Create a single, immutable instance to be used as the constant. DataFrameSpecs = _DataFrameSpecs( ROW_IDS="row_ids", COL_IDS="col_ids", COUNTS="counts", RAW_COUNTS="raw_counts", NORM_COUNTS="norm_counts", DIST="dists", INDICES_DTYPE=np.uint32, RAW_COUNTS_DTYPE=np.uint32, NORM_COUNTS_DTYPE=np.float64 ) GINTERACTIONS_COLUMNS: t.Final[t.List[str]] = [ "chr1", "start1", "end1", "chr2", "start2", "end2", "counts" ] NARROWPEAK_COLUMNS: t.Final[t.List[str]] = [ "chromosome", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak" ] ARROWHEAD_COLUMNS: t.Final[t.List[str]] = [ "chr1", "x1", "x2", "chr2", "y1", "y2", "color", "observed", "expectedBL", "expectedDonut", "expectedVertical", "expectedHorizontal", "binSize", "fdr", "stars" ] DEF_SEP: t.Final[str] = "\t" SUPPORTED_COMPRESSION_SCHEMES: t.Final[frozenset[str]] = frozenset({ "gz", "bz2", "zip", "xz", "zst" }) CSV_SPEC: t.Final[t.Dict[str, t.Any]] = MappingProxyType(dict( sep=DEF_SEP, index_col=None, engine="c" )) DEFAULT_SEPARATOR: t.Final[str] = "\t" # ============================================================================= # GENOMIC BUILD CONSTANTS (using immutable mappings) # ============================================================================= MM9_CHROM_LENGTHS: t.Mapping[str, int] = MappingProxyType({ "chr1": 197_195_432, "chr2": 181_748_087, "chr3": 159_599_783, "chr4": 155_630_120, "chr5": 152_537_259, "chr6": 149_517_037, "chr7": 152_524_553, "chr8": 131_738_871, "chr9": 124_076_172, "chr10": 129_993_255, "chr11": 121_843_856, "chr12": 121_257_530, "chr13": 120_284_312, "chr14": 125_194_864, "chr15": 103_494_974, "chr16": 98_319_150, "chr17": 95_272_651, "chr18": 90_772_031, "chr19": 61_342_430, "chrX": 166_650_296, "chrY": 15_902_555, "chrM": 16_299 }) MM10_CHROM_LENGTHS: t.Mapping[str, int] = MappingProxyType({ "chr1": 195_471_971, "chr2": 182_113_224, "chr3": 160_039_680, "chr4": 156_508_116, "chr5": 151_834_684, "chr6": 149_736_546, "chr7": 145_441_459, "chr8": 129_401_213, "chr9": 124_595_110, "chr10": 130_694_993, "chr11": 122_082_543, "chr12": 120_129_022, "chr13": 120_421_639, "chr14": 124_902_244, "chr15": 104_043_685, "chr16": 98_207_768, "chr17": 94_987_271, "chr18": 90_702_639, "chr19": 61_431_566, "chrX": 171_031_299, "chrY": 91_744_698, "chrM": 16_299 }) HG19_CHROM_LENGTHS: t.Mapping[str, int] = MappingProxyType({ "chr1": 249_250_621, "chr2": 243_199_373, "chr3": 198_022_430, "chr4": 191_154_276, "chr5": 180_857_866, "chr6": 170_805_979, "chr7": 159_345_973, "chr8": 146_364_022, "chr9": 141_213_431, "chr10": 135_534_747, "chr11": 135_006_516, "chr12": 133_851_895, "chr13": 115_169_878, "chr14": 107_349_540, "chr15": 102_521_392, "chr16": 90_354_753, "chr17": 81_195_210, "chr18": 78_077_248, "chr19": 59_128_983, "chr20": 63_025_520, "chr21": 48_129_895, "chr22": 51_304_566, "chrX": 155_270_560, "chrY": 59_373_566, "chrM": 16_571 }) AVAIL_GENOME_BUILDS: t.Mapping[str, t.Mapping[str, int]] = MappingProxyType({ "mm9": MM9_CHROM_LENGTHS, "mm10": MM10_CHROM_LENGTHS, "hg19": HG19_CHROM_LENGTHS, })