Source code for gunz_cm.compressions.bsc_cmc_encoder

"""
BSC + CMC Transforms Encoder for GZCM v3 compression.

Applies CMC domain-specific transforms (diagonal transform, binarization)
before BSC entropy coding. Combines BSC's speed with CMC's structured transforms.

Examples
--------
"""

__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"

import sys
import pathlib
import tempfile
import os
import numpy as np

_FILE_PATH = pathlib.Path(__file__).resolve()
_WS_ROOT = _FILE_PATH.parent
for _ in range(4):
    _WS_ROOT = _WS_ROOT.parent
_CMC_PATH = _WS_ROOT / "3d_recon" / "thirdparty" / "cmc"
if not _CMC_PATH.exists():
    raise FileNotFoundError(f"CMC not found at {_CMC_PATH}")

if str(_CMC_PATH) not in sys.path:
    sys.path.insert(0, str(_CMC_PATH))

import cmc.transform  # noqa: E402

binarize_rc_bin_split_v2 = cmc.transform.binarize_rc_bin_split_v2
diag_transform = cmc.transform.diag_transform


[docs]class BscCmcEncoder: """BSC + CMC Transforms encoder for contact matrix tiles. Applies CMC's domain-specific transforms (diagonal transform, binarization) before BSC entropy coding. Combines BSC's speed with CMC's structured transforms. Parameters ---------- tile_size : int, default=512 Tile size for block processing. resolution : int, default=50000 Hi-C resolution in bp. level : int, default=3 BSC compression level (0-9, higher = better compression). Examples -------- """ def __init__( self, tile_size: int = 512, resolution: int = 50000, level: int = 3, diag_mode: int = 0, ): """ Examples -------- """ self.tile_size = tile_size self.resolution = resolution self.level = level self.diag_mode = diag_mode self._bsc_path = pathlib.Path("/home/adhisant/tmp/bin/bsc") self._env = os.environ.copy() self._env["LD_LIBRARY_PATH"] = "/home/adhisant/tmp/miniforge3/envs/gunz_cm/lib"
[docs] def encode_tile(self, mat: np.ndarray) -> bytes: """Encode a single contact matrix tile. Parameters ---------- mat : np.ndarray 2D contact matrix tile (upper triangular). Returns ------- bytes Compressed bitstream (shape info + encoded data). Examples -------- """ mat = diag_transform(mat, mode=self.diag_mode) bin_mat = binarize_rc_bin_split_v2(mat, axis=0) shape = np.array(bin_mat.shape, dtype=np.int32).tobytes() bin_bytes = bin_mat.tobytes() with tempfile.NamedTemporaryFile(suffix=".dat", delete=False) as f_in: f_in.write(bin_bytes) f_in.flush() in_path = pathlib.Path(f_in.name) with tempfile.NamedTemporaryFile(suffix=".dat", delete=False) as f_out: out_path = pathlib.Path(f_out.name) try: import subprocess result = subprocess.run( [ str(self._bsc_path), "e", str(in_path), str(out_path), ], env=self._env, capture_output=True, timeout=30, ) if result.returncode != 0: raise RuntimeError(f"BSC encode failed: {result.stderr.decode()}") finally: in_path.unlink(missing_ok=True) compressed = out_path.read_bytes() out_path.unlink(missing_ok=True) return shape + compressed
[docs] def encode_tiles(self, tiles: np.ndarray) -> list[bytes]: """Encode multiple tiles. Parameters ---------- tiles : np.ndarray 4D array of shape (n_tile_rows, n_tile_cols, tile_size, tile_size). Returns ------- list[bytes] List of encoded bitstreams, one per tile. Examples -------- """ n_tile_rows, n_tile_cols = tiles.shape[0], tiles.shape[1] results = [] for i in range(n_tile_rows): for j in range(n_tile_cols): results.append(self.encode_tile(tiles[i, j])) return results
[docs] def get_compression_info(self) -> dict: """Return compression metadata. Returns ------- dict Compression parameters for header. Examples -------- """ return { "codec": "bsc_cmc", "version": "1.0", "tile_size": self.tile_size, "resolution": self.resolution, "level": self.level, "diag_mode": self.diag_mode, }