Source code for gunz_cm.compressions.bsc_cmc_decoder

"""
BSC + CMC Transforms Decoder for GZCM v3 compression.

Decodes BSC-compressed data that was encoded with CMC transforms.
Reverses BSC entropy coding then CMC's domain-specific transforms.

Examples
--------
"""

__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"

import sys
import pathlib
import tempfile
import os
import numpy as np

_FILE_PATH = pathlib.Path(__file__).resolve()
_WS_ROOT = _FILE_PATH.parent
for _ in range(4):
    _WS_ROOT = _WS_ROOT.parent
_CMC_PATH = _WS_ROOT / "3d_recon" / "thirdparty" / "cmc"
if not _CMC_PATH.exists():
    raise FileNotFoundError(f"CMC not found at {_CMC_PATH}")

if str(_CMC_PATH) not in sys.path:
    sys.path.insert(0, str(_CMC_PATH))

import cmc.transform  # noqa: E402

debinarize_rc_bin_split_v2 = cmc.transform.debinarize_rc_bin_split_v2
reverse_diag_transform_mode0 = cmc.transform.reverse_diag_transform_mode0
reverse_diag_transform = cmc.transform.reverse_diag_transform


[docs]class BscCmcDecoder: """BSC + CMC Transforms decoder for contact matrix tiles. Decodes BSC-compressed data that was encoded with CMC transforms. Reverses BSC entropy coding then CMC's domain-specific transforms. Parameters ---------- tile_size : int, default=512 Tile size for block processing. resolution : int, default=50000 Hi-C resolution in bp. dtype : np.dtype, default=np.uint32 Data type for decoded tiles. Examples -------- """ def __init__( self, tile_size: int = 512, resolution: int = 50000, dtype: np.dtype = np.uint32, diag_mode: int = 0, ): """ Examples -------- """ self.tile_size = tile_size self.resolution = resolution self.dtype = np.dtype(dtype) self.diag_mode = diag_mode self._bsc_path = pathlib.Path("/home/adhisant/tmp/bin/bsc") self._env = os.environ.copy() self._env["LD_LIBRARY_PATH"] = "/home/adhisant/tmp/miniforge3/envs/gunz_cm/lib"
[docs] def decode_tile(self, payload: bytes) -> np.ndarray: """Decode a single compressed tile. Parameters ---------- payload : bytes Compressed bitstream (shape info + encoded data). Returns ------- np.ndarray Decoded contact matrix tile. Examples -------- """ shape = np.frombuffer(payload[:8], dtype=np.int32) encoded_data = payload[8:] with tempfile.NamedTemporaryFile(suffix=".dat", delete=False) as f_in: f_in.write(encoded_data) f_in.flush() in_path = pathlib.Path(f_in.name) with tempfile.NamedTemporaryFile(suffix=".dat", delete=False) as f_out: out_path = pathlib.Path(f_out.name) try: import subprocess result = subprocess.run( [ str(self._bsc_path), "d", str(in_path), str(out_path), ], env=self._env, capture_output=True, timeout=30, ) if result.returncode != 0: raise RuntimeError(f"BSC decode failed: {result.stderr.decode()}") finally: in_path.unlink(missing_ok=True) data = out_path.read_bytes() out_path.unlink(missing_ok=True) bin_mat = np.frombuffer(data, dtype=np.bool_).reshape(shape) debinarized = debinarize_rc_bin_split_v2(bin_mat, axis=0) if self.diag_mode == 0: return reverse_diag_transform_mode0(debinarized) return reverse_diag_transform(debinarized, mode=self.diag_mode)
[docs] def decode_tiles(self, payloads: list[bytes]) -> np.ndarray: """Decode multiple tiles into a 4D array. Parameters ---------- payloads : list[bytes] List of encoded bitstreams. Returns ------- np.ndarray 4D array of decoded tiles (n_tile_rows, n_tile_cols, tile_size, tile_size). Examples -------- """ n_tiles = len(payloads) decoded = [self.decode_tile(p) for p in payloads] tile_shape = decoded[0].shape tile_rows = int(np.sqrt(n_tiles)) tile_cols = n_tiles // tile_rows if tile_rows > 0 else 1 result = np.empty((tile_rows, tile_cols, *tile_shape), dtype=self.dtype) idx = 0 for i in range(tile_rows): for j in range(tile_cols): result[i, j] = decoded[idx] idx += 1 return result