"""
BSC + CMC Transforms Encoder for GZCM v3 compression.
Applies CMC domain-specific transforms (diagonal transform, binarization)
before BSC entropy coding. Combines BSC's speed with CMC's structured transforms.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
import sys
import pathlib
import tempfile
import os
import numpy as np
_FILE_PATH = pathlib.Path(__file__).resolve()
_WS_ROOT = _FILE_PATH.parent
for _ in range(4):
_WS_ROOT = _WS_ROOT.parent
_CMC_PATH = _WS_ROOT / "3d_recon" / "thirdparty" / "cmc"
if not _CMC_PATH.exists():
raise FileNotFoundError(f"CMC not found at {_CMC_PATH}")
if str(_CMC_PATH) not in sys.path:
sys.path.insert(0, str(_CMC_PATH))
import cmc.transform # noqa: E402
binarize_rc_bin_split_v2 = cmc.transform.binarize_rc_bin_split_v2
diag_transform = cmc.transform.diag_transform
[docs]class BscCmcEncoder:
"""BSC + CMC Transforms encoder for contact matrix tiles.
Applies CMC's domain-specific transforms (diagonal transform, binarization)
before BSC entropy coding. Combines BSC's speed with CMC's structured transforms.
Parameters
----------
tile_size : int, default=512
Tile size for block processing.
resolution : int, default=50000
Hi-C resolution in bp.
level : int, default=3
BSC compression level (0-9, higher = better compression).
Examples
--------
"""
def __init__(
self,
tile_size: int = 512,
resolution: int = 50000,
level: int = 3,
diag_mode: int = 0,
):
"""
Examples
--------
"""
self.tile_size = tile_size
self.resolution = resolution
self.level = level
self.diag_mode = diag_mode
self._bsc_path = pathlib.Path("/home/adhisant/tmp/bin/bsc")
self._env = os.environ.copy()
self._env["LD_LIBRARY_PATH"] = "/home/adhisant/tmp/miniforge3/envs/gunz_cm/lib"
[docs] def encode_tile(self, mat: np.ndarray) -> bytes:
"""Encode a single contact matrix tile.
Parameters
----------
mat : np.ndarray
2D contact matrix tile (upper triangular).
Returns
-------
bytes
Compressed bitstream (shape info + encoded data).
Examples
--------
"""
mat = diag_transform(mat, mode=self.diag_mode)
bin_mat = binarize_rc_bin_split_v2(mat, axis=0)
shape = np.array(bin_mat.shape, dtype=np.int32).tobytes()
bin_bytes = bin_mat.tobytes()
with tempfile.NamedTemporaryFile(suffix=".dat", delete=False) as f_in:
f_in.write(bin_bytes)
f_in.flush()
in_path = pathlib.Path(f_in.name)
with tempfile.NamedTemporaryFile(suffix=".dat", delete=False) as f_out:
out_path = pathlib.Path(f_out.name)
try:
import subprocess
result = subprocess.run(
[
str(self._bsc_path),
"e",
str(in_path),
str(out_path),
],
env=self._env,
capture_output=True,
timeout=30,
)
if result.returncode != 0:
raise RuntimeError(f"BSC encode failed: {result.stderr.decode()}")
finally:
in_path.unlink(missing_ok=True)
compressed = out_path.read_bytes()
out_path.unlink(missing_ok=True)
return shape + compressed
[docs] def encode_tiles(self, tiles: np.ndarray) -> list[bytes]:
"""Encode multiple tiles.
Parameters
----------
tiles : np.ndarray
4D array of shape (n_tile_rows, n_tile_cols, tile_size, tile_size).
Returns
-------
list[bytes]
List of encoded bitstreams, one per tile.
Examples
--------
"""
n_tile_rows, n_tile_cols = tiles.shape[0], tiles.shape[1]
results = []
for i in range(n_tile_rows):
for j in range(n_tile_cols):
results.append(self.encode_tile(tiles[i, j]))
return results
[docs] def get_compression_info(self) -> dict:
"""Return compression metadata.
Returns
-------
dict
Compression parameters for header.
Examples
--------
"""
return {
"codec": "bsc_cmc",
"version": "1.0",
"tile_size": self.tile_size,
"resolution": self.resolution,
"level": self.level,
"diag_mode": self.diag_mode,
}