Source code for gunz_cm.metrics.ren.third_parties.hicrep_wrapper

# -*- coding: utf-8 -*-
"""
Wrapper for the third-party HiCRep implementation.


Examples
--------
"""

# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"


# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import typing as t

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
from scipy import sparse as sp

# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from gunz_cm import preprocs as cm_preprocs
from .hicrep.hicrep import sccByDiag

[docs] def comp_hicrep_third_party( cm1_coo: sp.coo_matrix, cm2_coo: sp.coo_matrix, max_k: t.Optional[int] = None, max_dist_in_bp: t.Optional[int] = None, resolution: t.Optional[int] = None, downsample: bool = False, half_win_size: t.Optional[int] = None, ena_common_region: bool = True, common_region_op: str = "union", ena_reshaping: bool = True, remove_main_diag: bool = True, ) -> float: """ Compute the similarity score between two contact matrices using Hi-CRep. This function serves as a wrapper around a third-party implementation of the HiCRep algorithm. Parameters ---------- cm1_coo : sp.coo_matrix The first contact matrix in COO format. cm2_coo : sp.coo_matrix The second contact matrix in COO format. max_k : int, optional The maximum distance in bins. Defaults to None. max_dist_in_bp : int, optional The maximum distance in base pairs. Defaults to None. resolution : int, optional The resolution of the contact matrices. Defaults to None. downsample : bool, optional Whether to downsample the contact matrices. Defaults to False. half_win_size : int, optional The half window size for mean filtering. Defaults to None. ena_common_region : bool, optional Whether to enable common region filtering. Defaults to True. common_region_op : str, optional The operation for common region filtering. Defaults to "union". ena_reshaping : bool, optional Whether to enable reshaping to enforce square shape. Defaults to True. remove_main_diag : bool, optional Whether to remove the main diagonal. Defaults to True. Returns ------- float The similarity score between the two contact matrices. Examples -------- """ #TODO: max_k None causes error #TODO: Add step `resample` #TODO: Add step `meanFilterSparse` #? Check input matrices assert cm1_coo.size > 0, \ "Contact matrix 1 is empty" assert cm2_coo.size > 0, \ "Contact matrix 2 is empty" if half_win_size is not None: assert half_win_size > 0, "half_win_size must be greater than 0!" if max_k is None: if max_dist_in_bp is not None and resolution is not None: max_k = max_dist_in_bp // resolution if ena_common_region: out = cm_preprocs.filter_common_empty_rowcols_coo( cm1_coo, cm2_coo, op=common_region_op, is_triu_sym=True, axis=None, ) cm1_coo, cm2_coo = out #? Do reshaping to enforce square shape if ena_reshaping: if ena_common_region: #? with ena_common_region, the matrices is automatically square pass else: cm_coo1_shape = cm_preprocs.infer_mat_shape_coo( cm1_coo, is_triu_sym=True ) cm1_coo = cm1_coo.reshape(cm_coo1_shape) cm_coo2_shape = cm_preprocs.infer_mat_shape_coo( cm2_coo, is_triu_sym=True ) cm2_coo = cm2_coo.reshape(cm_coo2_shape) assert cm1_coo.shape[0] == cm1_coo.shape[1], \ "Contact matrix 1 is not square" assert cm2_coo.shape[0] == cm2_coo.shape[1], \ "Contact matrix 2 is not square" assert cm1_coo.shape == cm2_coo.shape, \ "Contact matrices have different shapes" cm1_total_counts = cm1_coo.sum() cm2_total_counts = cm2_coo.sum() #? Trim diagonals to save computation time cm1_coo = cm_preprocs.create_triu_matrix_coo( cm1_coo, max_k=max_k, remove_main_diag=remove_main_diag ) cm2_coo = cm_preprocs.create_triu_matrix_coo( cm2_coo, max_k=max_k, remove_main_diag=remove_main_diag ) if downsample: raise NotImplementedError("Downsampling step is missing!") else: cm1_coo = cm1_coo.astype(float) / cm1_total_counts cm2_coo = cm2_coo.astype(float) / cm2_total_counts if half_win_size is not None: raise NotImplementedError("meanFilterSparse is missing!") score = sccByDiag(cm1_coo, cm2_coo, max_k) return score