# -*- coding: utf-8 -*-
"""
Wrapper for the third-party HiCRep implementation.
Examples
--------
"""
# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"
# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import typing as t
# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
from scipy import sparse as sp
# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from gunz_cm import preprocs as cm_preprocs
from .hicrep.hicrep import sccByDiag
[docs]
def comp_hicrep_third_party(
cm1_coo: sp.coo_matrix,
cm2_coo: sp.coo_matrix,
max_k: t.Optional[int] = None,
max_dist_in_bp: t.Optional[int] = None,
resolution: t.Optional[int] = None,
downsample: bool = False,
half_win_size: t.Optional[int] = None,
ena_common_region: bool = True,
common_region_op: str = "union",
ena_reshaping: bool = True,
remove_main_diag: bool = True,
) -> float:
"""
Compute the similarity score between two contact matrices using Hi-CRep.
This function serves as a wrapper around a third-party implementation of
the HiCRep algorithm.
Parameters
----------
cm1_coo : sp.coo_matrix
The first contact matrix in COO format.
cm2_coo : sp.coo_matrix
The second contact matrix in COO format.
max_k : int, optional
The maximum distance in bins. Defaults to None.
max_dist_in_bp : int, optional
The maximum distance in base pairs. Defaults to None.
resolution : int, optional
The resolution of the contact matrices. Defaults to None.
downsample : bool, optional
Whether to downsample the contact matrices. Defaults to False.
half_win_size : int, optional
The half window size for mean filtering. Defaults to None.
ena_common_region : bool, optional
Whether to enable common region filtering. Defaults to True.
common_region_op : str, optional
The operation for common region filtering. Defaults to "union".
ena_reshaping : bool, optional
Whether to enable reshaping to enforce square shape. Defaults to True.
remove_main_diag : bool, optional
Whether to remove the main diagonal. Defaults to True.
Returns
-------
float
The similarity score between the two contact matrices.
Examples
--------
"""
#TODO: max_k None causes error
#TODO: Add step `resample`
#TODO: Add step `meanFilterSparse`
#? Check input matrices
assert cm1_coo.size > 0, \
"Contact matrix 1 is empty"
assert cm2_coo.size > 0, \
"Contact matrix 2 is empty"
if half_win_size is not None:
assert half_win_size > 0, "half_win_size must be greater than 0!"
if max_k is None:
if max_dist_in_bp is not None and resolution is not None:
max_k = max_dist_in_bp // resolution
if ena_common_region:
out = cm_preprocs.filter_common_empty_rowcols_coo(
cm1_coo,
cm2_coo,
op=common_region_op,
is_triu_sym=True,
axis=None,
)
cm1_coo, cm2_coo = out
#? Do reshaping to enforce square shape
if ena_reshaping:
if ena_common_region:
#? with ena_common_region, the matrices is automatically square
pass
else:
cm_coo1_shape = cm_preprocs.infer_mat_shape_coo(
cm1_coo,
is_triu_sym=True
)
cm1_coo = cm1_coo.reshape(cm_coo1_shape)
cm_coo2_shape = cm_preprocs.infer_mat_shape_coo(
cm2_coo,
is_triu_sym=True
)
cm2_coo = cm2_coo.reshape(cm_coo2_shape)
assert cm1_coo.shape[0] == cm1_coo.shape[1], \
"Contact matrix 1 is not square"
assert cm2_coo.shape[0] == cm2_coo.shape[1], \
"Contact matrix 2 is not square"
assert cm1_coo.shape == cm2_coo.shape, \
"Contact matrices have different shapes"
cm1_total_counts = cm1_coo.sum()
cm2_total_counts = cm2_coo.sum()
#? Trim diagonals to save computation time
cm1_coo = cm_preprocs.create_triu_matrix_coo(
cm1_coo,
max_k=max_k,
remove_main_diag=remove_main_diag
)
cm2_coo = cm_preprocs.create_triu_matrix_coo(
cm2_coo,
max_k=max_k,
remove_main_diag=remove_main_diag
)
if downsample:
raise NotImplementedError("Downsampling step is missing!")
else:
cm1_coo = cm1_coo.astype(float) / cm1_total_counts
cm2_coo = cm2_coo.astype(float) / cm2_total_counts
if half_win_size is not None:
raise NotImplementedError("meanFilterSparse is missing!")
score = sccByDiag(cm1_coo, cm2_coo, max_k)
return score