Source code for gunz_cm.utils.intervals

# -*- coding: utf-8 -*-
"""
Genomic interval utilities for binnification and set operations.
Implemented to minimize dependencies on bioframe for core dataloading tasks.


Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"

import numpy as np
import pandas as pd
import typing as t
from pydantic import validate_call, ConfigDict

[docs] @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def binnify( chromsizes: t.Dict[str, int], binsize: int, ) -> pd.DataFrame: """ Divide a genome into evenly sized bins. Matches bioframe.binnify logic. Parameters ---------- chromsizes : dict Dictionary mapping chromosome names to lengths in bp. binsize : int Size of bins in bp. Returns ------- pd.DataFrame DataFrame with columns: 'chrom', 'start', 'end'. Examples -------- """ def _each(chrom): """ Function _each. Parameters ---------- Returns ------- Examples -------- Notes ----- """ clen = chromsizes[chrom] n_bins = int(np.ceil(clen / binsize)) edges = np.arange(0, (n_bins + 1)) * binsize edges[-1] = clen return pd.DataFrame({ "chrom": [chrom] * n_bins, "start": edges[:-1], "end": edges[1:] }) return pd.concat(map(_each, chromsizes.keys()), axis=0, ignore_index=True)
[docs] @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def subtract( df1: pd.DataFrame, df2: pd.DataFrame, ) -> pd.DataFrame: """ Remove intervals from df1 that overlap with any interval in df2. Simplified implementation of interval subtraction. Parameters ---------- df1 : pd.DataFrame Target intervals (e.g., training windows). df2 : pd.DataFrame Excluded intervals (e.g., centromeres, blacklisted regions). Returns ------- pd.DataFrame Filtered df1 containing only intervals that do NOT overlap with df2. Examples -------- """ if df1.empty or df2.empty: return df1.copy() # Standardize column names for comparison cols = ['chrom', 'start', 'end'] for df in [df1, df2]: for col in cols: if col not in df.columns: raise ValueError(f"Required column '{col}' missing from DataFrame.") # Sort for efficient merging df1 = df1.sort_values(['chrom', 'start']).reset_index(drop=True) df2 = df2.sort_values(['chrom', 'start']).reset_index(drop=True) # Perform a left join on chromosome merged = pd.merge( df1.assign(idx=df1.index), df2, on='chrom', how='left', suffixes=('', '_black') ) # Calculate overlap # Overlap exists if max(start1, start2) < min(end1, end2) overlap_mask = ( (merged['start_black'].notna()) & (np.maximum(merged['start'], merged['start_black']) < np.minimum(merged['end'], merged['end_black'])) ) # Identify indices in df1 that have AT LEAST ONE overlap with df2 overlapping_indices = merged.loc[overlap_mask, 'idx'].unique() # Return df1 excluding those indices return df1.drop(index=overlapping_indices).reset_index(drop=True)