# -*- coding: utf-8 -*-
"""
Genomic interval utilities for binnification and set operations.
Implemented to minimize dependencies on bioframe for core dataloading tasks.
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"
import numpy as np
import pandas as pd
import typing as t
from pydantic import validate_call, ConfigDict
[docs]
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def binnify(
chromsizes: t.Dict[str, int],
binsize: int,
) -> pd.DataFrame:
"""
Divide a genome into evenly sized bins. Matches bioframe.binnify logic.
Parameters
----------
chromsizes : dict
Dictionary mapping chromosome names to lengths in bp.
binsize : int
Size of bins in bp.
Returns
-------
pd.DataFrame
DataFrame with columns: 'chrom', 'start', 'end'.
Examples
--------
"""
def _each(chrom):
"""
Function _each.
Parameters
----------
Returns
-------
Examples
--------
Notes
-----
"""
clen = chromsizes[chrom]
n_bins = int(np.ceil(clen / binsize))
edges = np.arange(0, (n_bins + 1)) * binsize
edges[-1] = clen
return pd.DataFrame({
"chrom": [chrom] * n_bins,
"start": edges[:-1],
"end": edges[1:]
})
return pd.concat(map(_each, chromsizes.keys()), axis=0, ignore_index=True)
[docs]
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def subtract(
df1: pd.DataFrame,
df2: pd.DataFrame,
) -> pd.DataFrame:
"""
Remove intervals from df1 that overlap with any interval in df2.
Simplified implementation of interval subtraction.
Parameters
----------
df1 : pd.DataFrame
Target intervals (e.g., training windows).
df2 : pd.DataFrame
Excluded intervals (e.g., centromeres, blacklisted regions).
Returns
-------
pd.DataFrame
Filtered df1 containing only intervals that do NOT overlap with df2.
Examples
--------
"""
if df1.empty or df2.empty:
return df1.copy()
# Standardize column names for comparison
cols = ['chrom', 'start', 'end']
for df in [df1, df2]:
for col in cols:
if col not in df.columns:
raise ValueError(f"Required column '{col}' missing from DataFrame.")
# Sort for efficient merging
df1 = df1.sort_values(['chrom', 'start']).reset_index(drop=True)
df2 = df2.sort_values(['chrom', 'start']).reset_index(drop=True)
# Perform a left join on chromosome
merged = pd.merge(
df1.assign(idx=df1.index),
df2,
on='chrom',
how='left',
suffixes=('', '_black')
)
# Calculate overlap
# Overlap exists if max(start1, start2) < min(end1, end2)
overlap_mask = (
(merged['start_black'].notna()) &
(np.maximum(merged['start'], merged['start_black']) <
np.minimum(merged['end'], merged['end_black']))
)
# Identify indices in df1 that have AT LEAST ONE overlap with df2
overlapping_indices = merged.loc[overlap_mask, 'idx'].unique()
# Return df1 excluding those indices
return df1.drop(index=overlapping_indices).reset_index(drop=True)