Source code for gunz_cm.utils.resources

# -*- coding: utf-8 -*-
"""
Utilities for fetching genomic resources (centromeres, blacklists).


Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"

import pathlib
import pandas as pd
import urllib.request
from pydantic import validate_call

# Default cache directory
CACHE_DIR = pathlib.Path.home() / ".gunz_cm" / "resources"

UCSC_URL_TEMPLATE = "http://hgdownload.cse.ucsc.edu/goldenPath/{genome}/database/cytoBand.txt.gz"

[docs] @validate_call def fetch_centromeres( genome: str, cache: bool = True, cache_dir: pathlib.Path = CACHE_DIR ) -> pd.DataFrame: """ Fetch centromere coordinates for a given genome assembly from UCSC. Parameters ---------- genome : str Genome assembly name (e.g., 'hg19', 'hg38', 'mm10'). cache : bool, optional Whether to cache the downloaded data. Defaults to True. cache_dir : pathlib.Path, optional Directory to store cached files. Defaults to ~/.gunz_cm/resources. Returns ------- pd.DataFrame DataFrame with columns: ['chrom', 'start', 'end', 'name', 'gieStain']. Examples -------- """ if cache: cache_dir.mkdir(parents=True, exist_ok=True) cache_path = cache_dir / f"{genome}_cytoBand.txt.gz" if cache_path.exists(): return _parse_cytoband(cache_path) url = UCSC_URL_TEMPLATE.format(genome=genome) try: if cache: urllib.request.urlretrieve(url, cache_path) return _parse_cytoband(cache_path) else: return _parse_cytoband(url) except Exception as e: raise RuntimeError(f"Failed to fetch centromeres for {genome}: {e}")
def _parse_cytoband(fpath_or_url) -> pd.DataFrame: """ Parse UCSC cytoBand table and filter for centromeres ('acen'). Examples -------- """ col_names = ['chrom', 'start', 'end', 'name', 'gieStain'] df = pd.read_csv(fpath_or_url, sep='\t', names=col_names, compression='gzip') # Filter for centromeres centromeres = df[df['gieStain'] == 'acen'].reset_index(drop=True) return centromeres