# -*- coding: utf-8 -*-
"""
Utilities for fetching genomic resources (centromeres, blacklists).
Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"
import pathlib
import pandas as pd
import urllib.request
from pydantic import validate_call
# Default cache directory
CACHE_DIR = pathlib.Path.home() / ".gunz_cm" / "resources"
UCSC_URL_TEMPLATE = "http://hgdownload.cse.ucsc.edu/goldenPath/{genome}/database/cytoBand.txt.gz"
[docs]
@validate_call
def fetch_centromeres(
genome: str,
cache: bool = True,
cache_dir: pathlib.Path = CACHE_DIR
) -> pd.DataFrame:
"""
Fetch centromere coordinates for a given genome assembly from UCSC.
Parameters
----------
genome : str
Genome assembly name (e.g., 'hg19', 'hg38', 'mm10').
cache : bool, optional
Whether to cache the downloaded data. Defaults to True.
cache_dir : pathlib.Path, optional
Directory to store cached files. Defaults to ~/.gunz_cm/resources.
Returns
-------
pd.DataFrame
DataFrame with columns: ['chrom', 'start', 'end', 'name', 'gieStain'].
Examples
--------
"""
if cache:
cache_dir.mkdir(parents=True, exist_ok=True)
cache_path = cache_dir / f"{genome}_cytoBand.txt.gz"
if cache_path.exists():
return _parse_cytoband(cache_path)
url = UCSC_URL_TEMPLATE.format(genome=genome)
try:
if cache:
urllib.request.urlretrieve(url, cache_path)
return _parse_cytoband(cache_path)
else:
return _parse_cytoband(url)
except Exception as e:
raise RuntimeError(f"Failed to fetch centromeres for {genome}: {e}")
def _parse_cytoband(fpath_or_url) -> pd.DataFrame:
"""
Parse UCSC cytoBand table and filter for centromeres ('acen').
Examples
--------
"""
col_names = ['chrom', 'start', 'end', 'name', 'gieStain']
df = pd.read_csv(fpath_or_url, sep='\t', names=col_names, compression='gzip')
# Filter for centromeres
centromeres = df[df['gieStain'] == 'acen'].reset_index(drop=True)
return centromeres