Source code for gunz_cm.utils.resources

# -*- coding: utf-8 -*-
"""
Utilities for fetching genomic resources (centromeres, blacklists).


Examples
--------
"""
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"

import pathlib
import pandas as pd
import urllib.request
from pydantic import validate_call

# Default cache directory
CACHE_DIR = pathlib.Path.home() / ".gunz_cm" / "resources"

UCSC_URL_TEMPLATE = "http://hgdownload.cse.ucsc.edu/goldenPath/{genome}/database/cytoBand.txt.gz"


[docs]
@validate_call
def fetch_centromeres(
    genome: str,
    cache: bool = True,
    cache_dir: pathlib.Path = CACHE_DIR
) -> pd.DataFrame:
    """
    Fetch centromere coordinates for a given genome assembly from UCSC.
    
    Parameters
    ----------
    genome : str
        Genome assembly name (e.g., 'hg19', 'hg38', 'mm10').
    cache : bool, optional
        Whether to cache the downloaded data. Defaults to True.
    cache_dir : pathlib.Path, optional
        Directory to store cached files. Defaults to ~/.gunz_cm/resources.
        
    Returns
    -------
    pd.DataFrame
        DataFrame with columns: ['chrom', 'start', 'end', 'name', 'gieStain'].


Examples
--------
"""
    if cache:
        cache_dir.mkdir(parents=True, exist_ok=True)
        cache_path = cache_dir / f"{genome}_cytoBand.txt.gz"
        
        if cache_path.exists():
            return _parse_cytoband(cache_path)
            
    url = UCSC_URL_TEMPLATE.format(genome=genome)
    
    try:
        if cache:
            urllib.request.urlretrieve(url, cache_path)
            return _parse_cytoband(cache_path)
        else:
            return _parse_cytoband(url)
    except Exception as e:
        raise RuntimeError(f"Failed to fetch centromeres for {genome}: {e}")


def _parse_cytoband(fpath_or_url) -> pd.DataFrame:
    """
    Parse UCSC cytoBand table and filter for centromeres ('acen').


Examples
--------
"""
    col_names = ['chrom', 'start', 'end', 'name', 'gieStain']
    df = pd.read_csv(fpath_or_url, sep='\t', names=col_names, compression='gzip')
    
    # Filter for centromeres
    centromeres = df[df['gieStain'] == 'acen'].reset_index(drop=True)
    return centromeres