Source code for gunz_cm.loaders.narrowpeaks

from __future__ import annotations
# -*- coding: utf-8 -*-
"""
This module provides functionality to load and process genomic peak data from
narrowPeak format files.


Examples
--------
"""

# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"


# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import pathlib

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import pandas as pd
from pydantic import validate_call

# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..exceptions import LoaderError
from ..consts import NARROWPEAK_COLUMNS

# =============================================================================
# PUBLIC FUNCTIONS
# =============================================================================

[docs] @validate_call(config=dict(arbitrary_types_allowed=True)) def get_chrom_infos( fpath: str | pathlib.Path, ) -> dict[str, dict[str, str | None]]: """ Retrieves chromosome information from a narrowPeak file. This function reads a narrowPeak file, extracts unique chromosome names, and initializes a dictionary with chromosome names as keys and their lengths set to None. Parameters ---------- fpath : str | pathlib.Path The file path to the narrowPeak file. Returns ------- dict[str, dict[str, str | None]] A dictionary with chromosome names as keys and their lengths set to None. Examples -------- Examples -------- """ df = load_narrowpeak(fpath) chroms = df['chromosome'].unique().tolist() chr_infos = { chr_name: {'name': chr_name, 'length': None} for chr_name in chroms } return chr_infos
[docs] @validate_call(config=dict(arbitrary_types_allowed=True)) def load_narrowpeak( fpath: str | pathlib.Path, chromosome: str | None = None, resolution: int | None = None, ) -> pd.DataFrame: """ Reads and processes a narrowPeak file. This function reads a narrowPeak file, assigns column names, converts data types, validates the data, and filters by a specified region if provided. Parameters ---------- fpath : str | pathlib.Path The file path to the narrowPeak file. chromosome : str | None, optional The chromosome region to filter by (default is None). resolution : int | None, optional The resolution parameter. If provided, the start and end coordinates will be binned to this resolution. Returns ------- pd.DataFrame A DataFrame containing the processed narrowPeak data. Examples -------- Examples -------- """ try: fpath = pathlib.Path(fpath) df = pd.read_csv(fpath, sep='\s+', header=None, names=NARROWPEAK_COLUMNS) df['start'] = df['start'].astype(int) df['end'] = df['end'].astype(int) df['score'] = df['score'].astype(float) if resolution is not None: df['start'] //= resolution df['end'] //= resolution df['region_id'] = df['chromosome'] + ':' + df['start'].astype(str) + '-' + df['end'].astype(str) if (df['end'] <= df['start']).any(): raise LoaderError("End position must be greater than start position for all peaks.") if chromosome: df = df[df['chromosome'] == chromosome] return df.reset_index(drop=True) except FileNotFoundError: raise FileNotFoundError(f"The file {fpath} was not found.") except pd.errors.EmptyDataError: raise LoaderError(f"The narrowPeak file is empty: {fpath}") except Exception as e: raise LoaderError(f"An error occurred while reading the narrowPeak file {fpath}: {e}") from e