from __future__ import annotations
# -*- coding: utf-8 -*-
"""
This module provides functionality to load and process genomic peak data from
narrowPeak format files.
Examples
--------
"""
# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"
# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import pathlib
# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import pandas as pd
from pydantic import validate_call
# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..exceptions import LoaderError
from ..consts import NARROWPEAK_COLUMNS
# =============================================================================
# PUBLIC FUNCTIONS
# =============================================================================
[docs]
@validate_call(config=dict(arbitrary_types_allowed=True))
def get_chrom_infos(
fpath: str | pathlib.Path,
) -> dict[str, dict[str, str | None]]:
"""
Retrieves chromosome information from a narrowPeak file.
This function reads a narrowPeak file, extracts unique chromosome names,
and initializes a dictionary with chromosome names as keys and their lengths
set to None.
Parameters
----------
fpath : str | pathlib.Path
The file path to the narrowPeak file.
Returns
-------
dict[str, dict[str, str | None]]
A dictionary with chromosome names as keys and their lengths set to None.
Examples
--------
Examples
--------
"""
df = load_narrowpeak(fpath)
chroms = df['chromosome'].unique().tolist()
chr_infos = {
chr_name: {'name': chr_name, 'length': None}
for chr_name in chroms
}
return chr_infos
[docs]
@validate_call(config=dict(arbitrary_types_allowed=True))
def load_narrowpeak(
fpath: str | pathlib.Path,
chromosome: str | None = None,
resolution: int | None = None,
) -> pd.DataFrame:
"""
Reads and processes a narrowPeak file.
This function reads a narrowPeak file, assigns column names, converts data types,
validates the data, and filters by a specified region if provided.
Parameters
----------
fpath : str | pathlib.Path
The file path to the narrowPeak file.
chromosome : str | None, optional
The chromosome region to filter by (default is None).
resolution : int | None, optional
The resolution parameter. If provided, the start and end coordinates
will be binned to this resolution.
Returns
-------
pd.DataFrame
A DataFrame containing the processed narrowPeak data.
Examples
--------
Examples
--------
"""
try:
fpath = pathlib.Path(fpath)
df = pd.read_csv(fpath, sep='\s+', header=None, names=NARROWPEAK_COLUMNS)
df['start'] = df['start'].astype(int)
df['end'] = df['end'].astype(int)
df['score'] = df['score'].astype(float)
if resolution is not None:
df['start'] //= resolution
df['end'] //= resolution
df['region_id'] = df['chromosome'] + ':' + df['start'].astype(str) + '-' + df['end'].astype(str)
if (df['end'] <= df['start']).any():
raise LoaderError("End position must be greater than start position for all peaks.")
if chromosome:
df = df[df['chromosome'] == chromosome]
return df.reset_index(drop=True)
except FileNotFoundError:
raise FileNotFoundError(f"The file {fpath} was not found.")
except pd.errors.EmptyDataError:
raise LoaderError(f"The narrowPeak file is empty: {fpath}")
except Exception as e:
raise LoaderError(f"An error occurred while reading the narrowPeak file {fpath}: {e}") from e