Source code for gunz_cm.loaders.narrowpeaks

from __future__ import annotations
# -*- coding: utf-8 -*-
"""
This module provides functionality to load and process genomic peak data from
narrowPeak format files.


Examples
--------
"""

# =============================================================================
# METADATA
# =============================================================================
__author__ = "Yeremia Gunawan Adhisantoso"
__email__ = "adhisant@tnt.uni-hannover.de"
__license__ = "Clear BSD"
__version__ = "1.0.0"


# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import pathlib

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
import pandas as pd
from pydantic import validate_call

# =============================================================================
# LOCAL APPLICATION IMPORTS
# =============================================================================
from ..exceptions import LoaderError
from ..consts import NARROWPEAK_COLUMNS

# =============================================================================
# PUBLIC FUNCTIONS
# =============================================================================


[docs]
@validate_call(config=dict(arbitrary_types_allowed=True))
def get_chrom_infos(
    fpath: str | pathlib.Path,
) -> dict[str, dict[str, str | None]]:
    """
    Retrieves chromosome information from a narrowPeak file.

    This function reads a narrowPeak file, extracts unique chromosome names,
    and initializes a dictionary with chromosome names as keys and their lengths
    set to None.

    Parameters
    ----------
    fpath : str | pathlib.Path
        The file path to the narrowPeak file.

    Returns
    -------
    dict[str, dict[str, str | None]]
        A dictionary with chromosome names as keys and their lengths set to None.

    Examples
    --------


Examples
--------
"""
    df = load_narrowpeak(fpath)
    chroms = df['chromosome'].unique().tolist()

    chr_infos = {
        chr_name: {'name': chr_name, 'length': None}
        for chr_name in chroms
    }
    return chr_infos




[docs]
@validate_call(config=dict(arbitrary_types_allowed=True))
def load_narrowpeak(
    fpath: str | pathlib.Path,
    chromosome: str | None = None,
    resolution: int | None = None,
) -> pd.DataFrame:
    """
    Reads and processes a narrowPeak file.

    This function reads a narrowPeak file, assigns column names, converts data types,
    validates the data, and filters by a specified region if provided.

    Parameters
    ----------
    fpath : str | pathlib.Path
        The file path to the narrowPeak file.
    chromosome : str | None, optional
        The chromosome region to filter by (default is None).
    resolution : int | None, optional
        The resolution parameter. If provided, the start and end coordinates
        will be binned to this resolution.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the processed narrowPeak data.
        
    Examples
    --------


Examples
--------
"""
    try:
        fpath = pathlib.Path(fpath)
        df = pd.read_csv(fpath, sep='\s+', header=None, names=NARROWPEAK_COLUMNS)

        df['start'] = df['start'].astype(int)
        df['end'] = df['end'].astype(int)
        df['score'] = df['score'].astype(float)
        
        if resolution is not None:
            df['start'] //= resolution
            df['end'] //= resolution

        df['region_id'] = df['chromosome'] + ':' + df['start'].astype(str) + '-' + df['end'].astype(str)
        
        if (df['end'] <= df['start']).any():
            raise LoaderError("End position must be greater than start position for all peaks.")
        
        if chromosome:
            df = df[df['chromosome'] == chromosome]
        
        return df.reset_index(drop=True)
        
    except FileNotFoundError:
        raise FileNotFoundError(f"The file {fpath} was not found.")
    except pd.errors.EmptyDataError:
        raise LoaderError(f"The narrowPeak file is empty: {fpath}")
    except Exception as e:
        raise LoaderError(f"An error occurred while reading the narrowPeak file {fpath}: {e}") from e