Source code for tseda.core.validator

"""
Input validation utilities for tseda.

Every public function in this module raises a descriptive :class:`TypeError`
or :class:`ValueError` on bad input and returns the canonicalised value on
success.  All heavy lifting of data coercion lives here so that
:class:`~tseda.core.TimeSeries` and analysis modules stay clean.

Functions
---------
validate_data_array
    Coerce arbitrary numeric input to a 1-D ``float64`` :class:`numpy.ndarray`.
validate_datetime_index
    Coerce arbitrary input to a sorted, duplicate-free
    :class:`pandas.DatetimeIndex`.
validate_positive_int
    Assert that a value is a positive integer.
validate_lags
    Assert that the requested lag count is sensible relative to series length.
validate_freq_string
    Assert that a string is a recognised pandas offset alias.
"""
from __future__ import annotations

from typing import Any, Optional

import numpy as np
import pandas as pd

__all__ = [
    "validate_data_array",
    "validate_datetime_index",
    "validate_positive_int",
    "validate_lags",
    "validate_freq_string",
]

# ---------------------------------------------------------------------------
# Public validators
# ---------------------------------------------------------------------------



[docs]
def validate_data_array(data: Any, *, name: str = "data") -> np.ndarray:
    """Coerce *data* to a 1-D ``float64`` :class:`numpy.ndarray`.

    Parameters
    ----------
    data:
        Numeric input.  Accepted types:

        * :class:`numpy.ndarray` — must be 1-D.
        * :class:`pandas.Series` — values extracted; index ignored.
        * :class:`list` or :class:`tuple` — must be flat and numeric.

    name:
        Variable name used in error messages (default ``"data"``).

    Returns
    -------
    numpy.ndarray
        1-D array of dtype ``float64``.  NaN values are preserved.

    Raises
    ------
    TypeError
        If *data* is not a recognised type.
    ValueError
        If *data* is not 1-D or contains non-numeric elements.

    Examples
    --------
    >>> validate_data_array([1.0, 2.0, 3.0])
    array([1., 2., 3.])
    >>> validate_data_array(pd.Series([1, 2, 3]))
    array([1., 2., 3.])
    """
    if isinstance(data, pd.Series):
        arr = data.to_numpy(dtype=float, na_value=np.nan)
    elif isinstance(data, np.ndarray):
        if data.ndim != 1:
            raise ValueError(
                f"'{name}' must be 1-D, got shape {data.shape}. "
                "Use MultiTimeSeries for multivariate data."
            )
        arr = data.astype(float, copy=True)
    elif isinstance(data, (list, tuple)):
        try:
            arr = np.asarray(data, dtype=float)
        except (TypeError, ValueError) as exc:
            raise ValueError(
                f"'{name}' could not be converted to a numeric array: {exc}"
            ) from exc
        if arr.ndim != 1:
            raise ValueError(
                f"'{name}' must be a flat 1-D sequence, got shape {arr.shape}."
            )
    else:
        raise TypeError(
            f"'{name}' must be array-like (ndarray, Series, list, or tuple), "
            f"got {type(data).__name__!r}."
        )

    if arr.size == 0:
        raise ValueError(f"'{name}' must contain at least one element.")

    return arr




[docs]
def validate_datetime_index(index: Any, *, name: str = "index") -> pd.DatetimeIndex:
    """Coerce *index* to a sorted, duplicate-free :class:`pandas.DatetimeIndex`.

    Parameters
    ----------
    index:
        Datetime-like input.  Accepted types:

        * :class:`pandas.DatetimeIndex`
        * :class:`pandas.Series` with datetime dtype
        * :class:`list` or :class:`numpy.ndarray` of datetime-like strings
          or :class:`numpy.datetime64` values

    name:
        Variable name used in error messages (default ``"index"``).

    Returns
    -------
    pandas.DatetimeIndex
        Validated, monotonically increasing, duplicate-free index.

    Raises
    ------
    TypeError
        If *index* is not a recognised type.
    ValueError
        If *index* is not monotonically increasing or contains duplicates.

    Examples
    --------
    >>> idx = pd.date_range("2020-01-01", periods=5, freq="D")
    >>> validate_datetime_index(idx)  # doctest: +ELLIPSIS
    DatetimeIndex(['2020-01-01', ..., '2020-01-05'], dtype='datetime64[ns]', freq='D')
    """
    if isinstance(index, pd.DatetimeIndex):
        dti = index
    elif isinstance(index, pd.Series):
        try:
            dti = pd.DatetimeIndex(index)
        except Exception as exc:
            raise TypeError(
                f"'{name}' Series could not be converted to DatetimeIndex: {exc}"
            ) from exc
    elif isinstance(index, (list, np.ndarray)):
        try:
            dti = pd.DatetimeIndex(index)
        except Exception as exc:
            raise TypeError(
                f"'{name}' sequence could not be parsed as datetimes: {exc}"
            ) from exc
    else:
        raise TypeError(
            f"'{name}' must be a DatetimeIndex or datetime-like sequence, "
            f"got {type(index).__name__!r}."
        )

    if len(dti) == 0:
        raise ValueError(f"'{name}' must contain at least one timestamp.")

    if not dti.is_monotonic_increasing:
        raise ValueError(
            f"'{name}' must be monotonically increasing (time-sorted). "
            "Sort your data before constructing a TimeSeries."
        )

    if dti.has_duplicates:
        n_dupes = int(dti.duplicated().sum())
        raise ValueError(
            f"'{name}' contains {n_dupes} duplicate timestamp(s). "
            "Aggregate or drop duplicates before constructing a TimeSeries."
        )

    return dti




[docs]
def validate_positive_int(value: Any, *, name: str = "value") -> int:
    """Assert that *value* is a positive integer.

    Parameters
    ----------
    value:
        The candidate value.
    name:
        Variable name used in error messages.

    Returns
    -------
    int
        The validated integer.

    Raises
    ------
    TypeError
        If *value* is not an integer type.
    ValueError
        If *value* is less than 1.

    Examples
    --------
    >>> validate_positive_int(5)
    5
    """
    if not isinstance(value, (int, np.integer)):
        raise TypeError(
            f"'{name}' must be an integer, got {type(value).__name__!r}."
        )
    v = int(value)
    if v < 1:
        raise ValueError(f"'{name}' must be >= 1, got {v}.")
    return v




[docs]
def validate_lags(lags: int, n: int, *, name: str = "lags") -> int:
    """Assert that *lags* is a sensible lag count for a series of length *n*.

    The upper bound is ``n // 2`` because computing autocorrelations at lags
    approaching *n* produces unreliable estimates.

    Parameters
    ----------
    lags:
        Requested number of lags.
    n:
        Length of the time series.
    name:
        Variable name used in error messages.

    Returns
    -------
    int
        The validated lag count.

    Raises
    ------
    ValueError
        If *lags* is not in ``[1, n // 2]``.

    Examples
    --------
    >>> validate_lags(40, 100)
    40
    """
    lags = validate_positive_int(lags, name=name)
    max_lags = n // 2
    if lags > max_lags:
        raise ValueError(
            f"'{name}' ({lags}) exceeds the maximum allowed value of n // 2 = {max_lags} "
            f"for a series of length {n}."
        )
    return lags




[docs]
def validate_freq_string(freq: Any, *, name: str = "freq") -> str:
    """Assert that *freq* is a non-empty string accepted by :func:`pandas.tseries.frequencies.to_offset`.

    Parameters
    ----------
    freq:
        Candidate frequency string (e.g., ``"D"``, ``"h"``, ``"MS"``).
    name:
        Variable name used in error messages.

    Returns
    -------
    str
        The validated frequency string.

    Raises
    ------
    TypeError
        If *freq* is not a string.
    ValueError
        If *freq* is not recognised by pandas.

    Examples
    --------
    >>> validate_freq_string("D")
    'D'
    >>> validate_freq_string("15min")
    '15min'
    """
    if not isinstance(freq, str):
        raise TypeError(
            f"'{name}' must be a string (e.g., 'D', 'h', 'MS'), "
            f"got {type(freq).__name__!r}."
        )
    freq = freq.strip()
    if not freq:
        raise ValueError(f"'{name}' must not be empty.")

    try:
        offset = pd.tseries.frequencies.to_offset(freq)
    except (ValueError, KeyError) as exc:
        raise ValueError(
            f"'{name}' = {freq!r} is not a recognised pandas offset alias. "
            "See https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases"
        ) from exc
    if offset is None:
        raise ValueError(
            f"'{name}' = {freq!r} is not a recognised pandas offset alias. "
            "See https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases"
        )
    return freq