Source code for tseda.statistics.descriptive

"""
Descriptive statistics for time series.

Provides a single :class:`DescriptiveStats` result object and a stateless
:class:`DescriptiveAnalyzer` that computes it.  All arithmetic uses numpy
so there are no extra dependencies beyond the core stack.

The statistics reported go beyond what :func:`pandas.Series.describe` offers:

* Robust location / spread (median, MAD, trimmed mean).
* Shape (skewness, excess kurtosis).
* Quantiles at multiple probability levels.
* First/last value, range, coefficient of variation.
* Count of zeros and near-zero values.

Classes
-------
DescriptiveStats
    Frozen dataclass containing every computed statistic.
DescriptiveAnalyzer
    Stateless analyzer that produces :class:`DescriptiveStats`.

Examples
--------
>>> import numpy as np, pandas as pd
>>> from tseda import TimeSeries
>>> from tseda.statistics.descriptive import DescriptiveAnalyzer

>>> rng = np.random.default_rng(0)
>>> idx = pd.date_range("2020-01-01", periods=200, freq="D")
>>> ts  = TimeSeries(rng.standard_normal(200), index=idx, name="returns")
>>> r   = DescriptiveAnalyzer().analyze(ts)
>>> round(r.mean, 3)
0.024
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict

import numpy as np

from tseda.core.timeseries import TimeSeries

__all__ = ["DescriptiveStats", "DescriptiveAnalyzer"]

# ---------------------------------------------------------------------------
# Result dataclass
# ---------------------------------------------------------------------------



[docs]
@dataclass(frozen=True)
class DescriptiveStats:
    """Comprehensive descriptive statistics for a :class:`~tseda.core.TimeSeries`.

    All statistics are computed on the **non-NaN** subset unless otherwise
    noted.

    Attributes
    ----------
    n_total : int
        Total number of observations (including NaN).
    n_valid : int
        Number of non-NaN observations.
    n_nan : int
        Number of NaN observations.
    pct_nan : float
        Percentage of NaN observations (0–100).
    mean : float
        Arithmetic mean.
    median : float
        50th percentile.
    std : float
        Sample standard deviation (ddof=1).
    var : float
        Sample variance (ddof=1).
    mad : float
        Median absolute deviation: ``median(|x - median(x)|)``.
    trimmed_mean : float
        Mean with the top and bottom 5 % of values removed.
    min : float
        Minimum value.
    max : float
        Maximum value.
    range : float
        ``max - min``.
    first : float
        First (earliest) non-NaN value.
    last : float
        Last (most recent) non-NaN value.
    cv : float
        Coefficient of variation: ``std / |mean|``.  ``nan`` when mean == 0.
    skewness : float
        Fisher's moment coefficient of skewness (bias-corrected).
    kurtosis : float
        Excess kurtosis (Fisher definition, bias-corrected).  0 for a
        normal distribution.
    quantiles : dict of float → float
        Mapping from probability level to quantile value.
        Keys: ``[0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]``.
    n_zeros : int
        Number of exact zeros.
    n_positive : int
        Number of strictly positive values.
    n_negative : int
        Number of strictly negative values.
    """

    # Sample size
    n_total: int
    n_valid: int
    n_nan: int
    pct_nan: float

    # Central tendency
    mean: float
    median: float
    trimmed_mean: float

    # Spread
    std: float
    var: float
    mad: float
    cv: float

    # Range
    min: float
    max: float
    range: float
    first: float
    last: float

    # Shape
    skewness: float
    kurtosis: float

    # Quantiles
    quantiles: Dict[float, float]

    # Value-type counts
    n_zeros: int
    n_positive: int
    n_negative: int


[docs]
    def __repr__(self) -> str:  # pragma: no cover
        return (
            f"DescriptiveStats(\n"
            f"  n_valid    : {self.n_valid:,} / {self.n_total:,}  "
            f"({self.pct_nan:.1f}% NaN)\n"
            f"  mean       : {self.mean:.6g}\n"
            f"  median     : {self.median:.6g}\n"
            f"  std        : {self.std:.6g}\n"
            f"  [min, max] : [{self.min:.6g}, {self.max:.6g}]\n"
            f"  skewness   : {self.skewness:.4f}\n"
            f"  kurtosis   : {self.kurtosis:.4f}\n"
            f")"
        )




# ---------------------------------------------------------------------------
# Private helpers
# ---------------------------------------------------------------------------

_QUANTILE_LEVELS = (0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99)


def _trimmed_mean(x: np.ndarray, trim: float = 0.05) -> float:
    """Return mean of *x* after removing the *trim* fraction from each tail."""
    n = len(x)
    k = int(np.floor(n * trim))
    if k == 0:
        return float(np.mean(x))
    xs = np.sort(x)
    return float(np.mean(xs[k : n - k]))


def _skewness(x: np.ndarray) -> float:
    """Bias-corrected sample skewness (Fisher's g1)."""
    n = len(x)
    if n < 3:
        return float("nan")
    m = x - x.mean()
    m2 = float(np.mean(m ** 2))
    m3 = float(np.mean(m ** 3))
    if m2 == 0:
        return float("nan")
    g1 = m3 / m2 ** 1.5
    # bias correction
    return float(g1 * np.sqrt(n * (n - 1)) / (n - 2))


def _kurtosis(x: np.ndarray) -> float:
    """Bias-corrected excess kurtosis (Fisher's g2)."""
    n = len(x)
    if n < 4:
        return float("nan")
    m = x - x.mean()
    m2 = float(np.mean(m ** 2))
    m4 = float(np.mean(m ** 4))
    if m2 == 0:
        return float("nan")
    # Fisher's excess kurtosis (normal = 0)
    g2 = m4 / m2 ** 2 - 3.0
    # Bias correction (excess kurtosis)
    correction = (n - 1) / ((n - 2) * (n - 3)) * ((n + 1) * g2 + 6)
    return float(correction)


# ---------------------------------------------------------------------------
# Analyzer
# ---------------------------------------------------------------------------



[docs]
class DescriptiveAnalyzer:
    """Compute comprehensive descriptive statistics for a
    :class:`~tseda.core.TimeSeries`.

    This class is **stateless** — one instance, many series.

    Methods
    -------
    analyze(ts)
        Return a :class:`DescriptiveStats` for *ts*.

    Examples
    --------
    >>> import numpy as np, pandas as pd
    >>> from tseda import TimeSeries
    >>> from tseda.statistics.descriptive import DescriptiveAnalyzer

    >>> idx = pd.date_range("2020", periods=5, freq="D")
    >>> ts  = TimeSeries([2.0, 4.0, 4.0, 4.0, 5.0], index=idx)
    >>> r   = DescriptiveAnalyzer().analyze(ts)
    >>> r.mean
    3.8
    >>> r.std  # doctest: +ELLIPSIS
    1.09...
    """


[docs]
    def analyze(self, ts: TimeSeries) -> DescriptiveStats:
        """Compute descriptive statistics for *ts*.

        Parameters
        ----------
        ts : TimeSeries
            Input series.

        Returns
        -------
        DescriptiveStats

        Raises
        ------
        TypeError
            If *ts* is not a :class:`~tseda.core.TimeSeries`.
        ValueError
            If *ts* has no non-NaN values.

        Examples
        --------
        >>> import numpy as np, pandas as pd
        >>> from tseda import TimeSeries
        >>> from tseda.statistics.descriptive import DescriptiveAnalyzer

        >>> idx = pd.date_range("2020", periods=4, freq="D")
        >>> ts  = TimeSeries([1.0, 2.0, 3.0, 4.0], index=idx)
        >>> r   = DescriptiveAnalyzer().analyze(ts)
        >>> r.median
        2.5
        >>> r.n_positive
        4
        """
        if not isinstance(ts, TimeSeries):
            raise TypeError(
                f"'ts' must be a TimeSeries, got {type(ts).__name__!r}."
            )

        vals     = ts.values
        not_nan  = ~np.isnan(vals)
        x        = vals[not_nan]
        n_valid  = int(x.size)

        if n_valid == 0:
            raise ValueError("'ts' has no non-NaN values; cannot compute statistics.")

        n_total = ts.n
        n_nan   = n_total - n_valid
        pct_nan = 100.0 * n_nan / max(n_total, 1)

        # Central tendency
        mean     = float(np.mean(x))
        median   = float(np.median(x))
        tr_mean  = _trimmed_mean(x, trim=0.05)

        # Spread
        std = float(np.std(x, ddof=1)) if n_valid > 1 else float("nan")
        var = float(np.var(x, ddof=1)) if n_valid > 1 else float("nan")
        mad = float(np.median(np.abs(x - median)))
        cv  = (std / abs(mean)) if (mean != 0 and not np.isnan(std)) else float("nan")

        # Range
        mn    = float(np.min(x))
        mx    = float(np.max(x))
        rng   = mx - mn

        # First / last non-NaN values (positional)
        not_nan_idx = np.where(not_nan)[0]
        first = float(vals[not_nan_idx[0]])
        last  = float(vals[not_nan_idx[-1]])

        # Shape
        skew = _skewness(x)
        kurt = _kurtosis(x)

        # Quantiles
        quantiles = {
            q: float(np.quantile(x, q)) for q in _QUANTILE_LEVELS
        }

        # Value-type counts
        n_zeros    = int(np.sum(x == 0.0))
        n_positive = int(np.sum(x > 0.0))
        n_negative = int(np.sum(x < 0.0))

        return DescriptiveStats(
            n_total=n_total,
            n_valid=n_valid,
            n_nan=n_nan,
            pct_nan=round(pct_nan, 4),
            mean=mean,
            median=median,
            trimmed_mean=tr_mean,
            std=std,
            var=var,
            mad=mad,
            cv=cv,
            min=mn,
            max=mx,
            range=rng,
            first=first,
            last=last,
            skewness=skew,
            kurtosis=kurt,
            quantiles=quantiles,
            n_zeros=n_zeros,
            n_positive=n_positive,
            n_negative=n_negative,
        )