"""
Descriptive statistics for time series.
Provides a single :class:`DescriptiveStats` result object and a stateless
:class:`DescriptiveAnalyzer` that computes it. All arithmetic uses numpy
so there are no extra dependencies beyond the core stack.
The statistics reported go beyond what :func:`pandas.Series.describe` offers:
* Robust location / spread (median, MAD, trimmed mean).
* Shape (skewness, excess kurtosis).
* Quantiles at multiple probability levels.
* First/last value, range, coefficient of variation.
* Count of zeros and near-zero values.
Classes
-------
DescriptiveStats
Frozen dataclass containing every computed statistic.
DescriptiveAnalyzer
Stateless analyzer that produces :class:`DescriptiveStats`.
Examples
--------
>>> import numpy as np, pandas as pd
>>> from tseda import TimeSeries
>>> from tseda.statistics.descriptive import DescriptiveAnalyzer
>>> rng = np.random.default_rng(0)
>>> idx = pd.date_range("2020-01-01", periods=200, freq="D")
>>> ts = TimeSeries(rng.standard_normal(200), index=idx, name="returns")
>>> r = DescriptiveAnalyzer().analyze(ts)
>>> round(r.mean, 3)
0.024
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict
import numpy as np
from tseda.core.timeseries import TimeSeries
__all__ = ["DescriptiveStats", "DescriptiveAnalyzer"]
# ---------------------------------------------------------------------------
# Result dataclass
# ---------------------------------------------------------------------------
[docs]
@dataclass(frozen=True)
class DescriptiveStats:
"""Comprehensive descriptive statistics for a :class:`~tseda.core.TimeSeries`.
All statistics are computed on the **non-NaN** subset unless otherwise
noted.
Attributes
----------
n_total : int
Total number of observations (including NaN).
n_valid : int
Number of non-NaN observations.
n_nan : int
Number of NaN observations.
pct_nan : float
Percentage of NaN observations (0–100).
mean : float
Arithmetic mean.
median : float
50th percentile.
std : float
Sample standard deviation (ddof=1).
var : float
Sample variance (ddof=1).
mad : float
Median absolute deviation: ``median(|x - median(x)|)``.
trimmed_mean : float
Mean with the top and bottom 5 % of values removed.
min : float
Minimum value.
max : float
Maximum value.
range : float
``max - min``.
first : float
First (earliest) non-NaN value.
last : float
Last (most recent) non-NaN value.
cv : float
Coefficient of variation: ``std / |mean|``. ``nan`` when mean == 0.
skewness : float
Fisher's moment coefficient of skewness (bias-corrected).
kurtosis : float
Excess kurtosis (Fisher definition, bias-corrected). 0 for a
normal distribution.
quantiles : dict of float → float
Mapping from probability level to quantile value.
Keys: ``[0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]``.
n_zeros : int
Number of exact zeros.
n_positive : int
Number of strictly positive values.
n_negative : int
Number of strictly negative values.
"""
# Sample size
n_total: int
n_valid: int
n_nan: int
pct_nan: float
# Central tendency
mean: float
median: float
trimmed_mean: float
# Spread
std: float
var: float
mad: float
cv: float
# Range
min: float
max: float
range: float
first: float
last: float
# Shape
skewness: float
kurtosis: float
# Quantiles
quantiles: Dict[float, float]
# Value-type counts
n_zeros: int
n_positive: int
n_negative: int
[docs]
def __repr__(self) -> str: # pragma: no cover
return (
f"DescriptiveStats(\n"
f" n_valid : {self.n_valid:,} / {self.n_total:,} "
f"({self.pct_nan:.1f}% NaN)\n"
f" mean : {self.mean:.6g}\n"
f" median : {self.median:.6g}\n"
f" std : {self.std:.6g}\n"
f" [min, max] : [{self.min:.6g}, {self.max:.6g}]\n"
f" skewness : {self.skewness:.4f}\n"
f" kurtosis : {self.kurtosis:.4f}\n"
f")"
)
# ---------------------------------------------------------------------------
# Private helpers
# ---------------------------------------------------------------------------
_QUANTILE_LEVELS = (0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99)
def _trimmed_mean(x: np.ndarray, trim: float = 0.05) -> float:
"""Return mean of *x* after removing the *trim* fraction from each tail."""
n = len(x)
k = int(np.floor(n * trim))
if k == 0:
return float(np.mean(x))
xs = np.sort(x)
return float(np.mean(xs[k : n - k]))
def _skewness(x: np.ndarray) -> float:
"""Bias-corrected sample skewness (Fisher's g1)."""
n = len(x)
if n < 3:
return float("nan")
m = x - x.mean()
m2 = float(np.mean(m ** 2))
m3 = float(np.mean(m ** 3))
if m2 == 0:
return float("nan")
g1 = m3 / m2 ** 1.5
# bias correction
return float(g1 * np.sqrt(n * (n - 1)) / (n - 2))
def _kurtosis(x: np.ndarray) -> float:
"""Bias-corrected excess kurtosis (Fisher's g2)."""
n = len(x)
if n < 4:
return float("nan")
m = x - x.mean()
m2 = float(np.mean(m ** 2))
m4 = float(np.mean(m ** 4))
if m2 == 0:
return float("nan")
# Fisher's excess kurtosis (normal = 0)
g2 = m4 / m2 ** 2 - 3.0
# Bias correction (excess kurtosis)
correction = (n - 1) / ((n - 2) * (n - 3)) * ((n + 1) * g2 + 6)
return float(correction)
# ---------------------------------------------------------------------------
# Analyzer
# ---------------------------------------------------------------------------
[docs]
class DescriptiveAnalyzer:
"""Compute comprehensive descriptive statistics for a
:class:`~tseda.core.TimeSeries`.
This class is **stateless** — one instance, many series.
Methods
-------
analyze(ts)
Return a :class:`DescriptiveStats` for *ts*.
Examples
--------
>>> import numpy as np, pandas as pd
>>> from tseda import TimeSeries
>>> from tseda.statistics.descriptive import DescriptiveAnalyzer
>>> idx = pd.date_range("2020", periods=5, freq="D")
>>> ts = TimeSeries([2.0, 4.0, 4.0, 4.0, 5.0], index=idx)
>>> r = DescriptiveAnalyzer().analyze(ts)
>>> r.mean
3.8
>>> r.std # doctest: +ELLIPSIS
1.09...
"""
[docs]
def analyze(self, ts: TimeSeries) -> DescriptiveStats:
"""Compute descriptive statistics for *ts*.
Parameters
----------
ts : TimeSeries
Input series.
Returns
-------
DescriptiveStats
Raises
------
TypeError
If *ts* is not a :class:`~tseda.core.TimeSeries`.
ValueError
If *ts* has no non-NaN values.
Examples
--------
>>> import numpy as np, pandas as pd
>>> from tseda import TimeSeries
>>> from tseda.statistics.descriptive import DescriptiveAnalyzer
>>> idx = pd.date_range("2020", periods=4, freq="D")
>>> ts = TimeSeries([1.0, 2.0, 3.0, 4.0], index=idx)
>>> r = DescriptiveAnalyzer().analyze(ts)
>>> r.median
2.5
>>> r.n_positive
4
"""
if not isinstance(ts, TimeSeries):
raise TypeError(
f"'ts' must be a TimeSeries, got {type(ts).__name__!r}."
)
vals = ts.values
not_nan = ~np.isnan(vals)
x = vals[not_nan]
n_valid = int(x.size)
if n_valid == 0:
raise ValueError("'ts' has no non-NaN values; cannot compute statistics.")
n_total = ts.n
n_nan = n_total - n_valid
pct_nan = 100.0 * n_nan / max(n_total, 1)
# Central tendency
mean = float(np.mean(x))
median = float(np.median(x))
tr_mean = _trimmed_mean(x, trim=0.05)
# Spread
std = float(np.std(x, ddof=1)) if n_valid > 1 else float("nan")
var = float(np.var(x, ddof=1)) if n_valid > 1 else float("nan")
mad = float(np.median(np.abs(x - median)))
cv = (std / abs(mean)) if (mean != 0 and not np.isnan(std)) else float("nan")
# Range
mn = float(np.min(x))
mx = float(np.max(x))
rng = mx - mn
# First / last non-NaN values (positional)
not_nan_idx = np.where(not_nan)[0]
first = float(vals[not_nan_idx[0]])
last = float(vals[not_nan_idx[-1]])
# Shape
skew = _skewness(x)
kurt = _kurtosis(x)
# Quantiles
quantiles = {
q: float(np.quantile(x, q)) for q in _QUANTILE_LEVELS
}
# Value-type counts
n_zeros = int(np.sum(x == 0.0))
n_positive = int(np.sum(x > 0.0))
n_negative = int(np.sum(x < 0.0))
return DescriptiveStats(
n_total=n_total,
n_valid=n_valid,
n_nan=n_nan,
pct_nan=round(pct_nan, 4),
mean=mean,
median=median,
trimmed_mean=tr_mean,
std=std,
var=var,
mad=mad,
cv=cv,
min=mn,
max=mx,
range=rng,
first=first,
last=last,
skewness=skew,
kurtosis=kurt,
quantiles=quantiles,
n_zeros=n_zeros,
n_positive=n_positive,
n_negative=n_negative,
)