Implement two new scipy-compatible distributions : Log-Nakagami (lognakagami) and Log-Gamma (loggamma_dist), with complete logpdf/cdf/ppf/stats/entropy/rvs methods derived from the change-of-variable Y = ln(X). Add kl_statistic, a KDE-based KL-divergence goodness-of-fit callable compatible with the Fitter class. Extend k_gen with _stats (improving speed), _cdf, and a fit guard, and switch kv → kve to improve numerical stability at large arguments. Add unit tests for all three additions covering normalization, monotonicity, ppf inversion, moment formulas, and Fitter integration.
337 lines
14 KiB
Python
337 lines
14 KiB
Python
import numpy as np
|
|
import pytest
|
|
from scipy.stats import gamma, expon, norm
|
|
import sys
|
|
import os
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from tools.statistics import aic_statistic, bic_statistic, kl_statistic
|
|
from fitting.fitter import Fitter
|
|
|
|
|
|
RNG = np.random.default_rng(42)
|
|
GAMMA_DATA = RNG.gamma(shape=2.0, scale=1.5, size=200)
|
|
|
|
|
|
# ── aic_statistic unit tests ──────────────────────────────────────────────────
|
|
|
|
|
|
class TestAicStatistic:
|
|
def _fitted_dist(self, dist, data, **kwargs):
|
|
"""Return a frozen distribution fitted to data."""
|
|
params = dist.fit(data, **kwargs)
|
|
return dist(*params)
|
|
|
|
def test_returns_float(self):
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
result = aic_statistic(frozen, GAMMA_DATA, axis=0)
|
|
assert isinstance(float(result), float)
|
|
|
|
def test_formula_correct(self):
|
|
"""AIC = 2k - 2*log_likelihood."""
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
k = len(frozen.args)
|
|
log_likelihood = np.sum(frozen.logpdf(GAMMA_DATA), axis=0)
|
|
expected = 2 * k - 2 * log_likelihood
|
|
assert pytest.approx(aic_statistic(frozen, GAMMA_DATA, axis=0)) == expected
|
|
|
|
def test_penalises_more_parameters(self):
|
|
"""gamma (3 params) should have higher AIC penalty term than expon (2 params)
|
|
when both are fitted to the same data with identical log-likelihood contribution."""
|
|
gamma_frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
expon_frozen = self._fitted_dist(expon, GAMMA_DATA, floc=0)
|
|
# penalty term alone: 2*k; gamma has more params so its penalty is larger
|
|
assert 2 * len(gamma_frozen.args) > 2 * len(expon_frozen.args)
|
|
|
|
def test_better_fit_has_lower_aic(self):
|
|
"""Gamma fitted to gamma data should have lower AIC than normal fitted to gamma data."""
|
|
gamma_frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
norm_frozen = self._fitted_dist(norm, GAMMA_DATA)
|
|
aic_gamma = aic_statistic(gamma_frozen, GAMMA_DATA, axis=0)
|
|
aic_norm = aic_statistic(norm_frozen, GAMMA_DATA, axis=0)
|
|
assert aic_gamma < aic_norm
|
|
|
|
def test_works_with_axis_none(self):
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
result = aic_statistic(frozen, GAMMA_DATA, axis=None)
|
|
assert np.isfinite(result)
|
|
|
|
def test_result_is_finite(self):
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
assert np.isfinite(aic_statistic(frozen, GAMMA_DATA, axis=0))
|
|
|
|
|
|
# ── Integration: aic_statistic as callable in Fitter ─────────────────────────
|
|
|
|
|
|
class TestAicStatisticInFitter:
|
|
def test_fitter_accepts_aic_callable(self):
|
|
f = Fitter([gamma], statistic_method=aic_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert f["gamma"].test_result is not None
|
|
|
|
def test_fitter_aic_statistic_is_finite(self):
|
|
f = Fitter([gamma], statistic_method=aic_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert np.isfinite(f["gamma"].gof_statistic)
|
|
|
|
def test_fitter_aic_pvalue_in_range(self):
|
|
f = Fitter([gamma], statistic_method=aic_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
pval = f["gamma"].pvalue
|
|
assert 0.0 <= pval <= 1.0
|
|
|
|
def test_fitter_aic_vs_ad_different_statistic_values(self):
|
|
"""AIC and AD statistics should differ numerically."""
|
|
f_aic = Fitter(
|
|
[gamma], statistic_method=aic_statistic, gamma_params={"floc": 0}
|
|
)
|
|
f_ad = Fitter([gamma], statistic_method="ad", gamma_params={"floc": 0})
|
|
f_aic.fit(GAMMA_DATA)
|
|
f_ad.fit(GAMMA_DATA)
|
|
f_aic.validate(n_mc_samples=99)
|
|
f_ad.validate(n_mc_samples=99)
|
|
assert f_aic["gamma"].gof_statistic != pytest.approx(
|
|
f_ad["gamma"].gof_statistic
|
|
)
|
|
|
|
def test_fitter_aic_multiple_distributions(self):
|
|
f = Fitter(
|
|
[gamma, expon],
|
|
statistic_method=aic_statistic,
|
|
gamma_params={"floc": 0},
|
|
expon_params={"floc": 0},
|
|
)
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert f["gamma"].test_result is not None
|
|
assert f["expon"].test_result is not None
|
|
|
|
|
|
# ── bic_statistic unit tests ──────────────────────────────────────────────────
|
|
|
|
|
|
class TestBicStatistic:
|
|
def _fitted_dist(self, dist, data, **kwargs):
|
|
"""Return a frozen distribution fitted to data."""
|
|
params = dist.fit(data, **kwargs)
|
|
return dist(*params)
|
|
|
|
def test_returns_float(self):
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
result = bic_statistic(frozen, GAMMA_DATA, axis=0)
|
|
assert isinstance(float(result), float)
|
|
|
|
def test_formula_correct(self):
|
|
"""BIC = ln(n)*k - 2*log_likelihood."""
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
n = len(GAMMA_DATA)
|
|
k = len(frozen.args)
|
|
log_likelihood = np.sum(frozen.logpdf(GAMMA_DATA), axis=0)
|
|
expected = np.log(n) * k - 2 * log_likelihood
|
|
assert pytest.approx(bic_statistic(frozen, GAMMA_DATA, axis=0)) == expected
|
|
|
|
def test_penalises_more_parameters(self):
|
|
"""gamma (3 params) should have higher BIC penalty term than expon (2 params)."""
|
|
gamma_frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
expon_frozen = self._fitted_dist(expon, GAMMA_DATA, floc=0)
|
|
n = len(GAMMA_DATA)
|
|
assert np.log(n) * len(gamma_frozen.args) > np.log(n) * len(expon_frozen.args)
|
|
|
|
def test_better_fit_has_lower_bic(self):
|
|
"""Gamma fitted to gamma data should have lower BIC than normal fitted to gamma data."""
|
|
gamma_frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
norm_frozen = self._fitted_dist(norm, GAMMA_DATA)
|
|
bic_gamma = bic_statistic(gamma_frozen, GAMMA_DATA, axis=0)
|
|
bic_norm = bic_statistic(norm_frozen, GAMMA_DATA, axis=0)
|
|
assert bic_gamma < bic_norm
|
|
|
|
def test_works_with_axis_none(self):
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
result = bic_statistic(frozen, GAMMA_DATA, axis=None)
|
|
assert np.isfinite(result)
|
|
|
|
def test_result_is_finite(self):
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
assert np.isfinite(bic_statistic(frozen, GAMMA_DATA, axis=0))
|
|
|
|
|
|
# ── Integration: bic_statistic as callable in Fitter ─────────────────────────
|
|
|
|
|
|
class TestBicStatisticInFitter:
|
|
def test_fitter_accepts_bic_callable(self):
|
|
f = Fitter([gamma], statistic_method=bic_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert f["gamma"].test_result is not None
|
|
|
|
def test_fitter_bic_statistic_is_finite(self):
|
|
f = Fitter([gamma], statistic_method=bic_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert np.isfinite(f["gamma"].gof_statistic)
|
|
|
|
def test_fitter_bic_pvalue_in_range(self):
|
|
f = Fitter([gamma], statistic_method=bic_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
pval = f["gamma"].pvalue
|
|
assert 0.0 <= pval <= 1.0
|
|
|
|
def test_fitter_bic_vs_ad_different_statistic_values(self):
|
|
"""BIC and AD statistics should differ numerically."""
|
|
f_bic = Fitter(
|
|
[gamma], statistic_method=bic_statistic, gamma_params={"floc": 0}
|
|
)
|
|
f_ad = Fitter([gamma], statistic_method="ad", gamma_params={"floc": 0})
|
|
f_bic.fit(GAMMA_DATA)
|
|
f_ad.fit(GAMMA_DATA)
|
|
f_bic.validate(n_mc_samples=99)
|
|
f_ad.validate(n_mc_samples=99)
|
|
assert f_bic["gamma"].gof_statistic != pytest.approx(
|
|
f_ad["gamma"].gof_statistic
|
|
)
|
|
|
|
def test_fitter_bic_multiple_distributions(self):
|
|
f = Fitter(
|
|
[gamma, expon],
|
|
statistic_method=bic_statistic,
|
|
gamma_params={"floc": 0},
|
|
expon_params={"floc": 0},
|
|
)
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert f["gamma"].test_result is not None
|
|
assert f["expon"].test_result is not None
|
|
|
|
|
|
# ── kl_statistic unit tests ───────────────────────────────────────────────────
|
|
|
|
|
|
class TestKlStatistic:
|
|
def _fitted_dist(self, dist, data, **kwargs):
|
|
"""Return a frozen distribution fitted to data."""
|
|
params = dist.fit(data, **kwargs)
|
|
return dist(*params)
|
|
|
|
def test_returns_float(self):
|
|
"""kl_statistic must return a numeric scalar."""
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
result = kl_statistic(frozen, GAMMA_DATA, axis=None)
|
|
assert isinstance(float(result), float)
|
|
|
|
def test_returns_real_value(self):
|
|
"""kl_statistic returns a real number (KDE finite-sum approximation can be negative)."""
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
result = kl_statistic(frozen, GAMMA_DATA, axis=None)
|
|
assert np.isreal(result)
|
|
|
|
def test_result_is_finite(self):
|
|
"""kl_statistic must return a finite value for valid input."""
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
assert np.isfinite(kl_statistic(frozen, GAMMA_DATA, axis=None))
|
|
|
|
def test_works_with_axis_zero(self):
|
|
"""kl_statistic must return a finite value when axis=0."""
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
result = kl_statistic(frozen, GAMMA_DATA, axis=0)
|
|
assert np.isfinite(result)
|
|
|
|
def test_axis_zero_same_as_axis_none_for_1d(self):
|
|
"""For 1-D data, axis=0 and axis=None must return the same value."""
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
result_none = kl_statistic(frozen, GAMMA_DATA, axis=None)
|
|
result_axis0 = kl_statistic(frozen, GAMMA_DATA, axis=0)
|
|
assert pytest.approx(result_none) == result_axis0
|
|
|
|
def test_better_fit_has_lower_kl(self):
|
|
"""Gamma fitted to gamma data should have lower KL than normal fitted to gamma data."""
|
|
gamma_frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
norm_frozen = self._fitted_dist(norm, GAMMA_DATA)
|
|
kl_gamma = kl_statistic(gamma_frozen, GAMMA_DATA, axis=None)
|
|
kl_norm = kl_statistic(norm_frozen, GAMMA_DATA, axis=None)
|
|
assert kl_gamma < kl_norm
|
|
|
|
def test_matches_manual_formula(self):
|
|
"""kl_statistic result must match the KDE-based KL formula computed manually."""
|
|
from scipy.stats import gaussian_kde
|
|
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
kde = gaussian_kde(GAMMA_DATA)
|
|
data_pdf = kde(GAMMA_DATA)
|
|
dist_pdf = frozen.pdf(GAMMA_DATA)
|
|
epsilon = 1e-10
|
|
expected = np.sum(
|
|
data_pdf * np.log((data_pdf + epsilon) / (dist_pdf + epsilon))
|
|
)
|
|
assert pytest.approx(kl_statistic(frozen, GAMMA_DATA, axis=None)) == expected
|
|
|
|
def test_no_nan_when_dist_pdf_near_zero(self):
|
|
"""epsilon guard must prevent NaN when dist PDF is effectively zero over data."""
|
|
# expon with large loc has near-zero PDF over positive-skewed gamma data
|
|
far_dist = expon(loc=100.0, scale=1.0)
|
|
result = kl_statistic(far_dist, GAMMA_DATA, axis=None)
|
|
assert not np.isnan(result)
|
|
|
|
def test_result_is_consistent_across_calls(self):
|
|
"""Two calls with identical inputs must return the same value."""
|
|
frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0)
|
|
r1 = kl_statistic(frozen, GAMMA_DATA, axis=None)
|
|
r2 = kl_statistic(frozen, GAMMA_DATA, axis=None)
|
|
assert r1 == r2
|
|
|
|
|
|
# ── Integration: kl_statistic as callable in Fitter ──────────────────────────
|
|
|
|
|
|
class TestKlStatisticInFitter:
|
|
def test_fitter_accepts_kl_callable(self):
|
|
f = Fitter([gamma], statistic_method=kl_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert f["gamma"].test_result is not None
|
|
|
|
def test_fitter_kl_statistic_is_finite(self):
|
|
f = Fitter([gamma], statistic_method=kl_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert np.isfinite(f["gamma"].gof_statistic)
|
|
|
|
def test_fitter_kl_pvalue_in_range(self):
|
|
f = Fitter([gamma], statistic_method=kl_statistic, gamma_params={"floc": 0})
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
pval = f["gamma"].pvalue
|
|
assert 0.0 <= pval <= 1.0
|
|
|
|
def test_fitter_kl_vs_ad_different_statistic_values(self):
|
|
"""KL and AD statistics should differ numerically."""
|
|
f_kl = Fitter(
|
|
[gamma], statistic_method=kl_statistic, gamma_params={"floc": 0}
|
|
)
|
|
f_ad = Fitter([gamma], statistic_method="ad", gamma_params={"floc": 0})
|
|
f_kl.fit(GAMMA_DATA)
|
|
f_ad.fit(GAMMA_DATA)
|
|
f_kl.validate(n_mc_samples=99)
|
|
f_ad.validate(n_mc_samples=99)
|
|
assert f_kl["gamma"].gof_statistic != pytest.approx(
|
|
f_ad["gamma"].gof_statistic
|
|
)
|
|
|
|
def test_fitter_kl_multiple_distributions(self):
|
|
f = Fitter(
|
|
[gamma, expon],
|
|
statistic_method=kl_statistic,
|
|
gamma_params={"floc": 0},
|
|
expon_params={"floc": 0},
|
|
)
|
|
f.fit(GAMMA_DATA)
|
|
f.validate(n_mc_samples=99)
|
|
assert f["gamma"].test_result is not None
|
|
assert f["expon"].test_result is not None
|