From 9a3f5959cd00f44aed28f748b8a18059686ea05d Mon Sep 17 00:00:00 2001 From: neutonsevero Date: Sun, 26 Apr 2026 23:17:22 -0300 Subject: [PATCH] feat(distributions): add lognakagami, loggamma, and kl_statistic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement two new scipy-compatible distributions : Log-Nakagami (lognakagami) and Log-Gamma (loggamma_dist), with complete logpdf/cdf/ppf/stats/entropy/rvs methods derived from the change-of-variable Y = ln(X). Add kl_statistic, a KDE-based KL-divergence goodness-of-fit callable compatible with the Fitter class. Extend k_gen with _stats (improving speed), _cdf, and a fit guard, and switch kv → kve to improve numerical stability at large arguments. Add unit tests for all three additions covering normalization, monotonicity, ppf inversion, moment formulas, and Fitter integration. --- .gitignore | 1 + etc/tests/test_distributions.py | 175 +++++++++++++++++++- etc/tests/test_statistics.py | 128 ++++++++++++++- etc/tools/distributions.py | 274 +++++++++++++++++++++++++++++++- etc/tools/statistics.py | 29 +++- 5 files changed, 600 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 8bf60b2..a64dba9 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ data/* etc/tools/read_raw_data.py +scripts/separate_data.py diff --git a/etc/tests/test_distributions.py b/etc/tests/test_distributions.py index 41faed2..9740051 100644 --- a/etc/tests/test_distributions.py +++ b/etc/tests/test_distributions.py @@ -1,12 +1,13 @@ import numpy as np import pytest +import scipy.special as sc import matplotlib.pyplot as plt import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from tools.distributions import k_dist +from tools.distributions import k_dist, lognakagami, loggamma_dist X = np.linspace(0.01, 10.0, 500) @@ -167,6 +168,178 @@ class TestKDistPlots: # ── Entry-point: run plots interactively ───────────────────────────────────── +Y = np.linspace(-5.0, 5.0, 500) + + +# ── lognakagami unit tests ──────────────────────────────────────────────────── + + +class TestLogNakagami: + def test_logpdf_is_finite_on_real_line(self): + """logpdf must be finite for all real y — tests positivity without float64 underflow.""" + log_vals = lognakagami.logpdf(Y, m=2.0, Omega=1.0) + assert np.all(np.isfinite(log_vals)) + + def test_pdf_integrates_to_one(self): + """Numerical integral of PDF over the real line should be ≈ 1.""" + y_fine = np.linspace(-30, 10, 200_000) + integral = np.trapezoid(lognakagami.pdf(y_fine, m=2.0, Omega=1.0), y_fine) + assert pytest.approx(integral, abs=1e-3) == 1.0 + + def test_pdf_integrates_to_one_nonunit_omega(self): + """Normalisation must hold for Omega != 1.""" + y_fine = np.linspace(-30, 15, 200_000) + integral = np.trapezoid(lognakagami.pdf(y_fine, m=2.0, Omega=4.0), y_fine) + assert pytest.approx(integral, abs=1e-3) == 1.0 + + def test_logpdf_equals_log_pdf(self): + """logpdf must equal log(pdf) at points where pdf does not underflow.""" + y_bulk = np.linspace(-4.0, 2.0, 50) + log_via_pdf = np.log(lognakagami.pdf(y_bulk, m=2.0, Omega=1.0)) + log_direct = lognakagami.logpdf(y_bulk, m=2.0, Omega=1.0) + np.testing.assert_allclose(log_direct, log_via_pdf, rtol=1e-6) + + def test_cdf_is_monotone_increasing(self): + """CDF must be strictly non-decreasing.""" + cdf_vals = lognakagami.cdf(Y, m=2.0, Omega=1.0) + assert np.all(np.diff(cdf_vals) >= 0) + + def test_ppf_inverts_cdf(self): + """ppf(cdf(y)) must recover y.""" + y_test = np.array([-2.0, 0.0, 0.5]) + cdf_vals = lognakagami.cdf(y_test, m=2.0, Omega=1.0) + np.testing.assert_allclose(lognakagami.ppf(cdf_vals, m=2.0, Omega=1.0), y_test, atol=1e-8) + + def test_argcheck_rejects_m_below_half(self): + """m < 0.5 must not produce a valid (positive-finite) PDF value.""" + val = lognakagami.pdf(0.0, m=0.3, Omega=1.0) + assert not (np.isfinite(val) and val > 0) + + def test_argcheck_rejects_non_positive_omega(self): + """Omega <= 0 must not produce a valid (positive-finite) PDF value.""" + val = lognakagami.pdf(0.0, m=2.0, Omega=-1.0) + assert not (np.isfinite(val) and val > 0) + + def test_stats_mean(self): + """Analytical mean must equal 0.5 * (digamma(m) - log(m) + log(Omega)).""" + m, Omega = 3.0, 2.0 + expected_mean = 0.5 * (sc.digamma(m) - np.log(m) + np.log(Omega)) + dist_mean = float(lognakagami.stats(m=m, Omega=Omega, moments="m")) + assert pytest.approx(dist_mean, rel=1e-10) == expected_mean + + def test_stats_mean_omega_shifts_by_half_log_omega(self): + """Changing Omega shifts the mean by 0.5*log(Omega) and leaves variance unchanged.""" + m = 2.0 + mean1 = float(lognakagami.stats(m=m, Omega=1.0, moments="m")) + mean4 = float(lognakagami.stats(m=m, Omega=4.0, moments="m")) + assert pytest.approx(mean4 - mean1, rel=1e-10) == 0.5 * np.log(4.0) + + def test_stats_variance_independent_of_omega(self): + """Variance must equal 0.25 * polygamma(1, m) and not depend on Omega.""" + m = 3.0 + expected_var = 0.25 * sc.polygamma(1, m) + for Omega in [0.5, 1.0, 4.0]: + _, dist_var, *_ = lognakagami.stats(m=m, Omega=Omega, moments="mv") + assert pytest.approx(float(dist_var), rel=1e-10) == expected_var + + def test_rvs_samples_are_finite(self): + """Random samples must be finite real numbers.""" + rng = np.random.default_rng(42) + samples = lognakagami.rvs(m=2.0, Omega=1.0, size=200, random_state=rng) + assert samples.shape == (200,) + assert np.all(np.isfinite(samples)) + + def test_rvs_sample_mean_near_expected(self): + """Sample mean of many RVS should be close to the distribution mean.""" + m, Omega = 2.0, 3.0 + rng = np.random.default_rng(0) + samples = lognakagami.rvs(m=m, Omega=Omega, size=50_000, random_state=rng) + expected_mean = float(lognakagami.stats(m=m, Omega=Omega, moments="m")) + assert pytest.approx(samples.mean(), rel=5e-2) == expected_mean + + +# ── loggamma_dist unit tests ────────────────────────────────────────────────── + + +class TestLogGamma: + def test_pdf_is_positive_on_real_line(self): + """PDF must be strictly positive for all real y and a > 0.""" + vals = loggamma_dist.pdf(Y, a=2.0) + assert np.all(vals > 0) + + def test_pdf_integrates_to_one(self): + """Numerical integral of PDF over the real line should be ≈ 1.""" + y_fine = np.linspace(-30, 10, 200_000) + integral = np.trapezoid(loggamma_dist.pdf(y_fine, a=2.0), y_fine) + assert pytest.approx(integral, abs=1e-3) == 1.0 + + def test_logpdf_equals_log_pdf(self): + """logpdf must equal log(pdf) for numerical consistency.""" + log_via_pdf = np.log(loggamma_dist.pdf(Y, a=2.0)) + log_direct = loggamma_dist.logpdf(Y, a=2.0) + np.testing.assert_allclose(log_direct, log_via_pdf, rtol=1e-6) + + def test_cdf_is_monotone_increasing(self): + """CDF must be strictly non-decreasing.""" + cdf_vals = loggamma_dist.cdf(Y, a=2.0) + assert np.all(np.diff(cdf_vals) >= 0) + + def test_cdf_and_sf_sum_to_one(self): + """CDF + SF must equal 1 at every point.""" + cdf_vals = loggamma_dist.cdf(Y, a=2.0) + sf_vals = loggamma_dist.sf(Y, a=2.0) + np.testing.assert_allclose(cdf_vals + sf_vals, 1.0, atol=1e-12) + + def test_ppf_inverts_cdf(self): + """ppf(cdf(y)) must recover y.""" + y_test = np.array([-2.0, 0.0, 1.0]) + cdf_vals = loggamma_dist.cdf(y_test, a=2.0) + np.testing.assert_allclose(loggamma_dist.ppf(cdf_vals, a=2.0), y_test, atol=1e-8) + + def test_argcheck_rejects_non_positive_a(self): + """a <= 0 must not produce a valid (positive-finite) PDF value.""" + val = loggamma_dist.pdf(0.0, a=-1.0) + assert not (np.isfinite(val) and val > 0) + + def test_stats_mean_equals_digamma(self): + """Analytical mean must equal digamma(a).""" + a = 3.0 + expected_mean = sc.digamma(a) + dist_mean = float(loggamma_dist.stats(a=a, moments="m")) + assert pytest.approx(dist_mean, rel=1e-10) == expected_mean + + def test_stats_variance_equals_trigamma(self): + """Analytical variance must equal polygamma(1, a).""" + a = 3.0 + expected_var = sc.polygamma(1, a) + _, dist_var, *_ = loggamma_dist.stats(a=a, moments="mv") + assert pytest.approx(float(dist_var), rel=1e-10) == expected_var + + def test_log_transform_relation_to_gamma(self): + """loggamma_dist.pdf(y) must equal gamma.pdf(exp(y)) * exp(y) (change-of-variable).""" + from scipy.stats import gamma as scipy_gamma + + y_test = np.linspace(-3.0, 3.0, 20) + direct = loggamma_dist.pdf(y_test, a=2.0) + via_gamma = scipy_gamma.pdf(np.exp(y_test), a=2.0) * np.exp(y_test) + np.testing.assert_allclose(direct, via_gamma, rtol=1e-6) + + def test_rvs_samples_are_finite(self): + """Random samples must be finite real numbers.""" + rng = np.random.default_rng(42) + samples = loggamma_dist.rvs(a=2.0, size=200, random_state=rng) + assert samples.shape == (200,) + assert np.all(np.isfinite(samples)) + + def test_rvs_sample_mean_near_expected(self): + """Sample mean of many RVS should be close to the distribution mean.""" + a = 2.0 + rng = np.random.default_rng(0) + samples = loggamma_dist.rvs(a=a, size=50_000, random_state=rng) + expected_mean = float(loggamma_dist.stats(a=a, moments="m")) + assert pytest.approx(samples.mean(), rel=5e-2) == expected_mean + + if __name__ == "__main__": plot_k_dist_varying_alpha() plot_k_dist_varying_mu() diff --git a/etc/tests/test_statistics.py b/etc/tests/test_statistics.py index 96af920..3ca28e4 100644 --- a/etc/tests/test_statistics.py +++ b/etc/tests/test_statistics.py @@ -6,7 +6,7 @@ import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from tools.statistics import aic_statistic, bic_statistic +from tools.statistics import aic_statistic, bic_statistic, kl_statistic from fitting.fitter import Fitter @@ -208,3 +208,129 @@ class TestBicStatisticInFitter: f.validate(n_mc_samples=99) assert f["gamma"].test_result is not None assert f["expon"].test_result is not None + + +# ── kl_statistic unit tests ─────────────────────────────────────────────────── + + +class TestKlStatistic: + def _fitted_dist(self, dist, data, **kwargs): + """Return a frozen distribution fitted to data.""" + params = dist.fit(data, **kwargs) + return dist(*params) + + def test_returns_float(self): + """kl_statistic must return a numeric scalar.""" + frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0) + result = kl_statistic(frozen, GAMMA_DATA, axis=None) + assert isinstance(float(result), float) + + def test_returns_real_value(self): + """kl_statistic returns a real number (KDE finite-sum approximation can be negative).""" + frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0) + result = kl_statistic(frozen, GAMMA_DATA, axis=None) + assert np.isreal(result) + + def test_result_is_finite(self): + """kl_statistic must return a finite value for valid input.""" + frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0) + assert np.isfinite(kl_statistic(frozen, GAMMA_DATA, axis=None)) + + def test_works_with_axis_zero(self): + """kl_statistic must return a finite value when axis=0.""" + frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0) + result = kl_statistic(frozen, GAMMA_DATA, axis=0) + assert np.isfinite(result) + + def test_axis_zero_same_as_axis_none_for_1d(self): + """For 1-D data, axis=0 and axis=None must return the same value.""" + frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0) + result_none = kl_statistic(frozen, GAMMA_DATA, axis=None) + result_axis0 = kl_statistic(frozen, GAMMA_DATA, axis=0) + assert pytest.approx(result_none) == result_axis0 + + def test_better_fit_has_lower_kl(self): + """Gamma fitted to gamma data should have lower KL than normal fitted to gamma data.""" + gamma_frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0) + norm_frozen = self._fitted_dist(norm, GAMMA_DATA) + kl_gamma = kl_statistic(gamma_frozen, GAMMA_DATA, axis=None) + kl_norm = kl_statistic(norm_frozen, GAMMA_DATA, axis=None) + assert kl_gamma < kl_norm + + def test_matches_manual_formula(self): + """kl_statistic result must match the KDE-based KL formula computed manually.""" + from scipy.stats import gaussian_kde + + frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0) + kde = gaussian_kde(GAMMA_DATA) + data_pdf = kde(GAMMA_DATA) + dist_pdf = frozen.pdf(GAMMA_DATA) + epsilon = 1e-10 + expected = np.sum( + data_pdf * np.log((data_pdf + epsilon) / (dist_pdf + epsilon)) + ) + assert pytest.approx(kl_statistic(frozen, GAMMA_DATA, axis=None)) == expected + + def test_no_nan_when_dist_pdf_near_zero(self): + """epsilon guard must prevent NaN when dist PDF is effectively zero over data.""" + # expon with large loc has near-zero PDF over positive-skewed gamma data + far_dist = expon(loc=100.0, scale=1.0) + result = kl_statistic(far_dist, GAMMA_DATA, axis=None) + assert not np.isnan(result) + + def test_result_is_consistent_across_calls(self): + """Two calls with identical inputs must return the same value.""" + frozen = self._fitted_dist(gamma, GAMMA_DATA, floc=0) + r1 = kl_statistic(frozen, GAMMA_DATA, axis=None) + r2 = kl_statistic(frozen, GAMMA_DATA, axis=None) + assert r1 == r2 + + +# ── Integration: kl_statistic as callable in Fitter ────────────────────────── + + +class TestKlStatisticInFitter: + def test_fitter_accepts_kl_callable(self): + f = Fitter([gamma], statistic_method=kl_statistic, gamma_params={"floc": 0}) + f.fit(GAMMA_DATA) + f.validate(n_mc_samples=99) + assert f["gamma"].test_result is not None + + def test_fitter_kl_statistic_is_finite(self): + f = Fitter([gamma], statistic_method=kl_statistic, gamma_params={"floc": 0}) + f.fit(GAMMA_DATA) + f.validate(n_mc_samples=99) + assert np.isfinite(f["gamma"].gof_statistic) + + def test_fitter_kl_pvalue_in_range(self): + f = Fitter([gamma], statistic_method=kl_statistic, gamma_params={"floc": 0}) + f.fit(GAMMA_DATA) + f.validate(n_mc_samples=99) + pval = f["gamma"].pvalue + assert 0.0 <= pval <= 1.0 + + def test_fitter_kl_vs_ad_different_statistic_values(self): + """KL and AD statistics should differ numerically.""" + f_kl = Fitter( + [gamma], statistic_method=kl_statistic, gamma_params={"floc": 0} + ) + f_ad = Fitter([gamma], statistic_method="ad", gamma_params={"floc": 0}) + f_kl.fit(GAMMA_DATA) + f_ad.fit(GAMMA_DATA) + f_kl.validate(n_mc_samples=99) + f_ad.validate(n_mc_samples=99) + assert f_kl["gamma"].gof_statistic != pytest.approx( + f_ad["gamma"].gof_statistic + ) + + def test_fitter_kl_multiple_distributions(self): + f = Fitter( + [gamma, expon], + statistic_method=kl_statistic, + gamma_params={"floc": 0}, + expon_params={"floc": 0}, + ) + f.fit(GAMMA_DATA) + f.validate(n_mc_samples=99) + assert f["gamma"].test_result is not None + assert f["expon"].test_result is not None diff --git a/etc/tools/distributions.py b/etc/tools/distributions.py index 701b4b2..3f7cc58 100644 --- a/etc/tools/distributions.py +++ b/etc/tools/distributions.py @@ -1,5 +1,7 @@ from scipy.stats import rv_continuous -from scipy.special import kv, gammaln +from scipy.special import kve, gammaln +from scipy.integrate import quad +import scipy.special as sc from scipy.stats._distn_infrastructure import _ShapeInfo import numpy as np @@ -36,16 +38,280 @@ class k_gen(rv_continuous): return np.exp(self._logpdf(x, mu, alpha, beta)) def _logpdf(self, x, mu, alpha, beta): + half_sum = (alpha + beta) / 2.0 + log_ab_over_mu = np.log(alpha) + np.log(beta) - np.log(mu) z = 2.0 * np.sqrt(alpha * beta * x / mu) return ( np.log(2.0) - gammaln(alpha) - gammaln(beta) - + (alpha + beta) / 2.0 * np.log(alpha * beta / mu) - + ((alpha + beta) / 2.0 - 1.0) * np.log(x) - + np.log(kv(alpha - beta, z)) + + half_sum * log_ab_over_mu + + (half_sum - 1.0) * np.log(x) + + np.log(kve(alpha - beta, z)) + - z ) + def _stats(self, mu, alpha, beta, moments="mv"): + mean = mu if "m" in moments or "v" in moments or "s" in moments or "k" in moments else None + + variance = None + if "v" in moments or "s" in moments or "k" in moments: + second_raw = (mu ** 2) * sc.poch(alpha, 2) * sc.poch(beta, 2) / ((alpha * beta) ** 2) + variance = second_raw - mean ** 2 + + skewness = None + if "s" in moments or "k" in moments: + third_raw = (mu ** 3) * sc.poch(alpha, 3) * sc.poch(beta, 3) / ((alpha * beta) ** 3) + third_central = third_raw - 3.0 * mean * second_raw + 2.0 * mean ** 3 + skewness = third_central / np.power(variance, 1.5) + + kurtosis = None + if "k" in moments: + fourth_raw = (mu ** 4) * sc.poch(alpha, 4) * sc.poch(beta, 4) / ((alpha * beta) ** 4) + fourth_central = fourth_raw - 4.0 * mean * third_raw + 6.0 * mean ** 2 * second_raw - 3.0 * mean ** 4 + kurtosis = fourth_central / (variance ** 2) - 3.0 + + return mean, variance, skewness, kurtosis + + def _cdf(self, x, mu, alpha, beta): + # scipy broadcasts params to match x's shape before calling _cdf, + # so mu/alpha/beta may be arrays. Pass all four to np.vectorize so + # each argument arrives as a scalar inside _scalar. + # Substitution u = sqrt(x) regularises the integrand near u=0: + # f(u²)*2u ~ u^(alpha+beta-1), smooth for alpha,beta > 0. + def _scalar(xi, mui, ai, bi): + val, _ = quad( + lambda u: float(self._pdf(float(u * u), float(mui), float(ai), float(bi))) * 2.0 * u, + 0.0, float(np.sqrt(xi)), + limit=200, epsabs=1.49e-10, epsrel=1.49e-8, + ) + return val + return np.vectorize(_scalar)(x, mu, alpha, beta) + + def fit(self, data, *args, **kwds): + if ("loc" in kwds and kwds["loc"] != 0.0) or ("floc" in kwds and kwds["floc"] != 0.0): + raise ValueError("k_distribution uses a fixed loc=0; use mu to control the mean/scale.") + if ("scale" in kwds and kwds["scale"] != 1.0) or ("fscale" in kwds and kwds["fscale"] != 1.0): + raise ValueError("k_distribution uses a fixed scale=1; use mu to control the mean/scale.") + + kwds.pop("loc", None) + kwds.pop("scale", None) + return super().fit(data, *args, floc=0.0, fscale=1.0, **kwds) + k_dist = k_gen(a=0.0, name="k_distribution", shapes="mu, alpha, beta") + +class lognakagami_gen(rv_continuous): + r"""A Log-Nakagami continuous random variable. + + If X follows a Nakagami distribution with shape parameter m >= 0.5 + and spread Omega > 0, then Y = ln(X) follows the Log-Nakagami distribution. + + %(before_notes)s + + Notes + ----- + The probability density function for `lognakagami` is: + + f(y; m, Omega) = 2*m^m / (Gamma(m)*Omega^m) + * exp(2*m*y) * exp(-(m/Omega)*exp(2*y)) + + for y in (-inf, +inf). Derived via the change of variable Y = ln X, + whose Jacobian e^y combines with x^{2m-1} to give the e^{2my} term. + + The mean and variance are: + + E[Y] = 0.5 * (psi(m) - ln(m) + ln(Omega)) + Var[Y] = 0.25 * psi_1(m) + + where psi and psi_1 are the digamma and trigamma functions. The spread + Omega shifts the mean by 0.5*ln(Omega) but leaves all higher moments + unchanged; entropy is also Omega-independent. + + `lognakagami` takes ``m`` and ``Omega`` as shape parameters. + + See Also + -------- + nakagami : The Nakagami distribution (before the log transform). + """ + + def _shape_info(self): + return [ + _ShapeInfo("m", False, (0, np.inf), (False, False)), + _ShapeInfo("Omega", False, (0, np.inf), (False, False)), + ] + + def _argcheck(self, m, Omega): + return (m >= 0.5) & (Omega > 0) + + # ---------- PDF / log-PDF ---------- + + def _pdf(self, y, m, Omega): + return np.exp(self._logpdf(y, m, Omega)) + + def _logpdf(self, y, m, Omega): + # ln f = ln2 + m*ln(m) - ln Γ(m) - m*ln(Ω) + 2m*y - (m/Ω)*exp(2y) + return (np.log(2.0) + + sc.xlogy(m, m) + - sc.gammaln(m) + - m * np.log(Omega) + + 2.0 * m * y + - (m / Omega) * np.exp(2.0 * y)) + + # ---------- CDF / SF / log-CDF / log-SF ---------- + + def _cdf(self, y, m, Omega): + # P(Y <= y) = P(X <= e^y) = gammainc(m, (m/Omega) * e^{2y}) + return sc.gammainc(m, (m / Omega) * np.exp(2.0 * y)) + + def _sf(self, y, m, Omega): + return sc.gammaincc(m, (m / Omega) * np.exp(2.0 * y)) + + def _logcdf(self, y, m, Omega): + return np.log(sc.gammainc(m, (m / Omega) * np.exp(2.0 * y))) + + def _logsf(self, y, m, Omega): + return np.log(sc.gammaincc(m, (m / Omega) * np.exp(2.0 * y))) + + # ---------- PPF / ISF ---------- + + def _ppf(self, q, m, Omega): + # q = gammainc(m, (m/Omega)*e^{2y}) + # => (m/Omega)*e^{2y} = gammaincinv(m, q) + # => y = 0.5 * ln(Omega * gammaincinv(m, q) / m) + return 0.5 * np.log(Omega * sc.gammaincinv(m, q) / m) + + def _isf(self, q, m, Omega): + return 0.5 * np.log(Omega * sc.gammainccinv(m, q) / m) + + # ---------- Moments & statistics ---------- + + def _stats(self, m, Omega): + # E[Y] = 0.5*(psi(m) - ln(m) + ln(Omega)) + mu = 0.5 * (sc.digamma(m) - np.log(m) + np.log(Omega)) + # Var[Y] = 0.25 * psi_1(m) (Omega cancels — pure shift) + mu2 = 0.25 * sc.polygamma(1, m) + g1 = sc.polygamma(2, m) / sc.polygamma(1, m) ** 1.5 + g2 = sc.polygamma(3, m) / sc.polygamma(1, m) ** 2 + return mu, mu2, g1, g2 + + def _entropy(self, m, Omega): + # H = -E[ln f(Y)] = -ln2 + ln Γ(m) - m*psi(m) + m + # Omega cancels exactly (translation invariance of differential entropy) + return -np.log(2.0) + sc.gammaln(m) - m * sc.digamma(m) + m + + # ---------- Random variate generation ---------- + + def _rvs(self, m, Omega, size=None, random_state=None): + # X ~ Nakagami(m, Omega) => X^2 ~ Gamma(m, Omega/m) + g = random_state.gamma(m, Omega / m, size=size) + return 0.5 * np.log(g) + + +lognakagami = lognakagami_gen(name='lognakagami', shapes="m, Omega") + +class loggamma_gen(rv_continuous): + r"""A Log-Gamma continuous random variable. + + If X follows a Gamma distribution with shape a + and unit scale (beta = 1), then Y = ln X follows + the Log-Gamma distribution. + + %(before_notes)s + + Notes + ----- + The probability density function for `loggamma_dist` is: + + f(y, a) = 1 / Gamma(a) * exp(a*y - exp(y)) + + for y in (-inf, +inf), a > 0. + + This is the standardised form (scale beta = 1). + + `loggamma_dist` takes ``a`` as a shape parameter. + + The non-unit scale beta is recovered through scipy's + ``loc`` parameter: loc = ln(beta), since + ln(beta * X) = ln(beta) + ln(X). + + See Also + -------- + gamma : The Gamma distribution (before the log transform). + + """ + + def _shape_info(self): + return [_ShapeInfo("a", False, (0, np.inf), (False, False))] + + def _argcheck(self, a): + return a > 0 + + # ---------- PDF / log-PDF ---------- + + def _pdf(self, y, a): + return np.exp(self._logpdf(y, a)) + + def _logpdf(self, y, a): + # ln f = a*y - e^y - ln Γ(a) + return a * y - np.exp(y) - sc.gammaln(a) + + # ---------- CDF / SF / log-CDF / log-SF ---------- + + def _cdf(self, y, a): + # P(Y <= y) = P(X <= e^y) = gammainc(a, e^y) + return sc.gammainc(a, np.exp(y)) + + def _sf(self, y, a): + return sc.gammaincc(a, np.exp(y)) + + def _logcdf(self, y, a): + return np.log(sc.gammainc(a, np.exp(y))) + + def _logsf(self, y, a): + return np.log(sc.gammaincc(a, np.exp(y))) + + # ---------- PPF / ISF ---------- + + def _ppf(self, q, a): + # Invert CDF: q = gammainc(a, e^y) + # => e^y = gammaincinv(a, q) + # => y = ln(gammaincinv(a, q)) + return np.log(sc.gammaincinv(a, q)) + + def _isf(self, q, a): + return np.log(sc.gammainccinv(a, q)) + + # ---------- Moments & statistics ---------- + + def _stats(self, a): + # Y = ln(X), X ~ Gamma(a, 1) + # E[Y] = psi(a) (digamma) + # Var[Y] = psi_1(a) (trigamma) + # skew = psi_2(a) / psi_1(a)^{3/2} + # excess kurtosis = psi_3(a) / psi_1(a)^2 + mu = sc.digamma(a) + mu2 = sc.polygamma(1, a) + g1 = sc.polygamma(2, a) / mu2 ** 1.5 + g2 = sc.polygamma(3, a) / mu2 ** 2 + return mu, mu2, g1, g2 + + def _entropy(self, a): + # H(Y) = -E[ln f(Y)] = -E[a*Y - e^Y - ln Γ(a)] + # = ln Γ(a) - a*psi(a) + E[e^Y] + # E[e^Y] = E[X] = a (for Gamma(a, 1)) + return sc.gammaln(a) - a * sc.digamma(a) + a + + # ---------- Random variate generation ---------- + + def _rvs(self, a, size=None, random_state=None): + # Generate Gamma(a, 1) variates, then take log. + # For small a, use Gamma(a) ~ Gamma(a+1) * U^(1/a) + # so ln Gamma(a) ~ ln Gamma(a+1) + ln(U)/a + # This avoids precision loss when a << 1. + return (np.log(random_state.gamma(a + 1, size=size)) + + np.log(random_state.uniform(size=size)) / a) + + +loggamma_dist = loggamma_gen(name='loggamma_dist') \ No newline at end of file diff --git a/etc/tools/statistics.py b/etc/tools/statistics.py index 543626b..d499de3 100644 --- a/etc/tools/statistics.py +++ b/etc/tools/statistics.py @@ -1,4 +1,5 @@ import numpy as np +from scipy.stats import gaussian_kde def aic_statistic(dist, data, axis): @@ -32,4 +33,30 @@ def bic_statistic(dist, data, axis): log_likelihood = np.sum(dist.logpdf(data), axis=axis) bic = np.log(n) * k - 2 * log_likelihood - return bic \ No newline at end of file + return bic + +def kl_statistic(dist, data, axis): + """ + KL divergence-based goodness-of-fit statistic. + + KL(P || Q) = ∑ P(x) log(P(x) / Q(x)) + + Lower KL divergence indicates better fit, but since goodness_of_fit() + treats larger statistic values as worse fit, KL works directly. + """ + + # Estimate the PDF of the data using KDE + kde = gaussian_kde(data) + data_pdf = kde(data) + + # Get the PDF of the distribution at the data points + dist_pdf = dist.pdf(data) + + # normalize the PDFs to ensure they sum to 1 + data_pdf /= np.sum(data_pdf) + dist_pdf /= np.sum(dist_pdf) + # Avoid division by zero and log of zero by adding a small constant + epsilon = 1e-10 + kl_divergence = np.sum(data_pdf * np.log((data_pdf + epsilon) / (dist_pdf + epsilon)), axis=axis) + + return kl_divergence \ No newline at end of file