import numpy as np
from scipy.stats import gaussian_kde


def aic_statistic(dist, data, axis):
    """
    AIC-based goodness-of-fit statistic.

    AIC = 2k - 2ln(L)

    Lower AIC indicates better fit, but since goodness_of_fit()
    treats larger statistic values as worse fit, AIC works directly.
    """

    k = len(dist.args)
    log_likelihood = np.sum(dist.logpdf(data), axis=axis)
    aic = 2 * k - 2 * log_likelihood

    return aic

def bic_statistic(dist, data, axis):
    """
    BIC-based goodness-of-fit statistic.

    BIC = ln(n)k - 2ln(L)s

    Lower BIC indicates better fit, but since goodness_of_fit()
    treats larger statistic values as worse fit, BIC works directly.
    """

    n = data.size if axis is None else data.shape[axis]
    k = len(dist.args)
    log_likelihood = np.sum(dist.logpdf(data), axis=axis)
    bic = np.log(n) * k - 2 * log_likelihood

    return bic

def kl_statistic(dist, data, axis):
    """
    KL divergence-based goodness-of-fit statistic.

    KL(P || Q) = ∑ P(x) log(P(x) / Q(x))

    Lower KL divergence indicates better fit, but since goodness_of_fit()
    treats larger statistic values as worse fit, KL works directly.
    """

    # Estimate the PDF of the data using KDE
    kde = gaussian_kde(data)
    data_pdf = kde(data)

    # Get the PDF of the distribution at the data points
    dist_pdf = dist.pdf(data)

    # normalize the PDFs to ensure they sum to 1
    data_pdf /= np.sum(data_pdf)
    dist_pdf /= np.sum(dist_pdf)
    # Avoid division by zero and log of zero by adding a small constant
    epsilon = 1e-10
    kl_divergence = np.sum(data_pdf * np.log((data_pdf + epsilon) / (dist_pdf + epsilon)), axis=axis)

    return kl_divergence