from pathlib import Path import sys # Add project root to path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from etc.fitting import Fitter from etc.tools.distributions import logweibull,lognakagami, loggamma_dist, k_dist, lograyleigh, logrice,logk from etc.tools.statistics import aic_statistic, bic_statistic import pandas as pd import plotly.io as pio import numpy as np import itertools import os from scipy.stats import weibull_min, nakagami, gamma, rayleigh, rice from dotenv import load_dotenv pio.renderers.default = "browser" # LOAD env variables in dir above load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env")) DATA_PATH = os.path.join(os.path.dirname(__file__), "..", "data") DATA_FOLDER = os.path.join(DATA_PATH, "rain_statistics_04may2026") RAIN_DATA_FOLDER = os.path.join(DATA_PATH, "processed") if __name__ == "__main__": aberturas = ["12", "13", "14", "15"] voltas = ["0", "2", "4", "6", "8", "10"] ads = ["0","1","2","3"] bursts = ["0","1","2","3"] # if not exists, create the directory to save the dataframes and the figure if not os.path.exists(DATA_PATH): os.makedirs(DATA_PATH) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) dist_list = [weibull_min,nakagami,gamma, rice, rayleigh, k_dist] dist_list_log = [logweibull,lognakagami,loggamma_dist,logrice,lograyleigh,logk] statistics_dataframe_aic= pd.DataFrame(columns=[dist.name for dist in dist_list_log]) statistics_dataframe_bic= pd.DataFrame(columns=[dist.name for dist in dist_list_log]) statistics_dataframe_ks= pd.DataFrame(columns=[dist.name for dist in dist_list_log]) statistics_dataframe_cvm= pd.DataFrame(columns=[dist.name for dist in dist_list_log]) # create dataframes for each dist in list # weibull_dataframe = pd.DataFrame(columns=[f'weibull_{param}' for param in ['shape', 'loc', 'scale']]) lognakagami_dataframe = pd.DataFrame(columns=[f'lognakagami_{param}' for param in ['m', 'Omega', 'loc', 'scale']]) loggamma_dataframe = pd.DataFrame(columns=[f'loggamma_{param}' for param in ['a', 'loc', 'scale']]) logrice_dataframe = pd.DataFrame(columns=[f'logrice_{param}' for param in ['nu', 'sigma', 'loc', 'scale']]) lograyleigh_dataframe = pd.DataFrame(columns=[f'lograyleigh_{param}' for param in ['sigma', 'loc', 'scale']]) logk_dataframe = pd.DataFrame(columns=[f'logk_{param}' for param in ['mu', 'alpha', 'beta', 'loc', 'scale']]) logweibull_dataframe = pd.DataFrame(columns=[f'logweibull_{param}' for param in ['k', 'lam', 'loc', 'scale']]) i = 1 for abertura, volta, ad, burst in itertools.product(aberturas, voltas, ads, bursts): print(f"########### ITERATION {i} ###########") print(f"Processing ad{ad} burst{burst} abertura{abertura} volta{volta}") try: df = pd.read_csv(f"{RAIN_DATA_FOLDER}/ad{ad}_burst{burst}_abertura{abertura}_volta{volta}.csv") except FileNotFoundError: print(f"File not found for ad{ad} burst{burst} abertura{abertura} volta{volta}, skipping iteration.") i += 1 continue # Re-read with converters for every column converters = {col: complex for col in df.columns} df = pd.read_csv( f"{RAIN_DATA_FOLDER}/ad{ad}_burst{burst}_abertura{abertura}_volta{volta}.csv", converters=converters ) _rain_data = df["rain_data"].values # check abertura value and get samples between intervals for each abertura # #plot rain data to check if it is correct, plot the figure # fig = go.Figure() # fig.add_trace(go.Scatter(y=np.abs(_data_rc).mean(axis=1), name='Rain Data')) # fig.update_layout(title=f'Rain Data for ad{ad} burst{burst} abertura{abertura} volta{volta}', autosize=True) # # add dots on _dp_above_noise_idx # fig.add_trace(go.Scatter(x=_dp_above_noise_idx, y=np.abs(_data_rc).mean(axis=1)[_dp_above_noise_idx], mode='markers', name='Above Threshold', marker=dict(color='red', size=5))) # fig.show() # check if _rain_data is empty, if it is, skip this iteration if _rain_data.size == 0: continue ### LOG DATA _rain_data = np.abs(_rain_data) _rain_data_log = np.log(_rain_data) _rain_data_log = _rain_data_log[np.isfinite(_rain_data_log)] fitter_aic = Fitter(dist_list_log, statistic_method=aic_statistic) fitter_bic = Fitter(dist_list_log, statistic_method=bic_statistic) fitter_ks = Fitter(dist_list_log, statistic_method="ks") fitter_cvm = Fitter(dist_list_log, statistic_method="cvm") # create a fitter just to take dist params for kl divergence calculation print(f"Fitting distributions for ad{ad} burst{burst} abertura{abertura} volta{volta}") fitter_aic.fit(_rain_data_log) fitter_bic.fit(_rain_data_log) fitter_ks.fit(_rain_data_log) fitter_cvm.fit(_rain_data_log) print(f"Validating fits for ad{ad} burst{burst} abertura{abertura} volta{volta}") fitter_aic.validate(n_mc_samples=1) fitter_bic.validate(n_mc_samples=1) fitter_ks.validate(n_mc_samples=1) fitter_cvm.validate(n_mc_samples=1) # create hist with plots and save for each volta and abertura if ad == '0' and burst == '0': _fig = fitter_aic.histogram_with_fits() _fig.write_html(f'{DATA_FOLDER}/histogram_with_fits_ad{ad}_burst{burst}_abertura{abertura}_volta{volta}.html') # create a pandas dataframe to store the statistics of the fitted distributions and concatenate it with the existing dataframe dist_stats_aic = {dist.name: fitter_aic[dist.name].gof_statistic for dist in dist_list_log} dist_stats_bic = {dist.name: fitter_bic[dist.name].gof_statistic for dist in dist_list_log} dist_stats_ks = {dist.name: fitter_ks[dist.name].gof_statistic for dist in dist_list_log} dist_stats_cvm = {dist.name: fitter_cvm[dist.name].gof_statistic for dist in dist_list_log} dist_stats_df_aic = pd.DataFrame(dist_stats_aic, index=[0]) statistics_dataframe_aic = pd.concat([statistics_dataframe_aic, dist_stats_df_aic], ignore_index=True) dist_stats_df_bic = pd.DataFrame(dist_stats_bic, index=[0]) statistics_dataframe_bic = pd.concat([statistics_dataframe_bic, dist_stats_df_bic], ignore_index=True) dist_stats_df_ks = pd.DataFrame(dist_stats_ks, index=[0]) statistics_dataframe_ks = pd.concat([statistics_dataframe_ks, dist_stats_df_ks], ignore_index=True) dist_stats_df_cvm = pd.DataFrame(dist_stats_cvm, index=[0]) statistics_dataframe_cvm = pd.concat([statistics_dataframe_cvm, dist_stats_df_cvm], ignore_index=True) # Create pandas dataframe to store the statistics of the fitted distributions and concatenate it with the existing dataframe # log weibull parameters logweibull_params = {f'logweibull_{param}': fitter_aic._dist['logweibull'].fit_result_params[i] for i, param in enumerate(['k', 'lam', 'loc', 'scale'])} logweibull_params_df = pd.DataFrame(logweibull_params,index=[0]) logweibull_dataframe = pd.concat([logweibull_dataframe, logweibull_params_df], ignore_index=True) lognakagami_params = {f'lognakagami_{param}': fitter_aic._dist['lognakagami'].fit_result_params[i] for i, param in enumerate(['m', 'Omega', 'loc', 'scale'])} lognakagami_params_df = pd.DataFrame(lognakagami_params,index=[0]) lognakagami_dataframe = pd.concat([lognakagami_dataframe, lognakagami_params_df], ignore_index=True) loggamma_params = {f'loggamma_{param}': fitter_aic._dist['loggamma_dist'].fit_result_params[i] for i, param in enumerate(['a', 'loc', 'scale'])} loggamma_params_df = pd.DataFrame(loggamma_params,index=[0]) loggamma_dataframe = pd.concat([loggamma_dataframe, loggamma_params_df], ignore_index=True) logrice_params = {f'logrice_{param}': fitter_aic._dist['logrice'].fit_result_params[i] for i, param in enumerate(['nu', 'sigma', 'loc', 'scale'])} logrice_params_df = pd.DataFrame(logrice_params,index=[0]) logrice_dataframe = pd.concat([logrice_dataframe, logrice_params_df], ignore_index=True) lograyleigh_params = {f'lograyleigh_{param}': fitter_aic._dist['lograyleigh'].fit_result_params[i] for i, param in enumerate(['sigma', 'loc', 'scale'])} lograyleigh_params_df = pd.DataFrame(lograyleigh_params,index=[0]) lograyleigh_dataframe = pd.concat([lograyleigh_dataframe, lograyleigh_params_df], ignore_index=True) logk_params = {f'logk_{param}': fitter_aic._dist['logk'].fit_result_params[i] for i, param in enumerate(['mu', 'alpha', 'beta', 'loc', 'scale'])} logk_params_df = pd.DataFrame(logk_params,index=[0]) logk_dataframe = pd.concat([logk_dataframe, logk_params_df], ignore_index=True) # save params for each distribution in a csv file, one for each distribution. If exists, overwrite it, if not, create it logweibull_dataframe.to_csv(f'{DATA_FOLDER}/logweibull_params.csv', index=False) lognakagami_dataframe.to_csv(f'{DATA_FOLDER}/lognakagami_params.csv', index=False) loggamma_dataframe.to_csv(f'{DATA_FOLDER}/loggamma_params.csv', index=False) logrice_dataframe.to_csv(f'{DATA_FOLDER}/logrice_params.csv', index=False) lograyleigh_dataframe.to_csv(f'{DATA_FOLDER}/lograyleigh_params.csv', index=False) logk_dataframe.to_csv(f'{DATA_FOLDER}/logk_params.csv', index=False) # create histogram and save .html file with the figure # save only in ad0 burst 0 for each abertura and volta if ad == '0' and burst == '0': fig_1 = fitter_aic.histogram_with_fits() fig_1.write_html(f'{DATA_FOLDER}/histogram_with_fits_ad{ad}_burst{burst}_abertura{abertura}_volta{volta}.html') fig_2 = fitter_aic.plot_qq_plots_sns(method='filliben') # write sns image as .png file fig_2.savefig(f'{DATA_FOLDER}/qq_plots_sns_ad{ad}_burst{burst}_abertura{abertura}_volta{volta}.png') # save params statistics_dataframe_aic.to_csv(f"{DATA_FOLDER}/distribution_fit_aic_statistics.csv", index=False) statistics_dataframe_bic.to_csv(f"{DATA_FOLDER}/distribution_fit_bic_statistics.csv", index=False) statistics_dataframe_ks.to_csv(f"{DATA_FOLDER}/distribution_fit_ks_statistics.csv", index=False) statistics_dataframe_cvm.to_csv(f"{DATA_FOLDER}/distribution_fit_cvm_statistics.csv", index=False) # # save qqplot and save .html file with the figure # fig_2 = fitter_aic.plot_qq_plots(method='filliben') # fig_2.write_html(f'{DATA_PATH}/qq_plots_ad{ad}_burst{burst}_abertura{abertura}_volta{volta}.html') # # create pandas dataframe to store weibull parameters and concatenate it with the existing dataframe # weibull_params = {f'weibull_{param}': fitter_aic._dist['weibull_min'].fit_result_params[i] for i, param in enumerate(['shape', 'loc', 'scale'])} # weibull_params_df = pd.DataFrame(weibull_params,index=[0]) # weibull_dataframe = pd.concat([weibull_dataframe, weibull_params_df], ignore_index=True) # # create pandas dataframe for rayleigh parameters and concatenate it with the existing dataframe # rayleigh_params = {f'rayleigh_{param}': fitter._dist['rayleigh'].fit_result_params[i] for i, param in enumerate(['loc', 'scale'])} # rayleigh_params_df = pd.DataFrame(rayleigh_params,index=[0]) # rayleigh_dataframe = pd.concat([rayleigh_dataframe, rayleigh_params_df], ignore_index=True) # # create pandas dataframe for normal parameters and concatenate it with the existing dataframe # normal_params = {f'normal_{param}': fitter._dist['norm'].fit_result_params[i] for i, param in enumerate(['loc', 'scale'])} # normal_params_df = pd.DataFrame(normal_params,index=[0]) # normal_dataframe = pd.concat([normal_dataframe, normal_params_df], ignore_index=True) # # create pandas dataframe for gumbel_r parameters and concatenate it with the existing dataframe # gumbel_l_params = {f'gumbel_l_{param}': fitter_aic._dist['gumbel_l'].fit_result_params[i] for i, param in enumerate(['loc', 'scale'])} # gumbel_l_params_df = pd.DataFrame(gumbel_l_params,index=[0]) # gumbel_l_dataframe = pd.concat([gumbel_l_dataframe, gumbel_l_params_df], ignore_index=True) # # create pandas dataframe for genextreme parameters and concatenate it with the existing dataframe # genextreme_params = {f'genextreme_{param}': fitter_aic._dist['genextreme'].fit_result_params[i] for i, param in enumerate(['shape', 'loc', 'scale'])} # genextreme_params_df = pd.DataFrame(genextreme_params,index=[0]) # genextreme_dataframe = pd.concat([genextreme_dataframe, genextreme_params_df], ignore_index=True) # # create pandas dataframe for gamma parameters and concatenate it with the existing dataframe # gamma_params = {f'gamma_{param}': fitter_aic._dist['gamma'].fit_result_params[i] for i, param in enumerate(['shape', 'loc', 'scale'])} # gamma_params_df = pd.DataFrame(gamma_params,index=[0]) # gamma_dataframe = pd.concat([gamma_dataframe, gamma_params_df], ignore_index=True) # # create pandas dataframe for gompertz parameters and concatenate it with the existing dataframe # gompertz_params = {f'gompertz_{param}': fitter_aic._dist['gompertz'].fit_result_params[i] for i, param in enumerate(['shape', 'loc', 'scale'])} # gompertz_params_df = pd.DataFrame(gompertz_params,index=[0]) # gompertz_dataframe = pd.concat([gompertz_dataframe, gompertz_params_df], ignore_index=True) i += 1 # path to save the dataframes and the figure statistics_dataframe_aic.to_csv(f"{DATA_FOLDER}/distribution_fit_aic_statistics.csv", index=False) statistics_dataframe_bic.to_csv(f"{DATA_FOLDER}/distribution_fit_bic_statistics.csv", index=False) statistics_dataframe_ks.to_csv(f"{DATA_FOLDER}/distribution_fit_ks_statistics.csv", index=False) statistics_dataframe_cvm.to_csv(f"{DATA_FOLDER}/distribution_fit_cvm_statistics.csv", index=False) # rayleigh_dataframe.to_csv(f'{DATA_FOLDER}/rayleigh_parameters.csv', index=False) # weibull_dataframe.to_csv(f'{DATA_FOLDER}/weibull_parameters.csv', index=False) # normal_dataframe.to_csv(f'{DATA_FOLDER}/normal_parameters.csv', index=False) # gumbel_l_dataframe.to_csv(f'{DATA_FOLDER}/gumbel_l_parameters.csv', index=False) # genextreme_dataframe.to_csv(f'{DATA_FOLDER}/genextreme_parameters.csv', index=False) # gamma_dataframe.to_csv(f'{DATA_FOLDER}/gamma_parameters.csv', index=False) # gompertz_dataframe.to_csv(f'{DATA_FOLDER}/gompertz_parameters.csv', index=False)