From fde0e11a1d586154bcfe84fa38f811674b83a273 Mon Sep 17 00:00:00 2001 From: wilsonm <michael.wilson@ed.ac.uk> Date: Tue, 22 Sep 2020 15:12:09 +0100 Subject: [PATCH] leverage experiment grouping for calculating totals of estimators where appropriate --- .../validphys/closuretest/closure_results.py | 2 +- validphys2/src/validphys/config.py | 22 +++++ validphys2/src/validphys/dataplots.py | 8 +- validphys2/src/validphys/fitdata.py | 10 +-- validphys2/src/validphys/results.py | 82 ++++++++++++++++--- 5 files changed, 102 insertions(+), 22 deletions(-) diff --git a/validphys2/src/validphys/closuretest/closure_results.py b/validphys2/src/validphys/closuretest/closure_results.py index 50e39de6bd..7058f29c9f 100644 --- a/validphys2/src/validphys/closuretest/closure_results.py +++ b/validphys2/src/validphys/closuretest/closure_results.py @@ -273,7 +273,7 @@ fits_exps_bootstrap_chi2_central = collect( "experiments_bootstrap_chi2_central", ("fits", "fitcontext") ) fits_level_1_noise = collect( - "total_experiments_chi2data", ("fits", "fitinputcontext", "fitunderlyinglaw") + "total_chi2_data", ("fits", "fitinputcontext", "fitunderlyinglaw") ) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 78470a46d3..f81ff2acf5 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -1421,6 +1421,28 @@ class CoreConfig(configparser.Config): ) return validphys.filters.filter_closure_data_by_experiment + @configparser.explicit_node + def produce_total_chi2_data(self, fitthcovmat): + """If there is no theory covmat for the fit, then calculate the + total chi2 by summing the chi2 from each experiment. + """ + import validphys.results + + if fitthcovmat is None: + return validphys.results.total_chi2_data_from_experiments + return validphys.results.dataset_inputs_abs_chi2_data + + @configparser.explicit_node + def produce_total_phi_data(self, fitthcovmat): + """If there is no theory covmat for the fit, then calculate the total + phi using contributions from each experiment. + """ + import validphys.results + + if fitthcovmat is None: + return validphys.results.total_phi_data_from_experiments + return validphys.results.dataset_inputs_phi_data + class Config(report.Config, CoreConfig, ParamfitsConfig): diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index de65ef956d..843dd4aeec 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -30,17 +30,17 @@ from validphys.utils import sane_groupby_iter, split_ranges, scale_from_grid log = logging.getLogger(__name__) @figure -def plot_chi2dist_experiments(total_experiments_chi2data, experiments_chi2_stats, pdf): +def plot_chi2dist_experiments(total_chi2_data, experiments_chi2_stats, pdf): """Plot the distribution of chi²s of the members of the pdfset.""" - fig, ax = _chi2_distribution_plots(total_experiments_chi2data, experiments_chi2_stats, pdf, "hist") + fig, ax = _chi2_distribution_plots(total_chi2_data, experiments_chi2_stats, pdf, "hist") ax.set_title(r"Experiments $\chi^2$ distribution") return fig @figure -def kde_chi2dist_experiments(total_experiments_chi2data, experiments_chi2_stats, pdf): +def kde_chi2dist_experiments(total_chi2_data, experiments_chi2_stats, pdf): """KDE plot for experiments chi2.""" - fig, ax = _chi2_distribution_plots(total_experiments_chi2data, experiments_chi2_stats, pdf, "kde") + fig, ax = _chi2_distribution_plots(total_chi2_data, experiments_chi2_stats, pdf, "kde") ax.set_ylabel(r"Density") ax.set_title(r"Experiments $\chi^2 KDE plot$") return fig diff --git a/validphys2/src/validphys/fitdata.py b/validphys2/src/validphys/fitdata.py index cb8688ae31..865c786f18 100644 --- a/validphys2/src/validphys/fitdata.py +++ b/validphys2/src/validphys/fitdata.py @@ -131,7 +131,7 @@ def replica_data(fit, replica_paths): @table -def fit_summary(fit_name_with_covmat_label, replica_data, dataset_inputs_abs_chi2_data, dataset_inputs_phi_data): +def fit_summary(fit_name_with_covmat_label, replica_data, total_chi2_data, total_phi_data): """ Summary table of fit properties - Central chi-squared - Average chi-squared @@ -149,15 +149,15 @@ def fit_summary(fit_name_with_covmat_label, replica_data, dataset_inputs_abs_chi """ nrep = len(replica_data) - ndata = dataset_inputs_abs_chi2_data.ndata - central_chi2 = dataset_inputs_abs_chi2_data.central_result / ndata - member_chi2 = dataset_inputs_abs_chi2_data.replica_result.error_members() / ndata + ndata = total_chi2_data.ndata + central_chi2 = total_chi2_data.central_result / ndata + member_chi2 = total_chi2_data.replica_result.error_members() / ndata nite = [x.nite for x in replica_data] etrain = [x.training for x in replica_data] evalid = [x.validation for x in replica_data] - phi, _ = dataset_inputs_phi_data + phi, _ = total_phi_data phi_err = np.std(member_chi2)/(2.0*phi*np.sqrt(nrep)) VET = ValueErrorTuple diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index 43ae352318..da0289981c 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -13,6 +13,7 @@ import logging import numpy as np import pandas as pd +import scipy.linalg as la from NNPDF import ThPredictions, CommonData, Experiment from reportengine.checks import require_one, remove_outer, check_not_empty @@ -652,6 +653,33 @@ def dataset_inputs_phi_data(dataset_inputs_abs_chi2_data): return phi_data(dataset_inputs_abs_chi2_data) +experiments_phi_data = collect("dataset_inputs_phi_data", ("group_dataset_inputs_by_experiment",)) + + +def total_phi_data_from_experiments(experiments_phi_data): + """Like :py:func:`dataset_inputs_phi_data` except calculate phi for + each experiment and then sum the contributions. Note that since + the definition of phi is + + phi = sqrt( (<chi2[T_k]> - chi2[<T_k>]) / n_data ), + + where k is the replica index, the total phi is + + sqrt( sum(n_data*phi**2) / sum(n_data) ) + + where the sums run over experiment + + This is only a valid method of calculating total phi provided that there are + no inter-experimental correlations. + + """ + + unnorm_phi_squared, ndata = np.sum( + [(ndata*phi**2, ndata) for phi, ndata in experiments_phi_data], + axis=0 + ) + return np.sqrt(unnorm_phi_squared / ndata), ndata + @check_pdf_is_montecarlo def dataset_inputs_bootstrap_phi_data(dataset_inputs_results, bootstrap_samples=500): """Takes the data result and theory prediction given `dataset_inputs` and @@ -815,7 +843,7 @@ chi2_stat_labels = { } -def experiments_chi2_stats(total_experiments_chi2data): +def experiments_chi2_stats(total_chi2_data): """Compute several estimators from the chi² for an aggregate of experiments: @@ -829,7 +857,7 @@ def experiments_chi2_stats(total_experiments_chi2data): - chi2_per_data """ - rep_data, central_result, npoints = total_experiments_chi2data + rep_data, central_result, npoints = total_chi2_data m = central_result.mean() rep_mean = rep_data.central_value().mean() return OrderedDict( @@ -1040,8 +1068,8 @@ def dataspecs_datasets_chi2_table( ) -fits_total_chi2_data = collect("dataset_inputs_abs_chi2_data", ("fits", "fitcontext")) -dataspecs_total_chi2_data = collect("dataset_inputs_abs_chi2_data", ("dataspecs",)) +fits_total_chi2_data = collect("total_chi2_data", ("fits", "fitcontext")) +dataspecs_total_chi2_data = collect("total_chi2_data", ("dataspecs",)) # TODO: Decide what to do with the horrible totals code. @table @@ -1115,6 +1143,36 @@ def dataspecs_chi2_differences_table(dataspecs, dataspecs_chi2_table): df["difference"] = diff return df +experiments_chi2_data = collect( + "dataset_inputs_abs_chi2_data", ("group_dataset_inputs_by_experiment",) +) + +def total_chi2_data_from_experiments(experiments_chi2_data, pdf): + """Like :py:func:`dataset_inputs_abs_chi2_data`, except sums the contribution + from each experiment which is more efficient in the case that the total + covariance matrix is block diagonal in experiments. + + This is valid as long as there are no cross experiment correlations from + i.e theory covariance matrix + + """ + + central_result = np.sum( + [exp_chi2_data.central_result for exp_chi2_data in experiments_chi2_data], + ) + + error_members = np.sum( + [exp_chi2_data.replica_result.error_members() for exp_chi2_data in experiments_chi2_data], + axis=0 + ) + + ndata = np.sum( + [exp_chi2_data.ndata for exp_chi2_data in experiments_chi2_data], + ) + return Chi2Data( + pdf.stats_class(error_members), central_result, ndata + ) + def dataset_inputs_chi2_per_point_data(dataset_inputs_abs_chi2_data): """Return the total chi²/ndata for all data, specified by dataset_inputs. @@ -1126,13 +1184,13 @@ def dataset_inputs_chi2_per_point_data(dataset_inputs_abs_chi2_data): ) -def total_experiments_chi2(dataset_inputs_chi2_per_point_data): - return dataset_inputs_chi2_per_point_data +def total_chi2_per_point_data(total_chi2_data): + return dataset_inputs_chi2_per_point_data(total_chi2_data) @table @check_not_empty("groups_data") -def perreplica_chi2_table(groups_data, groups_chi2, dataset_inputs_abs_chi2_data): +def perreplica_chi2_table(groups_data, groups_chi2, total_chi2_data): """Chi² per point for each replica for each group. Also outputs the total chi² per replica. The columns come in two levels: The first is the name of the group, @@ -1147,7 +1205,7 @@ def perreplica_chi2_table(groups_data, groups_chi2, dataset_inputs_abs_chi2_data th, central, l = ch total_chis[i] = [central, *th.error_members()] ls.append(l) - total_rep, total_central, total_n = dataset_inputs_abs_chi2_data + total_rep, total_central, total_n = total_chi2_data total_chis[0] = [total_central, *total_rep.error_members()] total_chis[0] /= total_n total_chis[1:, :] /= np.array(ls)[:, np.newaxis] @@ -1225,15 +1283,15 @@ def dataspecs_dataset_chi2_difference_table( each_dataset_chi2 = collect(abs_chi2_data, ("data",)) -pdfs_total_chi2 = collect(dataset_inputs_chi2_per_point_data, ("pdfs",)) +pdfs_total_chi2 = collect(total_chi2_per_point_data, ("pdfs",)) # These are convenient ways to iterate and extract varios data from fits fits_chi2_data = collect(abs_chi2_data, ("fits", "fitcontext", "dataset_inputs")) -fits_total_chi2 = collect("dataset_inputs_chi2_per_point_data", ("fits", "fitcontext")) +fits_total_chi2 = collect("total_chi2_per_point_data", ("fits", "fitcontext")) fits_total_chi2_for_groups = collect( - "dataset_inputs_chi2_per_point_data", ("fits", "fittheoryandpdf") + "total_chi2_per_point_data", ("fits", "fittheoryandpdf") ) fits_pdf = collect("pdf", ("fits", "fitpdf")) @@ -1251,7 +1309,7 @@ dataspecs_groups = collect("groups_data", ("dataspecs",)) dataspecs_groups_chi2_data = collect("groups_chi2", ("dataspecs",)) dataspecs_groups_bootstrap_phi = collect("groups_bootstrap_phi", ("dataspecs",)) dataspecs_results = collect("results", ("dataspecs",)) -dataspecs_total_chi2 = collect("dataset_inputs_chi2_per_point_data", ("dataspecs",)) +dataspecs_total_chi2 = collect("total_chi2_per_point_data", ("dataspecs",)) dataspecs_speclabel = collect("speclabel", ("dataspecs",), element_default=None) dataspecs_cuts = collect("cuts", ("dataspecs",)) -- GitLab