Skip to content

Commit

Permalink
uncertainty definitions and variants - WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
achiefa committed Nov 29, 2024
1 parent 490ba10 commit 183f7a5
Show file tree
Hide file tree
Showing 5 changed files with 1,813 additions and 40 deletions.
20 changes: 20 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/ATLAS_WJ_8TEV/filter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
'''
Filter script for ATLAS_WJ_8TEV
Log:
- Central data and kinematics implemented from hepdata
- Central data and kinematics checked with legacy: passed
- The bin values in `uncertainties_legacy_WX-PT.yaml` and `uncertainties_legacy_WX-PT_sys_ATLAS.yaml`
are exactly the same (checked with notebook). The difference between
these two variants is in the definition of the uncertainties. See notebook for
differences.
- For the treatment of systematic uncertainties, refer to https://arxiv.org/pdf/2112.11266.
There are 50 sources of CORRELATED systematic uncertainty.
29 / 11 / 2024
- Implemented uncertaintiy definitions
- Construction of the uncertainty file is still missing. Remember that
for the symmetrised case you account for the shifts.
- Xq2 is missing (?)
'''

import logging

from filter_utils import Extractor
Expand Down
261 changes: 223 additions & 38 deletions nnpdf_data/nnpdf_data/commondata/ATLAS_WJ_8TEV/filter_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@
import pandas as pd
import yaml

from nnpdf_data.filter_utils.utils import prettify_float
from nnpdf_data.filter_utils.utils import matlist_to_matrix, prettify_float, symmetrize_errors

yaml.add_representer(float, prettify_float)

SQRTS = 8000

# List of systematic uncertainties that shuold
# be considered uncorrelated
UNCORR_SYS_UNC = ['BkgMCstat', 'UnfoldMCstat', 'QCDfitUncert']


class Extractor:
"""
Expand Down Expand Up @@ -54,6 +58,11 @@ def __init__(self, metadata_file, observable, mult_factor=1):
else:
raise Exception(f'{self.observable} is an unknown observable.')

# Collect diagonal absoulute uncertainties
self.diag_unc = self.__collect_diag_unc()
self.unc_labels = list(self.diag_unc[0].keys())
self.unc_labels.pop(0)

def __extract_kinematics(self, table: dict, tab_number: int):
"""
Extracts the kinematic variables of the single differential
Expand Down Expand Up @@ -116,7 +125,10 @@ def __retrieve_table(self, table_id):
table = tab_dict
return table

def generate_kinematics(self):
def get_table(self, table_id):
return self.__retrieve_table(table_id)

def generate_kinematics(self, save_to_yaml=True):
"""
Function that generates the kinematics by looping over all the
tables specified in the metadata file. The resulting kinematics
Expand All @@ -129,11 +141,12 @@ def generate_kinematics(self):
# Initialise kinematics list
kinematics = []
ndata = 0
for table in self.metadata["tables"]:
tab_dict = self.__retrieve_table(table)
kin = self.__extract_kinematics(tab_dict, table)
kinematics = np.concatenate([kinematics, kin])
ndata += len(kin)
table = self.metadata["tables"][0]
tab_dict = self.__retrieve_table(table)
kin = self.__extract_kinematics(tab_dict, table)
kinematics = np.concatenate([kinematics, kin])
ndata += len(kin)
self.kinematics = kinematics.tolist()
kinematics_yaml = {'bins': kinematics.tolist()}

# Check number of data agrees with metadata
Expand All @@ -146,45 +159,217 @@ def generate_kinematics(self):
f" The correct number is {ndata}. Please, update the metafile."
)

# Dump into file
logging.info("Dumping kinematics to file...")
with open(self.metadata['kinematics']['file'], 'w') as kin_out_file:
yaml.dump(kinematics_yaml, kin_out_file, sort_keys=False)
logging.info("Done!")
if save_to_yaml:
# Dump into file
logging.info("Dumping kinematics to file...")
with open(self.metadata['kinematics']['file'], 'w') as kin_out_file:
yaml.dump(kinematics_yaml, kin_out_file, sort_keys=False)
logging.info("Done!")
else:
return 1

def generate_data_central(self):
def generate_data_central(self, save_to_yaml=True):
"""
Same as `generate_kinematics`, but for central data points.
"""
logging.info(f"Generating central data for ATLAS_{self.observable}...")
dat_central = []
for table in self.metadata['tables']:
tab_dict = self.__retrieve_table(table)
table = self.metadata['tables'][0]
tab_dict = self.__retrieve_table(table)

# Select the chosen combination
values = next(
(
head['values']
for head in tab_dict["dependent_variables"]
if self.observable_latex in head['header']['name']
),
1,
)
if values == 1:
logging.error(
f"{self.observable} not found in table {table} under the LaTeX name {self.observable_latex}. The available options are:"
)
for head in tab_dict["dependent_variables"]:
print(f" - {head['header']['name']}")
exit(-1)
# Select the chosen combination
values = self.__select_bins_by_observable(tab_dict)

data = [dat['value'] * self.mult_factor for dat in values]
dat_central = np.concatenate([dat_central, data])
data = [dat['value'] * self.mult_factor for dat in values]
dat_central = np.concatenate([dat_central, data])
self.dat_central = dat_central

dat_central_yaml = {'data_central': dat_central.tolist()}

# Dump into file
logging.info("Dumping kinematics to file...")
with open(self.metadata['data_central'], 'w') as dat_out_file:
yaml.dump(dat_central_yaml, dat_out_file, sort_keys=False)
logging.info("Done!")
if save_to_yaml:
# Dump into file
logging.info("Dumping kinematics to file...")
with open(self.metadata['data_central'], 'w') as dat_out_file:
yaml.dump(dat_central_yaml, dat_out_file, sort_keys=False)
logging.info("Done!")
else:
return 1

def __build_abs_stat_covmat(self):
"""Builds the statistical covmat given the diagonal statistical uncertainties."""
ndata = self.metadata['ndata']
table_id = self.metadata['tables'][1]
with open(f'./rawdata/data{table_id}.yaml', 'r') as tab:
stat = yaml.load(tab, yaml.Loader)
matlist = [val['value'] for val in stat['dependent_variables'][0]['values']]
statmat_rel = matlist_to_matrix(ndata, ndata, matlist)

# Retrieve statistical diagonal entries
abs_stat_diag = [float(d['Stat']['error']) for d in self.diag_unc]
statmat_abs = np.zeros_like(statmat_rel)
for i in range(statmat_rel.shape[0]):
for j in range(statmat_rel.shape[1]):
statmat_abs[i, j] = statmat_rel[i, j] * abs_stat_diag[i] * abs_stat_diag[j]

return statmat_abs

def __collect_diag_unc(self):
"""Collect the absolute values of the diagonal uncertainties"""
table_idx = self.metadata['tables'][0]
tab_dict = self.__retrieve_table(table_idx)

# Select the chosen combination
bins = self.__select_bins_by_observable(tab_dict)

abs_unc_by_bin = []
for bin in bins:
bin_err = bin['errors']
unc_dict = {
unc['label']: {'type': list(unc.keys())[1], 'error': list(unc.values())[1]}
for unc in bin_err
}
abs_unc_by_bin.append(unc_dict)
return abs_unc_by_bin

# TODO and TOCHECK
# Some plus uncertainties are actually lower than minus uncertainties
# Also, some symmetric uncertainties are negative.
def symmetrized_sys_unc(self):
symmetrized_uncs = []
for bin in self.diag_unc:
unc_dict = {}
for source in self.unc_labels:
if bin[source]['type'] == 'asymerror':
error = bin[source]['error']
plus = error['plus']
minus = error['minus']
if plus < minus:
aux = minus
minus = plus
plus = aux
data_delta, sym_error = symmetrize_errors(error['plus'], error['minus'])
unc_dict[source] = {'shift': data_delta, 'sym_error': sym_error}
elif bin[source]['type'] == 'symerror':
# TODO
# I'm not sure I need the abs value here
unc_dict[source] = {'shift': 0, 'sym_error': abs(bin[source]['error'])}
symmetrized_uncs.append(unc_dict)
return symmetrized_uncs

# def __cms_treatment_sys_unc(self):
# """In the legacy implementation, all systematic uncertainties (even for symmetric uncertanties),
# are treated using the prescription outlined in eq. (6) of https://arxiv.org/pdf/1703.01630.
# Specifically, positive and negative bounds are treated as follows
#
# C_{ij}^{sys} = \frac{1}{2} \sum_{k'}( C_{jk'}^{+} C_{ik'}^{+} + C_{jk'}^{-} C_{ik'}^{-})
#
# With this prescription, asymmetric uncertainties are not symmetrized.
# """

def get_diag_unc(self):
if hasattr(self, 'diag_unc'):
return self.diag_unc
else:
self.diag_unc = self.__collect_diag_unc()
return self.diag_unc

def get_abs_stat_covmat(self):
if hasattr(self, 'abs_stat_covmat'):
return self.abs_stat_covmat
else:
self.abs_stat_covmat = self.__build_abs_stat_covmat()
return self.abs_stat_covmat

def __select_bins_by_observable(self, tab_dict):
"""This dataset colelcts differential xsecs for either W+ and W- in the
same yaml file. This function returns the part of this yaml file relevant
for the selected boson sign."""
values = next(
(
head['values']
for head in tab_dict["dependent_variables"]
if self.observable_latex in head['header']['name']
),
1,
)
if values == 1:
logging.error(
f"{self.observable} not found in table under the LaTeX name {self.observable_latex}. The available options are:"
)
for head in tab_dict["dependent_variables"]:
print(f" - {head['header']['name']}")
exit(-1)
else:
return values

def build_definitions(self, variant='default'):
unc_definitions = {}

# Statistical uncertainties are always the same
for idx in range(self.dat_central.size):
unc_definitions[f'art_stat_corr_{idx + 1}'] = {
'description': f'Artificial statistical uncertainty {idx + 1}',
'treatment': 'ADD',
'type': 'CORR',
}

if variant == 'default':
i = 1
for label in self.unc_labels:
if label == 'LumiUncert':
unc_type = 'ATLASLUMI12'
elif label in UNCORR_SYS_UNC:
unc_type = 'UNCORR'
else:
unc_type = f'SYSATLASW{i}'
i += 1

unc_definitions[f'{label}'] = {
'description': f'Systematic: {label}',
'treatment': 'MULT',
'type': unc_type,
}

elif variant == 'cms':
i = 1
for label in self.unc_labels:
if label == 'LumiUncert':
unc_definitions[f'{label}'] = {
'description': f'Systematic: {label}',
'treatment': 'MULT',
'type': 'ATLASLUMI12',
}
else:
if label in UNCORR_SYS_UNC:
unc_definitions[f'{label}'] = {
'description': f'Systematic: {label}',
'treatment': 'MULT',
'type': 'UNCORR',
}
else:
unc_definitions[f'{label}_plus'] = {
'description': f'Systematic upper unc.: {label}',
'treatment': 'MULT',
'type': f'SYSATLASW{i}',
}
unc_definitions[f'{label}_minus'] = {
'description': f'Systematic lower unc.: {label}',
'treatment': 'MULT',
'type': f'SYSATLASW{i+1}',
}
i += 2

elif variant == 'inter_sys':
raise ValueError(f'Not yet implemented')
else:
raise ValueError(f'The variant {variant} is not yet implemented or wrong.')

return unc_definitions

@classmethod
def generate_artifical_unc(matrix):
eigvals, eigvecs = np.linalg.eig(matrix)
artificial_unc = np.zeros_like(eigvecs)
for i in range(artificial_unc.shape[0]):
for j in range(artificial_unc.shape[1]):
artificial_unc[i, j] = np.sqrt(eigvals[j]) * eigvecs[i, j]
4 changes: 2 additions & 2 deletions nnpdf_data/nnpdf_data/commondata/ATLAS_WJ_8TEV/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ implemented_observables:
label: ATLAS $W^-$+jet 8 TeV
units: '[fb]'
process_type: DY_CC_PT
tables: [13]
tables: [13, 14]
ndata: 16
plotting:
kinematics_override: identity
Expand Down Expand Up @@ -68,7 +68,7 @@ implemented_observables:
label: ATLAS $W^+$+jet 8 TeV
units: '[fb]'
process_type: DY_CC_PT
tables: [13]
tables: [13, 15]
ndata: 16
plotting:
kinematics_override: identity
Expand Down
Loading

0 comments on commit 183f7a5

Please sign in to comment.