Skip to content

Commit

Permalink
Implementation in the new format - WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
achiefa committed Dec 9, 2024
1 parent d4f991e commit af65895
Show file tree
Hide file tree
Showing 9 changed files with 823 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
data_central:
- 5.68699350e+05
- 4.66912350e+05
- 4.79309450e+05
- 3.95159750e+05
- 282824.0
17 changes: 17 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_13TEV/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
'''
Filter script for CMS_WCHARM_13TEV
'''

import logging

from filter_utils import Extractor
import numpy as np
import yaml

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')


if __name__ == "__main__":
CMS_WCHARM = Extractor("./metadata.yaml", "WPWM-TOT-UNNORM", mult_factor=1000)
_, _, _ = CMS_WCHARM.generate_data(variant='default', save_to_yaml=True)
_, _, _ = CMS_WCHARM.generate_data(variant='sys_10', save_to_yaml=True)
313 changes: 313 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_13TEV/filter_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
import logging
import os

import numpy as np
import pandas as pd
from sys_uncertainties import SYS_DEFINITIONS, SYS_UNC_BY_BIN
import yaml

from nnpdf_data.filter_utils.utils import prettify_float, symmetrize_errors

yaml.add_representer(float, prettify_float)

SQRTS = 8000
MW2 = 80.385**2
CMSLUMI13 = 2.5 # %

# List of systematic uncertainties that shuold
# be considered uncorrelated
UNCORR_SYS_UNC = ['UnfoldMCstat', 'UnfoldOtherGen', 'UnfoldReweight']
ART_LABEL = 'art_corr_unc'
STAT_LABEL = 'stat_uncorr_unc'
TABLE = ''

# From Table 1 of the paper
SYS_UNC_by_bin = [{}]


class Extractor:
"""
Extracts kinematics, central data, and uncertainties for a given dataset
Parameters
----------
metadata_file: str
Path to the metadata file
observable: str
The name of the observable for which the data is extracted. The name must
be listed in the metadata file.
"""

def __init__(self, metadata_file, observable, mult_factor=1):

# Open metadata and select process
with open(metadata_file, 'r') as file:
metadata = yaml.safe_load(file)
self.metadata = next(
(
md
for md in metadata["implemented_observables"]
if md['observable_name'] == observable
),
None,
)
if self.metadata is None:
raise Exception(f"{observable} is not listed in the metadata file.")

# Initialise dict of tables
self.tables = {}
self.observable = observable
self.mult_factor = mult_factor
self.kin_labels = self.metadata['kinematic_coverage']
self.ndata = self.metadata['ndata']

# Collect diagonal absoulute uncertainties
# self.diag_unc = self.__collect_diag_unc()
# self.unc_labels = list(self.diag_unc[0].keys())
# self.unc_labels.pop(0)

def __retrieve_table(self, table_id):
"""
Implementation of the lazy loading for the tables. If the table
is loaded for the first time, it is stored into an internal
container of the class, so that it will not be loaded each time.
When called, this functions checks if the table has already been stored
and, if that is the case, returns the stored table.
Parameters
----------
table_id: int
Index that specifies the table
Return
------
The table specified by `table_id`. If not previously loaded, it is also
stored into the internal container for future use.
"""
try:
table = self.tables[str(table_id)]
except KeyError:
logging.debug(
f'Table {table_id} has not already been used or stored.' f' Storing the table...'
)
with open(f'./rawdata/{TABLE}{table_id}.yaml', 'r') as tab:
tab_dict = yaml.safe_load(tab)
self.tables[str(table_id)] = tab_dict
table = tab_dict
return table

def __extract_kinematics(self, table: dict):
"""
Extracts the kinematic variables of the single differential
distribution given a table.
For each bin, it computes the max, min, and mid value of the transverse
momentum of the boson.
Parameters
----------
table: dict
Dictionary containing the bins in the transverse momentum
tab_number: int
Index to select the range of the second kinematic variable
Return
------
List of bins containing min, max, and mid values for each of the kinematic
observables listed in the `kinematic_coverage` of the metadata file.
"""
data = table['independent_variables'][0]
label = self.kin_labels
kinematics = []
for bin in data['values']:
abs_eta_min = bin['low']
abs_eta_max = bin['high']
kin_bin = {
label[0]: {
'min': abs_eta_min,
'mid': (abs_eta_max + abs_eta_min) / 2,
'max': abs_eta_max,
},
label[1]: {'min': None, 'mid': MW2, 'max': None},
}
kinematics.append(kin_bin)
return kinematics

def generate_kinematics(self):
"""
Function that generates the kinematics by looping over all the
tables specified in the metadata file. The resulting kinematics
is then saved to a yaml file. It relies on the method
`__extract_kinematics`.
"""

logging.info(f"Generating kinematics for ATLAS_{self.observable}...")

# Initialise kinematics list
kinematics = []
ndata = 0
table = self.metadata["tables"][0]
tab_dict = self.__retrieve_table(table)
kin = self.__extract_kinematics(tab_dict)
kinematics = np.concatenate([kinematics, kin])
ndata += len(kin)

# Check number of data agrees with metadata
try:
assert self.metadata['ndata'] is not None
assert self.metadata['ndata'] == ndata
except AssertionError as e:
logging.warning(
f"The number of data in the metafile is either wrong or unspecified."
f" The correct number is {ndata}. Please, update the metafile."
)
return
return kinematics.tolist()

def generate_data_and_unc(self, mult_factor=1.0):
"""
Same as `generate_kinematics`, but for central data points.
"""
logging.info(f"Generating central data for CMS_{self.observable}...")
dat_central = []
stat_unc = []
asy_sys_unc = []
table = self.metadata['tables'][0]
tab_dict = self.__retrieve_table(table)
tab_dict = tab_dict['dependent_variables'][0]['values']

# Loop over bins
for rap_bin in tab_dict:
dat_central.append(rap_bin['value'] * mult_factor)
stat_unc.append(rap_bin['errors'][0]['symerror'] * mult_factor)
asy_sys_unc.append(
{
key: value * mult_factor
for key, value in rap_bin['errors'][1]['asymerror'].items()
}
)
return dat_central, stat_unc, asy_sys_unc

def symmetrized_sys_unc(self):
"""Symmetrise systematic uncertainties. Returns the symmetrized uncertainty
and the shift to the central data
"""
symmetrized_uncs = []
for bin in SYS_UNC_BY_BIN:
unc_dict = {}
for source in bin:
if 'asyserror' in source.keys():
error = source['asyserror']
plus = error['high']
minus = error['low']
data_delta, sym_error = symmetrize_errors(plus, minus)
unc_dict[source['label']] = {'shift': data_delta, 'sym_error': sym_error}
elif 'syserror' in source.keys():
unc_dict[source['label']] = {'shift': 0.0, 'sym_error': source['syserror']}
symmetrized_uncs.append(unc_dict)
return symmetrized_uncs

def __build_unc_definitions(self, variant='default'):
unc_definitions = {}

# Statistical uncertainty
unc_definitions[STAT_LABEL] = {
'description': f'Statistical uncertainty',
'treatment': 'ADD',
'type': 'UNCORR',
}

# Add lumi uncertainty
unc_definitions['corr_lumi_unc'] = {
'description': f'Luminosity uncertainty 2.6%',
'treatment': 'MULT',
'type': 'CMSLUMI13',
}

# Add systematic uncertainty
unc_definitions = unc_definitions | SYS_DEFINITIONS

if variant == 'sys_10':
unc_definitions['uncorr_mc_unc'] = {
'description': f'MC uncertainty',
'treatment': 'MULT',
'type': 'UNCORR',
}
elif variant != 'default':
raise ValueError(f'The variant {variant} is not implemented yet.')

return unc_definitions

def generate_data(self, variant='default', save_to_yaml=False, path='./'):
# Get central data and kinematics
central_data, stat_unc, _ = self.generate_data_and_unc(self.mult_factor)
kinematics = self.generate_kinematics()

# Uncertainty definitions
unc_definitions = self.__build_unc_definitions(variant=variant)

sys_artificial = [] # Initialize vector of artificial uncertainties

symmetrized_sys_uncs = self.symmetrized_sys_unc()
for data_idx, data in enumerate(central_data):
shift = 0
sys_unc_bin = symmetrized_sys_uncs[data_idx]

# Add shift from symmetrization
tmp = {}
for key, value in sys_unc_bin.items():
shift += value['shift']
tmp[key] = value['sym_error']

# Shift central data
central_data[data_idx] = central_data[data_idx] + shift

# Statistical uncertainty
unc_dict = {STAT_LABEL: stat_unc[data_idx]}

# Lumi uncertainty
unc_dict = {'corr_lumi_unc': central_data[data_idx] * CMSLUMI13 * 0.01}

# Add systematic uncertainties
unc_dict = unc_dict | tmp

if variant == 'sys_10':
unc_dict['uncorr_mc_unc'] = central_data[data_idx] * 0.01

sys_artificial.append(unc_dict)

if save_to_yaml:
# Save kinematics into file
logging.info("Dumping kinematics to file...")
kinematics_yaml = {'bins': kinematics}
with open(path + self.metadata['kinematics']['file'], 'w') as kin_out_file:
yaml.dump(kinematics_yaml, kin_out_file, sort_keys=False)
logging.info("Done!")

# Save central data into file
logging.info("Dumping kinematics to file...")
dat_central_yaml = {'data_central': central_data}
file_name = self.metadata['data_central']
with open(path + file_name, 'w') as dat_out_file:
yaml.dump(dat_central_yaml, dat_out_file, sort_keys=False)
logging.info("Done!")

# Save unertainties
logging.info("Dumping kinematics to file...")
uncertainties_yaml = {'definitions': unc_definitions, 'bins': sys_artificial}
file_name = (
self.metadata['data_uncertainties'][0]
if variant == 'default'
else self.metadata['variants'][variant]['data_uncertainties'][0]
)
with open(path + file_name, 'w') as dat_out_file:
yaml.dump(uncertainties_yaml, dat_out_file, sort_keys=False)
logging.info("Done!")
return kinematics, central_data, sys_artificial
else:
return kinematics, central_data, sys_artificial

def get_table(self, table_id):
return self.__retrieve_table(table_id)
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
bins:
- abs_eta:
min: 0.0
mid: 0.2
max: 0.4
m_W2:
min: null
mid: 6.46174823e+03
max: null
- abs_eta:
min: 0.4
mid: 6.00000000e-01
max: 0.8
m_W2:
min: null
mid: 6.46174823e+03
max: null
- abs_eta:
min: 0.8
mid: 1.05
max: 1.3
m_W2:
min: null
mid: 6.46174823e+03
max: null
- abs_eta:
min: 1.3
mid: 1.55
max: 1.8
m_W2:
min: null
mid: 6.46174823e+03
max: null
- abs_eta:
min: 1.8
mid: 2.1
max: 2.4
m_W2:
min: null
mid: 6.46174823e+03
max: null
Loading

0 comments on commit af65895

Please sign in to comment.