from coffea import processor
from coffea.nanoevents import BaseSchema
import numpy as np
import hist
import re
import matplotlib.pyplot as plt
%matplotlib inline
import infofile # Same infofile used in https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata
#prefix = '4lep/' # Use this prefix for local data
prefix = 'https://atlas-opendata.web.cern.ch/atlas-opendata/samples/2020/4lep/' # Use this prefix to get data via https
fileset = {'Data': [prefix + 'Data/data_A.4lep.root',
prefix + 'Data/data_B.4lep.root',
prefix + 'Data/data_C.4lep.root',
prefix + 'Data/data_D.4lep.root'],
'Background $Z,tt^{bar}$': [prefix + 'MC/mc_361106.Zee.4lep.root',
prefix + 'MC/mc_361107.Zmumu.4lep.root',
prefix + 'MC/mc_410000.ttbar_lep.4lep.root'],
'Background $ZZ^{star}$': [prefix + 'MC/mc_363490.llll.4lep.root'],
'Signal ($m_H$ = 125 GeV)': [prefix + 'MC/mc_345060.ggH125_ZZ4lep.4lep.root',
prefix + 'MC/mc_344235.VBFH125_ZZ4lep.4lep.root',
prefix + 'MC/mc_341964.WH125_ZZ4lep.4lep.root',
prefix + 'MC/mc_341947.ZH125_ZZ4lep.4lep.root']}
# If using local data, we want to make sure we have all the necessary files
if prefix == '4lep/':
import os
import requests
# If any of the necessary folders don't exist, make them
if not os.path.isdir('4lep'):
os.mkdir('4lep')
if not os.path.isdir('4lep/Data'):
os.mkdir('4lep/Data')
if not os.path.isdir('4lep/MC'):
os.mkdir('4lep/MC')
# For each file in each of our datasets
for file_list in fileset.values():
for filepath in file_list:
# Check if the file exists locally
if not os.path.isfile(filepath):
#If the file doesn't exist, download it
data_download = requests.get('https://atlas-opendata.web.cern.ch/atlas-opendata/samples/2020/' + filepath)
new_file = open(filepath, 'wb')
new_file.write(data_download.content)
new_file.close()
# Method from https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata
def calc_weight(xsec_weight, mcWeight, scaleFactor_PILEUP,
scaleFactor_ELE, scaleFactor_MUON,
scaleFactor_LepTRIGGER ):
return xsec_weight*mcWeight*scaleFactor_PILEUP*scaleFactor_ELE*scaleFactor_MUON*scaleFactor_LepTRIGGER
# Method from https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata
lumi = 10 # fb-1 # data_A,data_B,data_C,data_D
def get_xsec_weight(sample):
info = infofile.infos[sample] # open infofile
xsec_weight = (lumi*1000*info["xsec"])/(info["sumw"]*info["red_eff"]) #*1000 to go from fb-1 to pb-1
return xsec_weight # return cross-section weight
# Adapted from method from https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata
def calc_mllll(lep_pt,lep_eta,lep_phi,lep_E):
# first lepton is [0], 2nd lepton is [1] etc
px_0 = lep_pt[:,0]*np.cos(lep_phi[:,0]) # x-component of lep[0] momentum
py_0 = lep_pt[:,0]*np.sin(lep_phi[:,0]) # y-component of lep[0] momentum
pz_0 = lep_pt[:,0]*np.sinh(lep_eta[:,0]) # z-component of lep[0] momentum
px_1 = lep_pt[:,1]*np.cos(lep_phi[:,1]) # x-component of lep[1] momentum
py_1 = lep_pt[:,1]*np.sin(lep_phi[:,1]) # y-component of lep[1] momentum
pz_1 = lep_pt[:,1]*np.sinh(lep_eta[:,1]) # z-component of lep[1] momentum
px_2 = lep_pt[:,2]*np.cos(lep_phi[:,2]) # x-component of lep[2] momentum
py_2 = lep_pt[:,2]*np.sin(lep_phi[:,2]) # y-component of lep[2] momentum
pz_2 = lep_pt[:,2]*np.sinh(lep_eta[:,2]) # z-component of lep[3] momentum
px_3 = lep_pt[:,3]*np.cos(lep_phi[:,3]) # x-component of lep[3] momentum
py_3 = lep_pt[:,3]*np.sin(lep_phi[:,3]) # y-component of lep[3] momentum
pz_3 = lep_pt[:,3]*np.sinh(lep_eta[:,3]) # z-component of lep[3] momentum
sumpx = px_0 + px_1 + px_2 + px_3 # x-component of 4-lepton momentum
sumpy = py_0 + py_1 + py_2 + py_3 # y-component of 4-lepton momentum
sumpz = pz_0 + pz_1 + pz_2 + pz_3 # z-component of 4-lepton momentum
sumE = lep_E[:,0] + lep_E[:,1] + lep_E[:,2] + lep_E[:,3] # energy of 4-lepton system
return np.sqrt(sumE**2 - sumpx**2 - sumpy**2 - sumpz**2)/1000 #/1000 to go from MeV to GeV
# Adapted from methods from https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata
# cut on lepton charge
# paper: "selecting two pairs of isolated leptons, each of which is comprised of two leptons with the same flavour and opposite charge"
def cut_lep_charge(lep_charge):
# throw away when sum of lepton charges is not equal to 0
# first lepton is [0], 2nd lepton is [1] etc
return ((lep_charge[:,0] + lep_charge[:,1] + lep_charge[:,2] + lep_charge[:,3]) != 0)
# cut on lepton type
# paper: "selecting two pairs of isolated leptons, each of which is comprised of two leptons with the same flavour and opposite charge"
def cut_lep_type(lep_type):
# for an electron lep_type is 11
# for a muon lep_type is 13
# throw away when none of eeee, mumumumu, eemumu
sum_lep_type = lep_type[:,0] + lep_type[:,1] + lep_type[:,2] + lep_type[:,3]
return ((sum_lep_type != 44) & (sum_lep_type != 48) & (sum_lep_type != 52))
class Processor(processor.ProcessorABC):
def process(self, events):
dataset = events.metadata['dataset']
# Cut events based on lepton charge
events = events[~cut_lep_charge(events.lep_charge)]
# Cut events based on lepton type
events = events[~cut_lep_type(events.lep_type)]
# Calculate the mllll for each event
mllll = calc_mllll(events.lep_pt, events.lep_eta, events.lep_phi, events.lep_E)
if (dataset == 'Data'):
# Create and fill a histogram for mllll
mllllhist = (
hist.Hist.new
.Reg(34, 80, 250, name='mllll', label='4-lepton invariant mass $\mathrm{m_{4l}}$ [GeV]')
.StrCat(['Data'], name='dataset', label='Dataset')
.Weight()
.fill(mllll=mllll, dataset=dataset)
)
output = {
'entries': {dataset: len(events)},
'mllll': {'Data': mllllhist}
}
else:
# Extract the sample name from the filename metadata with regex
sample = re.findall(r'mc_\d+\.(.+)\.4lep', events.metadata['filename'])[0]
# Calculate the event weights
xsec_weight = get_xsec_weight(sample)
totalWeights = calc_weight(xsec_weight, events.mcWeight, events.scaleFactor_PILEUP, events.scaleFactor_ELE, events.scaleFactor_MUON, events.scaleFactor_LepTRIGGER)
# Create and fill a histogram for mllll
mllllhist = (
hist.Hist.new
.Reg(34, 80, 250, name='mllll', label='4-lepton invariant mass $\mathrm{m_{4l}}$ [GeV]')
.StrCat(['Background $Z,tt^{bar}$', 'Background $ZZ^{star}$', 'Signal ($m_H$ = 125 GeV)'], name='dataset', label='Dataset')
.Weight()
.fill(mllll=mllll, dataset=dataset, weight=totalWeights)
)
output = {
'entries': {dataset: len(events)},
'mllll': {'MC': mllllhist}
}
return output
def postprocess(self, accumulator):
return accumulator
from dask.distributed import Client
import time, os
if os.environ["LABEXTENTION_FACTORY_MODULE"] == "coffea_casa":
# We are using coffea-casa here, but we need to find beter handle to detect difference (TBD)
client = Client("tls://localhost:8786")
import zipfile
zipfile.ZipFile("infofile.zip", "w").write("infofile.py")
client.upload_file("infofile.zip")
else:
client = Client()
start_time = time.time()
output = processor.run_uproot_job(fileset,
treename = 'mini',
processor_instance = Processor(),
executor = processor.dask_executor,
executor_args = {'schema': BaseSchema, 'client': client}
)
finish_time = time.time()
print("Total runtime in seconds: " + str(finish_time - start_time))
Total runtime in seconds: 90.65940117835999| 100% Completed | 1min 20.4s
# Plot the histograms
hist.Hist.plot1d(output['mllll']['Data'], histtype='errorbar', color='black')
hist.Hist.plot1d(output['mllll']['MC'], stack=True, histtype='fill', color=['purple', 'red', 'cyan'])
# Basic bin parameters
xmin = 80
xmax = 250
step_size = 5
bin_centers = np.arange(start=xmin+step_size/2, # The interval includes this value
stop=xmax+step_size/2, # The interval doesn't include this value
step=step_size ) # Spacing between values
# Calculate background statistical uncertainty
# Remove the "[:,:2]" expressions to use all MC datasets with background and signal combined
mc_tot_var = np.sum(output['mllll']['MC'].variances()[:,:2], axis=1)
mc_err = np.sqrt(mc_tot_var)
mc_tot_height = np.sum(output['mllll']['MC'].values()[:,:2], axis=1)
# Plot background statistical uncertainty
plt.bar(bin_centers, # x
2*mc_err, # heights
alpha=0.5, # half transparency
bottom=mc_tot_height-mc_err, color='none',
hatch="////", width=step_size, label='Stat. Unc.')
# Tune plot appearance
main_axes = plt.gca()
main_axes.set_xlim(left=xmin, right=xmax)
main_axes.set_ylim(bottom=0, top=np.amax(output['mllll']['Data'].values())*1.6)
main_axes.set_ylabel('Events / '+str(step_size)+' GeV')
main_axes.legend(frameon=False)
<matplotlib.legend.Legend at 0x7f1f33bc7700>