Notebook

In [1]:

from coffea import processor
from coffea.nanoevents import BaseSchema
import numpy as np
import hist
import re
import matplotlib.pyplot as plt
%matplotlib inline

import infofile # Same infofile used in https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata

In [2]:

#prefix = '4lep/' # Use this prefix for local data
prefix = 'https://atlas-opendata.web.cern.ch/atlas-opendata/samples/2020/4lep/' # Use this prefix to get data via https

fileset = {'Data': [prefix + 'Data/data_A.4lep.root',
                    prefix + 'Data/data_B.4lep.root',
                    prefix + 'Data/data_C.4lep.root',
                    prefix + 'Data/data_D.4lep.root'],
           'Background $Z,tt^{bar}$': [prefix + 'MC/mc_361106.Zee.4lep.root',
                                       prefix + 'MC/mc_361107.Zmumu.4lep.root',
                                       prefix + 'MC/mc_410000.ttbar_lep.4lep.root'],
           'Background $ZZ^{star}$': [prefix + 'MC/mc_363490.llll.4lep.root'],
           'Signal ($m_H$ = 125 GeV)': [prefix + 'MC/mc_345060.ggH125_ZZ4lep.4lep.root',
                                        prefix + 'MC/mc_344235.VBFH125_ZZ4lep.4lep.root',
                                        prefix + 'MC/mc_341964.WH125_ZZ4lep.4lep.root',
                                        prefix + 'MC/mc_341947.ZH125_ZZ4lep.4lep.root']}

In [3]:

# If using local data, we want to make sure we have all the necessary files
if prefix == '4lep/':
    import os
    import requests
    
    # If any of the necessary folders don't exist, make them
    if not os.path.isdir('4lep'):
        os.mkdir('4lep')
    if not os.path.isdir('4lep/Data'):
        os.mkdir('4lep/Data')
    if not os.path.isdir('4lep/MC'):
        os.mkdir('4lep/MC')
        
    # For each file in each of our datasets
    for file_list in fileset.values():
        for filepath in file_list:
            
            # Check if the file exists locally
            if not os.path.isfile(filepath):
                
                #If the file doesn't exist, download it
                data_download = requests.get('https://atlas-opendata.web.cern.ch/atlas-opendata/samples/2020/' + filepath)
                new_file = open(filepath, 'wb')
                new_file.write(data_download.content)
                new_file.close()

In [4]:

# Method from https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata

def calc_weight(xsec_weight, mcWeight, scaleFactor_PILEUP,
                scaleFactor_ELE, scaleFactor_MUON, 
                scaleFactor_LepTRIGGER ):
    return xsec_weight*mcWeight*scaleFactor_PILEUP*scaleFactor_ELE*scaleFactor_MUON*scaleFactor_LepTRIGGER

In [5]:

# Method from https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata

lumi = 10 # fb-1 # data_A,data_B,data_C,data_D

def get_xsec_weight(sample):
    info = infofile.infos[sample] # open infofile
    xsec_weight = (lumi*1000*info["xsec"])/(info["sumw"]*info["red_eff"]) #*1000 to go from fb-1 to pb-1
    return xsec_weight # return cross-section weight

In [6]:

# Adapted from method from https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata

def calc_mllll(lep_pt,lep_eta,lep_phi,lep_E):
    # first lepton is [0], 2nd lepton is [1] etc
    px_0 = lep_pt[:,0]*np.cos(lep_phi[:,0]) # x-component of lep[0] momentum
    py_0 = lep_pt[:,0]*np.sin(lep_phi[:,0]) # y-component of lep[0] momentum
    pz_0 = lep_pt[:,0]*np.sinh(lep_eta[:,0]) # z-component of lep[0] momentum
    px_1 = lep_pt[:,1]*np.cos(lep_phi[:,1]) # x-component of lep[1] momentum
    py_1 = lep_pt[:,1]*np.sin(lep_phi[:,1]) # y-component of lep[1] momentum
    pz_1 = lep_pt[:,1]*np.sinh(lep_eta[:,1]) # z-component of lep[1] momentum
    px_2 = lep_pt[:,2]*np.cos(lep_phi[:,2]) # x-component of lep[2] momentum
    py_2 = lep_pt[:,2]*np.sin(lep_phi[:,2]) # y-component of lep[2] momentum
    pz_2 = lep_pt[:,2]*np.sinh(lep_eta[:,2]) # z-component of lep[3] momentum
    px_3 = lep_pt[:,3]*np.cos(lep_phi[:,3]) # x-component of lep[3] momentum
    py_3 = lep_pt[:,3]*np.sin(lep_phi[:,3]) # y-component of lep[3] momentum
    pz_3 = lep_pt[:,3]*np.sinh(lep_eta[:,3]) # z-component of lep[3] momentum
    sumpx = px_0 + px_1 + px_2 + px_3 # x-component of 4-lepton momentum
    sumpy = py_0 + py_1 + py_2 + py_3 # y-component of 4-lepton momentum
    sumpz = pz_0 + pz_1 + pz_2 + pz_3 # z-component of 4-lepton momentum
    sumE = lep_E[:,0] + lep_E[:,1] + lep_E[:,2] + lep_E[:,3] # energy of 4-lepton system
    return np.sqrt(sumE**2 - sumpx**2 - sumpy**2 - sumpz**2)/1000 #/1000 to go from MeV to GeV

In [7]:

# Adapted from methods from https://github.com/atlas-outreach-data-tools/notebooks-collection-opendata

# cut on lepton charge
# paper: "selecting two pairs of isolated leptons, each of which is comprised of two leptons with the same flavour and opposite charge"
def cut_lep_charge(lep_charge):
# throw away when sum of lepton charges is not equal to 0
# first lepton is [0], 2nd lepton is [1] etc
    return ((lep_charge[:,0] + lep_charge[:,1] + lep_charge[:,2] + lep_charge[:,3]) != 0)

# cut on lepton type
# paper: "selecting two pairs of isolated leptons, each of which is comprised of two leptons with the same flavour and opposite charge"
def cut_lep_type(lep_type):
# for an electron lep_type is 11
# for a muon lep_type is 13
# throw away when none of eeee, mumumumu, eemumu
    sum_lep_type = lep_type[:,0] + lep_type[:,1] + lep_type[:,2] + lep_type[:,3]
    return ((sum_lep_type != 44) & (sum_lep_type != 48) & (sum_lep_type != 52))

In [8]:

class Processor(processor.ProcessorABC):
    def process(self, events):
        dataset = events.metadata['dataset']
        
        # Cut events based on lepton charge
        events = events[~cut_lep_charge(events.lep_charge)]
        
        # Cut events based on lepton type
        events = events[~cut_lep_type(events.lep_type)]
        
        # Calculate the mllll for each event
        mllll = calc_mllll(events.lep_pt, events.lep_eta, events.lep_phi, events.lep_E)
        
        if (dataset == 'Data'):
            # Create and fill a histogram for mllll
            mllllhist = (
                hist.Hist.new
                .Reg(34, 80, 250, name='mllll', label='4-lepton invariant mass $\mathrm{m_{4l}}$ [GeV]')
                .StrCat(['Data'], name='dataset', label='Dataset')
                .Weight()
                .fill(mllll=mllll, dataset=dataset)
            )
            output = {
                'entries': {dataset: len(events)},
                'mllll': {'Data': mllllhist}
            }
        else:
            # Extract the sample name from the filename metadata with regex
            sample = re.findall(r'mc_\d+\.(.+)\.4lep', events.metadata['filename'])[0]
            
            # Calculate the event weights
            xsec_weight = get_xsec_weight(sample)
            totalWeights = calc_weight(xsec_weight, events.mcWeight, events.scaleFactor_PILEUP, events.scaleFactor_ELE, events.scaleFactor_MUON, events.scaleFactor_LepTRIGGER)
            
            # Create and fill a histogram for mllll
            mllllhist = (
                hist.Hist.new
                .Reg(34, 80, 250, name='mllll', label='4-lepton invariant mass $\mathrm{m_{4l}}$ [GeV]')
                .StrCat(['Background $Z,tt^{bar}$', 'Background $ZZ^{star}$', 'Signal ($m_H$ = 125 GeV)'], name='dataset', label='Dataset')
                .Weight()
                .fill(mllll=mllll, dataset=dataset, weight=totalWeights)
            )
            output = {
                'entries': {dataset: len(events)},
                'mllll': {'MC': mllllhist}
            }
        
        return output

    def postprocess(self, accumulator):
        return accumulator

In [9]:

from dask.distributed import Client
import time, os

if os.environ["LABEXTENTION_FACTORY_MODULE"] == "coffea_casa":
# We are using coffea-casa here, but we need to find beter handle to detect difference (TBD)
   client = Client("tls://localhost:8786")
   import zipfile
   zipfile.ZipFile("infofile.zip", "w").write("infofile.py")
   client.upload_file("infofile.zip")
else:
   client = Client()


start_time = time.time()
output = processor.run_uproot_job(fileset,
                                treename = 'mini',
                                processor_instance = Processor(),
                                executor = processor.dask_executor,
                                executor_args = {'schema': BaseSchema, 'client': client}
                                )
finish_time = time.time()

print("Total runtime in seconds: " + str(finish_time - start_time))

Total runtime in seconds: 90.65940117835999| 100% Completed |  1min 20.4s

In [10]:

# Plot the histograms
hist.Hist.plot1d(output['mllll']['Data'], histtype='errorbar', color='black')
hist.Hist.plot1d(output['mllll']['MC'], stack=True, histtype='fill', color=['purple', 'red', 'cyan'])

# Basic bin parameters
xmin = 80
xmax = 250
step_size = 5

bin_centers = np.arange(start=xmin+step_size/2, # The interval includes this value
                        stop=xmax+step_size/2, # The interval doesn't include this value
                        step=step_size ) # Spacing between values

# Calculate background statistical uncertainty
# Remove the "[:,:2]" expressions to use all MC datasets with background and signal combined
mc_tot_var = np.sum(output['mllll']['MC'].variances()[:,:2], axis=1)
mc_err = np.sqrt(mc_tot_var)
mc_tot_height = np.sum(output['mllll']['MC'].values()[:,:2], axis=1)

# Plot background statistical uncertainty
plt.bar(bin_centers, # x
        2*mc_err, # heights
        alpha=0.5, # half transparency
        bottom=mc_tot_height-mc_err, color='none', 
        hatch="////", width=step_size, label='Stat. Unc.')

# Tune plot appearance
main_axes = plt.gca()
main_axes.set_xlim(left=xmin, right=xmax)
main_axes.set_ylim(bottom=0, top=np.amax(output['mllll']['Data'].values())*1.6)
main_axes.set_ylabel('Events / '+str(step_size)+' GeV')
main_axes.legend(frameon=False)

Out[10]:

<matplotlib.legend.Legend at 0x7f1f33bc7700>