#!/usr/bin/env python # coding: utf-8 # # tmva100_DataPreparation # This tutorial illustrates how to prepare ROOT datasets to be nicely readable # by most machine learning methods. This requires filtering the initial complex # datasets and writing the data in a flat format. # # # # # **Author:** Stefan Wunsch # This notebook tutorial was automatically generated with ROOTBOOK-izer from the macro found in the ROOT repository on Thursday, April 11, 2024 at 09:41 AM. # In[1]: import ROOT def filter_events(df): """ Reduce initial dataset to only events which shall be used for training """ return df.Filter("nElectron>=2 && nMuon>=2", "At least two electrons and two muons") def define_variables(df): """ Define the variables which shall be used for training """ return df.Define("Muon_pt_1", "Muon_pt[0]")\ .Define("Muon_pt_2", "Muon_pt[1]")\ .Define("Electron_pt_1", "Electron_pt[0]")\ .Define("Electron_pt_2", "Electron_pt[1]") variables = ["Muon_pt_1", "Muon_pt_2", "Electron_pt_1", "Electron_pt_2"] if __name__ == "__main__": for filename, label in [["SMHiggsToZZTo4L.root", "signal"], ["ZZTo2e2mu.root", "background"]]: print(">>> Extract the training and testing events for {} from the {} dataset.".format( label, filename)) # Load dataset, filter the required events and define the training variables filepath = "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/" + filename df = ROOT.RDataFrame("Events", filepath) df = filter_events(df) df = define_variables(df) # Book cutflow report report = df.Report() # Split dataset by event number for training and testing columns = ROOT.std.vector["string"](variables) df.Filter("event % 2 == 0", "Select events with even event number for training")\ .Snapshot("Events", "train_" + label + ".root", columns) df.Filter("event % 2 == 1", "Select events with odd event number for training")\ .Snapshot("Events", "test_" + label + ".root", columns) # Print cutflow report report.Print()