#!/usr/bin/env python # coding: utf-8 # # Save dataset to ROOT file after processing # # With RDataFrame, you can read your dataset, add new columns with processed values and finally use `Snapshot` to save the resulting data to a ROOT file in TTree format. # In[ ]: import ROOT df = ROOT.RDataFrame("dataset","data/example_file.root") df1 = df.Define("c","a+b") out_treename = "outtree" out_filename = "outtree.root" out_columns = ["a","b","c"] snapdf = df1.Snapshot(out_treename, out_filename, out_columns) # We can now check that the dataset was correctly stored in a file: # In[ ]: get_ipython().run_cell_magic('bash', '', 'rootls -lt outtree.root\n') # Result of a Snapshot is still an RDataFrame that can be further used: # In[ ]: snapdf.Display().Print() # # Cutflow reports # Filters applied to the dataset can be given a name. The `Report` method will gather information about filter efficiency and show the data flow between subsequent cuts on the original dataset. # # In[ ]: df = ROOT.RDataFrame("sig_tree", "https://root.cern/files/Higgs_data.root") filter1 = df.Filter("lepton_eta > 0", "Lepton eta cut") filter2 = filter1.Filter("lepton_phi < 1", "Lepton phi cut") rep = df.Report() rep.Print() # # Using C++ functions in Python # - We still want to perform complex operations in Python but plain Python code is prone to be slow and not thread-safe. # # - Instead, you can inject C++ functions that will do the work in your event loop during runtime. # # - This mechanism uses the C++ interpreter cling shipped with ROOT, making this possible in a single line of code. # # - Let's start by defining a function that will allow us to change the type of a the RDataFrame dataset entry numbers (stored in the special column "rdfentry") from `unsigned long long` to `float`. # In[ ]: get_ipython().run_cell_magic('cpp', '', '\nfloat asfloat(unsigned long long entrynumber){\n return entrynumber;\n}\n') # Then let's define another function that takes a `float` values and computes its square. # In[ ]: get_ipython().run_cell_magic('cpp', '', '\nfloat square(float val){\n return val * val;\n}\n') # And now let's use these functions with RDataFrame! # # We start by creating an empty RDataFrame with 100 consecutive entries and defining new columns on it: # In[ ]: # Create a new RDataFrame from scratch with 100 consecutive entries df = ROOT.RDataFrame(100) # Create a new column using the previously declared C++ functions df1 = df.Define("a", "asfloat(rdfentry_)") df2 = df1.Define("b", "square(a)") # We can now plot the values of the columns in a graph: # In[ ]: # Show the two columns created in a graph c = ROOT.TCanvas() graph = df2.Graph("a","b") graph.SetMarkerStyle(20) graph.SetMarkerSize(0.5) graph.SetMarkerColor(ROOT.kBlue) graph.SetTitle("My graph") graph.Draw("AP") c.Draw() # # Using all cores of your machine with multi-threaded RDataFrame # - RDataFrame can transparently perform multi-threaded event loops to speed up the execution of its actions. # # - Users have to call `ROOT::EnableImplicitMT()` before constructing the RDataFrame object to indicate that it should take advantage of a pool of worker threads. # # - Each worker thread processes a distinct subset of entries, and their partial results are merged before returning the final values to the user. # # - RDataFrame operations such as Histo1D or Snapshot are guaranteed to work correctly in multi-thread event loops. # # - User-defined expressions, such as strings or lambdas passed to `Filter`, `Define`, `Foreach`, `Reduce` or `Aggregate` will have to be thread-safe, i.e. it should be possible to call them concurrently from different threads. # In[ ]: get_ipython().run_cell_magic('time', '', '# Get a first baseline measurement\n\ntreename = "Events"\nfilename = "root://eospublic.cern.ch//eos/opendata/cms/derived-data/AOD2NanoAODOutreachTool/Run2012BC_DoubleMuParked_Muons.root"\ndf = ROOT.RDataFrame(treename, filename)\n\ndf.Sum("nMuon").GetValue()\n') # In[ ]: get_ipython().run_cell_magic('time', '', '# Activate multithreading capabilities\n# By default takes all available cores on the machine\nROOT.EnableImplicitMT()\n\ntreename = "Events"\nfilename = "root://eospublic.cern.ch//eos/opendata/cms/derived-data/AOD2NanoAODOutreachTool/Run2012BC_DoubleMuParked_Muons.root"\ndf = ROOT.RDataFrame(treename, filename)\n\ndf.Sum("nMuon").GetValue()\n\n# Disable implicit multithreading when done\nROOT.DisableImplicitMT()\n')