#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('jupyter nbextension enable splitcell/splitcell') get_ipython().system('jupyter nbextension enable rise/main') get_ipython().run_line_magic('matplotlib', 'inline') # # # Tutorial: Lorentz Boost Networks (LBN) # # ### Physics-Inspired Autonomous Feature Engineering ([arXiv:1812.09722](https://arxiv.org/abs/1812.09722)) # #
# # Martin Erdmann, Yannik Rath, **Marcel Rieger** # #
# ### Outline of this tutorial # # 1. [LBN introduction](#LBN-introduction) # # 2. [Application: Classification task](#Application:-Classification-task) # # 3. [LBN introspection](#LBN-introspection) # # 4. [Open part](#Open-part) #
# ### LBN introduction # #### Feature engineering in HEP # #
# #### Feature engineering in HEP # #
# #### Feature engineering in HEP # #
# # **Observations** # 1. Physicists’ crafted high-level features might not exploit all information # 2. In practice, it is hard for “standard” DNNs (FCNs) to learn complex features # #### Feature engineering in HEP # # - Situation similar to transition **FCNs** → **CNNs** # - Images contain information in translation invariant adjacency
# → *Exploit information by changing the network structure!* # #
# # **→ Encode first-principles of domain (physics) into network structure** # #### LBN architecture # #
# #### LBN architecture # #
# # - Only *2 M N* trainable parameters in LBN (usually O(1k)) # - Different boosting configurations possible (pairwise, product, etc) # - Generic feature projections, **not tailored** to use case # #### LBN architecture: Boosting # # - Lorentz transformation with boost matrix # #
# # - Vectorized formulation to run efficiently on GPUs # #
# #### LBN architecture: Feature engineering # #
# # - Extract **generic** features from boosted particles # 1. Simple features, such as $E$, $m$, $p_T$, $\eta$, $\phi$ # 2. Pairwise features, such as $\cos(\measuredangle)$ between all pairs # # - More features possible, but **not necessary** # ### Application: Classification task # # In the following, we want distinguish $t\bar{t}H (H \rightarrow b\bar{b})$ and $t\bar{t} + b\bar{b}$
# **using only low-level features**, i.e., four-vectors #
#
# #### The dataset # # - Events generated with Pythia
($13$ $TeV$) # - Detector simulation using Delphes (CMS configuration) # - ~800k (~200k) events for training (validation) # - $t\bar{t}H$ / $t\bar{t}b\bar{b}$ $\approx$ 50/50 % # - Four-vectors ($E$, $p_{x}$, $p_{y}$, $p_{z}$) of
6 jets + ch. lepton + neutrino # - Labels: 0 for $t\bar{t}b\bar{b}$, 1 for $t\bar{t}H$ #
# # - Object naming: # - "bhad", "blep" # - "lj1", "lj2" # - "bj1", "bj2" # - "lep", "nu" # #### Remark: Jet sorting # #
# # - **Option 1**: Use generator information (only for LBN introspection) ← # - **Option 2**: Sorted by $p_T$ ("real-life" scenario) # #### Inspect the data # In[2]: import os import numpy as np import tensorflow as tf from lbn import LBN, LBNLayer tf.keras.backend.clear_session() import tutorial as tut import plotting as plt print("TF version: {}".format(tf.__version__)) # In[3]: # load data using the tut.load_lbn_data helper (for pt-sorted jets, use sorting="pt") data_train = tut.load_lbn_data("train", sorting="gen") data_valid = tut.load_lbn_data("valid", sorting="gen") # extract vectors vectors_train = data_train["features"] vectors_valid = data_valid["features"] # extract labels and apply one-hot encoding labels_train = tf.keras.utils.to_categorical(data_train["labels"], num_classes=2) labels_valid = tf.keras.utils.to_categorical(data_valid["labels"], num_classes=2) # In[4]: # print shapes print("vectors_train shape: {}".format(vectors_train.shape)) print("vectors_valid shape: {}".format(vectors_valid.shape)) print("labels_train shape : {}".format(labels_train.shape)) print("labels_valid shape : {}".format(labels_valid.shape)) # #### Plot some low-level features # In[5]: # store feature column indices in locals import itertools components = ["E", "px", "py", "pz"] particle_names = ["bhad", "blep", "lj1", "lj2", "bj1", "bj2", "lep", "nu"] for i, (n, c) in enumerate(itertools.product(particle_names, components)): locals()["{}_{}".format(c.upper(), n.upper())] = i # In[6]: # plot the energy of the jet "bhad" E_bhad = vectors_valid[:, E_BHAD] fig, ax = plt.plot_lbn_feature( E_bhad, labels_valid, limits=(0, 1000), xlabel="$b_{had}$: E / GeV") # In[7]: # plot the pT of the lepton px_lep = vectors_valid[:, PX_LEP] py_lep = vectors_valid[:, PY_LEP] pt_lep = (px_lep**2 + py_lep**2)**0.5 fig, ax = plt.plot_lbn_feature( pt_lep, labels_valid, limits=(0, 300), xlabel="$lep$: $p_{T}$ / GeV") # In[8]: # plot the pT of the jet "bj1" px_bj1 = vectors_valid[:, PX_BJ1] py_bj1 = vectors_valid[:, PY_BJ1] pt_bj1 = (px_bj1**2 + py_bj1**2)**0.5 fig, ax = plt.plot_lbn_feature( pt_bj1, labels_valid, limits=(0, 800), xlabel="$bj_{1}$: $p_{T}$ / GeV") # In[9]: # plot the pT of the jet "bj2" px_bj2 = vectors_valid[:, PX_BJ2] py_bj2 = vectors_valid[:, PY_BJ2] pt_bj2 = (px_bj2**2 + py_bj2**2)**0.5 fig, ax = plt.plot_lbn_feature( pt_bj2, labels_valid, limits=(0, 400), xlabel="$bj_{2}$: $p_{T}$ / GeV") # Expect to see differences caused by the two b-jets from $H$ / $g \rightarrow b\bar{b}$ # #### Model definition # In[10]: # create the LBN layer # possible boost modes are "pairs" and "product" features = ["E", "pt", "eta", "phi", "m"] lbn_layer = LBNLayer(n_particles=13, boost_mode="pairs", features=features) l2_reg = tf.keras.regularizers.l2(1e-4) dense_kwargs = dict( activation="selu", kernel_initializer=tf.keras.initializers.lecun_normal(), kernel_regularizer=l2_reg, ) model = tf.keras.models.Sequential() model.add(lbn_layer) model.add(tf.keras.layers.BatchNormalization(axis=1)) model.add(tf.keras.layers.Dense(1024, **dense_kwargs)) model.add(tf.keras.layers.Dense(512, **dense_kwargs)) model.add(tf.keras.layers.Dense(256, **dense_kwargs)) model.add(tf.keras.layers.Dense(128, **dense_kwargs)) model.add(tf.keras.layers.Dense(2, activation="softmax", kernel_regularizer=l2_reg)) # define metrics to monitor during training metrics = [ tf.keras.metrics.categorical_accuracy, tf.keras.metrics.AUC(), ] # select the optimizer optimizer = tf.keras.optimizers.Adam(lr=1e-4) # setup the model for training model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=metrics) # #### Helpers: Model fitting and loading # In[11]: def fit_model(model, name, data=None, validation_data=None, epochs=10, batch_size=512): model_dir = os.path.join(tut.data_dir, "lbn", "models", name) if not os.path.exists(model_dir): os.makedirs(model_dir) fit_callbacks = [ tf.keras.callbacks.ModelCheckpoint( filepath=os.path.join(model_dir, name), save_best_only=True, save_weights_only=True, monitor="val_auc", mode="max", ), ] if data is None: data = (vectors_train, labels_train) if validation_data is None: validation_data = (vectors_valid, labels_valid) model.fit(data[0], data[1], validation_data=validation_data, epochs=epochs, batch_size=batch_size, callbacks=fit_callbacks, ) # In[12]: def load_model(model, name): local_dir = tut.get_file(os.path.join("lbn", "models", name), is_dir=True) print("loading model from {}".format(local_dir)) model.load_weights(os.path.join(local_dir, name)) # #### Fit or load the model # In[13]: # set this flag to True in case you want to train the network do_fit = False if do_fit: # run a new fit for 1 epoch, store the weights as "lbn_v1" fit_model(model, "lbn_v1", epochs=1) else: load_model(model, "lbn_pretrained") # make a prediction for the first 5 validation events print(model.predict(vectors_valid[:5])) # print the model summary model.summary() # #### Check training results # In[14]: # run the prediction on the full validation set pred_valid = model.predict(vectors_valid) # In[15]: # plot the output distribution fig, ax = plt.plot_lbn_outputs("Output distributions (with LBN)", pred_valid, labels_valid) # In[16]: # compute the ROC AUC score from sklearn.metrics import roc_auc_score auc = roc_auc_score(labels_valid, pred_valid) print("ROC AUC: {:.3f}".format(auc)) # ### Performance comparison LBN+DNN vs. DNN-only # In[17]: # define the DNN-only model l2_reg = tf.keras.regularizers.l2(1e-4) dense_kwargs = dict( activation="selu", kernel_initializer=tf.keras.initializers.lecun_normal(), kernel_regularizer=l2_reg, ) model_dnn = tf.keras.models.Sequential() model_dnn.add(tf.keras.layers.BatchNormalization(axis=1)) model_dnn.add(tf.keras.layers.Dense(1024, **dense_kwargs)) model_dnn.add(tf.keras.layers.Dense(512, **dense_kwargs)) model_dnn.add(tf.keras.layers.Dense(256, **dense_kwargs)) model_dnn.add(tf.keras.layers.Dense(128, **dense_kwargs)) model_dnn.add(tf.keras.layers.Dense(2, activation="softmax", kernel_regularizer=l2_reg)) # define metrics to monitor during training metrics = [ tf.keras.metrics.categorical_accuracy, tf.keras.metrics.AUC(), ] # select the optimizer optimizer = tf.keras.optimizers.Adam(lr=1e-4) # setup the model for training model_dnn.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=metrics) # #### Fit or load the model # In[18]: # set this flag to True in case you want to train the network do_fit = False if do_fit: # run a new fit for 1 epoch, store the weights as "dnn_v1" fit_model(model_dnn, "dnn_v1", epochs=1) else: # load the pretrained DNN-only model load_model(model_dnn, "dnn_pretrained") # make a test prediction for the first 5 validation events model_dnn.predict(vectors_valid[:5]) # In[19]: # run the prediction on the full validation set pred_valid_dnn = model_dnn.predict(vectors_valid) # In[20]: # plot the output distribution fix, ax = plt.plot_lbn_outputs("Output distributions (DNN-only)", pred_valid_dnn, labels_valid) # In[21]: # compute the ROC AUC score auc = roc_auc_score(labels_valid, pred_valid_dnn) print("ROC AUC: {:.3f}".format(auc)) # #### ROC curves # In[22]: # plot the ROC cuves, i.e., background rejection vs signal efficiency fig, ax = plt.plot_lbn_rocs( dict(label="LBN+DNN", labels=labels_valid, prediction=pred_valid, color="blue"), dict(label="DNN-only", labels=labels_valid, prediction=pred_valid_dnn, color="red"), ) # #### The full picture #
#
# - Plots from [arXiv:1812.09722](https://arxiv.org/abs/1812.09722) # - High-level variables from *JHEP* **03** (2019) 026, [arXiv:1804.03682](https://arxiv.org/abs/1804.03682) # #### LBN introspection # #
# # 1. Investigate particle and restframe weights (for generator ordered jets) # 2. Study created features # #### Weights in combination layers # In[23]: particle_weights = np.abs(lbn_layer.particle_weights.numpy()) fig, ax = plt.plot_lbn_weights(particle_weights, "particle", cmap="OrRd") restframe_weights = np.abs(lbn_layer.restframe_weights.numpy()) fig, ax = plt.plot_lbn_weights(restframe_weights, "restframe", cmap="YlGn", hide_feynman=True) # #### Evolution of weights over time # ![](https://cernbox.cern.ch/index.php/s/xDYiSmbleT3rip4/download?path=%2Flbn%2Fimages&files=particle_weights_evo.gif) # ![](https://cernbox.cern.ch/index.php/s/xDYiSmbleT3rip4/download?path=%2Flbn%2Fimages&files=restframe_weights_evo.gif) # #### Created features # In[24]: # store feature indices in locals (such as E_0, M_12, PX_5, ...) for i, (n, m) in enumerate(itertools.product(lbn_layer.feature_names, list(range(lbn_layer.lbn.n_out)))): locals()["{}_{}".format(n.upper(), m)] = i # In[25]: # get the features produced by the LBN features = plt.get_lbn_features(lbn_layer, vectors_valid) # the variables to plot are accessible via flags, such as E_0, M_12, PX_5, ... # In[26]: # plot the mass of the 4th particle fig, ax = plt.plot_lbn_feature( features[:, M_4], labels_valid, limits=(0, 200), xlabel="LBN Particle 4: $m$ / GeV") # In[27]: # plot the mass of the 12th particle fig, ax = plt.plot_lbn_feature( features[:, M_12], labels_valid, limits=(0, 200), xlabel="LBN Particle 12: $p_T$ / GeV") # In[28]: # plot the pseudorapidity of the 4th particle fig, ax = plt.plot_lbn_feature( features[:, ETA_4], labels_valid, limits=(-3.5, 3.5), xlabel=r"LBN Particle 4: $\eta$") # In[29]: # plot the pT of the 4th particle fig, ax = plt.plot_lbn_feature( features[:, E_4], labels_valid, limits=(0, 300), xlabel=r"LBN Particle 4: $p_{T}$") # ### Open part # # - Time for questions # #
# **Suggestions for remaining time** # # Repeat the training for $p_{T}$ sorted data
# e.g. LBN+DNN (low-level) vs DNN-only (high-level) # #### Some hints # load data with $p_{T}$ sorted jets for both low- and high-level data # In[ ]: # load data using the tut.load_lbn_data helper (for pt-sorted jets, use sorting="pt") data_low = tut.load_lbn_data("train", sorting="pt", level="low") data_high = tut.load_lbn_data("train", sorting="pt", level="high") # extract low-level vectors vectors_low = data_train["features"] # extract high-level variables features_high = data_train["features"] # extract labels and apply one-hot encoding labels_low = tf.keras.utils.to_categorical(data_low_["labels"], num_classes=2) labels_high = tf.keras.utils.to_categorical(data_high["labels"], num_classes=2) # TODO: define the two models (see above for examples) # In[ ]: # define the models here # you should name them model_lbn_low and model_dnn_high # train the models (assuming the names are `model_lbn_low` and `model_dnn_high`) # In[ ]: fit_model(model_lbn_low, "lbn_low_v1", data=(vectors_low, labels_low), epochs=1) fit_model(model_dnn_high, "dnn_high_v1", data=(features_high, labels_high), epochs=1) # make a prediction for the first 5 validation events print("LBN (low-level) prediction : ...TODO") print("DNN (high-level) prediction: ...TODO") # run the prediction (we are using the training set here ...) # In[ ]: pred_lbn_low = model_lbn_low.predict(vectors_low) pred_dnn_high = model_dnn_high.predict(features_high) # draw the ROC curves # In[ ]: # plot the ROC cuves, i.e., background rejection vs signal efficiency fig, ax = plt.plot_lbn_rocs( dict(label="LBN+DNN", labels=labels_low, prediction=pred_lbn_low, color="blue"), dict(label="DNN-only", labels=labels_high, prediction=pred_dnn_high, color="red"), ) # draw the LBN weights (assuming the lbn layer is called `lbn_layer` again) # In[ ]: particle_weights = np.abs(lbn_layer.particle_weights.numpy()) fig, ax = plt.plot_lbn_weights(particle_weights, "particle", cmap="OrRd", sorting="pt") restframe_weights = np.abs(lbn_layer.restframe_weights.numpy()) fig, ax = plt.plot_lbn_weights(restframe_weights, "restframe", cmap="YlGn", sorting="pt", hide_feynman=True)