In [1]:

%%capture

import numpy as np
import pandas as pd
import xarray as xr

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()
from IPython.display import Image

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense

from tqdm import tqdm

from trackml.dataset import load_event

I won't explain the content of this competition here, everything can be found on the kaggle site: https://www.kaggle.com/c/trackml-particle-identification. However, I will only analyze one single event in this notebook (because I don't have a lot of computing power at hand). This event is visualized in a few graphs below.

In [2]:

# Load data
hits, cells, particles, truth = load_event("./train_100_events/event000001000")
truth.index = truth.hit_id
hits.index = hits.hit_id

# Add "distance from origin"-column and sort by it
truth['dist'] = (truth.tx**2 + truth.ty**2 + truth.tz**2) ** (1/2)
truth = truth.sort_values('dist')

# Plot all hits
trace1 = go.Scatter3d(x=truth.tx, y=truth.ty, z=truth.tz, mode='markers', opacity=0.3, name="Hits")
trace2 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2], layout=layout)
# py.iplot(fig)

Image('fig1.png')

Out[2]:

In [3]:

# Two test tracks
track1 = truth[truth.particle_id == truth.loc[20906].particle_id]
track2 = truth[truth.particle_id == truth.loc[22405].particle_id]

track1

Out[3]:

	hit_id	particle_id	tx	ty	tz	tpx	tpy	tpz	weight	dist
hit_id
20906	20906	950260552167325696	-3.715120	32.531200	-28.693501	-0.051091	0.364180	-0.275196	0.000016	43.536171
29287	29287	950260552167325696	-10.520200	71.067299	-57.998402	-0.075847	0.359430	-0.274471	0.000013	92.331202
35612	35612	950260552167325696	-20.975201	113.817001	-90.823502	-0.099410	0.351922	-0.271879	0.000010	147.116198
42232	42232	950260552167325696	-38.887001	167.647995	-132.860992	-0.131641	0.341420	-0.270685	0.000008	217.416866
73760	73760	950260552167325696	-78.810699	252.750000	-202.261993	-0.179820	0.318319	-0.266942	0.000005	333.172030
80449	80449	950260552167325696	-132.494995	332.373993	-272.451996	-0.229098	0.284209	-0.266186	0.000005	449.730457
86907	86907	950260552167325696	-244.658997	440.040009	-386.165009	-0.291042	0.219140	-0.266104	0.000005	634.520784
92821	92821	950260552167325696	-395.536011	523.101013	-512.562012	-0.341102	0.127076	-0.265053	0.000008	832.347997
108754	108754	950260552167325696	-601.302979	561.273010	-666.289001	-0.361658	0.002130	-0.266721	0.000010	1058.552668

In [4]:

track2

Out[4]:

	hit_id	particle_id	tx	ty	tz	tpx	tpy	tpz	weight	dist
hit_id
22405	22405	468379102890426368	31.363001	-2.240470	49.250099	0.555573	-0.030916	0.827233	0.000014	58.431411
31033	31033	468379102890426368	71.755699	-3.584530	109.389999	0.555913	-0.005662	0.826708	0.000012	130.873605
38293	38293	468379102890426368	115.221001	-3.005630	174.029999	0.554995	0.020648	0.825945	0.000010	208.737528
44866	44866	468379102890426368	171.330994	0.850648	258.066010	0.550073	0.054562	0.827046	0.000008	309.762964
44872	44872	468379102890426368	173.184998	1.036430	260.854004	0.550368	0.054847	0.826769	0.000007	313.111689
76732	76732	468379102890426368	260.217987	13.868400	392.063995	0.542548	0.108607	0.823840	0.000005	470.765235
83789	83789	468379102890426368	353.845001	37.641102	535.929016	0.529077	0.164183	0.822516	0.000004	643.306328
83778	83778	468379102890426368	359.763000	39.498501	545.138000	0.528109	0.166867	0.822467	0.000004	654.343182
90693	90693	468379102890426368	488.882996	90.821198	751.671021	0.496812	0.242259	0.821913	0.000004	901.257108
97056	97056	468379102890426368	636.452026	179.432999	1008.159973	0.446866	0.323151	0.822002	0.000006	1205.675744
118379	118379	468379102890426368	744.026978	269.036011	1217.500000	0.396525	0.379051	0.822572	0.000007	1451.985795
118915	118915	468379102890426368	870.002014	412.187012	1505.500000	0.321804	0.435167	0.824130	0.000008	1786.989647

In [5]:

# Visualize test tracks

trace1 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', name="Track 1")
trace2 = go.Scatter3d(x=track2.tx, y=track2.ty, z=track2.tz, mode='markers', name="Track 2")
trace3 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
#py.iplot(fig)

Image('fig2.png')

Out[5]:

Now the task is to find these two (and all of the other) tracks from the unannotated mess of points above. A logical first step is to look for the first hits of the track. One might expect these to be the ones closest to the origin point, but because of the cylindrical structure of the detector, distance isn't the way to go, as can be seen below, where all actual first hits are plottet (for orientation, both example tracks are also plottet):

In [6]:

# True first hits

first_true = truth.drop_duplicates(subset='particle_id')

trace1 = go.Scatter3d(x=first_true.tx, y=first_true.ty, z=first_true.tz, mode='markers', opacity=0.3, name="True first hits")
trace2 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', opacity=1, name="Track 1")
trace3 = go.Scatter3d(x=track2.tx, y=track2.ty, z=track2.tz, mode='markers', opacity=1, name="Track 2")
trace4 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)
#py.iplot(fig)

Image('fig3.png')

Out[6]:

It is evident that most first hits lie on the innermost cylindrical detector, as is to be expected. However, a lot of random mess can be seen everywhere, where the particle apparently was not detected by detectors closer to the origin. To predict these first hits, I used a random forest classifier.

In [7]:

# Predict first hits

X = truth
y = truth.hit_id.isin(first_true.hit_id)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4)

model_first = RandomForestClassifier()
model_first.fit(X_train, y_train)

print("Confusion Matrix of test data:\n", confusion_matrix(y_test, model_first.predict(X_test)))

Confusion Matrix of test data:
 [[27288   262]
 [  501  2184]]

These results are not great, but they would probably get better if more data was used to train the classifier. Below is a plot of what the model predicts as the first hits, but remember that it was trained on 3/4 of exactly these points, so if it gets a lot correct that is not coincidence or brilliance, but just the fact that the model was fit to this data.

In [8]:

# Predicted first hits

y_pred = model_first.predict(truth)
first_pred = truth[y_pred]

trace1 = go.Scatter3d(x=first_pred.tx, y=first_pred.ty, z=first_pred.tz, mode='markers', opacity=0.3, name="Predicted first hits")
trace2 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', opacity=1, name="Track 1")
trace3 = go.Scatter3d(x=track2.tx, y=track2.ty, z=track2.tz, mode='markers', opacity=1, name="Track 2")
trace4 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)
#py.iplot(fig)

Image('fig4.png')

Out[8]:

In [9]:

# Similarity

print("Total number of predicted first hits:", y_pred.sum())
print("Total errors made:", (y_pred ^ y).sum())
print("{:f}% of the prediction is correct".format((len(y) - (y_pred ^ y).sum()) / len(y)))

Total number of predicted first hits: 10184
Total errors made: 938
0.992244% of the prediction is correct

With that, we can now turn to the second hits. This is where it gets more tricky, and the huge amount of data really becomes a problem. If one were to look for a second hit for every of our predicted first hits among all of the data, there would be far too many possibilities:

In [10]:

num = len(y_pred) * (len(truth) - 1)    # We can exclude the first hit itself as second hit 🎉
print("Number of possibilities:", num)

Number of possibilities: 14626120782

Now if each of the calculations took only ten microsecond, it would still take:

In [11]:

print(pd.Timedelta('10 us') * num)

1 days 16:37:41.207820

to predict all possible second hits for one event! Now suppose we would take the ten most likely second hits per first hit and predict the third hits for those, that would then take

In [12]:

# This time, we can exclude two hits on our search for the third one!
num = len(y_pred) * 10 * (len(truth) - 2)
print(pd.Timedelta('10 us') * num)

16 days 22:16:39.984300

Alright, that is definetely too long. But, because we are not looking for a random point among all other ones, we can just exclude most points when looking for a second hit. For example, we know the particle physically has to continue moving away from the origin. That already excludes all hits that are closer to the origin than our first hit. Also, the second hit won't be too far away from the first hit, we can calculate the average distance of the two:

In [13]:

l = np.array([])
for _, hits in truth.groupby("particle_id"):
    if len(hits) > 1:
        l = np.append(l, [np.sqrt((hits.tx.iloc[1] - hits.tx.iloc[0])**2 
                                   + (hits.ty.iloc[1] - hits.ty.iloc[0])**2 
                                   + (hits.tz.iloc[1] - hits.tz.iloc[0])**2)])

print(f"The average distance between first and second hit is {l.mean():.2f} "
      f"with a standard deviation of {l.std():.2f}.")

The average distance between first and second hit is 89.87 with a standard deviation of 151.94.

Now we can define a cone with the first hit being the starting point in which the second point must be. We can use efficient linear algebra libraries to rapidly calculate these cones:

In [14]:

%%time

# First layer, I only use 5000 points because otherwise my laptop crashes.
layer = first_true.iloc[1:5001]

# Put all hits into a tensor with coordinates hit_id, and prev; prev here is the hit_id of the previous hit (namely the first hit)
x = xr.DataArray(np.resize(truth.tx.values, (len(layer), len(truth))).T, coords={'hit_id': truth.hit_id.values, 'prev': layer.hit_id.values}, dims=['hit_id', 'prev'])
y = xr.DataArray(np.resize(truth.ty.values, (len(layer), len(truth))).T, coords={'hit_id': truth.hit_id.values, 'prev': layer.hit_id.values}, dims=['hit_id', 'prev'])
z = xr.DataArray(np.resize(truth.tz.values, (len(layer), len(truth))).T, coords={'hit_id': truth.hit_id.values, 'prev': layer.hit_id.values}, dims=['hit_id', 'prev'])
ds = x.to_dataset(name='x')
ds['y'] = y
ds['z'] = z

# Put first hits in similar shape as all other hits (xarray)
X = layer[['tx', 'ty', 'tz']].to_xarray()
X = X.rename({'hit_id': 'prev', 'tx': 'x', 'ty': 'y', 'tz': 'z'})

# Calculate the angle between all first hits and possible second hits
xp = ds - X    # Elements of xp are the vectors between the first hit (x) and possible second hit (p)
xpl = (xp.x**2 + xp.y**2 + xp.z**2)**(1/2)
p = xp/xpl     # Norm all vectors

ox = X         # Elements of ox are the vectors between the origin (o) and the first hit (x)
oxl = (ox.x**2 + ox.y**2 + ox.z**2)**(1/2)
n = ox/oxl     # Norm all vectors

s = n * p      # Calculate cosine of angles between points

# Exclude all hits where the angle is > 36.9° or the distance is < 300
cones = ((((s.x + s.y + s.z) > 0.8)) & (xpl < 300)).T

CPU times: user 1min 8s, sys: 2min 54s, total: 4min 3s
Wall time: 4min 34s

To illustrate this result I will plot the cone of the first test track below:

In [15]:

# Visualize cones
cone1 = truth[cones.sel(prev=20906).values]
trace1 = go.Scatter3d(x=cone1.tx, y=cone1.ty, z=cone1.tz, mode='markers', opacity=0.3, name="Cone")
trace2 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', opacity=1, name="Track")
trace3 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
#py.iplot(fig)

Image('fig5.png')

Out[15]:

Now these cones we can work with. A simple neural net can classify between "looks like a second hit" and "does not look like a second hit" among the hits inside a cone. To train the net I will use the second hit as a positive example and 10 random hits from the cone as negative example, so that the distribution isn't too skewed towards negative examples.

In [16]:

# Process data for NN

remains = ds.where(cones)    # Hits inside cones

X = np.zeros(6)
y = np.array([])

for fhit in tqdm(ds.prev):
    # Get 10 random hits from cone
    remain = remains.sel(prev=fhit).dropna('hit_id')
    remain = remain.sel(hit_id=np.random.choice(remain.hit_id.values, 10)).to_dataframe()
    
    # Add the one true second hit
    try:
        t = truth[truth.particle_id == truth.particle_id.loc[int(fhit)]].iloc[1]
    except:
        continue
    
    remain = remain[remain.index != t.hit_id]
    prev = truth.loc[np.full((len(remain),), fhit)]
    y = np.concatenate((y, len(remain)*[0], [1]))
    X = np.vstack([X, np.vstack([prev.tx, prev.ty, prev.tz, remain.x.values, remain.y.values, remain.z.values]).T, 
                   [prev.tx.iloc[0], prev.ty.iloc[0], prev.tz.iloc[0], t.tx, t.ty, t.tz]])

X = X[1:]

100%|██████████| 5000/5000 [01:36<00:00, 51.66it/s]

In [17]:

# Train the NN

model_second = Sequential()
model_second.add(Dense(64, activation='relu', input_shape=(6,)))
model_second.add(Dense(64, activation='relu'))
model_second.add(Dense(64, activation='relu'))
model_second.add(Dense(64, activation='relu'))
model_second.add(Dense(64, activation='relu'))
model_second.add(Dense(1, activation='sigmoid'))
model_second.compile(optimizer='rmsprop', loss='binary_crossentropy')
model_second.fit(X, y, epochs=5)

Epoch 1/5
53878/53878 [==============================] - 3s 49us/step - loss: 0.1586
Epoch 2/5
53878/53878 [==============================] - 2s 43us/step - loss: 0.1057
Epoch 3/5
53878/53878 [==============================] - 2s 42us/step - loss: 0.0973
Epoch 4/5
53878/53878 [==============================] - 2s 44us/step - loss: 0.0907
Epoch 5/5
53878/53878 [==============================] - 2s 41us/step - loss: 0.0927

Out[17]:

<keras.callbacks.History at 0x1c33c0f320>

In [18]:

# Test the NN on the two test tracks

cone1 = truth[cones.sel(prev=20906).values][['tx', 'ty', 'tz']].values
X1 = np.hstack([np.repeat(truth.loc[20906][['tx', 'ty', 'tz']].values[np.newaxis], len(cone1), axis=0), cone1])
second_1 = cone1[np.argmax(model_second.predict(X1))]

cone2 = truth[cones.sel(prev=22405).values][['tx', 'ty', 'tz']].values
X2 = np.hstack([np.repeat(truth.loc[22405][['tx', 'ty', 'tz']].values[np.newaxis], len(cone2), axis=0), cone2])
second_2 = cone2[np.argmax(model_second.predict(X2))]

trace1 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")

trace2 = go.Scatter3d(x=track1.tx.iloc[2:], y=track1.ty.iloc[2:], z=track1.tz.iloc[2:], mode='markers', opacity=1, name="Track 1")
trace3 = go.Scatter3d(x=[track1.tx.iloc[0]], y=[track1.ty.iloc[0]], z=[track1.tz.iloc[0]], mode='markers', opacity=1, name="First hit 1")
trace4 = go.Scatter3d(x=[track1.tx.iloc[1]], y=[track1.ty.iloc[1]], z=[track1.tz.iloc[1]], mode='markers', opacity=1, name="Second hit 1 (true)")
trace5 = go.Scatter3d(x=[second_1[0]], y=[second_1[1]], z=[second_1[2]], mode='markers', opacity=1, name="Second hit 1 (predicted)")

trace6 = go.Scatter3d(x=track2.tx.iloc[2:], y=track2.ty.iloc[2:], z=track2.tz.iloc[2:], mode='markers', opacity=1, name="Track 2")
trace7 = go.Scatter3d(x=[track2.tx.iloc[0]], y=[track2.ty.iloc[0]], z=[track2.tz.iloc[0]], mode='markers', opacity=1, name="First 2 hit")
trace8 = go.Scatter3d(x=[track2.tx.iloc[1]], y=[track2.ty.iloc[1]], z=[track2.tz.iloc[1]], mode='markers', opacity=1, name="Second 2 hit (true)")
trace9 = go.Scatter3d(x=[second_2[0]], y=[second_2[1]], z=[second_2[2]], mode='markers', opacity=1, name="Second hit 2 (predicted)")

layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3, trace4, trace5, trace6, trace7, trace8, trace9], layout=layout)
#py.iplot(fig)

Image('fig6.png')

Out[18]:

The model accurately predicts the second hit of the first track, but fails to predict the second hit of the second track, though on a previous trial the same network predicted the second hit of the first track accurately.

To predict the third and subsequent hits we will use neural nets that use the previous hits as input and output the predicted position in space of the next hit. Then we can use the closest available hit as a prediction.

In [19]:

# Process data for NN

X = np.zeros(6)
y = np.zeros(3)

for fhit in tqdm(ds.prev):
    t = truth[truth.particle_id == truth.particle_id.loc[int(fhit)]]
    try:
        y = np.vstack([y, np.r_[t.iloc[2].tx, t.iloc[2].ty, t.iloc[2].tz]])
        X = np.vstack([X, np.r_[t.iloc[0].tx, t.iloc[0].ty, t.iloc[0].tz, 
                                t.iloc[1].tx, t.iloc[1].ty, t.iloc[1].tz]])
    except:
        continue
    
X = X[1:]
y = y[1:]

100%|██████████| 5000/5000 [00:14<00:00, 345.75it/s]

In [20]:

# Train the NN

model_third = Sequential()
model_third.add(Dense(64, activation='relu', input_shape=(6,)))
model_third.add(Dense(64, activation='relu'))
model_third.add(Dense(64, activation='relu'))
model_third.add(Dense(64, activation='relu'))
model_third.add(Dense(64, activation='relu'))
model_third.add(Dense(3))
model_third.compile(optimizer='rmsprop', loss='mse')
model_third.fit(X, y, epochs=20)

Epoch 1/20
4837/4837 [==============================] - 0s 96us/step - loss: 2230.0851
Epoch 2/20
4837/4837 [==============================] - 0s 39us/step - loss: 838.3017
Epoch 3/20
4837/4837 [==============================] - 0s 38us/step - loss: 825.0221
Epoch 4/20
4837/4837 [==============================] - 0s 37us/step - loss: 825.1955
Epoch 5/20
4837/4837 [==============================] - 0s 38us/step - loss: 783.8190
Epoch 6/20
4837/4837 [==============================] - 0s 38us/step - loss: 784.6124
Epoch 7/20
4837/4837 [==============================] - 0s 39us/step - loss: 784.2446
Epoch 8/20
4837/4837 [==============================] - 0s 40us/step - loss: 773.4479
Epoch 9/20
4837/4837 [==============================] - 0s 38us/step - loss: 752.6868
Epoch 10/20
4837/4837 [==============================] - 0s 39us/step - loss: 754.0583
Epoch 11/20
4837/4837 [==============================] - 0s 37us/step - loss: 755.4317
Epoch 12/20
4837/4837 [==============================] - 0s 37us/step - loss: 749.5753
Epoch 13/20
4837/4837 [==============================] - 0s 38us/step - loss: 760.9674
Epoch 14/20
4837/4837 [==============================] - 0s 38us/step - loss: 755.6223
Epoch 15/20
4837/4837 [==============================] - 0s 38us/step - loss: 743.6794
Epoch 16/20
4837/4837 [==============================] - 0s 38us/step - loss: 737.1274
Epoch 17/20
4837/4837 [==============================] - 0s 38us/step - loss: 738.6237
Epoch 18/20
4837/4837 [==============================] - 0s 39us/step - loss: 734.3788
Epoch 19/20
4837/4837 [==============================] - 0s 39us/step - loss: 737.1564
Epoch 20/20
4837/4837 [==============================] - 0s 36us/step - loss: 739.3553

Out[20]:

<keras.callbacks.History at 0x1c3143b7f0>

In [21]:

# Test the NN on the two test tracks

X1 = np.r_[track1.tx.iloc[0], track1.ty.iloc[0], track1.tz.iloc[0], track1.tx.iloc[1], track1.ty.iloc[1], track1.tz.iloc[1]]
third_1 = model_third.predict(X1[np.newaxis])[0]    # Predicted position
third_1 = truth.iloc[np.argmin(np.sqrt(np.sum((truth[['tx', 'ty', 'tz']].values - third_1)**2, axis=1)))]    # Nearest hit

X2 = np.r_[track2.tx.iloc[0], track2.ty.iloc[0], track2.tz.iloc[0], track2.tx.iloc[1], track2.ty.iloc[1], track2.tz.iloc[1]]
third_2 = model_third.predict(X2[np.newaxis])[0]
third_2 = truth.iloc[np.argmin(np.sqrt(np.sum((truth[['tx', 'ty', 'tz']].values - third_2)**2, axis=1)))]    # Nearest hit

trace1 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")

trace2 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', opacity=1, name="Track 1")
trace3 = go.Scatter3d(x=[track1.tx.iloc[2]], y=[track1.ty.iloc[2]], z=[track1.tz.iloc[2]], mode='markers', opacity=1, name="Third hit 1 (true)")
trace4 = go.Scatter3d(x=[third_1.tx], y=[third_1.ty], z=[third_1.tz], mode='markers', opacity=1, name="Third hit 1 (predicted)")

trace5 = go.Scatter3d(x=track2.tx, y=track2.ty, z=track2.tz, mode='markers', opacity=1, name="Track 2")
trace6 = go.Scatter3d(x=[track2.tx.iloc[2]], y=[track2.ty.iloc[2]], z=[track2.tz.iloc[2]], mode='markers', opacity=1, name="Third hit 2 (true)")
trace7 = go.Scatter3d(x=[third_2.tx], y=[third_2.ty], z=[third_2.tz], mode='markers', opacity=1, name="Third hit 2 (predicted)")

layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3, trace4, trace5, trace6, trace7], layout=layout)
#py.iplot(fig)

Image('fig7.png')

Out[21]:

The model fails to predict the third hit for the second track, but correctly predicts the third hit for the first track. Again, the model probably would get better if it were trained on a lot more data, but this algorithm is also very rudimentary and can be improved a great deal if more time was invested.

However, I have already missed the deadline on this kaggle competition and won't continue work from here on. Thank you very much for reading :)