In this notebook we download the data from the Telemanom S3 bucket and reformat it as Orion pipelines expect.
import io
import os
import urllib
import zipfile
DATA_URL = 'https://s3-us-west-2.amazonaws.com/telemanom/data.zip'
if not os.path.exists('data'):
response = urllib.request.urlopen(DATA_URL)
bytes_io = io.BytesIO(response.read())
with zipfile.ZipFile(bytes_io) as zf:
zf.extractall()
train_signals = os.listdir('data/train')
test_signals = os.listdir('data/test')
train_signals == test_signals
True
We convert the NPY matrices to CSV files with two columns: timestamp
and value
.
For this, what we do is loading both the train and test matrices for each signals and concantenate them to generate a single matrix for each signal.
Afterwards, we add a timestamp column by taking the value 1222819200 (2008-10-01T00:00:00) as for the first row and then increasing the timestamp by 21600 seconds (6h) for each other row.
import pandas as pd
import numpy as np
NASA_DIR = os.path.join('data', '{}', '{}')
def build_df(data, start=0):
index = np.array(range(start, start + len(data)))
timestamp = index * 21600 + 1222819200
return pd.DataFrame({'timestamp': timestamp, 'value': data[:, 0]})
data = build_df(np.load(NASA_DIR.format('train', 'S-1.npy')))
data.head()
timestamp | value | |
---|---|---|
0 | 1222819200 | -0.366359 |
1 | 1222840800 | -0.394108 |
2 | 1222862400 | 0.403625 |
3 | 1222884000 | -0.362759 |
4 | 1222905600 | -0.370746 |
os.makedirs('csv', exist_ok=True)
PATH_DIR = os.path.join('csv', '{}')
for signal in train_signals:
name = signal[:-4]
train_np = np.load(NASA_DIR.format('train', signal))
test_np = np.load(NASA_DIR.format('test', signal))
data = build_df(np.concatenate([train_np, test_np]))
data.to_csv(PATH_DIR.format(name + '.csv'), index=False)
train = build_df(train_np)
train.to_csv(PATH_DIR.format(name + '-train.csv'), index=False)
test = build_df(test_np, start=len(train))
test.to_csv(PATH_DIR.format(name + '-test.csv'), index=False)
s1 = pd.read_csv(PATH_DIR.format('S-1.csv'))
s1.head()
We will use the labeled_anomalies.csv
file from the telemanom project
and convert it to the CSV that we will later on use in Orion.
import os
import json
import pandas as pd
from orion.data import load_signal
CSV_URL = 'https://github.com/khundman/telemanom/raw/master/labeled_anomalies.csv'
df = pd.read_csv(CSV_URL)
df.head()
import os
import json
labels_data = list()
for _, row in df.iterrows():
signal = row.chan_id
data = load_signal(os.path.join('csv', signal + '.csv'))
test = data[-row.num_values:]
events = list()
for start, end in json.loads(row.anomaly_sequences):
start_ts = test.iloc[start].timestamp.astype(int)
end_ts = test.iloc[end].timestamp.astype(int)
events.append([start_ts, end_ts])
labels_data.append({
'signal': signal,
'events': events
})
labels = pd.DataFrame(labels_data)[['signal','events']]
labels.head()
labels.to_csv('labels.csv', index=False)