Loading data¶

In [21]:

import numpy as np
import pandas as pd
df = pd.read_parquet("fraud-all.parquet")

Cleaning data¶

These data are mostly clean but we need to add a new field for transaction interarrival time.

In [22]:

df = df.sort_values(["user_id", "timestamp"]).reset_index()
del df['index']

In [23]:

shifted = df.shift(1)[['user_id', 'timestamp']]

df['prev_user_id'] = shifted['user_id']
df['prev_timestamp'] = shifted['timestamp']
df['interarrival'] = (df['timestamp'] - df['prev_timestamp']).where(df['user_id'] == df['prev_user_id'], np.NaN)

del df['prev_user_id']
del df['prev_timestamp']

In [24]:

df["label"] = df["label"].astype("string")
df["trans_type"] = df["trans_type"].astype("string")
df["merchant_id"] = df["merchant_id"].astype("int32")
df["user_id"] = df["user_id"].astype("int32")
df["amount"] = df["amount"].astype("float32")

In [25]:

# for this data set, brotli is ~60% the size of snappy (the default)
# fastparquet can't currently handle strings (!)
df.to_parquet("fraud-cleaned.parquet", engine="pyarrow", compression="brotli")

In [26]:

df.sample(frac=0.05).to_parquet("fraud-cleaned-sample.parquet", engine="pyarrow", compression="brotli")