import numpy as np
import pandas as pd
df = pd.read_parquet("fraud-all.parquet")
These data are mostly clean but we need to add a new field for transaction interarrival time.
df = df.sort_values(["user_id", "timestamp"]).reset_index()
del df['index']
shifted = df.shift(1)[['user_id', 'timestamp']]
df['prev_user_id'] = shifted['user_id']
df['prev_timestamp'] = shifted['timestamp']
df['interarrival'] = (df['timestamp'] - df['prev_timestamp']).where(df['user_id'] == df['prev_user_id'], np.NaN)
del df['prev_user_id']
del df['prev_timestamp']
df["label"] = df["label"].astype("string")
df["trans_type"] = df["trans_type"].astype("string")
df["merchant_id"] = df["merchant_id"].astype("int32")
df["user_id"] = df["user_id"].astype("int32")
df["amount"] = df["amount"].astype("float32")
# for this data set, brotli is ~60% the size of snappy (the default)
# fastparquet can't currently handle strings (!)
df.to_parquet("fraud-cleaned.parquet", engine="pyarrow", compression="brotli")
df.sample(frac=0.05).to_parquet("fraud-cleaned-sample.parquet", engine="pyarrow", compression="brotli")