import os
project_name = "reco-tut-gml"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)
if not os.path.exists(project_path):
!cp /content/drive/MyDrive/mykeys.py /content
import mykeys
!rm /content/mykeys.py
path = "/content/" + project_name;
!mkdir "{path}"
%cd "{path}"
import sys; sys.path.append(path)
!git config --global user.email "recotut@recohut.com"
!git config --global user.name "reco-tut"
!git init
!git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
!git pull origin "{branch}"
!git checkout main
else:
%cd "{project_path}"
!pip install -q node2vec
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
from community import community_louvain
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
default_edge_color = 'gray'
default_node_color = '#407cc9'
enhanced_node_color = '#f5b042'
enhanced_edge_color = '#cc2f04'
%matplotlib inline
The dataset used in this tutorial is the Credit Card Transactions Fraud Detection Dataset available on Kaggle at the following URL: https://www.kaggle.com/kartik2112/fraud-detection?select=fraudTrain.csv.
The dataset is made up of simulated credit card transactions containing legitimate and fraudulent transactions for the period January 1, 2019 – December 31, 2020. It includes the credit cards of 1,000 customers performing transactions with a pool of 800 merchants. The dataset was generated using Sparkov Data Generation. More information about the generation algorithm is available at the following URL: https://github.com/namebrandon/Sparkov_Data_Generation.
!pip install -q -U kaggle
!pip install --upgrade --force-reinstall --no-deps kaggle
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!cd /content && kaggle datasets download -d kartik2112/fraud-detection
!cd /content && unzip fraud-detection.zip
df = pd.read_csv('/content/fraudTrain.csv', index_col=0)
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df.info()
df.to_parquet('./data/financial_fraud_detection_v1.parquet.gzip', compression='gzip')
!git status
!git add . && git commit -m 'ADD data financial_fraud' && git push origin main
df = pd.read_parquet('./data/financial_fraud_detection_v1.parquet.gzip').reset_index(drop=True)
df.head()
df["is_fraud"].value_counts()
Since the dataset represents temporal transactions, multiple interactions can happen between a customer and a merchant. In both our formalizations, we decided to collapse all that information in a single graph. In other words, if multiple transactions are present between a customer and a merchant, we will build a single edge between the two nodes with its weight given by the sum of all the transaction amounts. A graphical representation of the direct bipartite graph is visible below:
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
df = df_input.copy()
mapping = {x:node_id for node_id,x in enumerate(set(df["cc_num"].values.tolist() + df["merchant"].values.tolist()))}
df["from"] = df["cc_num"].apply(lambda x: mapping[x])
df["to"] = df["merchant"].apply(lambda x: mapping[x])
df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from', 'to']).agg({"is_fraud": "sum", "amt": "sum"}).reset_index()
df["is_fraud"] = df["is_fraud"].apply(lambda x: 1 if x>0 else 0)
G = nx.from_edgelist(df[["from", "to"]].values, create_using=graph_type)
nx.set_node_attributes(G,{x:1 for x in df["from"].unique()}, "bipartite")
nx.set_node_attributes(G,{x:2 for x in df["to"].unique()}, "bipartite")
nx.set_edge_attributes(G,
{(int(x["from"]), int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()},
"label")
nx.set_edge_attributes(G,
{(int(x["from"]), int(x["to"])):x["amt"] for idx, x in df[["from","to","amt"]].iterrows()},
"weight")
return G
Finally, to each edge of the graph, we assign an (always positive) weight representing the amount (in US dollars) of the transaction. Since, in this case, we create a node for each transaction, we do not need to aggregate multiple transactions from a customer to a merchant. Moreover, as for the other approach, in our formalization, we allow both directed and undirected graphs. A graphical representation of the tripartite graph is visible below:
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
df = df_input.copy()
mapping = {x:node_id for node_id,x in enumerate(set(df.index.values.tolist() +
df["cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"] = df["cc_num"].apply(lambda x: mapping[x])
df["out_node"] = df["merchant"].apply(lambda x: mapping[x])
G = nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +
[(x["out_node"], mapping[idx]) for idx, x in df.iterrows()],
create_using=graph_type)
nx.set_node_attributes(G,{x["in_node"]:1 for idx,x in df.iterrows()}, "bipartite")
nx.set_node_attributes(G,{x["out_node"]:2 for idx,x in df.iterrows()}, "bipartite")
nx.set_node_attributes(G,{mapping[idx]:3 for idx, x in df.iterrows()}, "bipartite")
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
return G
G = build_graph_bipartite(df, nx.Graph())
bipartite.is_bipartite(G)
plt.figure(figsize=(5,4))
top = nx.bipartite.sets(G)[0]
pos = nx.bipartite_layout(G, top)
nx.draw(G, pos=pos, with_labels=False, node_color=default_node_color, edge_color=default_edge_color)
plt.show()
plt.axis("off")
plt.figure(figsize=(10,10))
# spring_pos = nx.spring_layout(G)
nx.draw_networkx(G, pos=spring_pos, node_color=default_node_color,
edge_cmap=plt.cm.Blues, with_labels=False, node_size=15);
print(nx.info(G))
degrees = pd.Series({k: v for k, v in nx.degree(G)})
degrees.plot.hist()
plt.yscale("log")
allEdgesWeights = pd.Series({(d[0], d[1]): d[2]["weight"] for d in G.edges(data=True)})
quant_dist = np.quantile(allEdgesWeights.values,[0.10,0.50,0.70,0.9])
quant_dist
allEdgesWeightsFiltered = pd.Series({(d[0], d[1]): d[2]["weight"] for d in G.edges(data=True)
if d[2]["weight"] < quant_dist[-1]})
allEdgesWeightsFiltered.plot.hist(bins=40)
plt.yscale("log")
# betweenness centrality
bC = nx.betweenness_centrality(G)
bc_distr = pd.Series(bC)
bc_distr.plot.hist()
plt.yscale("log")
# degree centrality
deg_C = nx.degree_centrality(G)
degc_distr = pd.Series(deg_C)
degc_distr.plot.hist()
# closeness centrality
clos_C = nx.closeness_centrality(G)
closc_distr = pd.Series(clos_C)
closc_distr.plot.hist()
# assortativity
nx.degree_pearson_correlation_coefficient(G)
parts = community_louvain.best_partition(G, random_state=42, weight='weight')
communities = pd.Series(parts)
communities.value_counts().sort_values(ascending=False)
graphs = []
d = {}
for x in communities.unique():
tmp = nx.subgraph(G, communities[communities==x].index)
fraud_edges = sum(nx.get_edge_attributes(tmp, "label").values())
ratio = 0 if fraud_edges == 0 else (fraud_edges/tmp.number_of_edges())*100
d[x] = ratio
graphs += [tmp]
pd.Series(d).sort_values(ascending=False)
gId = 10
plt.figure(figsize=(10,10))
spring_pos = nx.spring_layout(graphs[gId])
plt.axis("off")
edge_colors = ["r" if x == 1 else "g" for x in nx.get_edge_attributes(graphs[gId], 'label').values()]
nx.draw_networkx(graphs[gId], pos=spring_pos, node_color=default_node_color,
edge_color=edge_colors, with_labels=False, node_size=15)
df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]
df_maj_dowsampled = resample(df_majority,
n_samples=len(df_minority),
random_state=42)
df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
print(df_downsampled.is_fraud.value_counts())
G_down = build_graph_bipartite(df_downsampled)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))),
list(nx.get_edge_attributes(G_down, "label").values()),
test_size=0.20,
random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))
node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
embeddings_train = cl(keyed_vectors=model_train.wv)
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
rf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf.fit(train_embeddings, train_labels);
y_pred = rf.predict(test_embeddings)
print(cl)
print('Precision:', metrics.precision_score(test_labels, y_pred))
print('Recall:', metrics.recall_score(test_labels, y_pred))
print('F1-Score:', metrics.f1_score(test_labels, y_pred))
print('\n{}\n'.format('='*100))
nod2vec_unsup = Node2Vec(G_down, weight_key='weight')
unsup_vals = nod2vec_unsup.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
true_labels = [x for x in nx.get_edge_attributes(G_down, "label").values()]
for cl in classes:
embedding_edge = cl(keyed_vectors=unsup_vals.wv)
embedding = [embedding_edge[str(x[0]), str(x[1])] for x in G_down.edges()]
kmeans = KMeans(2, random_state=42).fit(embedding)
nmi = metrics.adjusted_mutual_info_score(true_labels, kmeans.labels_)
ho = metrics.homogeneity_score(true_labels, kmeans.labels_)
co = metrics.completeness_score(true_labels, kmeans.labels_)
vmeasure = metrics.v_measure_score(true_labels, kmeans.labels_)
print(cl)
print('NMI:', nmi)
print('Homogeneity:', ho)
print('Completeness:', co)
print('V-Measure:', vmeasure)
print('\n{}\n'.format('='*100))