from splink.duckdb.duckdb_linker import DuckDBLinker
import pandas as pd
pd.options.display.max_rows = 1000
df = pd.read_csv("./data/fake_1000.csv")
df.head(5)
unique_id | first_name | surname | dob | city | cluster | ||
---|---|---|---|---|---|---|---|
0 | 0 | Robert | Alan | 1971-06-24 | NaN | robert255@smith.net | 0 |
1 | 1 | Robert | Allen | 1971-05-24 | NaN | roberta25@smith.net | 0 |
2 | 2 | Rob | Allen | 1971-06-24 | London | roberta25@smith.net | 0 |
3 | 3 | Robert | Alen | 1971-06-24 | Lonon | NaN | 0 |
4 | 4 | Grace | NaN | 1997-04-26 | Hull | grace.kelly52@jones.com | 1 |
linker = DuckDBLinker(df)
linker.load_settings_from_json("./demo_settings/saved_model_from_demo.json")
Since we have labels in this dataset, we can compute the accuracy of our trained model
df_labels = pd.read_csv("./data/fake_1000_labels.csv")
df_labels.head(5)
unique_id_l | source_dataset_l | unique_id_r | source_dataset_r | clerical_match_score | |
---|---|---|---|---|---|
0 | 0 | fake_1000 | 1 | fake_1000 | 1.0 |
1 | 0 | fake_1000 | 2 | fake_1000 | 1.0 |
2 | 0 | fake_1000 | 3 | fake_1000 | 1.0 |
3 | 0 | fake_1000 | 4 | fake_1000 | 0.0 |
4 | 0 | fake_1000 | 5 | fake_1000 | 0.0 |
Then to produce the chart:
df_predictions = linker.predict()
linker._initialise_df_concat_with_tf()
linker._con.register("labels", df_labels)
<duckdb.DuckDBPyConnection at 0x7fdb1411e770>
using roc_chart_from_labels
linker.roc_chart_from_labels("labels")
A precision-recall chart is also available with linker.precision_recall_from_labels
linker.precision_recall_chart_from_labels("labels")
Create a splink_comparison_viewer interactive dashboard and display in an iframe
linker.comparison_viewer_dashboard(df_predictions, "scv.html", True,2)
from IPython.display import IFrame
IFrame(
src="./scv.html", width="100%", height=1200
)
df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predictions, 0.2)
df_clustered.as_pandas_dataframe(limit=5)
Completed iteration 1, root rows count 32 Completed iteration 2, root rows count 21 Completed iteration 3, root rows count 11 Completed iteration 4, root rows count 8 Completed iteration 5, root rows count 3 Completed iteration 6, root rows count 1 Completed iteration 7, root rows count 0
cluster_id | unique_id | first_name | surname | dob | city | cluster | tf_city | ||
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | Robert | Alan | 1971-06-24 | NaN | robert255@smith.net | 0 | NaN |
1 | 1 | 1 | Robert | Allen | 1971-05-24 | NaN | roberta25@smith.net | 0 | NaN |
2 | 1 | 2 | Rob | Allen | 1971-06-24 | London | roberta25@smith.net | 0 | 0.212792 |
3 | 0 | 3 | Robert | Alen | 1971-06-24 | Lonon | NaN | 0 | 0.007380 |
4 | 4 | 4 | Grace | NaN | 1997-04-26 | Hull | grace.kelly52@jones.com | 1 | 0.001230 |
linker.cluster_studio_dashboard(df_predictions,df_clustered, sampling_method="by_cluster_size", out_path="cluster_studio.html", overwrite=True)
from IPython.display import IFrame
IFrame(
src="./cluster_studio.html", width="100%", height=1200
)