In this example, our data contains a fully-populated ground-truth column called cluster
that enables us to perform accuracy analysis of the final model
import pandas as pd
import altair as alt
alt.renderers.enable("mimetype")
df = pd.read_csv("./data/fake_1000.csv")
df.head(2)
unique_id | first_name | surname | dob | city | cluster | ||
---|---|---|---|---|---|---|---|
0 | 0 | Robert | Alan | 1971-06-24 | NaN | robert255@smith.net | 0 |
1 | 1 | Robert | Allen | 1971-05-24 | NaN | roberta25@smith.net | 0 |
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": [
"l.first_name = r.first_name",
"l.surname = r.surname",
],
"comparisons": [
cl.levenshtein_at_thresholds("first_name", 2),
cl.levenshtein_at_thresholds("surname", 2),
cl.levenshtein_at_thresholds("dob"),
cl.exact_match("city", term_frequency_adjustments=True),
cl.levenshtein_at_thresholds("email"),
],
"retain_matching_columns": True,
"retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
"l.email = r.email"
]
linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
linker.estimate_u_using_random_sampling(target_rows=1e6)
session_dob = linker.estimate_parameters_using_expectation_maximisation("l.dob = r.dob")
session_email = linker.estimate_parameters_using_expectation_maximisation("l.email = r.email")
linker.truth_space_table_from_labels_column(
"cluster", match_weight_round_to_nearest=0.1
).as_pandas_dataframe(limit=5)
truth_threshold | row_count | P | N | TP | TN | FP | FN | P_rate | N_rate | TP_rate | TN_rate | FP_rate | FN_rate | precision | recall | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -18.4 | 4353.0 | 2031.0 | 2322.0 | 2031.0 | 0.0 | 2322.0 | 0.0 | 0.0 | 0.533425 | 1.000000 | 0.0 | 1.0 | 0.000000 | 0.466575 | 1.000000 |
1 | -17.2 | 4353.0 | 2031.0 | 2322.0 | 2029.0 | 0.0 | 2322.0 | 2.0 | 0.0 | 0.533425 | 0.999015 | 0.0 | 1.0 | 0.000985 | 0.466330 | 0.999015 |
2 | -16.2 | 4353.0 | 2031.0 | 2322.0 | 2026.0 | 0.0 | 2322.0 | 5.0 | 0.0 | 0.533425 | 0.997538 | 0.0 | 1.0 | 0.002462 | 0.465961 | 0.997538 |
3 | -15.4 | 4353.0 | 2031.0 | 2322.0 | 2024.0 | 0.0 | 2322.0 | 7.0 | 0.0 | 0.533425 | 0.996553 | 0.0 | 1.0 | 0.003447 | 0.465716 | 0.996553 |
4 | -14.2 | 4353.0 | 2031.0 | 2322.0 | 2019.0 | 0.0 | 2322.0 | 12.0 | 0.0 | 0.533425 | 0.994092 | 0.0 | 1.0 | 0.005908 | 0.465100 | 0.994092 |
linker.roc_chart_from_labels_column("cluster")
linker.precision_recall_chart_from_labels_column("cluster")
# Plot some false positives
linker.prediction_errors_from_labels_column(
"cluster", include_false_negatives=True, include_false_positives=True
).as_pandas_dataframe(limit=5)
clerical_match_score | found_by_blocking_rules | match_weight | match_probability | unique_id_l | unique_id_r | first_name_l | first_name_r | gamma_first_name | bf_first_name | ... | tf_city_r | bf_city | bf_tf_adj_city | email_l | email_r | gamma_email | bf_email | cluster_l | cluster_r | match_key | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | True | 0.205044 | 0.535472 | 486 | 619 | Hannah | NaN | -1 | 1.000000 | ... | 0.04920 | 10.852673 | 1.120874 | NaN | od@holloway.info | -1 | 1.000000 | 122 | 157 | 1 |
1 | 1.0 | True | -3.399559 | 0.086559 | 617 | 620 | NaN | Olivia | -1 | 1.000000 | ... | 0.04920 | 1.000000 | 1.000000 | NaN | NaN | -1 | 1.000000 | 157 | 157 | 1 |
2 | 1.0 | True | -3.399559 | 0.086559 | 618 | 620 | NaN | Olivia | -1 | 1.000000 | ... | 0.04920 | 1.000000 | 1.000000 | od@holloway.info | NaN | -1 | 1.000000 | 157 | 157 | 1 |
3 | 1.0 | True | -1.472365 | 0.264917 | 660 | 661 | Charlie | Cahlrae | 0 | 0.218912 | ... | 0.00123 | 0.424936 | 1.000000 | NaN | charlieh@sandoval-sanders.info | -1 | 1.000000 | 168 | 168 | 1 |
4 | 1.0 | True | -2.268212 | 0.171902 | 505 | 508 | NaN | NaN | -1 | 1.000000 | ... | 0.00246 | 0.424936 | 1.000000 | f.s@jharp.com | f.j@shrarp.com | 0 | 0.126095 | 126 | 126 | 1 |
5 rows × 32 columns
records = linker.prediction_errors_from_labels_column(
"cluster", include_false_negatives=True, include_false_positives=True
).as_record_dict(limit=5)
linker.waterfall_chart(records)