-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathChapter5_Splink_Settings.json
118 lines (118 loc) · 4.95 KB
/
Chapter5_Splink_Settings.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
{
"link_type": "link_only",
"blocking_rules_to_generate_predictions": [
"l.Year = r.Year and l.Month = r.Month"
],
"comparisons": [
{
"output_column_name": "Firstname",
"comparison_levels": [
{
"sql_condition": "\"Firstname_l\" IS NULL OR \"Firstname_r\" IS NULL",
"label_for_charts": "Null",
"is_null_level": true
},
{
"sql_condition": "\"Firstname_l\" = \"Firstname_r\"",
"label_for_charts": "Exact match",
"m_probability": 0.9536693722036732,
"u_probability": 0.0037622388311744416
},
{
"sql_condition": "jaro_winkler_similarity(\"Firstname_l\", \"Firstname_r\") >= 0.9",
"label_for_charts": "Jaro_winkler_similarity >= 0.9",
"m_probability": 0.04185896073102217,
"u_probability": 0.0011643334683679213
},
{
"sql_condition": "ELSE",
"label_for_charts": "All other comparisons",
"m_probability": 0.004471667065304645,
"u_probability": 0.9950734277004576
}
],
"comparison_description": "Exact match vs. Firstname within jaro_winkler_similarity threshold 0.9 vs. anything else"
},
{
"output_column_name": "Lastname",
"comparison_levels": [
{
"sql_condition": "\"Lastname_l\" IS NULL OR \"Lastname_r\" IS NULL",
"label_for_charts": "Null",
"is_null_level": true
},
{
"sql_condition": "\"Lastname_l\" = \"Lastname_r\"",
"label_for_charts": "Exact match",
"m_probability": 0.7039682117133251,
"u_probability": 0.0006134708934003129
},
{
"sql_condition": "jaro_winkler_similarity(\"Lastname_l\", \"Lastname_r\") >= 0.9",
"label_for_charts": "Jaro_winkler_similarity >= 0.9",
"m_probability": 0.2935505556049878,
"u_probability": 0.00029914541533406863
},
{
"sql_condition": "ELSE",
"label_for_charts": "All other comparisons",
"m_probability": 0.0024812326816870916,
"u_probability": 0.9990873836912656
}
],
"comparison_description": "Exact match vs. Lastname within jaro_winkler_similarity threshold 0.9 vs. anything else"
},
{
"output_column_name": "Month",
"comparison_levels": [
{
"sql_condition": "\"Month_l\" IS NULL OR \"Month_r\" IS NULL",
"label_for_charts": "Null",
"is_null_level": true
},
{
"sql_condition": "\"Month_l\" = \"Month_r\"",
"label_for_charts": "Exact match",
"m_probability": 0.5013056458813202,
"u_probability": 0.0838861594985051
},
{
"sql_condition": "ELSE",
"label_for_charts": "All other comparisons",
"m_probability": 0.4986943541186798,
"u_probability": 0.9161138405014949
}
],
"comparison_description": "Exact match vs. anything else"
},
{
"output_column_name": "Year",
"comparison_levels": [
{
"sql_condition": "\"Year_l\" IS NULL OR \"Year_r\" IS NULL",
"label_for_charts": "Null",
"is_null_level": true
},
{
"sql_condition": "\"Year_l\" = \"Year_r\"",
"label_for_charts": "Exact match",
"m_probability": 0.9571810981114466,
"u_probability": 0.019595044260833656,
"tf_adjustment_column": "Year",
"tf_adjustment_weight": 1.0
},
{
"sql_condition": "ELSE",
"label_for_charts": "All other comparisons",
"m_probability": 0.042818901888553355,
"u_probability": 0.9804049557391663
}
],
"comparison_description": "Exact match vs. anything else"
}
],
"additional_columns_to_retain": ["company_number"],
"sql_dialect": "duckdb",
"linker_uid": "heycobsb",
"probability_two_random_records_match": 0.0001
}