-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathFuzzyNGramSearch.ecl
538 lines (439 loc) · 18.1 KB
/
FuzzyNGramSearch.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
/**
* Implementation of ngram-based fuzzy searching, at scale. Both index creation and searching
* are supported.
*
* This module contains some toplevel exported attributes, primarily data types and record
* definitions, to make it easier for external code to interface with the data created here.
* A shared module contains the implementation, then two exported modules act as the interface
* (Build and Search). The exported functions are:
*
* Build('<fileScope>').CreateFiles()
* Search('<fileScope>').SearchMany()
* Search('<fileScope>').SearchOne()
*
* <fileScope> refers to an HPCC logical file scope. Files created by this module will be
* created there, and the search functions will expect those same files to be available.
*
* The tag "entity" is used to denote "the thing being indexed/searched." In reality, you
* can index/search anything that is a UTF-8 string, and an entity ID is really just a GUID.
* Also, note that the string can be anything: a simple string, a string that is a
* concatenation of other data, etc. As long as you normalize and construct the string the
* same way for both building and searching, It Just Works. You just need to use an
* EntityLayout record structure for both building and searching.
*
* The only build parameter for "tuning" is the ngram size. The system defaults to a
* value of 2, but 3 is often used as well. YMMV, and you should test.
*
* A complete example of both build and search is in a comment block at the end of this file.
*
* Origin: https://github.com/hpccsystems-solutions-lab/Useful_ECL
*/
EXPORT FuzzyNGramSearch := MODULE
IMPORT Std;
//--------------------------------------------------------------------
SHARED DEFAULT_NGRAM_LENGTH := 2;
//--------------------------------------------------------------------
EXPORT EntityID_t := UNSIGNED8;
EXPORT VocabID_t := UNSIGNED2;
EXPORT EntityLayout := RECORD
EntityID_t id; // Entity GUID
UTF8 s; // Data associated with entity
END;
EXPORT NGramLayout := RECORD
UTF8 ngram;
END;
EXPORT EntityNGramLayout := RECORD
EntityID_t id;
NGramLayout;
END;
EXPORT VocabLayout := RECORD
VocabID_t pos;
NGramLayout;
END;
EXPORT NGramLookupLayout := RECORD
VocabID_t lookup_id;
EntityID_t id;
SET OF VocabID_t ngram_pos_set;
END;
EXPORT SearchResultLayout := RECORD
EntityID_t search_id;
EntityID_t entity_id;
REAL8 similarity;
END;
//====================================================================
SHARED Util(STRING fileScope) := MODULE
EXPORT FS := MODULE
SHARED fsPrefix := Std.Str.RemoveSuffix(fileScope, '::');
EXPORT VOCABULARY_FILENAME := fsPrefix + '::vocabulary';
EXPORT NGRAM_LOOKUP_FILENAME := fsPrefix + '::ngram_lookup';
EXPORT vocabDS := DATASET(VOCABULARY_FILENAME, VocabLayout, FLAT);
EXPORT corpusNGramsDS := DATASET(NGRAM_LOOKUP_FILENAME, NGramLookupLayout, FLAT);
END;
//--------------------------------------------------------------------
EXPORT DATASET(NGramLayout) MakeNGrams(CONST UTF8 s, UNSIGNED1 ngram_length = DEFAULT_NGRAM_LENGTH) := EMBED(C++)
#option pure
#include <set>
#include <string>
#include <utility>
#include <vector>
inline size_t countTrailingBytes(byte value)
{
if (value < 0xc0) return 0;
if (value < 0xe0) return 1;
if (value < 0xf0) return 2;
if (value < 0xf8) return 3;
if (value < 0xfc) return 4;
return 5;
}
inline size_t bytesForChar(byte ch)
{
size_t trailingByteCount = countTrailingBytes(ch);
if (trailingByteCount > 4)
return 0;
return trailingByteCount + 1;
}
size_t byteCountForChar(const char* inputString, size_t inputStringSize, size_t currentPos)
{
size_t byteCount = bytesForChar(inputString[currentPos]);
if (byteCount == 0 || (currentPos + byteCount > inputStringSize))
{
// Error condition
rtlFail(-1, "Invalid UTF-8 encoding");
}
return byteCount;
}
#body
std::set<std::string> ngramSet;
if (lenS >= ngram_length && ngram_length > 0)
{
size_t sSize = rtlUtf8Size(lenS, s);
size_t currentPos = 0;
std::vector<std::pair<size_t, size_t>> byteSizes;
std::string ngramBuffer;
// Precompute bytes used for each character
byteSizes.reserve(lenS);
for (size_t x = 0; x < lenS; x++)
{
size_t numBytesToCopy = byteCountForChar(s, sSize, currentPos);
byteSizes.push_back(std::make_pair(currentPos, numBytesToCopy));
currentPos += numBytesToCopy;
}
// Collect unique ngrams
for (size_t x = 0; x < (lenS - ngram_length + 1); x++)
{
size_t numBytesToCopy = 0;
currentPos = byteSizes[x].first;
for (size_t y = 0; y < ngram_length; y++)
numBytesToCopy += byteSizes[x + y].second;
ngramBuffer.assign(s + currentPos, numBytesToCopy);
ngramSet.insert(ngramBuffer);
}
}
// Compute result buffer size and allocate
__lenResult = 0;
for (auto& n : ngramSet)
{
// stringLen + stringData
__lenResult += (sizeof(size32_t) + n.size());
}
if (__lenResult > 0)
{
__result = rtlMalloc(__lenResult);
byte* outPtr = static_cast<byte*>(__result);
size32_t stringLen = ngram_length;
for (auto& n : ngramSet)
{
memcpy_iflen(outPtr, &stringLen, sizeof(stringLen));
outPtr += sizeof(stringLen);
memcpy_iflen(outPtr, n.data(), n.size());
outPtr += n.size();
}
}
else
{
__result = nullptr;
}
ENDEMBED;
//--------------------------------------------------------------------
SHARED CreateEntityNGrams(DATASET(EntityLayout) entities, UNSIGNED1 ngramLength) := FUNCTION
entityNGrams := NORMALIZE
(
entities,
MakeNGrams(LEFT.s, ngramLength),
TRANSFORM
(
{
EntityID_t id,
UTF8 ngram
},
SELF.id := LEFT.id,
SELF.ngram := RIGHT.ngram
)
);
RETURN entityNGrams;
END;
//--------------------------------------------------------------------
EXPORT CreateVocabulary(DATASET(EntityLayout) entities, UNSIGNED1 ngramLength) := FUNCTION
rawNGrams := CreateEntityNGrams(entities, ngramLength);
// Deduplicate only the ngrams
vocab0 := TABLE(rawNGrams, {ngram}, ngram, MERGE);
vocab := PROJECT
(
vocab0,
TRANSFORM
(
VocabLayout,
SELF.pos := COUNTER,
SELF := LEFT
)
);
RETURN vocab;
END;
//--------------------------------------------------------------------
EXPORT CreateNGramLookups(DATASET(EntityLayout) entities, DATASET(VocabLayout) vocabulary, UNSIGNED1 ngramLength) := FUNCTION
// Convert entity names into ngrams, keeping the ID associated with each
entityNGrams := CreateEntityNGrams(entities, ngramLength);
// Convert found ngrams into their positions
vocabMatches := JOIN
(
entityNGrams,
vocabulary,
LEFT.ngram = RIGHT.ngram,
TRANSFORM
(
{
EntityID_t id,
VocabID_t ngram_pos
},
SELF.id := LEFT.id,
SELF.ngram_pos := RIGHT.pos
),
LOOKUP
);
distVocabMatches := DISTRIBUTE(vocabMatches, HASH64(id));
// Convert the positions to a sorted set
ngramLookups0 := PROJECT
(
distVocabMatches,
TRANSFORM
(
{
EntityID_t id,
SET OF VocabID_t ngram_pos_set,
LEFT.ngram_pos
},
SELF.id := LEFT.id,
SELF.ngram_pos_set := [LEFT.ngram_pos],
SELF := LEFT
)
);
ngramLookups1 := ROLLUP
(
SORT(ngramLookups0, id, ngram_pos, LOCAL),
TRANSFORM
(
RECORDOF(LEFT),
SELF.ngram_pos_set := LEFT.ngram_pos_set + RIGHT.ngram_pos_set,
SELF := LEFT
),
id,
LOCAL
);
// Extract each ngram position; this becomes our primary key to each record
ngramLookups := NORMALIZE
(
ngramLookups1,
COUNT(LEFT.ngram_pos_set),
TRANSFORM
(
NGramLookupLayout,
SELF.lookup_id := LEFT.ngram_pos_set[COUNTER],
SELF := LEFT
)
);
RETURN ngramLookups;
END;
//--------------------------------------------------------------------
// Assumes that SET values are sorted ascending
EXPORT REAL8 JaccardSimilarity(SET OF VocabID_t set1, SET OF VocabID_t set2) := EMBED(C++)
#option pure;
#body
const uint16_t * numSet1 = static_cast<const uint16_t *>(set1);
unsigned long numElements1 = lenSet1 / sizeof(uint16_t);
unsigned long pos1 = 0;
const uint16_t * numSet2 = static_cast<const uint16_t *>(set2);
unsigned long numElements2 = lenSet2 / sizeof(uint16_t);
unsigned long pos2 = 0;
unsigned long intersectionCount = 0;
unsigned long unionCount = 0;
while (pos1 < numElements1 || pos2 < numElements2)
{
if (pos1 < numElements1 && pos2 < numElements2)
{
++unionCount;
if (numSet1[pos1] == numSet2[pos2])
{
++intersectionCount;
++pos1;
++pos2;
}
else if (numSet1[pos1] < numSet2[pos2])
{
++pos1;
}
else
{
++pos2;
}
}
else if (pos1 < numElements1)
{
unionCount += (numElements1 - pos1);
break;
}
else
{
unionCount += (numElements2 - pos2);
break;
}
}
return static_cast<double>(intersectionCount) / static_cast<double>(unionCount);
ENDEMBED;
END; // Module Util
//====================================================================
EXPORT Build(STRING fileScope) := MODULE
SHARED UtilMod := Util(fileScope);
SHARED FSMod := UtilMod.FS;
EXPORT CreateFiles(DATASET(EntityLayout) entities, UNSIGNED1 ngramLength = DEFAULT_NGRAM_LENGTH) := FUNCTION
// Distribute the corpus for efficiency
distEntities := DISTRIBUTE(entities, SKEW(0.02));
// Create vocabulary
vocab := UtilMod.CreateVocabulary(distEntities, ngramLength);
createVocabFileAction := OUTPUT(vocab, {vocab}, FSMod.VOCABULARY_FILENAME, COMPRESSED, OVERWRITE);
// Create dense signatures
corpusNGrams0 := UtilMod.CreateNGramLookups(distEntities, vocab, ngramLength);
corpusNGrams := corpusNGrams0;
createSignaturesFileAction := OUTPUT(corpusNGrams, {corpusNGrams}, FSMod.NGRAM_LOOKUP_FILENAME, COMPRESSED, OVERWRITE);
buildAllAction := PARALLEL
(
createVocabFileAction,
createSignaturesFileAction
);
RETURN buildAllAction;
END;
END; // Module Build
//====================================================================
EXPORT Search(STRING fileScope) := MODULE
SHARED UtilMod := Util(fileScope);
SHARED FSMod := UtilMod.FS;
EXPORT SearchMany(DATASET(EntityLayout) searchEntities, REAL8 minSimilarity) := FUNCTION
// Files we need
vocab := FSMod.vocabDS;
corpusNGrams := FSMod.corpusNGramsDS;
// Create dense signatures for the search entities
searchSigs := UtilMod.CreateNGramLookups(searchEntities, vocab, LENGTH(vocab[1].ngram));
// Find initial matches
initialMatches := JOIN
(
corpusNGrams,
searchSigs,
LEFT.lookup_id = RIGHT.lookup_id,
TRANSFORM
(
{
EntityID_t search_id,
EntityID_t entity_id,
SET OF VocabID_t search_ngram_pos_set;
SET OF VocabID_t entity_ngram_pos_set;
},
SELF.entity_id := LEFT.id,
SELF.search_id := RIGHT.id,
SELF.entity_ngram_pos_set := LEFT.ngram_pos_set,
SELF.search_ngram_pos_set := RIGHT.ngram_pos_set
),
LOOKUP
);
// Dedup so as to avoid running similarity computation on identical pairs
dedupedMatches := TABLE
(
initialMatches,
{
search_id,
entity_id,
entity_ngram_pos_set,
search_ngram_pos_set
},
search_id, entity_id, entity_ngram_pos_set, search_ngram_pos_set,
MERGE
);
// Filter out dissimilar matches
matches := PROJECT
(
dedupedMatches,
TRANSFORM
(
SearchResultLayout,
sim := UtilMod.JaccardSimilarity(LEFT.entity_ngram_pos_set, LEFT.search_ngram_pos_set);
SELF.similarity := IF(sim >= minSimilarity, sim, SKIP),
SELF := LEFT
)
);
RETURN matches;
END;
//--------------------------------------------------------------------
EXPORT SearchOne(UTF8 searchString, REAL8 minSimilarity) := FUNCTION
RETURN SearchMany(DATASET([{0, searchString}], EntityLayout), minSimilarity);
END;
END; // Module Search
END; // Module FuzzyNGramSearch
/******* EXAMPLE CODE ***********************************************************************
IMPORT FuzzyNGramSearch;
NGRAM_SIZE := 2;
MIN_SIMILARITY := 0.10; // Artificially low to see results
// Make sure the above constants adhere to our setup
ASSERT(NGRAM_SIZE > 1, FAIL);
ASSERT(MIN_SIMILARITY >= 0, FAIL);
//--------------------------------
DO_BUILD := TRUE;
DO_SEARCH := TRUE;
FILE_SCOPE := '~FuzzyNGramSearch_test';
//--------------------------------
corpus0 := DATASET
(
[
{1001, u8'CAMPER'},
{1002, u8'AAAAAABAAAAAAA'},
{1003, u8'FUBAR'},
{1004, u8'Z'}
],
FuzzyNGramSearch.EntityLayout,
DISTRIBUTED
);
corpus := NOFOLD(corpus0);
buildAction := FuzzyNGramSearch.Build(FILE_SCOPE).CreateFiles(corpus, NGRAM_SIZE);
//--------------------------------
searchEntities0 := DATASET
(
[
{1, u8'CAMPER'},
{2, u8'DAN'},
{3, u8'PERSON'},
{4, u8'CAMPBELL'}
],
FuzzyNGramSearch.EntityLayout
);
searchEntities := NOFOLD(searchEntities0);
searchManyResults := FuzzyNGramSearch.Search(FILE_SCOPE).SearchMany(searchEntities, MIN_SIMILARITY);
searchOneResults := FuzzyNGramSearch.Search(FILE_SCOPE).SearchOne(NOFOLD(u8'FUBARSKY'), MIN_SIMILARITY);
//--------------------------------
SEQUENTIAL
(
IF(DO_BUILD, buildAction),
IF(DO_SEARCH,
PARALLEL
(
OUTPUT(searchManyResults, NAMED('search_many_results')),
OUTPUT(searchOneResults, NAMED('search_one_results'))
))
);
*/