From ecc856bfbb2b93a5af27add912a2bddb7673c86d Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Wed, 14 Aug 2024 12:58:38 -0700 Subject: [PATCH] Add class-level docs to bloom filter (and builder) and include the serialization format in the impl file --- filters/include/bloom_filter.hpp | 33 +++++++++++++++++++++++++++ filters/include/bloom_filter_impl.hpp | 23 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index e73170d0..71b170e0 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -37,6 +37,14 @@ template class bloom_filter_builder_alloc; using bloom_filter = bloom_filter_alloc>; using bloom_filter_builder = bloom_filter_builder_alloc>; +/** + *

This class provides methods to help estimate the correct parameters when + * creating a Bloom filter, and methods to create the filter using those values.

+ * + *

The underlying math is described in the + * + * Wikipedia article on Bloom filters.

+ */ template> class bloom_filter_builder_alloc { using A = Allocator; @@ -149,6 +157,31 @@ class bloom_filter_builder_alloc { static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob); }; +/** + *

A Bloom filter is a data structure that can be used for probabilistic + * set membership.

+ * + *

When querying a Bloom filter, there are no false positives. Specifically: + * When querying an item that has already been inserted to the filter, the filter will + * always indicate that the item is present. There is a chance of false positives, where + * querying an item that has never been presented to the filter will indicate that the + * item has already been seen. Consequently, any query should be interpreted as + * "might have seen."

+ * + *

A standard Bloom filter is unlike typical sketches in that it is not sub-linear + * in size and does not resize itself. A Bloom filter will work up to a target number of + * distinct items, beyond which it will saturate and the false positive rate will start to + * increase. The size of a Bloom filter will be linear in the expected number of + * distinct items.

+ * + *

See the bloom_filter_builder_alloc class for methods to create a filter, especially + * one sized correctly for a target number of distinct elements and a target + * false positive probability.

+ * + *

This implementation uses xxHash64 and follows the approach in Kirsch and Mitzenmacher, + * "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley Interscience, 2008, pp. 187-218.

+ */ + template> class bloom_filter_alloc { using A = Allocator; diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index fab4be7b..b8c47a9b 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -244,6 +244,29 @@ bloom_filter_alloc bloom_filter_alloc::deserialize(const void* bytes, size return internal_deserialize_or_wrap(const_cast(bytes), length_bytes, false, false, allocator); } +/* + * A Bloom Filter's serialized image always uses 3 longs of preamble when empty, + * otherwise 4 longs: + * + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||       0        |    1   |    2   |    3   |    4   |    5   |    6   |    7   |
+ *  0   || Preamble_Longs | SerVer | FamID  |  Flags |----Num Hashes---|-----Unused------|
+ *
+ *      ||       8        |    9   |   10   |   11   |   12   |   13   |   14   |   15   |
+ *  1   ||---------------------------------Hash Seed-------------------------------------|
+ *
+ *      ||      16        |   17   |   18   |   19   |   20   |   21   |   22   |   23   |
+ *  2   ||-------BitArray Length (in longs)----------|-----------Unused------------------|
+ *
+ *      ||      24        |   25   |   26   |   27   |   28   |   29   |   30   |   31   |
+ *  3   ||---------------------------------NumBitsSet------------------------------------|
+ *  
+ * + * The raw BitArray bits, if non-empty start at byte 32. + */ + template bloom_filter_alloc
bloom_filter_alloc::deserialize(std::istream& is, const A& allocator) { const uint8_t prelongs = read(is);