diff --git a/tuple/include/tuple_filter.hpp b/tuple/include/tuple_filter.hpp new file mode 100644 index 00000000..0cf396ff --- /dev/null +++ b/tuple/include/tuple_filter.hpp @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TUPLE_FILTER_HPP_ +#define TUPLE_FILTER_HPP_ + +#include + +namespace datasketches { + +template> +class tuple_filter { +public: + using Entry = std::pair; + using CompactSketch = compact_tuple_sketch; + using AllocEntry = typename std::allocator_traits::template rebind_alloc; + + explicit tuple_filter(const Allocator& allocator = Allocator()); + + template + CompactSketch compute(const Sketch& sketch, const Predicate& predicate) const; + +private: + Allocator allocator_; +}; + +} + +#include "tuple_filter_impl.hpp" + +#endif diff --git a/tuple/include/tuple_filter_impl.hpp b/tuple/include/tuple_filter_impl.hpp new file mode 100644 index 00000000..95698e90 --- /dev/null +++ b/tuple/include/tuple_filter_impl.hpp @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TUPLE_FILTER_IMPL_HPP_ +#define TUPLE_FILTER_IMPL_HPP_ + +namespace datasketches { + +template +tuple_filter::tuple_filter(const A& allocator): +allocator_(allocator) +{} + +template +template +auto tuple_filter::compute(const Sketch& sketch, const Predicate& predicate) const -> CompactSketch { + std::vector entries(allocator_); + entries.reserve(sketch.get_num_retained()); + std::copy_if( + sketch.begin(), + sketch.end(), + std::back_inserter(entries), + [&predicate](const Entry& e) {return predicate(e.second);} + ); + entries.shrink_to_fit(); + return CompactSketch( + !sketch.is_estimation_mode() && entries.empty(), + sketch.is_ordered(), + sketch.get_seed_hash(), + sketch.get_theta64(), + std::move(entries) + ); +} + +} + +#endif diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp index 383e5736..e4a40c79 100644 --- a/tuple/include/tuple_sketch.hpp +++ b/tuple/include/tuple_sketch.hpp @@ -578,6 +578,7 @@ class compact_tuple_sketch: public tuple_sketch { template friend class theta_union_base; template friend class theta_intersection_base; template friend class theta_set_difference_base; + template friend class tuple_filter; compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector&& entries); }; diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt index 4ca6a503..5e1a3ab6 100644 --- a/tuple/test/CMakeLists.txt +++ b/tuple/test/CMakeLists.txt @@ -43,6 +43,7 @@ target_sources(tuple_test tuple_intersection_test.cpp tuple_a_not_b_test.cpp tuple_jaccard_similarity_test.cpp + tuple_filter_test.cpp array_of_doubles_sketch_test.cpp engagement_test.cpp ) diff --git a/tuple/test/tuple_filter_test.cpp b/tuple/test/tuple_filter_test.cpp new file mode 100644 index 00000000..933a776c --- /dev/null +++ b/tuple/test/tuple_filter_test.cpp @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +namespace datasketches { + +TEST_CASE("test", "[tuple_filter]") { + auto usk = update_tuple_sketch::builder().build(); + tuple_filter f; + + { // empty update sketch + auto sk = f.compute(usk, [](int){return true;}); + REQUIRE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.get_num_retained() == 0); + } + + { // empty compact sketch + auto sk = f.compute(usk.compact(), [](int){return true;}); + REQUIRE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.get_num_retained() == 0); + } + + usk.update(1, 1); + usk.update(1, 1); + usk.update(2, 1); + usk.update(2, 1); + usk.update(3, 1); + + { // exact mode update sketch + auto sk = f.compute(usk, [](int v){return v > 1;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE_FALSE(sk.is_ordered()); + REQUIRE_FALSE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + { // exact mode compact sketch + auto sk = f.compute(usk.compact(), [](int v){return v > 1;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE_FALSE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + // only keys 1 and 2 had values of 2, which will become 3 after this update + // some entries are discarded in estimation mode, but these happen to survive + // the process is deterministic, so the test will always work + for (int i = 0; i < 10000; ++i) usk.update(i, 1); + + { // estimation mode update sketch + auto sk = f.compute(usk, [](int v){return v > 2;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE_FALSE(sk.is_ordered()); + REQUIRE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + { // estimation mode compact sketch + auto sk = f.compute(usk.compact(), [](int v){return v > 2;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } +} + +} /* namespace datasketches */