Skip to content

Commit

Permalink
tuple filter
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexanderSaydakov committed May 10, 2024
1 parent 145ab0d commit 5826c0d
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 0 deletions.
47 changes: 47 additions & 0 deletions tuple/include/tuple_filter.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#ifndef TUPLE_FILTER_HPP_
#define TUPLE_FILTER_HPP_

#include <tuple_sketch.hpp>

namespace datasketches {

template<typename Summary, typename Allocator = std::allocator<Summary>>
class tuple_filter {
public:
using Entry = std::pair<uint64_t, Summary>;
using CompactSketch = compact_tuple_sketch<Summary, Allocator>;
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;

explicit tuple_filter(const Allocator& allocator = Allocator());

template<typename Sketch, typename Predicate>
CompactSketch compute(const Sketch& sketch, const Predicate& predicate) const;

private:
Allocator allocator_;
};

}

#include "tuple_filter_impl.hpp"

#endif
53 changes: 53 additions & 0 deletions tuple/include/tuple_filter_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#ifndef TUPLE_FILTER_IMPL_HPP_
#define TUPLE_FILTER_IMPL_HPP_

namespace datasketches {

template<typename S, typename A>
tuple_filter<S, A>::tuple_filter(const A& allocator):
allocator_(allocator)
{}

template<typename S, typename A>
template<typename Sketch, typename Predicate>
auto tuple_filter<S, A>::compute(const Sketch& sketch, const Predicate& predicate) const -> CompactSketch {
std::vector<Entry, AllocEntry> entries(allocator_);
entries.reserve(sketch.get_num_retained());
std::copy_if(
sketch.begin(),
sketch.end(),
std::back_inserter(entries),
[&predicate](const Entry& e) {return predicate(e.second);}
);
entries.shrink_to_fit();
return CompactSketch(
!sketch.is_estimation_mode() && entries.empty(),
sketch.is_ordered(),
sketch.get_seed_hash(),
sketch.get_theta64(),
std::move(entries)
);
}

}

#endif
1 change: 1 addition & 0 deletions tuple/include/tuple_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,7 @@ class compact_tuple_sketch: public tuple_sketch<Summary, Allocator> {
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
template<typename S, typename A> friend class tuple_filter;
compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);

};
Expand Down
1 change: 1 addition & 0 deletions tuple/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ target_sources(tuple_test
tuple_intersection_test.cpp
tuple_a_not_b_test.cpp
tuple_jaccard_similarity_test.cpp
tuple_filter_test.cpp
array_of_doubles_sketch_test.cpp
engagement_test.cpp
)
Expand Down
87 changes: 87 additions & 0 deletions tuple/test/tuple_filter_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <catch2/catch.hpp>
#include <tuple_filter.hpp>

namespace datasketches {

TEST_CASE("test", "[tuple_filter]") {
auto usk = update_tuple_sketch<int>::builder().build();
tuple_filter<int> f;

{ // empty update sketch
auto sk = f.compute(usk, [](int){return true;});
REQUIRE(sk.is_empty());
REQUIRE(sk.is_ordered());
REQUIRE(sk.get_num_retained() == 0);
}

{ // empty compact sketch
auto sk = f.compute(usk.compact(), [](int){return true;});
REQUIRE(sk.is_empty());
REQUIRE(sk.is_ordered());
REQUIRE(sk.get_num_retained() == 0);
}

usk.update(1, 1);
usk.update(1, 1);
usk.update(2, 1);
usk.update(2, 1);
usk.update(3, 1);

{ // exact mode update sketch
auto sk = f.compute(usk, [](int v){return v > 1;});
REQUIRE_FALSE(sk.is_empty());
REQUIRE_FALSE(sk.is_ordered());
REQUIRE_FALSE(sk.is_estimation_mode());
REQUIRE(sk.get_num_retained() == 2);
}

{ // exact mode compact sketch
auto sk = f.compute(usk.compact(), [](int v){return v > 1;});
REQUIRE_FALSE(sk.is_empty());
REQUIRE(sk.is_ordered());
REQUIRE_FALSE(sk.is_estimation_mode());
REQUIRE(sk.get_num_retained() == 2);
}

// only keys 1 and 2 had values of 2, which will become 3 after this update
// some entries are discarded in estimation mode, but these happen to survive
// the process is deterministic, so the test will always work
for (int i = 0; i < 10000; ++i) usk.update(i, 1);

{ // estimation mode update sketch
auto sk = f.compute(usk, [](int v){return v > 2;});
REQUIRE_FALSE(sk.is_empty());
REQUIRE_FALSE(sk.is_ordered());
REQUIRE(sk.is_estimation_mode());
REQUIRE(sk.get_num_retained() == 2);
}

{ // estimation mode compact sketch
auto sk = f.compute(usk.compact(), [](int v){return v > 2;});
REQUIRE_FALSE(sk.is_empty());
REQUIRE(sk.is_ordered());
REQUIRE(sk.is_estimation_mode());
REQUIRE(sk.get_num_retained() == 2);
}
}

} /* namespace datasketches */

0 comments on commit 5826c0d

Please sign in to comment.