From a211236329e9fd97cf8c7172a357e55fe7e14292 Mon Sep 17 00:00:00 2001 From: Christoph Viebig Date: Thu, 2 Jul 2026 17:43:51 +0200 Subject: [PATCH] Avoid allocation in count_min_sketch::get_hashes --- count/include/count_min.hpp | 9 +++++---- count/include/count_min_impl.hpp | 26 +++++++++++--------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/count/include/count_min.hpp b/count/include/count_min.hpp index 1fbba0e6..5d45ee20 100644 --- a/count/include/count_min.hpp +++ b/count/include/count_min.hpp @@ -389,12 +389,13 @@ class count_min_sketch{ static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte); /* - * Obtain the hash values when inserting an item into the sketch. - * @param item pointer to the data item to be inserted into the sketch. + * Compute the hash locations for an input item + * @param item pointer to the data item to be inserted into or queried from the sketch. * @param size of the data in bytes - * @return vector of uint64_t which each represent the index to which `value' must update in the sketch + * @param callback function to invoke for each sketch array location */ - std::vector get_hashes(const void* item, size_t size) const; + template + void foreach_hash_location(const void* item, size_t size, F callback) const; }; diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp index 2f2629fc..528fb619 100644 --- a/count/include/count_min_impl.hpp +++ b/count/include/count_min_impl.hpp @@ -110,9 +110,10 @@ uint8_t count_min_sketch::suggest_num_hashes(double confidence) { } template -std::vector count_min_sketch::get_hashes(const void* item, size_t size) const { +template +void count_min_sketch::foreach_hash_location(const void* item, size_t size, F callback) const { /* - * Returns the hash locations for the input item using the original hashing + * Computes the hash locations for the input item using the original hashing * scheme from [1]. * Generate _num_hashes separate hashes from calls to murmurmhash. * This could be optimized by keeping both of the 64bit parts of the hash @@ -126,8 +127,6 @@ std::vector count_min_sketch::get_hashes(const void* item, size_t * https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf */ uint64_t bucket_index; - std::vector sketch_update_locations; - sketch_update_locations.reserve(_num_hashes); uint64_t hash_seed_index = 0; for (const auto &it: hash_seeds) { @@ -135,10 +134,9 @@ std::vector count_min_sketch::get_hashes(const void* item, size_t MurmurHash3_x64_128(item, size, it, hashes); // ? BEWARE OVERFLOW. uint64_t hash = hashes.h1; bucket_index = hash % _num_buckets; - sketch_update_locations.push_back((hash_seed_index * _num_buckets) + bucket_index); + callback((hash_seed_index * _num_buckets) + bucket_index); hash_seed_index += 1; } - return sketch_update_locations; } template @@ -158,12 +156,11 @@ W count_min_sketch::get_estimate(const void* item, size_t size) const { /* * Returns the estimated frequency of the item */ - std::vector hash_locations = get_hashes(item, size); - std::vector estimates; - for (const auto h: hash_locations) { - estimates.push_back(_sketch_array[h]); - } - return *std::min_element(estimates.begin(), estimates.end()); + W estimate = std::numeric_limits::max(); + foreach_hash_location(item, size, [this, &estimate](uint64_t h) { + estimate = std::min(estimate, _sketch_array[h]); + }); + return estimate; } template @@ -189,10 +186,9 @@ void count_min_sketch::update(const void* item, size_t size, W weight) { * locations by the weight. */ _total_weight += weight >= 0 ? weight : -weight; - std::vector hash_locations = get_hashes(item, size); - for (const auto h: hash_locations) { + foreach_hash_location(item, size, [this, weight](uint64_t h) { _sketch_array[h] += weight; - } + }); } template