Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions count/include/count_min.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,12 +389,13 @@ class count_min_sketch{
static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte);

/*
* Obtain the hash values when inserting an item into the sketch.
* @param item pointer to the data item to be inserted into the sketch.
* Compute the hash locations for an input item
* @param item pointer to the data item to be inserted into or queried from the sketch.
* @param size of the data in bytes
* @return vector of uint64_t which each represent the index to which `value' must update in the sketch
* @param callback function to invoke for each sketch array location
*/
std::vector<uint64_t> get_hashes(const void* item, size_t size) const;
template<typename F>
void foreach_hash_location(const void* item, size_t size, F callback) const;

};

Expand Down
26 changes: 11 additions & 15 deletions count/include/count_min_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,10 @@ uint8_t count_min_sketch<W,A>::suggest_num_hashes(double confidence) {
}

template<typename W, typename A>
std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t size) const {
template<typename F>
void count_min_sketch<W,A>::foreach_hash_location(const void* item, size_t size, F callback) const {
/*
* Returns the hash locations for the input item using the original hashing
* Computes the hash locations for the input item using the original hashing
* scheme from [1].
* Generate _num_hashes separate hashes from calls to murmurmhash.
* This could be optimized by keeping both of the 64bit parts of the hash
Expand All @@ -126,19 +127,16 @@ std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t
* https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
*/
uint64_t bucket_index;
std::vector<uint64_t> sketch_update_locations;
sketch_update_locations.reserve(_num_hashes);

uint64_t hash_seed_index = 0;
for (const auto &it: hash_seeds) {
HashState hashes;
MurmurHash3_x64_128(item, size, it, hashes); // ? BEWARE OVERFLOW.
uint64_t hash = hashes.h1;
bucket_index = hash % _num_buckets;
sketch_update_locations.push_back((hash_seed_index * _num_buckets) + bucket_index);
callback((hash_seed_index * _num_buckets) + bucket_index);
hash_seed_index += 1;
}
return sketch_update_locations;
}

template<typename W, typename A>
Expand All @@ -158,12 +156,11 @@ W count_min_sketch<W,A>::get_estimate(const void* item, size_t size) const {
/*
* Returns the estimated frequency of the item
*/
std::vector<uint64_t> hash_locations = get_hashes(item, size);
std::vector<W> estimates;
for (const auto h: hash_locations) {
estimates.push_back(_sketch_array[h]);
}
return *std::min_element(estimates.begin(), estimates.end());
W estimate = std::numeric_limits<W>::max();
foreach_hash_location(item, size, [this, &estimate](uint64_t h) {
estimate = std::min(estimate, _sketch_array[h]);
});
return estimate;
}

template<typename W, typename A>
Expand All @@ -189,10 +186,9 @@ void count_min_sketch<W,A>::update(const void* item, size_t size, W weight) {
* locations by the weight.
*/
_total_weight += weight >= 0 ? weight : -weight;
std::vector<uint64_t> hash_locations = get_hashes(item, size);
for (const auto h: hash_locations) {
foreach_hash_location(item, size, [this, weight](uint64_t h) {
_sketch_array[h] += weight;
}
});
}

template<typename W, typename A>
Expand Down
Loading