Added cuckoo stuff, uint64 became slower again.

This commit is contained in:
Davi Reis 2012-03-14 11:58:37 -03:00
parent a4d96e6cb2
commit 687cc1b194
4 changed files with 74 additions and 49 deletions

View File

@ -89,12 +89,10 @@ using namespace cxxmph;
int main(int argc, char** argv) { int main(int argc, char** argv) {
srandom(4); srandom(4);
/*
Benchmark::Register(new BM_CreateUrls<mph_map<StringPiece, StringPiece>>("URLS100k")); Benchmark::Register(new BM_CreateUrls<mph_map<StringPiece, StringPiece>>("URLS100k"));
Benchmark::Register(new BM_CreateUrls<unordered_map<StringPiece, StringPiece>>("URLS100k")); Benchmark::Register(new BM_CreateUrls<unordered_map<StringPiece, StringPiece>>("URLS100k"));
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0));
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0));
*/
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>); Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>);

View File

@ -2,6 +2,7 @@
#define __CXXMPH_MPH_BITS_H__ #define __CXXMPH_MPH_BITS_H__
#include <stdint.h> // for uint32_t and friends #include <stdint.h> // for uint32_t and friends
#include <climits>
namespace cxxmph { namespace cxxmph {
@ -12,6 +13,12 @@ static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
} }
static uint32_t nextpoweroftwo(uint32_t k) {
if (k == 0) return 1;
k--;
for (int i=1; i<sizeof(uint32_t)*CHAR_BIT; i<<=1) k = k | k >> i;
return k+1;
}
} // namespace cxxmph } // namespace cxxmph

View File

@ -68,8 +68,8 @@ class MPHIndex {
// Crazy functions. Ignore. // Crazy functions. Ignore.
template <class SeededHashFcn> // must agree with Reset template <class SeededHashFcn> // must agree with Reset
uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const; uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const;
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn> // must agree with Reset
uint8_t cuckoo_nest(const Key& x, const uint32_t* h) const; uint8_t cuckoo_nest(const uint32_t* h) const;
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t cuckoo_nest_index(const Key& x, uint32_t* h) const; uint32_t cuckoo_nest_index(const Key& x, uint32_t* h) const;
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn, class Key> // must agree with Reset
@ -197,24 +197,28 @@ void MPHIndex::hash_vector(const Key& key, uint32_t* h) const {
SeededHashFcn().hash64(key, hash_seed_[0], h); SeededHashFcn().hash64(key, hash_seed_[0], h);
} }
template <class SeededHashFcn, class Key> template <class SeededHashFcn> // must agree with Reset
uint8_t MPHIndex::cuckoo_nest(const Key& key, const uint32_t* h) const { uint8_t MPHIndex::cuckoo_nest(const uint32_t* h) const {
uint32_t x[4]; uint32_t x[4];
if (!g_size_) return 0;
x[0] = (h[0] % r_) + nest_displacement_[0]; x[0] = (h[0] % r_) + nest_displacement_[0];
x[1] = (h[1] % r_) + nest_displacement_[1]; x[1] = (h[1] % r_) + nest_displacement_[1];
x[2] = (h[2] % r_) + nest_displacement_[2]; x[2] = (h[2] % r_) + nest_displacement_[2];
assert((x[0] >> 2) <g_size_);
assert((x[1] >> 2) <g_size_);
assert((x[2] >> 2) <g_size_);
return (get_2bit_value(g_, x[0]) + get_2bit_value(g_, x[1]) + get_2bit_value(g_, x[2])) % 3; return (get_2bit_value(g_, x[0]) + get_2bit_value(g_, x[1]) + get_2bit_value(g_, x[2])) % 3;
} }
template <class SeededHashFcn, class Key> template <class SeededHashFcn, class Key>
uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t MPHIndex::perfect_hash(const Key& key) const {
uint32_t h[4]; uint32_t h[4];
if (!g_size_) return 0;
SeededHashFcn().hash64(key, hash_seed_[0], h); SeededHashFcn().hash64(key, hash_seed_[0], h);
// for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
h[0] = (h[0] % r_) + nest_displacement_[0]; h[0] = (h[0] % r_) + nest_displacement_[0];
h[1] = (h[1] % r_) + nest_displacement_[1]; h[1] = (h[1] % r_) + nest_displacement_[1];
h[2] = (h[2] % r_) + nest_displacement_[2]; h[2] = (h[2] % r_) + nest_displacement_[2];
if (!g_size_) return 0;
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
assert((h[0] >> 2) <g_size_); assert((h[0] >> 2) <g_size_);
assert((h[1] >> 2) <g_size_); assert((h[1] >> 2) <g_size_);
@ -245,7 +249,7 @@ class SimpleMPHIndex : public MPHIndex {
uint32_t index(const Key& key) const { return MPHIndex::index<HashFcn>(key); } uint32_t index(const Key& key) const { return MPHIndex::index<HashFcn>(key); }
uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash<HashFcn>(key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash<HashFcn>(key); }
uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash<HashFcn>(key); } uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash<HashFcn>(key); }
uint8_t cuckoo_nest(const Key& key, const uint32_t* h) const { return MPHIndex::cuckoo_nest<HashFcn>(key, h); } uint8_t cuckoo_nest(const uint32_t* h) const { return MPHIndex::cuckoo_nest<HashFcn>(h); }
uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const { return MPHIndex::cuckoo_hash<HashFcn>(h, nest); } uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const { return MPHIndex::cuckoo_hash<HashFcn>(h, nest); }
void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector<HashFcn>(key, h); } void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector<HashFcn>(key, h); }
}; };

View File

@ -13,6 +13,7 @@
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include <utility> // for std::pair #include <utility> // for std::pair
@ -101,13 +102,19 @@ class mph_map {
return hollow_const_iterator<std::vector<value_type>>(&values_, &present_, it); return hollow_const_iterator<std::vector<value_type>>(&values_, &present_, it);
} }
iterator slow_find(const key_type& k); iterator slow_find(const key_type& k, uint32_t perfect_hash);
const_iterator slow_find(const key_type& k) const; const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const;
static const uint8_t kNestCollision = 3; // biggest 2 bit value static const uint8_t kNestCollision = 3; // biggest 2 bit value
uint32_t nest_index(const key_type& k, uint32_t* h) const { void set_nest_value(const uint32_t* h, uint8_t value) {
index_.hash_vector(k, h); assert(get_nest_index(h) < nests_.size() * 4);
// Use a pivot to prevent branch in the fast path set_2bit_value(&(nests_[0]), get_nest_index(h), value);
return h[3] % (index_.perfect_hash_size() + 1); }
uint32_t get_nest_value(const uint32_t* h) const {
assert(get_nest_index(h) < nests_.size() * 4);
return get_2bit_value(&(nests_[0]), get_nest_index(h));
}
uint32_t get_nest_index(const uint32_t* h) const {
return h[3] & ((nests_.size() << 2) - 1);
} }
void pack(); void pack();
@ -144,10 +151,11 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
} }
values_.push_back(x); values_.push_back(x);
present_.push_back(true); present_.push_back(true);
nests_.resize(ceil(values_.size() / 2.0), std::numeric_limits<uint8_t>::max()); auto nests_size = nextpoweroftwo(ceil(values_.size() / 4.0) + 1)*10;
nests_.resize(nests_size, std::numeric_limits<uint8_t>::max());
uint32_t h[4]; uint32_t h[4];
auto index = nest_index(x.first, h); index_.hash_vector(x.first, h);
set_2bit_value(&(nests_[0]), index, kNestCollision); set_nest_value(h, kNestCollision);
++size_; ++size_;
slack_.insert(make_pair(x.first, values_.size() - 1)); slack_.insert(make_pair(x.first, values_.size() - 1));
if (should_pack) pack(); if (should_pack) pack();
@ -157,6 +165,7 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
MPH_MAP_METHOD_DECL(void_type, pack)() { MPH_MAP_METHOD_DECL(void_type, pack)() {
if (values_.empty()) return; if (values_.empty()) return;
assert(std::unordered_set<key_type>(make_iterator_first(begin()), make_iterator_first(end())).size() == size());
bool success = index_.Reset( bool success = index_.Reset(
make_iterator_first(begin()), make_iterator_first(begin()),
make_iterator_first(end()), size_); make_iterator_first(end()), size_);
@ -165,28 +174,33 @@ MPH_MAP_METHOD_DECL(void_type, pack)() {
new_values.reserve(new_values.size() * 2); new_values.reserve(new_values.size() * 2);
std::vector<bool> new_present(index_.perfect_hash_size(), false); std::vector<bool> new_present(index_.perfect_hash_size(), false);
new_present.reserve(new_present.size() * 2); new_present.reserve(new_present.size() * 2);
std::vector<uint8_t> new_nests(ceil(index_.perfect_hash_size() / 2.0), std::numeric_limits<uint8_t>::max()); auto new_nests_size = nextpoweroftwo(ceil(new_values.size() / 4.0) + 1)*10;
std::vector<uint8_t> new_nests(new_nests_size, std::numeric_limits<uint8_t>::max());
new_nests.reserve(new_nests.size() * 2); new_nests.reserve(new_nests.size() * 2);
vector<bool> used_nests(new_nests.size() * 2); nests_.swap(new_nests);
vector<bool> used_nests(nests_.size() * 4);
uint32_t collisions = 0;
for (iterator it = begin(), it_end = end(); it != it_end; ++it) { for (iterator it = begin(), it_end = end(); it != it_end; ++it) {
size_type id = index_.perfect_hash(it->first); size_type id = index_.perfect_hash(it->first);
assert(id < new_values.size()); assert(id < new_values.size());
new_values[id] = *it; new_values[id] = *it;
new_present[id] = true; new_present[id] = true;
uint32_t h[4]; uint32_t h[4];
uint32_t index = nest_index(it->first, h); index_.hash_vector(it->first, h);
if (used_nests[index]) { // fprintf(stderr, "Nest index: %d\n", get_nest_index(h));
set_2bit_value(&(new_nests[0]), index, kNestCollision); assert(used_nests.size() > get_nest_index(h));
} if (used_nests[get_nest_index(h)]) {
else { set_nest_value(h, kNestCollision);
set_2bit_value(&(new_nests[0]), index, index_.cuckoo_nest(it->first, h)); ++collisions;
assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, index_.cuckoo_nest(it->first, h))); } else {
used_nests[index] = true; set_nest_value(h, index_.cuckoo_nest(h));
assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, index_.cuckoo_nest(h)));
used_nests[get_nest_index(h)] = true;
} }
} }
fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size());
values_.swap(new_values); values_.swap(new_values);
present_.swap(new_present); present_.swap(new_present);
nests_.swap(new_nests);
slack_type().swap(slack_); slack_type().swap(slack_);
} }
@ -210,7 +224,8 @@ MPH_MAP_METHOD_DECL(void_type, clear)() {
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
present_[pos - begin] = false; present_[pos - begin] = false;
uint32_t h[4]; uint32_t h[4];
nests_[nest_index(pos->first, h)] = kNestCollision; index_.hash_vector(pos->first, &h);
nests_[get_nest_index(h)] = kNestCollision;
*pos = value_type(); *pos = value_type();
--size_; --size_;
} }
@ -222,19 +237,21 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) {
MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
uint32_t h[4]; uint32_t h[4];
auto nest = get_2bit_value(&(nests_[0]), nest_index(k, h)); index_.hash_vector(k, h);
if (nest != kNestCollision) { auto nest = get_nest_value(h);
auto vit = values_.begin() + h[nest]; if (__builtin_expect(nest != kNestCollision, 1)) {
auto vit = values_.begin() + index_.cuckoo_hash(h, nest);
if (equal_(k, vit->first)) return make_iterator(vit); if (equal_(k, vit->first)) return make_iterator(vit);
} }
return slow_find(k); nest = index_.cuckoo_nest(h);
assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest));
return slow_find(k, index_.cuckoo_hash(h, nest));
} }
MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k) const { MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const {
if (index_.perfect_hash_size()) { if (__builtin_expect(index_.perfect_hash_size(), 0)) {
auto id = index_.perfect_hash(k); if (__builtin_expect(present_[perfect_hash], true)) {
if (present_[id]) { auto vit = values_.begin() + perfect_hash;
auto vit = values_.begin() + id;
if (equal_(k, vit->first)) return make_iterator(vit); if (equal_(k, vit->first)) return make_iterator(vit);
} }
} }
@ -247,22 +264,21 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k) const {
MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
uint32_t h[4]; uint32_t h[4];
auto index = nest_index(k, h); index_.hash_vector(k, h);
assert(nests_.size()); auto nest = get_nest_value(h);
assert(nests_.size() > index / 2); if (__builtin_expect(nest != kNestCollision, 1)) {
auto nest = get_2bit_value(&(nests_[0]), index);
if (nest != kNestCollision) {
auto vit = values_.begin() + index_.cuckoo_hash(h, nest); auto vit = values_.begin() + index_.cuckoo_hash(h, nest);
if (equal_(k, vit->first)) return make_iterator(vit); if (equal_(k, vit->first)) return make_iterator(vit);
} }
return slow_find(k); nest = index_.cuckoo_nest(h);
// assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest));
return slow_find(k, index_.cuckoo_hash(h, nest));
} }
MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k) { MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) {
if (index_.perfect_hash_size()) { if (__builtin_expect(index_.perfect_hash_size(), 0)) {
auto id = index_.perfect_hash(k); if (__builtin_expect(present_[perfect_hash], true)) {
if (present_[id]) { auto vit = values_.begin() + perfect_hash;
auto vit = values_.begin() + id;
if (equal_(k, vit->first)) return make_iterator(vit); if (equal_(k, vit->first)) return make_iterator(vit);
} }
} }