diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 0534c68..f99a284 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,5 +1,5 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test +check_PROGRAMS = seeded_hash_test mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la @@ -27,9 +27,10 @@ cxxmph_LDADD = libcxxmph.la cxxmph_SOURCES = cxxmph.cc hollow_iterator_test_SOURCES = hollow_iterator_test.cc + seeded_hash_test_SOURCES = seeded_hash_test.cc +seeded_hash_test_LDADD = libcxxmph.la mph_bits_test_SOURCES = mph_bits_test.cc mph_bits_test_LDADD = libcxxmph.la -mph_bits_test_LDADD = libcxxmph.la diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 0a0b225..1708c06 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -8,6 +8,9 @@ using cxxmph::mph_map; using std::string; using std::unordered_map; +// Another reference benchmark: +// http://blog.aggregateknowledge.com/tag/bigmemory/ + namespace cxxmph { template diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index c9eaabb..a32f31f 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -63,6 +63,23 @@ static uint32_t nextpoweroftwo(uint32_t k) { return k+1; } +template struct bitcount { +enum { value = (n & mask ? 1:0) + bitcount> 1)>::value }; +}; +template struct bitcount { enum { value = 0 }; }; + +template +class CompileTimeRankTable { +public: +CompileTimeRankTable() : current(bitcount::value) { } +int operator[] (int i) { return *(¤t + size - i - 1); } +private: +unsigned char current; +CompileTimeRankTable next; +}; +template class CompileTimeRankTable { }; +typedef CompileTimeRankTable<256> Ranktable; + // Interesting bit tricks that might end up here: // http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord // Fast a % (k*2^t) diff --git a/cxxmph/mph_bits_test.cc b/cxxmph/mph_bits_test.cc index c1680e3..0105265 100644 --- a/cxxmph/mph_bits_test.cc +++ b/cxxmph/mph_bits_test.cc @@ -4,6 +4,8 @@ #include "mph_bits.h" using cxxmph::dynamic_2bitset; +using cxxmph::Ranktable; + int main(int argc, char** argv) { dynamic_2bitset small(256, true); for (int i = 0; i < small.size(); ++i) small.set(i, i % 4); @@ -52,6 +54,12 @@ int main(int argc, char** argv) { empty.clear(); dynamic_2bitset large(1000, true); empty.swap(large); + + Ranktable ranktable; + if (ranktable[0] != 0) exit(-1); + if (ranktable[1] != 1) exit(-1); + if (ranktable[2] != 1) exit(-1); + if (ranktable[255] != 8) exit(-1); } diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index ceb3109..dc7134e 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -112,10 +112,11 @@ class mph_map { std::vector values_; std::vector present_; SimpleMPHIndex::hash_function> index_; - // TODO(davi) optimize slack to hold 128 unique bits from hash64 as key - typedef unordered_map slack_type; + // TODO(davi) optimize slack to use hash from index rather than calculate its own + typedef unordered_map slack_type; slack_type slack_; size_type size_; + typename seeded_hash::hash_function hasher128_; mutable uint64_t fast_; mutable uint64_t fast_taken_; @@ -148,7 +149,9 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { values_.push_back(x); present_.push_back(true); ++size_; - slack_.insert(make_pair(x.first, values_.size() - 1)); + h128 h = hasher128_.hash128(x.first, 0); + if (slack_.find(h) != slack_.end()) should_pack = true; // unavoidable pack + else slack_.insert(std::make_pair(h, values_.size() - 1)); if (should_pack) pack(); it = find(x.first); slow_ = 0; @@ -218,7 +221,7 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfe } if (__builtin_expect(!slack_.empty(), 0)) { ++very_slow_; - auto sit = slack_.find(k); + auto sit = slack_.find(hasher128_.hash128(k, 0)); if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); } return end(); @@ -233,7 +236,7 @@ MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_has } if (__builtin_expect(!slack_.empty(), 0)) { ++very_slow_; - auto sit = slack_.find(k); + auto sit = slack_.find(hasher128_.hash128(k, 0)); if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); } return end(); diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index be0b39e..7332be4 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -4,6 +4,7 @@ #include // for uint32_t and friends #include +#include #include // for std::hash #include "MurmurHash3.h" @@ -17,13 +18,15 @@ uint64_t fmix ( uint64_t h ); namespace cxxmph { struct h128 { - uint32_t operator[](uint8_t i) const { return uint32[i]; } + const uint32_t& operator[](uint8_t i) const { return uint32[i]; } uint32_t& operator[](uint8_t i) { return uint32[i]; } - uint64_t* uint64ptr(bool second) { return reinterpret_cast(&uint32[static_cast(second) << 1]); } - uint64_t uint64(bool second) const { return *reinterpret_cast(&uint32[static_cast(second) << 1]); } - bool operator==(const h128 rhs) const { return uint64(0) == rhs.uint64(0) && uint64(1) == rhs.uint64(1); } + const uint64_t get64(bool second) const { return (static_cast(uint32[second << 1]) << 32) | uint32[1 + (second << 1)]; } + void set64(uint64_t v, bool second) { uint32[second << 1] = v >> 32; uint32[1+(second<<1)] = ((v << 32) >> 32); } + bool operator==(const h128 rhs) const { return memcmp(uint32, rhs.uint32, sizeof(uint32)) == 0; } uint32_t uint32[4]; + + struct hash32 { uint32_t operator()(const cxxmph::h128& h) const { return h[3]; } }; }; template @@ -83,8 +86,8 @@ struct Murmur3Fmix64bitsType { template h128 hash128(const Key& k) const { h128 h; - *h.uint64ptr(0) = fmix(k); - *h.uint64ptr(1) = fmix(h.uint64(0)); + h.set64(fmix(k), 0); + h.set64(fmix(h.get64(0)), 1); } }; @@ -131,8 +134,8 @@ struct seeded_hash_function { template h128 hash128(const Key& k, uint32_t seed) const { h128 h; - *h.uint64ptr(0) = fmix(k ^ seed); - *h.uint64ptr(1) = fmix(h.uint64(0)); + h.set64(fmix(k ^ seed), 0); + h.set64(fmix(h.get64(0)), 1); return h; } }; diff --git a/cxxmph/seeded_hash_test.cc b/cxxmph/seeded_hash_test.cc new file mode 100644 index 0000000..e2983b0 --- /dev/null +++ b/cxxmph/seeded_hash_test.cc @@ -0,0 +1,59 @@ +#include "seeded_hash.h" + +#include +#include +#include + +using std::cerr; +using std::endl; +using std::string; +using std::unordered_map; +using namespace cxxmph; + +int main(int argc, char** argv) { + auto hasher = seeded_hash_function(); + string key1("0"); + string key2("1"); + auto h1 = hasher.hash128(key1, 1); + auto h2 = hasher.hash128(key2, 1); + if (h1 == h2) { + fprintf(stderr, "unexpected murmur collision\n"); + exit(-1); + } + + unordered_map g; + for (int i = 0; i < 1000; ++i) g[i] = i; + for (int i = 0; i < 1000; ++i) if (g[i] != i) exit(-1); + + auto inthasher = seeded_hash_function>(); + unordered_map g2; + for (uint64_t i = 0; i < 1000; ++i) { + auto h = inthasher.hash128(i, 0); + if (g2.find(h) != g2.end()) { + std::cerr << "Incorrectly found " << i << std::endl; + exit(-1); + } + if (h128::hash32()(h) != h[3]) { + cerr << "Buggy hash method." << endl; + exit(-1); + } + auto h2 = inthasher.hash128(i, 0); + if (!(h == h2)) { + cerr << "h 64(0) " << h.get64(0) << " h 64(1) " << h.get64(1) << endl; + cerr << " h2 64(0) " << h2.get64(0) << " h2 64(1) " << h2.get64(1) << endl; + cerr << "Broken equality for h128" << endl; + exit(-1); + } + if (h128::hash32()(h) != h128::hash32()(h2)) { + cerr << "Inconsistent hash method." << endl; + exit(-1); + } + g2[h] = i; + if (g2.find(h) == g2.end()) { + std::cerr << "Incorrectly missed " << i << std::endl; + exit(-1); + } + } + + for (uint64_t i = 0; i < 1000; ++i) if (g2[inthasher.hash128(i, 0)] != i) exit(-1); +}