From fd0bc2ae439fd1381317343a80c3a8e7e9a86fd9 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Tue, 13 Mar 2012 19:34:24 -0300 Subject: [PATCH] Added Murmur3 support. --- cxxmph/MurmurHash2.h | 74 ---------------------------------- cxxmph/bm_common.h | 6 ++- cxxmph/bm_index.cc | 2 +- cxxmph/bm_map.cc | 4 +- cxxmph/mph_index.h | 5 ++- cxxmph/mph_index_test.cc | 3 +- cxxmph/mph_map.h | 3 +- cxxmph/seeded_hash.h | 85 ++++++++++++++++++++++++++++------------ 8 files changed, 74 insertions(+), 108 deletions(-) delete mode 100644 cxxmph/MurmurHash2.h diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h deleted file mode 100644 index 0d318a3..0000000 --- a/cxxmph/MurmurHash2.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef __CXXMPH_MURMUR_HASH2__ -#define __CXXMPH_MURMUR_HASH2__ - -//----------------------------------------------------------------------------- -// MurmurHash2, by Austin Appleby - -// Note - This code makes a few assumptions about how your machine behaves - - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 - -// And it has a few limitations - - -// 1. It will not work incrementally. -// 2. It will not produce the same results on little-endian and big-endian -// machines. - -namespace cxxmph { - -inline // not measured, for making compilation easier only -unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const unsigned int m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - unsigned int h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - unsigned int k = *(unsigned int *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -} // namespace cxxmph - -#endif // __CXXMPH_MURMUR_HASH2__ diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index aaf12b9..eed12df 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -6,14 +6,16 @@ #include #include #include // std::hash -#include "MurmurHash2.h" +#include "MurmurHash3.h" #include "benchmark.h" namespace std { template <> struct hash { uint32_t operator()(const cxxmph::StringPiece& k) const { - return cxxmph::MurmurHash2(k.data(), k.length(), 1); + uint32_t out; + MurmurHash3_x86_32(k.data(), k.length(), 1, &out); + return out; } }; } // namespace std diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 924231c..443178f 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -47,7 +47,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { for (auto it = random_.begin(); it != random_.end(); ++it) { auto idx = index_.index(*it); // Collision check to be fair with STL - if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; + // if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; } } protected: diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 5c0f7a4..25ba463 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -90,9 +90,9 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index d2e4a01..46d8ebe 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -157,7 +157,8 @@ bool MPHIndex::Mapping( std::vector* edges, std::vector* queue) { TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { - uint32_t h[3]; + uint32_t h[4]; + // SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); uint32_t v0 = h[0] % r_; uint32_t v1 = h[1] % r_ + r_; @@ -200,7 +201,7 @@ uint32_t MPHIndex::index(const Key& key) const { // Simple wrapper around MPHIndex to simplify calling code. Please refer to the // MPHIndex class for documentation. -template >::hash_function> +template >::hash_function> class SimpleMPHIndex : public MPHIndex { public: template diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index 7a7d036..f2482b7 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -7,7 +7,7 @@ using std::string; using std::vector; -using cxxmph::SimpleMPHIndex; +using namespace cxxmph; int main(int argc, char** argv) { @@ -38,4 +38,3 @@ int main(int argc, char** argv) { SimpleMPHIndex other_mph_index; other_mph_index.deserialize(serialized); } - diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 7541c45..405a7f9 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -6,13 +6,14 @@ // This class is about 20% to 100% slower than unordered_map (or ext/hash_map) // and should not be used if performance is a concern. In fact, you should only // use it for educational purposes. +// +// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl #include #include #include #include // for std::pair -#include "MurmurHash2.h" #include "mph_index.h" namespace cxxmph { diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index d079a57..e204d36 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -6,9 +6,12 @@ #include #include // for std::hash -#include "MurmurHash2.h" +#include "MurmurHash3.h" #include "stringpiece.h" +// From murmur, only used naively to extend 32 bits functions to 64 bits. +uint32_t fmix ( uint32_t h ); + namespace cxxmph { template @@ -17,72 +20,106 @@ struct seeded_hash_function { uint32_t operator()(const Key& k, uint32_t seed) const { return HashFcn()(k) ^ seed; } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + for (int i = 0; i < 4; ++i) { + out[i] = HashFcn()(k) ^ seed; + seed = fmix(seed); + } + } }; -struct Murmur2 { +struct Murmur3 { template uint32_t operator()(const Key& k) const { - return MurmurHash2(reinterpret_cast(&k), sizeof(Key), 1 /* seed */); + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, out); } }; -struct Murmur2StringPiece { + +struct Murmur3StringPiece { template uint32_t operator()(const Key& k) const { StringPiece s(k); - return MurmurHash2(s.data(), s.length(), 1 /* seed */); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), 1 /* seed */, out); } }; template <> -struct seeded_hash_function { +struct seeded_hash_function { template uint32_t operator()(const Key& k, uint32_t seed) const { - return MurmurHash2(reinterpret_cast(&k), sizeof(Key), seed); + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), seed, out); } }; template <> -struct seeded_hash_function { +struct seeded_hash_function { template uint32_t operator()(const Key& k, uint32_t seed) const { StringPiece s(k); - return MurmurHash2(s.data(), s.length(), seed); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), seed, out); } }; template struct seeded_hash { typedef seeded_hash_function hash_function; }; -// Use Murmur2 instead for all types defined in std::hash, plus +// Use Murmur3 instead for all types defined in std::hash, plus // std::string which is commonly extended. template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; } // namespace cxxmph