From 9dcf0450f00fd5bbf12ab39f38565babb15546de Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 12 Mar 2012 01:43:06 -0300 Subject: [PATCH] Added Murmur3 support. Not necessarily faster. --- cxxmph/Makefile.am | 4 +- cxxmph/MurmurHash2.h | 74 ---------------------------------- cxxmph/bm_common.h | 6 ++- cxxmph/bm_index.cc | 2 +- cxxmph/bm_map.cc | 4 +- cxxmph/mph_index.h | 5 ++- cxxmph/mph_index_test.cc | 3 +- cxxmph/mph_map.h | 3 +- cxxmph/seeded_hash.h | 85 ++++++++++++++++++++++++++++------------ 9 files changed, 76 insertions(+), 110 deletions(-) delete mode 100644 cxxmph/MurmurHash2.h diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index cec2073..0de662f 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -3,10 +3,10 @@ check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc +libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h mph_map_test_LDADD = libcxxmph.la mph_map_test_SOURCES = mph_map_test.cc diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h deleted file mode 100644 index 0d318a3..0000000 --- a/cxxmph/MurmurHash2.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef __CXXMPH_MURMUR_HASH2__ -#define __CXXMPH_MURMUR_HASH2__ - -//----------------------------------------------------------------------------- -// MurmurHash2, by Austin Appleby - -// Note - This code makes a few assumptions about how your machine behaves - - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 - -// And it has a few limitations - - -// 1. It will not work incrementally. -// 2. It will not produce the same results on little-endian and big-endian -// machines. - -namespace cxxmph { - -inline // not measured, for making compilation easier only -unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const unsigned int m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - unsigned int h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - unsigned int k = *(unsigned int *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -} // namespace cxxmph - -#endif // __CXXMPH_MURMUR_HASH2__ diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index aaf12b9..eed12df 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -6,14 +6,16 @@ #include #include #include // std::hash -#include "MurmurHash2.h" +#include "MurmurHash3.h" #include "benchmark.h" namespace std { template <> struct hash { uint32_t operator()(const cxxmph::StringPiece& k) const { - return cxxmph::MurmurHash2(k.data(), k.length(), 1); + uint32_t out; + MurmurHash3_x86_32(k.data(), k.length(), 1, &out); + return out; } }; } // namespace std diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index d1cbc00..9345a11 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -47,7 +47,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { for (auto it = random_.begin(); it != random_.end(); ++it) { auto idx = index_.index(*it); // Collision check to be fair with STL - if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; + // if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; } } protected: diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index e381976..a90b7b2 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -91,9 +91,9 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index ad5bc6e..7b54250 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -158,7 +158,8 @@ bool MPHIndex::Mapping( std::vector* edges, std::vector* queue) { TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { - uint32_t h[3]; + uint32_t h[4]; + // SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); uint32_t v0 = h[0] % r_; uint32_t v1 = h[1] % r_ + r_; @@ -201,7 +202,7 @@ uint32_t MPHIndex::index(const Key& key) const { // Simple wrapper around MPHIndex to simplify calling code. Please refer to the // MPHIndex class for documentation. -template >::hash_function> +template >::hash_function> class SimpleMPHIndex : public MPHIndex { public: template diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index 70e01bc..b4101df 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -7,7 +7,7 @@ using std::string; using std::vector; -using cxxmph::SimpleMPHIndex; +using namespace cxxmph; int main(int argc, char** argv) { @@ -38,4 +38,3 @@ int main(int argc, char** argv) { SimpleMPHIndex other_mph_index; other_mph_index.deserialize(serialized); } - diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index ac77a06..fa264c8 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -6,6 +6,8 @@ // This class is about 20% to 100% slower than unordered_map (or ext/hash_map) // and should not be used if performance is a concern. In fact, you should only // use it for educational purposes. +// +// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl #include #include @@ -13,7 +15,6 @@ #include #include // for std::pair -#include "MurmurHash2.h" #include "mph_index.h" #include "hollow_iterator.h" diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index d079a57..e204d36 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -6,9 +6,12 @@ #include #include // for std::hash -#include "MurmurHash2.h" +#include "MurmurHash3.h" #include "stringpiece.h" +// From murmur, only used naively to extend 32 bits functions to 64 bits. +uint32_t fmix ( uint32_t h ); + namespace cxxmph { template @@ -17,72 +20,106 @@ struct seeded_hash_function { uint32_t operator()(const Key& k, uint32_t seed) const { return HashFcn()(k) ^ seed; } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + for (int i = 0; i < 4; ++i) { + out[i] = HashFcn()(k) ^ seed; + seed = fmix(seed); + } + } }; -struct Murmur2 { +struct Murmur3 { template uint32_t operator()(const Key& k) const { - return MurmurHash2(reinterpret_cast(&k), sizeof(Key), 1 /* seed */); + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, out); } }; -struct Murmur2StringPiece { + +struct Murmur3StringPiece { template uint32_t operator()(const Key& k) const { StringPiece s(k); - return MurmurHash2(s.data(), s.length(), 1 /* seed */); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), 1 /* seed */, out); } }; template <> -struct seeded_hash_function { +struct seeded_hash_function { template uint32_t operator()(const Key& k, uint32_t seed) const { - return MurmurHash2(reinterpret_cast(&k), sizeof(Key), seed); + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), seed, out); } }; template <> -struct seeded_hash_function { +struct seeded_hash_function { template uint32_t operator()(const Key& k, uint32_t seed) const { StringPiece s(k); - return MurmurHash2(s.data(), s.length(), seed); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), seed, out); } }; template struct seeded_hash { typedef seeded_hash_function hash_function; }; -// Use Murmur2 instead for all types defined in std::hash, plus +// Use Murmur3 instead for all types defined in std::hash, plus // std::string which is commonly extended. template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; } // namespace cxxmph