Added Murmur3 support.
This commit is contained in:
parent
bd9efab766
commit
fd0bc2ae43
|
@ -1,74 +0,0 @@
|
||||||
#ifndef __CXXMPH_MURMUR_HASH2__
|
|
||||||
#define __CXXMPH_MURMUR_HASH2__
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
|
||||||
// MurmurHash2, by Austin Appleby
|
|
||||||
|
|
||||||
// Note - This code makes a few assumptions about how your machine behaves -
|
|
||||||
|
|
||||||
// 1. We can read a 4-byte value from any address without crashing
|
|
||||||
// 2. sizeof(int) == 4
|
|
||||||
|
|
||||||
// And it has a few limitations -
|
|
||||||
|
|
||||||
// 1. It will not work incrementally.
|
|
||||||
// 2. It will not produce the same results on little-endian and big-endian
|
|
||||||
// machines.
|
|
||||||
|
|
||||||
namespace cxxmph {
|
|
||||||
|
|
||||||
inline // not measured, for making compilation easier only
|
|
||||||
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
|
||||||
{
|
|
||||||
// 'm' and 'r' are mixing constants generated offline.
|
|
||||||
// They're not really 'magic', they just happen to work well.
|
|
||||||
|
|
||||||
const unsigned int m = 0x5bd1e995;
|
|
||||||
const int r = 24;
|
|
||||||
|
|
||||||
// Initialize the hash to a 'random' value
|
|
||||||
|
|
||||||
unsigned int h = seed ^ len;
|
|
||||||
|
|
||||||
// Mix 4 bytes at a time into the hash
|
|
||||||
|
|
||||||
const unsigned char * data = (const unsigned char *)key;
|
|
||||||
|
|
||||||
while(len >= 4)
|
|
||||||
{
|
|
||||||
unsigned int k = *(unsigned int *)data;
|
|
||||||
|
|
||||||
k *= m;
|
|
||||||
k ^= k >> r;
|
|
||||||
k *= m;
|
|
||||||
|
|
||||||
h *= m;
|
|
||||||
h ^= k;
|
|
||||||
|
|
||||||
data += 4;
|
|
||||||
len -= 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle the last few bytes of the input array
|
|
||||||
|
|
||||||
switch(len)
|
|
||||||
{
|
|
||||||
case 3: h ^= data[2] << 16;
|
|
||||||
case 2: h ^= data[1] << 8;
|
|
||||||
case 1: h ^= data[0];
|
|
||||||
h *= m;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Do a few final mixes of the hash to ensure the last few
|
|
||||||
// bytes are well-incorporated.
|
|
||||||
|
|
||||||
h ^= h >> 13;
|
|
||||||
h *= m;
|
|
||||||
h ^= h >> 15;
|
|
||||||
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace cxxmph
|
|
||||||
|
|
||||||
#endif // __CXXMPH_MURMUR_HASH2__
|
|
|
@ -6,14 +6,16 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map> // std::hash
|
#include <unordered_map> // std::hash
|
||||||
#include "MurmurHash2.h"
|
#include "MurmurHash3.h"
|
||||||
|
|
||||||
#include "benchmark.h"
|
#include "benchmark.h"
|
||||||
|
|
||||||
namespace std {
|
namespace std {
|
||||||
template <> struct hash<cxxmph::StringPiece> {
|
template <> struct hash<cxxmph::StringPiece> {
|
||||||
uint32_t operator()(const cxxmph::StringPiece& k) const {
|
uint32_t operator()(const cxxmph::StringPiece& k) const {
|
||||||
return cxxmph::MurmurHash2(k.data(), k.length(), 1);
|
uint32_t out;
|
||||||
|
MurmurHash3_x86_32(k.data(), k.length(), 1, &out);
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // namespace std
|
} // namespace std
|
||||||
|
|
|
@ -47,7 +47,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark {
|
||||||
for (auto it = random_.begin(); it != random_.end(); ++it) {
|
for (auto it = random_.begin(); it != random_.end(); ++it) {
|
||||||
auto idx = index_.index(*it);
|
auto idx = index_.index(*it);
|
||||||
// Collision check to be fair with STL
|
// Collision check to be fair with STL
|
||||||
if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1;
|
// if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
protected:
|
protected:
|
||||||
|
|
|
@ -90,9 +90,9 @@ int main(int argc, char** argv) {
|
||||||
Benchmark::Register(new BM_CreateUrls<mph_map<StringPiece, StringPiece>>("URLS100k"));
|
Benchmark::Register(new BM_CreateUrls<mph_map<StringPiece, StringPiece>>("URLS100k"));
|
||||||
Benchmark::Register(new BM_CreateUrls<unordered_map<StringPiece, StringPiece>>("URLS100k"));
|
Benchmark::Register(new BM_CreateUrls<unordered_map<StringPiece, StringPiece>>("URLS100k"));
|
||||||
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
||||||
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur2StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
||||||
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
||||||
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur2StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
||||||
Benchmark::Register(new BM_SearchUint64<unordered_map<uint64_t, uint64_t>>);
|
Benchmark::Register(new BM_SearchUint64<unordered_map<uint64_t, uint64_t>>);
|
||||||
Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>);
|
Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>);
|
||||||
Benchmark::RunAll();
|
Benchmark::RunAll();
|
||||||
|
|
|
@ -157,7 +157,8 @@ bool MPHIndex::Mapping(
|
||||||
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
|
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
|
||||||
TriGraph graph(n_, m_);
|
TriGraph graph(n_, m_);
|
||||||
for (ForwardIterator it = begin; it != end; ++it) {
|
for (ForwardIterator it = begin; it != end; ++it) {
|
||||||
uint32_t h[3];
|
uint32_t h[4];
|
||||||
|
// SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast<uint32_t*>(&h));
|
||||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
|
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
|
||||||
uint32_t v0 = h[0] % r_;
|
uint32_t v0 = h[0] % r_;
|
||||||
uint32_t v1 = h[1] % r_ + r_;
|
uint32_t v1 = h[1] % r_ + r_;
|
||||||
|
@ -200,7 +201,7 @@ uint32_t MPHIndex::index(const Key& key) const {
|
||||||
|
|
||||||
// Simple wrapper around MPHIndex to simplify calling code. Please refer to the
|
// Simple wrapper around MPHIndex to simplify calling code. Please refer to the
|
||||||
// MPHIndex class for documentation.
|
// MPHIndex class for documentation.
|
||||||
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
|
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key>>::hash_function>
|
||||||
class SimpleMPHIndex : public MPHIndex {
|
class SimpleMPHIndex : public MPHIndex {
|
||||||
public:
|
public:
|
||||||
template <class ForwardIterator>
|
template <class ForwardIterator>
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using cxxmph::SimpleMPHIndex;
|
using namespace cxxmph;
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
|
||||||
|
@ -38,4 +38,3 @@ int main(int argc, char** argv) {
|
||||||
SimpleMPHIndex<string> other_mph_index;
|
SimpleMPHIndex<string> other_mph_index;
|
||||||
other_mph_index.deserialize(serialized);
|
other_mph_index.deserialize(serialized);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,13 +6,14 @@
|
||||||
// This class is about 20% to 100% slower than unordered_map (or ext/hash_map)
|
// This class is about 20% to 100% slower than unordered_map (or ext/hash_map)
|
||||||
// and should not be used if performance is a concern. In fact, you should only
|
// and should not be used if performance is a concern. In fact, you should only
|
||||||
// use it for educational purposes.
|
// use it for educational purposes.
|
||||||
|
//
|
||||||
|
// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <utility> // for std::pair
|
#include <utility> // for std::pair
|
||||||
|
|
||||||
#include "MurmurHash2.h"
|
|
||||||
#include "mph_index.h"
|
#include "mph_index.h"
|
||||||
|
|
||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
|
|
|
@ -6,9 +6,12 @@
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <unordered_map> // for std::hash
|
#include <unordered_map> // for std::hash
|
||||||
|
|
||||||
#include "MurmurHash2.h"
|
#include "MurmurHash3.h"
|
||||||
#include "stringpiece.h"
|
#include "stringpiece.h"
|
||||||
|
|
||||||
|
// From murmur, only used naively to extend 32 bits functions to 64 bits.
|
||||||
|
uint32_t fmix ( uint32_t h );
|
||||||
|
|
||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
|
|
||||||
template <class HashFcn>
|
template <class HashFcn>
|
||||||
|
@ -17,72 +20,106 @@ struct seeded_hash_function {
|
||||||
uint32_t operator()(const Key& k, uint32_t seed) const {
|
uint32_t operator()(const Key& k, uint32_t seed) const {
|
||||||
return HashFcn()(k) ^ seed;
|
return HashFcn()(k) ^ seed;
|
||||||
}
|
}
|
||||||
|
template <class Key>
|
||||||
|
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
out[i] = HashFcn()(k) ^ seed;
|
||||||
|
seed = fmix(seed);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Murmur2 {
|
struct Murmur3 {
|
||||||
template<class Key>
|
template<class Key>
|
||||||
uint32_t operator()(const Key& k) const {
|
uint32_t operator()(const Key& k) const {
|
||||||
return MurmurHash2(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */);
|
uint32_t out;
|
||||||
|
MurmurHash3_x86_32(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */, &out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
template <class Key>
|
||||||
|
void hash64(const Key& k, uint32_t* out) const {
|
||||||
|
MurmurHash3_x64_128(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */, out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct Murmur2StringPiece {
|
|
||||||
|
struct Murmur3StringPiece {
|
||||||
template <class Key>
|
template <class Key>
|
||||||
uint32_t operator()(const Key& k) const {
|
uint32_t operator()(const Key& k) const {
|
||||||
StringPiece s(k);
|
StringPiece s(k);
|
||||||
return MurmurHash2(s.data(), s.length(), 1 /* seed */);
|
uint32_t out;
|
||||||
|
MurmurHash3_x86_32(s.data(), s.length(), 1 /* seed */, &out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
template <class Key>
|
||||||
|
void hash64(const Key& k, uint32_t* out) const {
|
||||||
|
StringPiece s(k);
|
||||||
|
MurmurHash3_x64_128(s.data(), s.length(), 1 /* seed */, out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct seeded_hash_function<Murmur2> {
|
struct seeded_hash_function<Murmur3> {
|
||||||
template <class Key>
|
template <class Key>
|
||||||
uint32_t operator()(const Key& k, uint32_t seed) const {
|
uint32_t operator()(const Key& k, uint32_t seed) const {
|
||||||
return MurmurHash2(reinterpret_cast<const void*>(&k), sizeof(Key), seed);
|
uint32_t out;
|
||||||
|
MurmurHash3_x86_32(reinterpret_cast<const void*>(&k), sizeof(Key), seed, &out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
template <class Key>
|
||||||
|
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
|
||||||
|
MurmurHash3_x64_128(reinterpret_cast<const void*>(&k), sizeof(Key), seed, out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct seeded_hash_function<Murmur2StringPiece> {
|
struct seeded_hash_function<Murmur3StringPiece> {
|
||||||
template <class Key>
|
template <class Key>
|
||||||
uint32_t operator()(const Key& k, uint32_t seed) const {
|
uint32_t operator()(const Key& k, uint32_t seed) const {
|
||||||
StringPiece s(k);
|
StringPiece s(k);
|
||||||
return MurmurHash2(s.data(), s.length(), seed);
|
uint32_t out;
|
||||||
|
MurmurHash3_x86_32(s.data(), s.length(), seed, &out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
template <class Key>
|
||||||
|
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
|
||||||
|
StringPiece s(k);
|
||||||
|
MurmurHash3_x64_128(s.data(), s.length(), seed, out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class HashFcn> struct seeded_hash
|
template <class HashFcn> struct seeded_hash
|
||||||
{ typedef seeded_hash_function<HashFcn> hash_function; };
|
{ typedef seeded_hash_function<HashFcn> hash_function; };
|
||||||
// Use Murmur2 instead for all types defined in std::hash, plus
|
// Use Murmur3 instead for all types defined in std::hash, plus
|
||||||
// std::string which is commonly extended.
|
// std::string which is commonly extended.
|
||||||
template <> struct seeded_hash<std::hash<char*> >
|
template <> struct seeded_hash<std::hash<char*> >
|
||||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<const char*> >
|
template <> struct seeded_hash<std::hash<const char*> >
|
||||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<std::string> >
|
template <> struct seeded_hash<std::hash<std::string> >
|
||||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<cxxmph::StringPiece> >
|
template <> struct seeded_hash<std::hash<cxxmph::StringPiece> >
|
||||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
|
||||||
|
|
||||||
template <> struct seeded_hash<std::hash<char> >
|
template <> struct seeded_hash<std::hash<char> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<unsigned char> >
|
template <> struct seeded_hash<std::hash<unsigned char> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<short> >
|
template <> struct seeded_hash<std::hash<short> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<unsigned short> >
|
template <> struct seeded_hash<std::hash<unsigned short> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<int> >
|
template <> struct seeded_hash<std::hash<int> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<unsigned int> >
|
template <> struct seeded_hash<std::hash<unsigned int> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<long> >
|
template <> struct seeded_hash<std::hash<long> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<unsigned long> >
|
template <> struct seeded_hash<std::hash<unsigned long> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<long long> >
|
template <> struct seeded_hash<std::hash<long long> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
template <> struct seeded_hash<std::hash<unsigned long long> >
|
template <> struct seeded_hash<std::hash<unsigned long long> >
|
||||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||||
|
|
||||||
} // namespace cxxmph
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue