Added Murmur3 support. Not necessarily faster.
This commit is contained in:
parent
09c1af7771
commit
9dcf0450f0
@ -3,10 +3,10 @@ check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test
|
||||
noinst_PROGRAMS = bm_index bm_map
|
||||
bin_PROGRAMS = cxxmph
|
||||
lib_LTLIBRARIES = libcxxmph.la
|
||||
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
|
||||
libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
|
||||
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
||||
cxxmph_includedir = $(includedir)/cxxmph/
|
||||
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h
|
||||
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h
|
||||
|
||||
mph_map_test_LDADD = libcxxmph.la
|
||||
mph_map_test_SOURCES = mph_map_test.cc
|
||||
|
@ -1,74 +0,0 @@
|
||||
#ifndef __CXXMPH_MURMUR_HASH2__
|
||||
#define __CXXMPH_MURMUR_HASH2__
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// MurmurHash2, by Austin Appleby
|
||||
|
||||
// Note - This code makes a few assumptions about how your machine behaves -
|
||||
|
||||
// 1. We can read a 4-byte value from any address without crashing
|
||||
// 2. sizeof(int) == 4
|
||||
|
||||
// And it has a few limitations -
|
||||
|
||||
// 1. It will not work incrementally.
|
||||
// 2. It will not produce the same results on little-endian and big-endian
|
||||
// machines.
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
inline // not measured, for making compilation easier only
|
||||
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
||||
{
|
||||
// 'm' and 'r' are mixing constants generated offline.
|
||||
// They're not really 'magic', they just happen to work well.
|
||||
|
||||
const unsigned int m = 0x5bd1e995;
|
||||
const int r = 24;
|
||||
|
||||
// Initialize the hash to a 'random' value
|
||||
|
||||
unsigned int h = seed ^ len;
|
||||
|
||||
// Mix 4 bytes at a time into the hash
|
||||
|
||||
const unsigned char * data = (const unsigned char *)key;
|
||||
|
||||
while(len >= 4)
|
||||
{
|
||||
unsigned int k = *(unsigned int *)data;
|
||||
|
||||
k *= m;
|
||||
k ^= k >> r;
|
||||
k *= m;
|
||||
|
||||
h *= m;
|
||||
h ^= k;
|
||||
|
||||
data += 4;
|
||||
len -= 4;
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
|
||||
switch(len)
|
||||
{
|
||||
case 3: h ^= data[2] << 16;
|
||||
case 2: h ^= data[1] << 8;
|
||||
case 1: h ^= data[0];
|
||||
h *= m;
|
||||
};
|
||||
|
||||
// Do a few final mixes of the hash to ensure the last few
|
||||
// bytes are well-incorporated.
|
||||
|
||||
h ^= h >> 13;
|
||||
h *= m;
|
||||
h ^= h >> 15;
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
} // namespace cxxmph
|
||||
|
||||
#endif // __CXXMPH_MURMUR_HASH2__
|
@ -6,14 +6,16 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map> // std::hash
|
||||
#include "MurmurHash2.h"
|
||||
#include "MurmurHash3.h"
|
||||
|
||||
#include "benchmark.h"
|
||||
|
||||
namespace std {
|
||||
template <> struct hash<cxxmph::StringPiece> {
|
||||
uint32_t operator()(const cxxmph::StringPiece& k) const {
|
||||
return cxxmph::MurmurHash2(k.data(), k.length(), 1);
|
||||
uint32_t out;
|
||||
MurmurHash3_x86_32(k.data(), k.length(), 1, &out);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
} // namespace std
|
||||
|
@ -47,7 +47,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark {
|
||||
for (auto it = random_.begin(); it != random_.end(); ++it) {
|
||||
auto idx = index_.index(*it);
|
||||
// Collision check to be fair with STL
|
||||
if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1;
|
||||
// if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1;
|
||||
}
|
||||
}
|
||||
protected:
|
||||
|
@ -91,9 +91,9 @@ int main(int argc, char** argv) {
|
||||
Benchmark::Register(new BM_CreateUrls<mph_map<StringPiece, StringPiece>>("URLS100k"));
|
||||
Benchmark::Register(new BM_CreateUrls<unordered_map<StringPiece, StringPiece>>("URLS100k"));
|
||||
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
||||
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur2StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
||||
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
||||
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
||||
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur2StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
||||
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
||||
Benchmark::Register(new BM_SearchUint64<unordered_map<uint64_t, uint64_t>>);
|
||||
Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>);
|
||||
Benchmark::RunAll();
|
||||
|
@ -158,7 +158,8 @@ bool MPHIndex::Mapping(
|
||||
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
|
||||
TriGraph graph(n_, m_);
|
||||
for (ForwardIterator it = begin; it != end; ++it) {
|
||||
uint32_t h[3];
|
||||
uint32_t h[4];
|
||||
// SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast<uint32_t*>(&h));
|
||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
|
||||
uint32_t v0 = h[0] % r_;
|
||||
uint32_t v1 = h[1] % r_ + r_;
|
||||
@ -201,7 +202,7 @@ uint32_t MPHIndex::index(const Key& key) const {
|
||||
|
||||
// Simple wrapper around MPHIndex to simplify calling code. Please refer to the
|
||||
// MPHIndex class for documentation.
|
||||
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
|
||||
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key>>::hash_function>
|
||||
class SimpleMPHIndex : public MPHIndex {
|
||||
public:
|
||||
template <class ForwardIterator>
|
||||
|
@ -7,7 +7,7 @@
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using cxxmph::SimpleMPHIndex;
|
||||
using namespace cxxmph;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
@ -38,4 +38,3 @@ int main(int argc, char** argv) {
|
||||
SimpleMPHIndex<string> other_mph_index;
|
||||
other_mph_index.deserialize(serialized);
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,8 @@
|
||||
// This class is about 20% to 100% slower than unordered_map (or ext/hash_map)
|
||||
// and should not be used if performance is a concern. In fact, you should only
|
||||
// use it for educational purposes.
|
||||
//
|
||||
// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
@ -13,7 +15,6 @@
|
||||
#include <vector>
|
||||
#include <utility> // for std::pair
|
||||
|
||||
#include "MurmurHash2.h"
|
||||
#include "mph_index.h"
|
||||
#include "hollow_iterator.h"
|
||||
|
||||
|
@ -6,9 +6,12 @@
|
||||
#include <cstdlib>
|
||||
#include <unordered_map> // for std::hash
|
||||
|
||||
#include "MurmurHash2.h"
|
||||
#include "MurmurHash3.h"
|
||||
#include "stringpiece.h"
|
||||
|
||||
// From murmur, only used naively to extend 32 bits functions to 64 bits.
|
||||
uint32_t fmix ( uint32_t h );
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
template <class HashFcn>
|
||||
@ -17,72 +20,106 @@ struct seeded_hash_function {
|
||||
uint32_t operator()(const Key& k, uint32_t seed) const {
|
||||
return HashFcn()(k) ^ seed;
|
||||
}
|
||||
template <class Key>
|
||||
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
out[i] = HashFcn()(k) ^ seed;
|
||||
seed = fmix(seed);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct Murmur2 {
|
||||
struct Murmur3 {
|
||||
template<class Key>
|
||||
uint32_t operator()(const Key& k) const {
|
||||
return MurmurHash2(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */);
|
||||
uint32_t out;
|
||||
MurmurHash3_x86_32(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */, &out);
|
||||
return out;
|
||||
}
|
||||
template <class Key>
|
||||
void hash64(const Key& k, uint32_t* out) const {
|
||||
MurmurHash3_x64_128(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */, out);
|
||||
}
|
||||
};
|
||||
struct Murmur2StringPiece {
|
||||
|
||||
struct Murmur3StringPiece {
|
||||
template <class Key>
|
||||
uint32_t operator()(const Key& k) const {
|
||||
StringPiece s(k);
|
||||
return MurmurHash2(s.data(), s.length(), 1 /* seed */);
|
||||
uint32_t out;
|
||||
MurmurHash3_x86_32(s.data(), s.length(), 1 /* seed */, &out);
|
||||
return out;
|
||||
}
|
||||
template <class Key>
|
||||
void hash64(const Key& k, uint32_t* out) const {
|
||||
StringPiece s(k);
|
||||
MurmurHash3_x64_128(s.data(), s.length(), 1 /* seed */, out);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct seeded_hash_function<Murmur2> {
|
||||
struct seeded_hash_function<Murmur3> {
|
||||
template <class Key>
|
||||
uint32_t operator()(const Key& k, uint32_t seed) const {
|
||||
return MurmurHash2(reinterpret_cast<const void*>(&k), sizeof(Key), seed);
|
||||
uint32_t out;
|
||||
MurmurHash3_x86_32(reinterpret_cast<const void*>(&k), sizeof(Key), seed, &out);
|
||||
return out;
|
||||
}
|
||||
template <class Key>
|
||||
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
|
||||
MurmurHash3_x64_128(reinterpret_cast<const void*>(&k), sizeof(Key), seed, out);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct seeded_hash_function<Murmur2StringPiece> {
|
||||
struct seeded_hash_function<Murmur3StringPiece> {
|
||||
template <class Key>
|
||||
uint32_t operator()(const Key& k, uint32_t seed) const {
|
||||
StringPiece s(k);
|
||||
return MurmurHash2(s.data(), s.length(), seed);
|
||||
uint32_t out;
|
||||
MurmurHash3_x86_32(s.data(), s.length(), seed, &out);
|
||||
return out;
|
||||
}
|
||||
template <class Key>
|
||||
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
|
||||
StringPiece s(k);
|
||||
MurmurHash3_x64_128(s.data(), s.length(), seed, out);
|
||||
}
|
||||
};
|
||||
|
||||
template <class HashFcn> struct seeded_hash
|
||||
{ typedef seeded_hash_function<HashFcn> hash_function; };
|
||||
// Use Murmur2 instead for all types defined in std::hash, plus
|
||||
// Use Murmur3 instead for all types defined in std::hash, plus
|
||||
// std::string which is commonly extended.
|
||||
template <> struct seeded_hash<std::hash<char*> >
|
||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<const char*> >
|
||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<std::string> >
|
||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<cxxmph::StringPiece> >
|
||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
|
||||
|
||||
template <> struct seeded_hash<std::hash<char> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<unsigned char> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<short> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<unsigned short> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<int> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<unsigned int> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<long> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<unsigned long> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<long long> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
template <> struct seeded_hash<std::hash<unsigned long long> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
{ typedef seeded_hash_function<Murmur3> hash_function; };
|
||||
|
||||
} // namespace cxxmph
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user