1
Fork 0

Added Murmur3 support. Not necessarily faster.

This commit is contained in:
Davi Reis 2012-03-12 01:43:06 -03:00
parent 09c1af7771
commit 9dcf0450f0
9 changed files with 76 additions and 110 deletions

View File

@ -3,10 +3,10 @@ check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test
noinst_PROGRAMS = bm_index bm_map
bin_PROGRAMS = cxxmph
lib_LTLIBRARIES = libcxxmph.la
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
libcxxmph_la_LDFLAGS = -version-info 0:0:0
cxxmph_includedir = $(includedir)/cxxmph/
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h
mph_map_test_LDADD = libcxxmph.la
mph_map_test_SOURCES = mph_map_test.cc

View File

@ -1,74 +0,0 @@
#ifndef __CXXMPH_MURMUR_HASH2__
#define __CXXMPH_MURMUR_HASH2__
//-----------------------------------------------------------------------------
// MurmurHash2, by Austin Appleby
// Note - This code makes a few assumptions about how your machine behaves -
// 1. We can read a 4-byte value from any address without crashing
// 2. sizeof(int) == 4
// And it has a few limitations -
// 1. It will not work incrementally.
// 2. It will not produce the same results on little-endian and big-endian
// machines.
namespace cxxmph {
inline // not measured, for making compilation easier only
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
{
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const unsigned int m = 0x5bd1e995;
const int r = 24;
// Initialize the hash to a 'random' value
unsigned int h = seed ^ len;
// Mix 4 bytes at a time into the hash
const unsigned char * data = (const unsigned char *)key;
while(len >= 4)
{
unsigned int k = *(unsigned int *)data;
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 4;
len -= 4;
}
// Handle the last few bytes of the input array
switch(len)
{
case 3: h ^= data[2] << 16;
case 2: h ^= data[1] << 8;
case 1: h ^= data[0];
h *= m;
};
// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.
h ^= h >> 13;
h *= m;
h ^= h >> 15;
return h;
}
} // namespace cxxmph
#endif // __CXXMPH_MURMUR_HASH2__

View File

@ -6,14 +6,16 @@
#include <string>
#include <vector>
#include <unordered_map> // std::hash
#include "MurmurHash2.h"
#include "MurmurHash3.h"
#include "benchmark.h"
namespace std {
template <> struct hash<cxxmph::StringPiece> {
uint32_t operator()(const cxxmph::StringPiece& k) const {
return cxxmph::MurmurHash2(k.data(), k.length(), 1);
uint32_t out;
MurmurHash3_x86_32(k.data(), k.length(), 1, &out);
return out;
}
};
} // namespace std

View File

@ -47,7 +47,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark {
for (auto it = random_.begin(); it != random_.end(); ++it) {
auto idx = index_.index(*it);
// Collision check to be fair with STL
if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1;
// if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1;
}
}
protected:

View File

@ -91,9 +91,9 @@ int main(int argc, char** argv) {
Benchmark::Register(new BM_CreateUrls<mph_map<StringPiece, StringPiece>>("URLS100k"));
Benchmark::Register(new BM_CreateUrls<unordered_map<StringPiece, StringPiece>>("URLS100k"));
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0));
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur2StringPiece>>("URLS100k", 10*1000 * 1000, 0));
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0));
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur2StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
Benchmark::Register(new BM_SearchUint64<unordered_map<uint64_t, uint64_t>>);
Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>);
Benchmark::RunAll();

View File

@ -158,7 +158,8 @@ bool MPHIndex::Mapping(
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
TriGraph graph(n_, m_);
for (ForwardIterator it = begin; it != end; ++it) {
uint32_t h[3];
uint32_t h[4];
// SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast<uint32_t*>(&h));
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
uint32_t v0 = h[0] % r_;
uint32_t v1 = h[1] % r_ + r_;
@ -201,7 +202,7 @@ uint32_t MPHIndex::index(const Key& key) const {
// Simple wrapper around MPHIndex to simplify calling code. Please refer to the
// MPHIndex class for documentation.
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key>>::hash_function>
class SimpleMPHIndex : public MPHIndex {
public:
template <class ForwardIterator>

View File

@ -7,7 +7,7 @@
using std::string;
using std::vector;
using cxxmph::SimpleMPHIndex;
using namespace cxxmph;
int main(int argc, char** argv) {
@ -38,4 +38,3 @@ int main(int argc, char** argv) {
SimpleMPHIndex<string> other_mph_index;
other_mph_index.deserialize(serialized);
}

View File

@ -6,6 +6,8 @@
// This class is about 20% to 100% slower than unordered_map (or ext/hash_map)
// and should not be used if performance is a concern. In fact, you should only
// use it for educational purposes.
//
// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl
#include <iostream>
#include <algorithm>
@ -13,7 +15,6 @@
#include <vector>
#include <utility> // for std::pair
#include "MurmurHash2.h"
#include "mph_index.h"
#include "hollow_iterator.h"

View File

@ -6,9 +6,12 @@
#include <cstdlib>
#include <unordered_map> // for std::hash
#include "MurmurHash2.h"
#include "MurmurHash3.h"
#include "stringpiece.h"
// From murmur, only used naively to extend 32 bits functions to 64 bits.
uint32_t fmix ( uint32_t h );
namespace cxxmph {
template <class HashFcn>
@ -17,72 +20,106 @@ struct seeded_hash_function {
uint32_t operator()(const Key& k, uint32_t seed) const {
return HashFcn()(k) ^ seed;
}
template <class Key>
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
for (int i = 0; i < 4; ++i) {
out[i] = HashFcn()(k) ^ seed;
seed = fmix(seed);
}
}
};
struct Murmur2 {
struct Murmur3 {
template<class Key>
uint32_t operator()(const Key& k) const {
return MurmurHash2(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */);
uint32_t out;
MurmurHash3_x86_32(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */, &out);
return out;
}
template <class Key>
void hash64(const Key& k, uint32_t* out) const {
MurmurHash3_x64_128(reinterpret_cast<const void*>(&k), sizeof(Key), 1 /* seed */, out);
}
};
struct Murmur2StringPiece {
struct Murmur3StringPiece {
template <class Key>
uint32_t operator()(const Key& k) const {
StringPiece s(k);
return MurmurHash2(s.data(), s.length(), 1 /* seed */);
uint32_t out;
MurmurHash3_x86_32(s.data(), s.length(), 1 /* seed */, &out);
return out;
}
template <class Key>
void hash64(const Key& k, uint32_t* out) const {
StringPiece s(k);
MurmurHash3_x64_128(s.data(), s.length(), 1 /* seed */, out);
}
};
template <>
struct seeded_hash_function<Murmur2> {
struct seeded_hash_function<Murmur3> {
template <class Key>
uint32_t operator()(const Key& k, uint32_t seed) const {
return MurmurHash2(reinterpret_cast<const void*>(&k), sizeof(Key), seed);
uint32_t out;
MurmurHash3_x86_32(reinterpret_cast<const void*>(&k), sizeof(Key), seed, &out);
return out;
}
template <class Key>
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
MurmurHash3_x64_128(reinterpret_cast<const void*>(&k), sizeof(Key), seed, out);
}
};
template <>
struct seeded_hash_function<Murmur2StringPiece> {
struct seeded_hash_function<Murmur3StringPiece> {
template <class Key>
uint32_t operator()(const Key& k, uint32_t seed) const {
StringPiece s(k);
return MurmurHash2(s.data(), s.length(), seed);
uint32_t out;
MurmurHash3_x86_32(s.data(), s.length(), seed, &out);
return out;
}
template <class Key>
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
StringPiece s(k);
MurmurHash3_x64_128(s.data(), s.length(), seed, out);
}
};
template <class HashFcn> struct seeded_hash
{ typedef seeded_hash_function<HashFcn> hash_function; };
// Use Murmur2 instead for all types defined in std::hash, plus
// Use Murmur3 instead for all types defined in std::hash, plus
// std::string which is commonly extended.
template <> struct seeded_hash<std::hash<char*> >
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
template <> struct seeded_hash<std::hash<const char*> >
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
template <> struct seeded_hash<std::hash<std::string> >
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
template <> struct seeded_hash<std::hash<cxxmph::StringPiece> >
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
{ typedef seeded_hash_function<Murmur3StringPiece> hash_function; };
template <> struct seeded_hash<std::hash<char> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<unsigned char> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<short> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<unsigned short> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<int> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<unsigned int> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<long> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<unsigned long> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<long long> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
template <> struct seeded_hash<std::hash<unsigned long long> >
{ typedef seeded_hash_function<Murmur2> hash_function; };
{ typedef seeded_hash_function<Murmur3> hash_function; };
} // namespace cxxmph