Cleanup for upload.

This commit is contained in:
Davi Reis 2012-03-15 18:14:39 -03:00
commit 40c6626d87
16 changed files with 941 additions and 93 deletions

View File

@ -1,12 +1,12 @@
TESTS = $(check_PROGRAMS) TESTS = $(check_PROGRAMS)
check_PROGRAMS = mph_map_test mph_index_test trigraph_test check_PROGRAMS = mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test
noinst_PROGRAMS = bm_index bm_map noinst_PROGRAMS = bm_index bm_map
bin_PROGRAMS = cxxmph bin_PROGRAMS = cxxmph
lib_LTLIBRARIES = libcxxmph.la lib_LTLIBRARIES = libcxxmph.la
libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc mph_bits.h mph_bits.cc
libcxxmph_la_LDFLAGS = -version-info 0:0:0 libcxxmph_la_LDFLAGS = -version-info 0:0:0
cxxmph_includedir = $(includedir)/cxxmph/ cxxmph_includedir = $(includedir)/cxxmph/
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h
mph_map_test_LDADD = libcxxmph.la mph_map_test_LDADD = libcxxmph.la
mph_map_test_SOURCES = mph_map_test.cc mph_map_test_SOURCES = mph_map_test.cc
@ -25,3 +25,8 @@ bm_map_SOURCES = bm_common.cc bm_map.cc
cxxmph_LDADD = libcxxmph.la cxxmph_LDADD = libcxxmph.la
cxxmph_SOURCES = cxxmph.cc cxxmph_SOURCES = cxxmph.cc
hollow_iterator_test_SOURCES = hollow_iterator_test.cc
mph_bits_test_SOURCES = mph_bits_test.cc
mph_bits_test_LDADD = libcxxmph.la

335
cxxmph/MurmurHash3.cpp Normal file
View File

@ -0,0 +1,335 @@
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
// Note - The x86 and x64 versions do _not_ produce the same results, as the
// algorithms are optimized for their respective platforms. You can still
// compile and run any of them on any platform, but your performance with the
// non-native version will be less than optimal.
#include "MurmurHash3.h"
//-----------------------------------------------------------------------------
// Platform-specific functions and macros
// Microsoft Visual Studio
#if defined(_MSC_VER)
#define FORCE_INLINE __forceinline
#include <stdlib.h>
#define ROTL32(x,y) _rotl(x,y)
#define ROTL64(x,y) _rotl64(x,y)
#define BIG_CONSTANT(x) (x)
// Other compilers
#else // defined(_MSC_VER)
#define FORCE_INLINE __attribute__((always_inline))
inline uint32_t rotl32 ( uint32_t x, int8_t r )
{
return (x << r) | (x >> (32 - r));
}
inline uint64_t rotl64 ( uint64_t x, int8_t r )
{
return (x << r) | (x >> (64 - r));
}
#define ROTL32(x,y) rotl32(x,y)
#define ROTL64(x,y) rotl64(x,y)
#define BIG_CONSTANT(x) (x##LLU)
#endif // !defined(_MSC_VER)
//-----------------------------------------------------------------------------
// Block read - if your platform needs to do endian-swapping or can only
// handle aligned reads, do the conversion here
FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
{
return p[i];
}
FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
{
return p[i];
}
//-----------------------------------------------------------------------------
// Finalization mix - force all bits of a hash block to avalanche
FORCE_INLINE uint32_t fmix ( uint32_t h )
{
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
//----------
FORCE_INLINE uint64_t fmix ( uint64_t k )
{
k ^= k >> 33;
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
k ^= k >> 33;
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
k ^= k >> 33;
return k;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x86_32 ( const void * key, int len,
uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 4;
uint32_t h1 = seed;
uint32_t c1 = 0xcc9e2d51;
uint32_t c2 = 0x1b873593;
//----------
// body
const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
for(int i = -nblocks; i; i++)
{
uint32_t k1 = getblock(blocks,i);
k1 *= c1;
k1 = ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
uint32_t k1 = 0;
switch(len & 3)
{
case 3: k1 ^= tail[2] << 16;
case 2: k1 ^= tail[1] << 8;
case 1: k1 ^= tail[0];
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= len;
h1 = fmix(h1);
*(uint32_t*)out = h1;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x86_128 ( const void * key, const int len,
uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
uint32_t h1 = seed;
uint32_t h2 = seed;
uint32_t h3 = seed;
uint32_t h4 = seed;
uint32_t c1 = 0x239b961b;
uint32_t c2 = 0xab0e9789;
uint32_t c3 = 0x38b34ae5;
uint32_t c4 = 0xa1e38b93;
//----------
// body
const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
for(int i = -nblocks; i; i++)
{
uint32_t k1 = getblock(blocks,i*4+0);
uint32_t k2 = getblock(blocks,i*4+1);
uint32_t k3 = getblock(blocks,i*4+2);
uint32_t k4 = getblock(blocks,i*4+3);
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
uint32_t k1 = 0;
uint32_t k2 = 0;
uint32_t k3 = 0;
uint32_t k4 = 0;
switch(len & 15)
{
case 15: k4 ^= tail[14] << 16;
case 14: k4 ^= tail[13] << 8;
case 13: k4 ^= tail[12] << 0;
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
case 12: k3 ^= tail[11] << 24;
case 11: k3 ^= tail[10] << 16;
case 10: k3 ^= tail[ 9] << 8;
case 9: k3 ^= tail[ 8] << 0;
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
case 8: k2 ^= tail[ 7] << 24;
case 7: k2 ^= tail[ 6] << 16;
case 6: k2 ^= tail[ 5] << 8;
case 5: k2 ^= tail[ 4] << 0;
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
case 4: k1 ^= tail[ 3] << 24;
case 3: k1 ^= tail[ 2] << 16;
case 2: k1 ^= tail[ 1] << 8;
case 1: k1 ^= tail[ 0] << 0;
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
h1 += h2; h1 += h3; h1 += h4;
h2 += h1; h3 += h1; h4 += h1;
h1 = fmix(h1);
h2 = fmix(h2);
h3 = fmix(h3);
h4 = fmix(h4);
h1 += h2; h1 += h3; h1 += h4;
h2 += h1; h3 += h1; h4 += h1;
((uint32_t*)out)[0] = h1;
((uint32_t*)out)[1] = h2;
((uint32_t*)out)[2] = h3;
((uint32_t*)out)[3] = h4;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x64_128 ( const void * key, const int len,
const uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
uint64_t h1 = seed;
uint64_t h2 = seed;
uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
//----------
// body
const uint64_t * blocks = (const uint64_t *)(data);
for(int i = 0; i < nblocks; i++)
{
uint64_t k1 = getblock(blocks,i*2+0);
uint64_t k2 = getblock(blocks,i*2+1);
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
uint64_t k1 = 0;
uint64_t k2 = 0;
switch(len & 15)
{
case 15: k2 ^= uint64_t(tail[14]) << 48;
case 14: k2 ^= uint64_t(tail[13]) << 40;
case 13: k2 ^= uint64_t(tail[12]) << 32;
case 12: k2 ^= uint64_t(tail[11]) << 24;
case 11: k2 ^= uint64_t(tail[10]) << 16;
case 10: k2 ^= uint64_t(tail[ 9]) << 8;
case 9: k2 ^= uint64_t(tail[ 8]) << 0;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= uint64_t(tail[ 7]) << 56;
case 7: k1 ^= uint64_t(tail[ 6]) << 48;
case 6: k1 ^= uint64_t(tail[ 5]) << 40;
case 5: k1 ^= uint64_t(tail[ 4]) << 32;
case 4: k1 ^= uint64_t(tail[ 3]) << 24;
case 3: k1 ^= uint64_t(tail[ 2]) << 16;
case 2: k1 ^= uint64_t(tail[ 1]) << 8;
case 1: k1 ^= uint64_t(tail[ 0]) << 0;
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= len; h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix(h1);
h2 = fmix(h2);
h1 += h2;
h2 += h1;
((uint64_t*)out)[0] = h1;
((uint64_t*)out)[1] = h2;
}
//-----------------------------------------------------------------------------

37
cxxmph/MurmurHash3.h Normal file
View File

@ -0,0 +1,37 @@
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
#ifndef _MURMURHASH3_H_
#define _MURMURHASH3_H_
//-----------------------------------------------------------------------------
// Platform-specific functions and macros
// Microsoft Visual Studio
#if defined(_MSC_VER)
typedef unsigned char uint8_t;
typedef unsigned long uint32_t;
typedef unsigned __int64 uint64_t;
// Other compilers
#else // defined(_MSC_VER)
#include <stdint.h>
#endif // !defined(_MSC_VER)
//-----------------------------------------------------------------------------
void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out );
void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
//-----------------------------------------------------------------------------
#endif // _MURMURHASH3_H_

View File

@ -21,7 +21,7 @@ class BM_MPHIndexCreate : public UrlsBenchmark {
protected: protected:
virtual void Run() { virtual void Run() {
SimpleMPHIndex<StringPiece> index; SimpleMPHIndex<StringPiece> index;
index.Reset(urls_.begin(), urls_.end()); index.Reset(urls_.begin(), urls_.end(), urls_.size());
} }
}; };
@ -53,7 +53,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark {
protected: protected:
virtual bool SetUp () { virtual bool SetUp () {
if (!SearchUrlsBenchmark::SetUp()) return false; if (!SearchUrlsBenchmark::SetUp()) return false;
index_.Reset(urls_.begin(), urls_.end()); index_.Reset(urls_.begin(), urls_.end(), urls_.size());
return true; return true;
} }
SimpleMPHIndex<StringPiece> index_; SimpleMPHIndex<StringPiece> index_;

View File

@ -13,7 +13,8 @@ namespace cxxmph {
template<class MapType, class T> template<class MapType, class T>
const T* myfind(const MapType& mymap, const T& k) { const T* myfind(const MapType& mymap, const T& k) {
auto it = mymap.find(k); auto it = mymap.find(k);
if (it == mymap.end()) return NULL; auto end = mymap.end();
if (it == end) return NULL;
return &it->second; return &it->second;
} }
@ -48,6 +49,7 @@ class BM_SearchUrls : public SearchUrlsBenchmark {
mymap_[*it] = *it; mymap_[*it] = *it;
} }
mymap_.rehash(mymap_.bucket_count()); mymap_.rehash(mymap_.bucket_count());
fprintf(stderr, "Occupation: %f\n", static_cast<float>(mymap_.size())/mymap_.bucket_count());
return true; return true;
} }
MapType mymap_; MapType mymap_;
@ -56,7 +58,7 @@ class BM_SearchUrls : public SearchUrlsBenchmark {
template <class MapType> template <class MapType>
class BM_SearchUint64 : public SearchUint64Benchmark { class BM_SearchUint64 : public SearchUint64Benchmark {
public: public:
BM_SearchUint64() : SearchUint64Benchmark(10000, 10*1000*1000) { } BM_SearchUint64() : SearchUint64Benchmark(100000, 10*1000*1000) { }
virtual bool SetUp() { virtual bool SetUp() {
if (!SearchUint64Benchmark::SetUp()) return false; if (!SearchUint64Benchmark::SetUp()) return false;
for (int i = 0; i < values_.size(); ++i) { for (int i = 0; i < values_.size(); ++i) {
@ -93,7 +95,7 @@ int main(int argc, char** argv) {
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0));
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
Benchmark::Register(new BM_SearchUint64<unordered_map<uint64_t, uint64_t>>);
Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>); Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>);
Benchmark::Register(new BM_SearchUint64<unordered_map<uint64_t, uint64_t>>);
Benchmark::RunAll(); Benchmark::RunAll();
} }

View File

@ -63,8 +63,8 @@ int main(int argc, char** argv) {
for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i]; for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i];
mph_map<string, string>::const_iterator it = table.begin(); mph_map<string, string>::const_iterator it = table.begin();
mph_map<string, string>::const_iterator end = table.end(); mph_map<string, string>::const_iterator end = table.end();
for (; it != end; ++it) { for (int i = 0; it != end; ++it, ++i) {
cout << (it - table.begin()) << ": " << it->first cout << i << ": " << it->first
<<" -> " << it->second << endl; <<" -> " << it->second << endl;
} }
} }

71
cxxmph/hollow_iterator.h Normal file
View File

@ -0,0 +1,71 @@
#ifndef __CXXMPH_HOLLOW_ITERATOR_H__
#define __CXXMPH_HOLLOW_ITERATOR_H__
#include <vector>
namespace cxxmph {
template <typename container_type, typename presence_type, typename iterator_type>
struct hollow_iterator_base
: public std::iterator<std::forward_iterator_tag,
typename container_type::value_type> {
typedef presence_type presence;
typedef container_type container;
typedef iterator_type iterator;
typedef hollow_iterator_base<container, presence, iterator>& self_reference;
typedef typename iterator::reference reference;
typedef typename iterator::pointer pointer;
hollow_iterator_base(container* c, presence* p, iterator it)
: c_(c), p_(p), it_(it) { if (c_) find_present(); }
self_reference operator++() {
++it_; find_present();
}
reference operator*() { return *it_; }
pointer operator->() { return &(*it_); }
// TODO find syntax to make this less permissible at compile time
template <class T>
bool operator==(const T& rhs) { return rhs.it_ == this->it_; }
template <class T>
bool operator!=(const T& rhs) { return rhs.it_ != this->it_; }
public: // TODO find syntax to make this friend of const iterator
void find_present() {
while (it_ != c_->end() && !((*p_)[it_-c_->begin()])) ++it_;
}
container* c_;
presence* p_;
iterator it_;
};
template <typename container_type>
struct hollow_iterator : public hollow_iterator_base<
container_type, std::vector<bool>, typename container_type::iterator> {
typedef hollow_iterator_base<
container_type, std::vector<bool>, typename container_type::iterator> parent_class;
hollow_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { }
hollow_iterator(typename parent_class::container* c,
typename parent_class::presence* p,
typename parent_class::iterator it)
: parent_class(c, p, it) { }
};
template <typename container_type>
struct hollow_const_iterator : public hollow_iterator_base<
const container_type, const std::vector<bool>, typename container_type::const_iterator> {
typedef hollow_iterator_base<
const container_type, const std::vector<bool>, typename container_type::const_iterator> parent_class;
typedef hollow_const_iterator<container_type> self_type;
typedef hollow_iterator<container_type> non_const_type;
hollow_const_iterator(non_const_type rhs) : parent_class(rhs.c_, rhs.p_, typename container_type::const_iterator(rhs.it_)) { }
hollow_const_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { }
hollow_const_iterator(const typename parent_class::container* c,
const typename parent_class::presence* p,
typename parent_class::iterator it)
: parent_class(c, p, it) { }
};
} // namespace cxxmph
#endif // __CXXMPH_HOLLOW_ITERATOR_H__

View File

@ -0,0 +1,38 @@
#include <cstdlib>
#include <cstdio>
#include <vector>
#include "hollow_iterator.h"
using std::vector;
using cxxmph::hollow_iterator;
using cxxmph::hollow_const_iterator;
int main(int argc, char** argv) {
vector<int> v;
vector<bool> p;
for (int i = 0; i < 100; ++i) {
v.push_back(i);
p.push_back(i % 2 == 0);
}
auto begin = hollow_iterator<vector<int>>(&v, &p, v.begin());
auto end = hollow_iterator<vector<int>>(&v, &p, v.end());
for (auto it = begin; it != end; ++it) {
if (((*it) % 2) != 0) exit(-1);
}
hollow_const_iterator<vector<int>> const_begin(begin);
hollow_const_iterator<vector<int>> const_end(end);
for (auto it = const_begin; it != const_end; ++it) {
if (((*it) % 2) != 0) exit(-1);
}
vector<int>::iterator vit1 = v.begin();
vector<int>::const_iterator vit2 = v.begin();
if (vit1 != vit2) exit(-1);
auto it1 = hollow_iterator<vector<int>>(&v, &p, v.begin());
auto it2 = hollow_const_iterator<vector<int>>(&v, &p, v.begin());
if (it1 != it2) exit(-1);
hollow_iterator<vector<int>> default_constructed;
default_constructed = hollow_iterator<vector<int>>(&v, &p, v.begin());
}

7
cxxmph/mph_bits.cc Normal file
View File

@ -0,0 +1,7 @@
#include "mph_bits.h"
namespace cxxmph {
const uint8_t dynamic_2bitset::vmask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
}

67
cxxmph/mph_bits.h Normal file
View File

@ -0,0 +1,67 @@
#ifndef __CXXMPH_MPH_BITS_H__
#define __CXXMPH_MPH_BITS_H__
#include <stdint.h> // for uint32_t and friends
#include <cassert>
#include <climits>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <limits>
#include <vector>
namespace cxxmph {
class dynamic_2bitset {
public:
dynamic_2bitset() : fill_(false) {}
dynamic_2bitset(uint32_t size, bool fill = false)
: size_(size), fill_(fill), data_(ceil(size / 4.0), ones()*fill) {
}
const uint8_t operator[](uint32_t i) const { return get(i); }
uint8_t get(uint32_t i) const {
return (data_[(i >> 2)] >> (((i & 3) << 1)) & 3);
}
uint8_t set(uint32_t i, uint8_t v) {
data_[(i >> 2)] |= ones() ^ dynamic_2bitset::vmask[i & 3];
data_[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]);
assert(v <= 3);
assert(get(i) == v);
}
void resize(uint32_t size) {
size_ = size;
data_.resize(size >> 2, fill_*ones());
}
void swap(dynamic_2bitset& other) {
std::swap(other.size_, size_);
std::swap(other.fill_, fill_);
std::swap(other.data_, data_);
}
void clear() { data_.clear(); }
uint32_t size() const { return size_; }
static const uint8_t vmask[];
private:
uint32_t size_;
bool fill_;
std::vector<uint8_t> data_;
uint8_t ones() { return std::numeric_limits<uint8_t>::max(); }
};
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]);
}
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
}
static uint32_t nextpoweroftwo(uint32_t k) {
if (k == 0) return 1;
k--;
for (int i=1; i<sizeof(uint32_t)*CHAR_BIT; i<<=1) k = k | k >> i;
return k+1;
}
} // namespace cxxmph
#endif

49
cxxmph/mph_bits_test.cc Normal file
View File

@ -0,0 +1,49 @@
#include <cstdio>
#include <cstdlib>
#include "mph_bits.h"
using cxxmph::dynamic_2bitset;
int main(int argc, char** argv) {
dynamic_2bitset small(256, true);
for (int i = 0; i < small.size(); ++i) small.set(i, i % 4);
for (int i = 0; i < small.size(); ++i) {
if (small[i] != i % 4) {
fprintf(stderr, "wrong bits %d at %d expected %d\n", small[i], i, i % 4);
exit(-1);
}
}
int size = 256;
dynamic_2bitset bits(size, true /* fill with ones */);
for (int i = 0; i < size; ++i) {
if (bits[i] != 3) {
fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 3);
exit(-1);
}
}
for (int i = 0; i < size; ++i) bits.set(i, 0);
for (int i = 0; i < size; ++i) {
if (bits[i] != 0) {
fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 0);
exit(-1);
}
}
for (int i = 0; i < size; ++i) bits.set(i, i % 4);
for (int i = 0; i < size; ++i) {
if (bits[i] != i % 4) {
fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, i % 4);
exit(-1);
}
}
dynamic_2bitset size_corner1(1);
if (size_corner1.size() != 1) exit(-1);
dynamic_2bitset size_corner2(2);
if (size_corner2.size() != 2) exit(-1);
(dynamic_2bitset(4)).swap(size_corner2);
if (size_corner2.size() != 4) exit(-1);
}

View File

@ -25,6 +25,7 @@
#include <stdint.h> #include <stdint.h>
#include <cassert> #include <cassert>
#include <climits>
#include <cmath> #include <cmath>
#include <unordered_map> // for std::hash #include <unordered_map> // for std::hash
#include <vector> #include <vector>
@ -35,6 +36,7 @@ using std::cerr;
using std::endl; using std::endl;
#include "seeded_hash.h" #include "seeded_hash.h"
#include "mph_bits.h"
#include "trigraph.h" #include "trigraph.h"
namespace cxxmph { namespace cxxmph {
@ -42,13 +44,13 @@ namespace cxxmph {
class MPHIndex { class MPHIndex {
public: public:
MPHIndex(double c = 1.23, uint8_t b = 7) : MPHIndex(double c = 1.23, uint8_t b = 7) :
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), c_(c), b_(b), m_(0), n_(0), k_(0), r_(1),
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
deserialized_(false) { } deserialized_(false) { }
~MPHIndex(); ~MPHIndex();
template <class SeededHashFcn, class ForwardIterator> template <class SeededHashFcn, class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end); bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size);
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn, class Key> // must agree with Reset
// Get a unique identifier for k, in the range [0;size()). If x wasn't part // Get a unique identifier for k, in the range [0;size()). If x wasn't part
// of the input in the last Reset call, returns a random value. // of the input in the last Reset call, returns a random value.
@ -63,6 +65,16 @@ class MPHIndex {
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t minimal_perfect_hash(const Key& x) const; uint32_t minimal_perfect_hash(const Key& x) const;
// Crazy functions. Ignore.
template <class SeededHashFcn> // must agree with Reset
uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const;
template <class SeededHashFcn> // must agree with Reset
uint8_t cuckoo_nest(const uint32_t* h) const;
template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t cuckoo_nest_index(const Key& x, uint32_t* h) const;
template <class SeededHashFcn, class Key> // must agree with Reset
void hash_vector(const Key& x, uint32_t* h) const;
// Serialization for mmap usage - not tested well, ping me if you care. // Serialization for mmap usage - not tested well, ping me if you care.
// Serialized tables are not guaranteed to work across versions or different // Serialized tables are not guaranteed to work across versions or different
// endianness (although they could easily be made to be). // endianness (although they could easily be made to be).
@ -94,6 +106,8 @@ class MPHIndex {
// Partition vertex count, derived from c parameter. // Partition vertex count, derived from c parameter.
uint32_t r_; uint32_t r_;
uint32_t nest_displacement_[3]; // derived from r_
// The array containing the minimal perfect hash function graph. Do not use // The array containing the minimal perfect hash function graph. Do not use
// c++ vector to make mmap based backing easier. // c++ vector to make mmap based backing easier.
const uint8_t* g_; const uint8_t* g_;
@ -108,26 +122,26 @@ class MPHIndex {
bool deserialized_; bool deserialized_;
static const uint8_t valuemask[]; static const uint8_t valuemask[];
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
}
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
}
}; };
// Template method needs to go in the header file. // Template method needs to go in the header file.
template <class SeededHashFcn, class ForwardIterator> template <class SeededHashFcn, class ForwardIterator>
bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { bool MPHIndex::Reset(
ForwardIterator begin, ForwardIterator end, uint32_t size) {
if (end == begin) { if (end == begin) {
clear(); clear();
return true; return true;
} }
m_ = end - begin; m_ = size;
r_ = static_cast<uint32_t>(ceil((c_*m_)/3)); r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
if ((r_ % 2) == 0) r_ += 1; if ((r_ % 2) == 0) r_ += 1;
nest_displacement_[0] = 0;
nest_displacement_[1] = r_;
nest_displacement_[2] = (r_ << 1);
// This can be used to speed mods, but increases occupation too much.
// Needs to try http://gmplib.org/manual/Integer-Exponentiation.html instead
// r_ = nextpoweroftwo(r_);
n_ = 3*r_; n_ = 3*r_;
k_ = 1U << b_; k_ = 1U << b_;
@ -173,21 +187,44 @@ bool MPHIndex::Mapping(
return false; return false;
} }
template <class SeededHashFcn>
uint32_t MPHIndex::cuckoo_hash(const uint32_t* h, uint8_t nest) const {
return (h[nest] % r_) + nest_displacement_[nest];
}
template <class SeededHashFcn, class Key>
void MPHIndex::hash_vector(const Key& key, uint32_t* h) const {
SeededHashFcn().hash64(key, hash_seed_[0], h);
}
template <class SeededHashFcn> // must agree with Reset
uint8_t MPHIndex::cuckoo_nest(const uint32_t* h) const {
uint32_t x[4];
if (!g_size_) return 0;
x[0] = (h[0] % r_) + nest_displacement_[0];
x[1] = (h[1] % r_) + nest_displacement_[1];
x[2] = (h[2] % r_) + nest_displacement_[2];
assert((x[0] >> 2) <g_size_);
assert((x[1] >> 2) <g_size_);
assert((x[2] >> 2) <g_size_);
return (get_2bit_value(g_, x[0]) + get_2bit_value(g_, x[1]) + get_2bit_value(g_, x[2])) % 3;
}
template <class SeededHashFcn, class Key> template <class SeededHashFcn, class Key>
uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t MPHIndex::perfect_hash(const Key& key) const {
uint32_t h[4]; uint32_t h[4];
SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast<uint32_t*>(&h)); if (!g_size_) return 0;
SeededHashFcn().hash64(key, hash_seed_[0], h);
// for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
assert(r_); h[0] = (h[0] % r_) + nest_displacement_[0];
h[0] = h[0] % r_; h[1] = (h[1] % r_) + nest_displacement_[1];
h[1] = h[1] % r_ + r_; h[2] = (h[2] % r_) + nest_displacement_[2];
h[2] = h[2] % r_ + (r_ << 1);
assert(g_size_);
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
assert((h[0] >> 2) <g_size_); assert((h[0] >> 2) <g_size_);
assert((h[1] >> 2) <g_size_); assert((h[1] >> 2) <g_size_);
assert((h[2] >> 2) <g_size_); assert((h[2] >> 2) <g_size_);
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3]; uint8_t nest = (get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3;
uint32_t vertex = h[nest];
return vertex; return vertex;
} }
template <class SeededHashFcn, class Key> template <class SeededHashFcn, class Key>
@ -206,12 +243,15 @@ template <class Key, class HashFcn = typename seeded_hash<std::hash<Key>>::hash_
class SimpleMPHIndex : public MPHIndex { class SimpleMPHIndex : public MPHIndex {
public: public:
template <class ForwardIterator> template <class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end) { bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size) {
return MPHIndex::Reset<HashFcn>(begin, end); return MPHIndex::Reset<HashFcn>(begin, end, size);
} }
uint32_t index(const Key& key) const { return MPHIndex::index<HashFcn>(key); } uint32_t index(const Key& key) const { return MPHIndex::index<HashFcn>(key); }
uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash<HashFcn>(key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash<HashFcn>(key); }
uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash<HashFcn>(key); } uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash<HashFcn>(key); }
uint8_t cuckoo_nest(const uint32_t* h) const { return MPHIndex::cuckoo_nest<HashFcn>(h); }
uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const { return MPHIndex::cuckoo_hash<HashFcn>(h, nest); }
void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector<HashFcn>(key, h); }
}; };
} // namespace cxxmph } // namespace cxxmph

View File

@ -24,7 +24,7 @@ int main(int argc, char** argv) {
keys.push_back("algume"); keys.push_back("algume");
SimpleMPHIndex<string> mph_index; SimpleMPHIndex<string> mph_index;
if (!mph_index.Reset(keys.begin(), keys.end())) { exit(-1); } if (!mph_index.Reset(keys.begin(), keys.end(), keys.size())) { exit(-1); }
vector<int> ids; vector<int> ids;
for (vector<int>::size_type i = 0; i < keys.size(); ++i) { for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
ids.push_back(mph_index.index(keys[i])); ids.push_back(mph_index.index(keys[i]));

View File

@ -10,11 +10,16 @@
// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl // See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl
#include <algorithm> #include <algorithm>
#include <iostream>
#include <limits>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include <utility> // for std::pair #include <utility> // for std::pair
#include "mph_bits.h"
#include "mph_index.h" #include "mph_index.h"
#include "hollow_iterator.h"
namespace cxxmph { namespace cxxmph {
@ -42,8 +47,9 @@ class mph_map {
typedef typename std::vector<value_type>::const_reference const_reference; typedef typename std::vector<value_type>::const_reference const_reference;
typedef typename std::vector<value_type>::size_type size_type; typedef typename std::vector<value_type>::size_type size_type;
typedef typename std::vector<value_type>::difference_type difference_type; typedef typename std::vector<value_type>::difference_type difference_type;
typedef typename std::vector<value_type>::iterator iterator;
typedef typename std::vector<value_type>::const_iterator const_iterator; typedef hollow_iterator<std::vector<value_type>> iterator;
typedef hollow_const_iterator<std::vector<value_type>> const_iterator;
// For making macros simpler. // For making macros simpler.
typedef void void_type; typedef void void_type;
@ -63,16 +69,15 @@ class mph_map {
void erase(iterator pos); void erase(iterator pos);
void erase(const key_type& k); void erase(const key_type& k);
pair<iterator, bool> insert(const value_type& x); pair<iterator, bool> insert(const value_type& x);
iterator find(const key_type& k); iterator find(const key_type& k) { return slow_find(k, index_.perfect_hash(k)); }
const_iterator find(const key_type& k) const; const_iterator find(const key_type& k) const { return slow_find(k, index_.perfect_hash(k)); };
typedef int32_t my_int32_t; // help macros typedef int32_t my_int32_t; // help macros
int32_t index(const key_type& k) const; int32_t index(const key_type& k) const;
data_type& operator[](const key_type &k); data_type& operator[](const key_type &k);
const data_type& operator[](const key_type &k) const; const data_type& operator[](const key_type &k) const;
size_type bucket_count() const { return size(); } size_type bucket_count() const { return index_.perfect_hash_size() + slack_.bucket_count(); }
// FIXME: not sure if this has the semantics I want void rehash(size_type nbuckets /*ignored*/);
void rehash(size_type nbuckets /*ignored*/) { pack(); }
protected: // mimicking STL implementation protected: // mimicking STL implementation
EqualKey equal_; EqualKey equal_;
@ -81,7 +86,7 @@ class mph_map {
template <typename iterator> template <typename iterator>
struct iterator_first : public iterator { struct iterator_first : public iterator {
iterator_first(iterator it) : iterator(it) { } iterator_first(iterator it) : iterator(it) { }
const typename iterator::value_type::first_type& operator*() const { const typename iterator::value_type::first_type& operator*() {
return this->iterator::operator*().first; return this->iterator::operator*().first;
} }
}; };
@ -91,72 +96,173 @@ class mph_map {
return iterator_first<iterator>(it); return iterator_first<iterator>(it);
} }
iterator make_iterator(typename std::vector<value_type>::iterator it) {
return hollow_iterator<std::vector<value_type>>(&values_, &present_, it);
}
const_iterator make_iterator(typename std::vector<value_type>::const_iterator it) const {
return hollow_const_iterator<std::vector<value_type>>(&values_, &present_, it);
}
// Experimental functions, not always faster
iterator fast_find(const key_type& k);
const_iterator fast_find(const key_type& k) const;
iterator slow_find(const key_type& k, uint32_t perfect_hash);
const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const;
static const uint8_t kNestCollision = 3; // biggest 2 bit value
void set_nest_value(const uint32_t* h, uint8_t value) {
auto index = get_nest_index(h);
assert(get_nest_index(h) < nests_.size());
assert(get_nest_index(h) >> 2 < nests_.size());
assert(value < 4);
nests_.set(index, value);
assert(nests_[index] == value);
}
uint32_t get_nest_value(const uint32_t* h) const {
assert(get_nest_index(h) < nests_.size());
return nests_[get_nest_index(h)];
}
uint32_t get_nest_index(const uint32_t* h) const {
assert(nests_.size());
assert(nests_.size() % 2 == 0);
assert((nests_.size() & (nests_.size() - 1)) == 0);
assert((h[3] % nests_.size()) == (h[3] & (nests_.size() - 1)));
return (h[3] & (nests_.size() - 1)); // a mod 2^n == a & 2^n - 1
}
void pack(); void pack();
std::vector<value_type> values_; std::vector<value_type> values_;
std::vector<bool> present_;
dynamic_2bitset nests_;
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_; SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
// TODO(davi) optimize slack to no hold a copy of the key // TODO(davi) optimize slack to hold 128 unique bits from hash64 as key
typedef unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type; typedef unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
slack_type slack_; slack_type slack_;
size_type size_;
mutable uint64_t fast_;
mutable uint64_t fast_taken_;
mutable uint64_t slow_;
mutable uint64_t very_slow_;
}; };
MPH_MAP_TMPL_SPEC MPH_MAP_TMPL_SPEC
bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) {
return lhs.values_ == rhs.values_; return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
} }
MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() { MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) {
clear();
pack(); pack();
} }
MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() {
// fprintf(stderr, "Fast taken: %d Fast: %d Slow %d very_slow %d ratio %f\n", fast_taken_, fast_, slow_, very_slow_, fast_*1.0/slow_);
} }
MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
iterator it = find(x.first); auto it = find(x.first);
if (it != end()) return make_pair(it, false); auto it_end = end();
values_.push_back(x); if (it != it_end) return make_pair(it, false);
slack_.insert(make_pair(x.first, values_.size() - 1)); bool should_pack = false;
if (slack_.size() == index_.size() || if (values_.capacity() == values_.size() && values_.size() > 256) {
(slack_.size() >= 256 && index_.size() == 0)) { should_pack = true;
pack();
} }
values_.push_back(x);
present_.push_back(true);
uint32_t h[4];
index_.hash_vector(x.first, h);
set_nest_value(h, kNestCollision);
++size_;
slack_.insert(make_pair(x.first, values_.size() - 1));
if (should_pack) pack();
it = find(x.first); it = find(x.first);
slow_ = 0;
very_slow_ = 0;
fast_ = 0;
fast_taken_ = 0;
return make_pair(it, true); return make_pair(it, true);
} }
MPH_MAP_METHOD_DECL(void_type, pack)() { MPH_MAP_METHOD_DECL(void_type, pack)() {
// fprintf(stderr, "Paki %d values\n", values_.size());
if (values_.empty()) return; if (values_.empty()) return;
slack_type().swap(slack_); assert(std::unordered_set<key_type>(make_iterator_first(begin()), make_iterator_first(end())).size() == size());
bool success = index_.Reset( bool success = index_.Reset(
make_iterator_first(values_.begin()), make_iterator_first(begin()),
make_iterator_first(values_.end())); make_iterator_first(end()), size_);
assert(success); assert(success);
std::vector<value_type> new_values(values_.size()); std::vector<value_type> new_values(index_.perfect_hash_size());
for (const_iterator it = values_.begin(), end = values_.end(); new_values.reserve(new_values.size() * 2);
it != end; ++it) { std::vector<bool> new_present(index_.perfect_hash_size(), false);
size_type id = index_.index(it->first); new_present.reserve(new_present.size() * 2);
auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*10000 + 1);
dynamic_2bitset(new_nests_size, true /* fill with 1s */).swap(nests_);
vector<bool> used_nests(nests_.size());
uint32_t collisions = 0;
for (iterator it = begin(), it_end = end(); it != it_end; ++it) {
size_type id = index_.perfect_hash(it->first);
assert(id < new_values.size()); assert(id < new_values.size());
new_values[id] = *it; new_values[id] = *it;
new_present[id] = true;
uint32_t h[4];
index_.hash_vector(it->first, h);
// fprintf(stderr, "Nest index: %d\n", get_nest_index(h));
assert(used_nests.size() > get_nest_index(h));
if (used_nests[get_nest_index(h)]) {
set_nest_value(h, kNestCollision);
assert(get_nest_value(h) == kNestCollision);
// fprintf(stderr, "Collision at nest index %d among %d positions\n", get_nest_index(h), nests_.size());
++collisions;
} else {
set_nest_value(h, index_.cuckoo_nest(h));
assert(get_nest_value(h) == index_.cuckoo_nest(h));
assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, get_nest_value(h)));
used_nests[get_nest_index(h)] = true;
}
} }
// fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size());
values_.swap(new_values); values_.swap(new_values);
present_.swap(new_present);
slack_type().swap(slack_);
int32_t fast = 0;
int32_t slow= 0;
for (iterator it = begin(), it_end = end(); it != it_end; ++it) {
uint32_t h[4];
index_.hash_vector(it->first, h);
if (get_nest_value(h) == kNestCollision) ++slow;
else {
++fast;
auto cit = values_.begin() + index_.cuckoo_hash(h, get_nest_value(h));
assert(index_.perfect_hash(it->first) == cit - values_.begin());
assert(equal_(it->first, cit->first));
}
}
// fprintf(stderr, "Predicted fast: %d slow %d\n", fast, slow);
} }
MPH_MAP_METHOD_DECL(iterator, begin)() { return values_.begin(); } MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); }
MPH_MAP_METHOD_DECL(iterator, end)() { return values_.end(); } MPH_MAP_METHOD_DECL(iterator, end)() { return make_iterator(values_.end()); }
MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return make_iterator(values_.begin()); }
MPH_MAP_METHOD_DECL(const_iterator, end)() const { return values_.end(); } MPH_MAP_METHOD_DECL(const_iterator, end)() const { return make_iterator(values_.end()); }
MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } MPH_MAP_METHOD_DECL(bool_type, empty)() const { return size_ == 0; }
MPH_MAP_METHOD_DECL(size_type, size)() const { return values_.size(); } MPH_MAP_METHOD_DECL(size_type, size)() const { return size_; }
MPH_MAP_METHOD_DECL(void_type, clear)() { MPH_MAP_METHOD_DECL(void_type, clear)() {
values_.clear(); values_.clear();
present_.clear();
slack_.clear(); slack_.clear();
index_.clear(); index_.clear();
dynamic_2bitset(8, true /* fill with 1s */).swap(nests_);
size_ = 0;
} }
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
values_.erase(pos); present_[pos - begin] = false;
pack(); uint32_t h[4];
index_.hash_vector(pos->first, &h);
nests_[get_nest_index(h)] = kNestCollision;
*pos = value_type();
--size_;
} }
MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) {
iterator it = find(k); iterator it = find(k);
@ -164,36 +270,88 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) {
erase(it); erase(it);
} }
MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { MPH_MAP_METHOD_DECL(const_iterator, fast_find)(const key_type& k) const {
if (__builtin_expect(!slack_.empty(), 0)) { uint32_t h[4];
typename slack_type::const_iterator it = slack_.find(k); index_.hash_vector(k, h);
if (it != slack_.end()) return values_.begin() + it->second; auto nest = get_nest_value(h);
if (__builtin_expect(nest != kNestCollision, 1)) {
++fast_taken_;
auto vit = values_.begin() + index_.cuckoo_hash(h, nest);
// do not hold for unknown keys
assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first));
if (equal_(k, vit->first)) {
++fast_;
return make_iterator(vit);
}
}
nest = index_.cuckoo_nest(h);
++slow_;
return slow_find(k, index_.cuckoo_hash(h, nest));
}
MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const {
if (__builtin_expect(index_.perfect_hash_size(), 1)) {
if (__builtin_expect(present_[perfect_hash], true)) {
auto vit = values_.begin() + perfect_hash;
if (equal_(k, vit->first)) return make_iterator(vit);
}
}
if (__builtin_expect(!slack_.empty(), 0)) {
++very_slow_;
auto sit = slack_.find(k);
if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second);
} }
if (__builtin_expect(index_.size() == 0, 0)) return end();
const_iterator it = values_.begin() + index_.index(k);
if (__builtin_expect(equal_(k, it->first), 1)) return it;
return end(); return end();
} }
MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { MPH_MAP_METHOD_DECL(iterator, fast_find)(const key_type& k) {
if (!slack_.empty()) { uint32_t h[4];
typename slack_type::const_iterator it = slack_.find(k); index_.hash_vector(k, h);
if (it != slack_.end()) return values_.begin() + it->second; auto nest = get_nest_value(h);
if (__builtin_expect(nest != kNestCollision, 1)) {
++fast_taken_;
auto vit = values_.begin() + index_.cuckoo_hash(h, nest);
assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first));
if (equal_(k, vit->first)) {
++fast_;
return make_iterator(vit);
}
}
nest = index_.cuckoo_nest(h);
++slow_;
return slow_find(k, index_.cuckoo_hash(h, nest));
}
MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) {
if (__builtin_expect(index_.perfect_hash_size(), 1)) {
if (__builtin_expect(present_[perfect_hash], true)) {
auto vit = values_.begin() + perfect_hash;
if (equal_(k, vit->first)) return make_iterator(vit);
}
}
if (__builtin_expect(!slack_.empty(), 0)) {
++very_slow_;
auto sit = slack_.find(k);
if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second);
} }
if (index_.size() == 0) return end();
iterator it = values_.begin() + index_.index(k);
if (equal_(it->first, k)) return it;
return end(); return end();
} }
MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const {
if (index_.size() == 0) return -1; if (index_.size() == 0) return -1;
return index_.index(k); return index_.perfect_hash(k);
} }
MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) {
return insert(make_pair(k, data_type())).first->second; return insert(make_pair(k, data_type())).first->second;
} }
MPH_MAP_METHOD_DECL(void_type, rehash)(size_type nbuckets) {
pack();
vector<value_type>(values_.begin(), values_.end()).swap(values_);
vector<bool>(present_.begin(), present_.end()).swap(present_);
slack_type().swap(slack_);
}
} // namespace cxxmph } // namespace cxxmph

View File

@ -11,21 +11,32 @@ using cxxmph::mph_map;
int main(int argc, char** argv) { int main(int argc, char** argv) {
mph_map<int64_t, int64_t> b; mph_map<int64_t, int64_t> b;
for (int i = 0; i < 100*1000; ++i) { int32_t num_keys = 1000*10;
for (int i = 0; i < num_keys; ++i) {
b.insert(make_pair(i, i)); b.insert(make_pair(i, i));
} }
for (int i = 0; i < 1000*1000; ++i) { b.rehash(b.size());
b.find(i); fprintf(stderr, "Insertion finished\n");
for (int i = 0; i < 1000000; ++i) {
auto it = b.find(i % num_keys);
if (it == b.end()) {
std::cerr << "Failed to find " << i << std::endl;
exit(-1);
}
if (it->first != it->second || it->first != i % num_keys) {
std::cerr << "Found " << it->first << " looking for " << i << std::endl;
exit(-1);
}
} }
/* /*
mph_map<string, int> h; mph_map<string, int> h;
h.insert(std::make_pair("-1",-1)); h.insert(std::make_pair("-1",-1));
mph_map<string, int>::const_iterator it; mph_map<string, int>::const_iterator it;
for (it = h.begin(); it != h.end(); ++it) { for (it = h.begin(); it != h.end(); ++it) {
std::cerr << it->first << " -> " << it->second << std::endl; if (it->second != -1) exit(-1);
} }
std::cerr << "Search -1 gives " << h.find("-1")->second << std::endl; int32_t num_valid = 100;
for (int i = 0; i < 100; ++i) { for (int i = 0; i < num_valid; ++i) {
char buf[10]; char buf[10];
snprintf(buf, 10, "%d", i); snprintf(buf, 10, "%d", i);
h.insert(std::make_pair(buf, i)); h.insert(std::make_pair(buf, i));
@ -34,18 +45,18 @@ int main(int argc, char** argv) {
for (int i = 1000; i > 0; --i) { for (int i = 1000; i > 0; --i) {
char buf[10]; char buf[10];
snprintf(buf, 10, "%d", i - 1); snprintf(buf, 10, "%d", i - 1);
h.find(buf); auto it = h.find(buf);
std::cerr << "Search " << i - 1 << " gives " << h.find(buf)->second << std::endl; if (i < num_valid && it->second != i - 1) exit(-1);
} }
} }
for (int j = 0; j < 100; ++j) { for (int j = 0; j < 100; ++j) {
for (int i = 1000; i > 0; --i) { for (int i = 1000; i > 0; --i) {
char buf[10]; char buf[10];
snprintf(buf, 10, "%d", i*100 - 1); int key = i*100 - 1;
h.find(buf); snprintf(buf, 10, "%d", key);
std::cerr << "Search " << i*100 - 1 << " gives " << h.find(buf)->second << std::endl; auto it = h.find(buf);
if (key < num_valid && it->second != key) exit(-1);
} }
} }
*/ */
} }

View File

@ -9,8 +9,10 @@
#include "MurmurHash3.h" #include "MurmurHash3.h"
#include "stringpiece.h" #include "stringpiece.h"
// From murmur, only used naively to extend 32 bits functions to 64 bits. // From murmur, only used naively to extend 32 bits functions to 128 bits.
uint32_t fmix ( uint32_t h ); uint32_t fmix ( uint32_t h );
// Used for a quick and dirty hash function for integers. Probably a bad idea.
uint64_t fmix ( uint64_t h );
namespace cxxmph { namespace cxxmph {
@ -57,6 +59,18 @@ struct Murmur3StringPiece {
} }
}; };
struct Murmur3Fmix64bitsType {
template <class Key>
uint32_t operator()(const Key& k) const {
return fmix(*reinterpret_cast<const uint64_t*>(&k));
}
template <class Key>
void hash64(const Key& k, uint32_t* out) const {
*reinterpret_cast<uint64_t*>(out) = fmix(k);
*(out + 2) = fmix(*out);
}
};
template <> template <>
struct seeded_hash_function<Murmur3> { struct seeded_hash_function<Murmur3> {
template <class Key> template <class Key>
@ -87,6 +101,20 @@ struct seeded_hash_function<Murmur3StringPiece> {
} }
}; };
template <>
struct seeded_hash_function<Murmur3Fmix64bitsType> {
template <class Key>
uint32_t operator()(const Key& k, uint32_t seed) const {
return fmix(k + seed);
}
template <class Key>
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
*reinterpret_cast<uint64_t*>(out) = fmix(k ^ seed);
*(out + 2) = fmix(*out);
}
};
template <class HashFcn> struct seeded_hash template <class HashFcn> struct seeded_hash
{ typedef seeded_hash_function<HashFcn> hash_function; }; { typedef seeded_hash_function<HashFcn> hash_function; };
// Use Murmur3 instead for all types defined in std::hash, plus // Use Murmur3 instead for all types defined in std::hash, plus