Cleanup for upload.
This commit is contained in:
commit
40c6626d87
@ -1,12 +1,12 @@
|
|||||||
TESTS = $(check_PROGRAMS)
|
TESTS = $(check_PROGRAMS)
|
||||||
check_PROGRAMS = mph_map_test mph_index_test trigraph_test
|
check_PROGRAMS = mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test
|
||||||
noinst_PROGRAMS = bm_index bm_map
|
noinst_PROGRAMS = bm_index bm_map
|
||||||
bin_PROGRAMS = cxxmph
|
bin_PROGRAMS = cxxmph
|
||||||
lib_LTLIBRARIES = libcxxmph.la
|
lib_LTLIBRARIES = libcxxmph.la
|
||||||
libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
|
libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc mph_bits.h mph_bits.cc
|
||||||
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
||||||
cxxmph_includedir = $(includedir)/cxxmph/
|
cxxmph_includedir = $(includedir)/cxxmph/
|
||||||
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h
|
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h
|
||||||
|
|
||||||
mph_map_test_LDADD = libcxxmph.la
|
mph_map_test_LDADD = libcxxmph.la
|
||||||
mph_map_test_SOURCES = mph_map_test.cc
|
mph_map_test_SOURCES = mph_map_test.cc
|
||||||
@ -25,3 +25,8 @@ bm_map_SOURCES = bm_common.cc bm_map.cc
|
|||||||
|
|
||||||
cxxmph_LDADD = libcxxmph.la
|
cxxmph_LDADD = libcxxmph.la
|
||||||
cxxmph_SOURCES = cxxmph.cc
|
cxxmph_SOURCES = cxxmph.cc
|
||||||
|
|
||||||
|
hollow_iterator_test_SOURCES = hollow_iterator_test.cc
|
||||||
|
mph_bits_test_SOURCES = mph_bits_test.cc
|
||||||
|
mph_bits_test_LDADD = libcxxmph.la
|
||||||
|
|
||||||
|
335
cxxmph/MurmurHash3.cpp
Normal file
335
cxxmph/MurmurHash3.cpp
Normal file
@ -0,0 +1,335 @@
|
|||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// MurmurHash3 was written by Austin Appleby, and is placed in the public
|
||||||
|
// domain. The author hereby disclaims copyright to this source code.
|
||||||
|
|
||||||
|
// Note - The x86 and x64 versions do _not_ produce the same results, as the
|
||||||
|
// algorithms are optimized for their respective platforms. You can still
|
||||||
|
// compile and run any of them on any platform, but your performance with the
|
||||||
|
// non-native version will be less than optimal.
|
||||||
|
|
||||||
|
#include "MurmurHash3.h"
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Platform-specific functions and macros
|
||||||
|
|
||||||
|
// Microsoft Visual Studio
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
|
||||||
|
#define FORCE_INLINE __forceinline
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#define ROTL32(x,y) _rotl(x,y)
|
||||||
|
#define ROTL64(x,y) _rotl64(x,y)
|
||||||
|
|
||||||
|
#define BIG_CONSTANT(x) (x)
|
||||||
|
|
||||||
|
// Other compilers
|
||||||
|
|
||||||
|
#else // defined(_MSC_VER)
|
||||||
|
|
||||||
|
#define FORCE_INLINE __attribute__((always_inline))
|
||||||
|
|
||||||
|
inline uint32_t rotl32 ( uint32_t x, int8_t r )
|
||||||
|
{
|
||||||
|
return (x << r) | (x >> (32 - r));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint64_t rotl64 ( uint64_t x, int8_t r )
|
||||||
|
{
|
||||||
|
return (x << r) | (x >> (64 - r));
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ROTL32(x,y) rotl32(x,y)
|
||||||
|
#define ROTL64(x,y) rotl64(x,y)
|
||||||
|
|
||||||
|
#define BIG_CONSTANT(x) (x##LLU)
|
||||||
|
|
||||||
|
#endif // !defined(_MSC_VER)
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Block read - if your platform needs to do endian-swapping or can only
|
||||||
|
// handle aligned reads, do the conversion here
|
||||||
|
|
||||||
|
FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
|
||||||
|
{
|
||||||
|
return p[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
|
||||||
|
{
|
||||||
|
return p[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Finalization mix - force all bits of a hash block to avalanche
|
||||||
|
|
||||||
|
FORCE_INLINE uint32_t fmix ( uint32_t h )
|
||||||
|
{
|
||||||
|
h ^= h >> 16;
|
||||||
|
h *= 0x85ebca6b;
|
||||||
|
h ^= h >> 13;
|
||||||
|
h *= 0xc2b2ae35;
|
||||||
|
h ^= h >> 16;
|
||||||
|
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
//----------
|
||||||
|
|
||||||
|
FORCE_INLINE uint64_t fmix ( uint64_t k )
|
||||||
|
{
|
||||||
|
k ^= k >> 33;
|
||||||
|
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
|
||||||
|
k ^= k >> 33;
|
||||||
|
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
|
||||||
|
k ^= k >> 33;
|
||||||
|
|
||||||
|
return k;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void MurmurHash3_x86_32 ( const void * key, int len,
|
||||||
|
uint32_t seed, void * out )
|
||||||
|
{
|
||||||
|
const uint8_t * data = (const uint8_t*)key;
|
||||||
|
const int nblocks = len / 4;
|
||||||
|
|
||||||
|
uint32_t h1 = seed;
|
||||||
|
|
||||||
|
uint32_t c1 = 0xcc9e2d51;
|
||||||
|
uint32_t c2 = 0x1b873593;
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// body
|
||||||
|
|
||||||
|
const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
|
||||||
|
|
||||||
|
for(int i = -nblocks; i; i++)
|
||||||
|
{
|
||||||
|
uint32_t k1 = getblock(blocks,i);
|
||||||
|
|
||||||
|
k1 *= c1;
|
||||||
|
k1 = ROTL32(k1,15);
|
||||||
|
k1 *= c2;
|
||||||
|
|
||||||
|
h1 ^= k1;
|
||||||
|
h1 = ROTL32(h1,13);
|
||||||
|
h1 = h1*5+0xe6546b64;
|
||||||
|
}
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// tail
|
||||||
|
|
||||||
|
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
|
||||||
|
|
||||||
|
uint32_t k1 = 0;
|
||||||
|
|
||||||
|
switch(len & 3)
|
||||||
|
{
|
||||||
|
case 3: k1 ^= tail[2] << 16;
|
||||||
|
case 2: k1 ^= tail[1] << 8;
|
||||||
|
case 1: k1 ^= tail[0];
|
||||||
|
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
|
||||||
|
};
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// finalization
|
||||||
|
|
||||||
|
h1 ^= len;
|
||||||
|
|
||||||
|
h1 = fmix(h1);
|
||||||
|
|
||||||
|
*(uint32_t*)out = h1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void MurmurHash3_x86_128 ( const void * key, const int len,
|
||||||
|
uint32_t seed, void * out )
|
||||||
|
{
|
||||||
|
const uint8_t * data = (const uint8_t*)key;
|
||||||
|
const int nblocks = len / 16;
|
||||||
|
|
||||||
|
uint32_t h1 = seed;
|
||||||
|
uint32_t h2 = seed;
|
||||||
|
uint32_t h3 = seed;
|
||||||
|
uint32_t h4 = seed;
|
||||||
|
|
||||||
|
uint32_t c1 = 0x239b961b;
|
||||||
|
uint32_t c2 = 0xab0e9789;
|
||||||
|
uint32_t c3 = 0x38b34ae5;
|
||||||
|
uint32_t c4 = 0xa1e38b93;
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// body
|
||||||
|
|
||||||
|
const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
|
||||||
|
|
||||||
|
for(int i = -nblocks; i; i++)
|
||||||
|
{
|
||||||
|
uint32_t k1 = getblock(blocks,i*4+0);
|
||||||
|
uint32_t k2 = getblock(blocks,i*4+1);
|
||||||
|
uint32_t k3 = getblock(blocks,i*4+2);
|
||||||
|
uint32_t k4 = getblock(blocks,i*4+3);
|
||||||
|
|
||||||
|
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
|
||||||
|
|
||||||
|
h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
|
||||||
|
|
||||||
|
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
|
||||||
|
|
||||||
|
h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
|
||||||
|
|
||||||
|
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
|
||||||
|
|
||||||
|
h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
|
||||||
|
|
||||||
|
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
|
||||||
|
|
||||||
|
h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
|
||||||
|
}
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// tail
|
||||||
|
|
||||||
|
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
|
||||||
|
|
||||||
|
uint32_t k1 = 0;
|
||||||
|
uint32_t k2 = 0;
|
||||||
|
uint32_t k3 = 0;
|
||||||
|
uint32_t k4 = 0;
|
||||||
|
|
||||||
|
switch(len & 15)
|
||||||
|
{
|
||||||
|
case 15: k4 ^= tail[14] << 16;
|
||||||
|
case 14: k4 ^= tail[13] << 8;
|
||||||
|
case 13: k4 ^= tail[12] << 0;
|
||||||
|
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
|
||||||
|
|
||||||
|
case 12: k3 ^= tail[11] << 24;
|
||||||
|
case 11: k3 ^= tail[10] << 16;
|
||||||
|
case 10: k3 ^= tail[ 9] << 8;
|
||||||
|
case 9: k3 ^= tail[ 8] << 0;
|
||||||
|
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
|
||||||
|
|
||||||
|
case 8: k2 ^= tail[ 7] << 24;
|
||||||
|
case 7: k2 ^= tail[ 6] << 16;
|
||||||
|
case 6: k2 ^= tail[ 5] << 8;
|
||||||
|
case 5: k2 ^= tail[ 4] << 0;
|
||||||
|
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
|
||||||
|
|
||||||
|
case 4: k1 ^= tail[ 3] << 24;
|
||||||
|
case 3: k1 ^= tail[ 2] << 16;
|
||||||
|
case 2: k1 ^= tail[ 1] << 8;
|
||||||
|
case 1: k1 ^= tail[ 0] << 0;
|
||||||
|
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
|
||||||
|
};
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// finalization
|
||||||
|
|
||||||
|
h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
|
||||||
|
|
||||||
|
h1 += h2; h1 += h3; h1 += h4;
|
||||||
|
h2 += h1; h3 += h1; h4 += h1;
|
||||||
|
|
||||||
|
h1 = fmix(h1);
|
||||||
|
h2 = fmix(h2);
|
||||||
|
h3 = fmix(h3);
|
||||||
|
h4 = fmix(h4);
|
||||||
|
|
||||||
|
h1 += h2; h1 += h3; h1 += h4;
|
||||||
|
h2 += h1; h3 += h1; h4 += h1;
|
||||||
|
|
||||||
|
((uint32_t*)out)[0] = h1;
|
||||||
|
((uint32_t*)out)[1] = h2;
|
||||||
|
((uint32_t*)out)[2] = h3;
|
||||||
|
((uint32_t*)out)[3] = h4;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void MurmurHash3_x64_128 ( const void * key, const int len,
|
||||||
|
const uint32_t seed, void * out )
|
||||||
|
{
|
||||||
|
const uint8_t * data = (const uint8_t*)key;
|
||||||
|
const int nblocks = len / 16;
|
||||||
|
|
||||||
|
uint64_t h1 = seed;
|
||||||
|
uint64_t h2 = seed;
|
||||||
|
|
||||||
|
uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
|
||||||
|
uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// body
|
||||||
|
|
||||||
|
const uint64_t * blocks = (const uint64_t *)(data);
|
||||||
|
|
||||||
|
for(int i = 0; i < nblocks; i++)
|
||||||
|
{
|
||||||
|
uint64_t k1 = getblock(blocks,i*2+0);
|
||||||
|
uint64_t k2 = getblock(blocks,i*2+1);
|
||||||
|
|
||||||
|
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
|
||||||
|
|
||||||
|
h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
|
||||||
|
|
||||||
|
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
|
||||||
|
|
||||||
|
h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
|
||||||
|
}
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// tail
|
||||||
|
|
||||||
|
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
|
||||||
|
|
||||||
|
uint64_t k1 = 0;
|
||||||
|
uint64_t k2 = 0;
|
||||||
|
|
||||||
|
switch(len & 15)
|
||||||
|
{
|
||||||
|
case 15: k2 ^= uint64_t(tail[14]) << 48;
|
||||||
|
case 14: k2 ^= uint64_t(tail[13]) << 40;
|
||||||
|
case 13: k2 ^= uint64_t(tail[12]) << 32;
|
||||||
|
case 12: k2 ^= uint64_t(tail[11]) << 24;
|
||||||
|
case 11: k2 ^= uint64_t(tail[10]) << 16;
|
||||||
|
case 10: k2 ^= uint64_t(tail[ 9]) << 8;
|
||||||
|
case 9: k2 ^= uint64_t(tail[ 8]) << 0;
|
||||||
|
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
|
||||||
|
|
||||||
|
case 8: k1 ^= uint64_t(tail[ 7]) << 56;
|
||||||
|
case 7: k1 ^= uint64_t(tail[ 6]) << 48;
|
||||||
|
case 6: k1 ^= uint64_t(tail[ 5]) << 40;
|
||||||
|
case 5: k1 ^= uint64_t(tail[ 4]) << 32;
|
||||||
|
case 4: k1 ^= uint64_t(tail[ 3]) << 24;
|
||||||
|
case 3: k1 ^= uint64_t(tail[ 2]) << 16;
|
||||||
|
case 2: k1 ^= uint64_t(tail[ 1]) << 8;
|
||||||
|
case 1: k1 ^= uint64_t(tail[ 0]) << 0;
|
||||||
|
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
|
||||||
|
};
|
||||||
|
|
||||||
|
//----------
|
||||||
|
// finalization
|
||||||
|
|
||||||
|
h1 ^= len; h2 ^= len;
|
||||||
|
|
||||||
|
h1 += h2;
|
||||||
|
h2 += h1;
|
||||||
|
|
||||||
|
h1 = fmix(h1);
|
||||||
|
h2 = fmix(h2);
|
||||||
|
|
||||||
|
h1 += h2;
|
||||||
|
h2 += h1;
|
||||||
|
|
||||||
|
((uint64_t*)out)[0] = h1;
|
||||||
|
((uint64_t*)out)[1] = h2;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
|
37
cxxmph/MurmurHash3.h
Normal file
37
cxxmph/MurmurHash3.h
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// MurmurHash3 was written by Austin Appleby, and is placed in the public
|
||||||
|
// domain. The author hereby disclaims copyright to this source code.
|
||||||
|
|
||||||
|
#ifndef _MURMURHASH3_H_
|
||||||
|
#define _MURMURHASH3_H_
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Platform-specific functions and macros
|
||||||
|
|
||||||
|
// Microsoft Visual Studio
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
|
||||||
|
typedef unsigned char uint8_t;
|
||||||
|
typedef unsigned long uint32_t;
|
||||||
|
typedef unsigned __int64 uint64_t;
|
||||||
|
|
||||||
|
// Other compilers
|
||||||
|
|
||||||
|
#else // defined(_MSC_VER)
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#endif // !defined(_MSC_VER)
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out );
|
||||||
|
|
||||||
|
void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
|
||||||
|
|
||||||
|
void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#endif // _MURMURHASH3_H_
|
@ -21,7 +21,7 @@ class BM_MPHIndexCreate : public UrlsBenchmark {
|
|||||||
protected:
|
protected:
|
||||||
virtual void Run() {
|
virtual void Run() {
|
||||||
SimpleMPHIndex<StringPiece> index;
|
SimpleMPHIndex<StringPiece> index;
|
||||||
index.Reset(urls_.begin(), urls_.end());
|
index.Reset(urls_.begin(), urls_.end(), urls_.size());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -53,7 +53,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark {
|
|||||||
protected:
|
protected:
|
||||||
virtual bool SetUp () {
|
virtual bool SetUp () {
|
||||||
if (!SearchUrlsBenchmark::SetUp()) return false;
|
if (!SearchUrlsBenchmark::SetUp()) return false;
|
||||||
index_.Reset(urls_.begin(), urls_.end());
|
index_.Reset(urls_.begin(), urls_.end(), urls_.size());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
SimpleMPHIndex<StringPiece> index_;
|
SimpleMPHIndex<StringPiece> index_;
|
||||||
|
@ -13,7 +13,8 @@ namespace cxxmph {
|
|||||||
template<class MapType, class T>
|
template<class MapType, class T>
|
||||||
const T* myfind(const MapType& mymap, const T& k) {
|
const T* myfind(const MapType& mymap, const T& k) {
|
||||||
auto it = mymap.find(k);
|
auto it = mymap.find(k);
|
||||||
if (it == mymap.end()) return NULL;
|
auto end = mymap.end();
|
||||||
|
if (it == end) return NULL;
|
||||||
return &it->second;
|
return &it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -48,6 +49,7 @@ class BM_SearchUrls : public SearchUrlsBenchmark {
|
|||||||
mymap_[*it] = *it;
|
mymap_[*it] = *it;
|
||||||
}
|
}
|
||||||
mymap_.rehash(mymap_.bucket_count());
|
mymap_.rehash(mymap_.bucket_count());
|
||||||
|
fprintf(stderr, "Occupation: %f\n", static_cast<float>(mymap_.size())/mymap_.bucket_count());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
MapType mymap_;
|
MapType mymap_;
|
||||||
@ -56,7 +58,7 @@ class BM_SearchUrls : public SearchUrlsBenchmark {
|
|||||||
template <class MapType>
|
template <class MapType>
|
||||||
class BM_SearchUint64 : public SearchUint64Benchmark {
|
class BM_SearchUint64 : public SearchUint64Benchmark {
|
||||||
public:
|
public:
|
||||||
BM_SearchUint64() : SearchUint64Benchmark(10000, 10*1000*1000) { }
|
BM_SearchUint64() : SearchUint64Benchmark(100000, 10*1000*1000) { }
|
||||||
virtual bool SetUp() {
|
virtual bool SetUp() {
|
||||||
if (!SearchUint64Benchmark::SetUp()) return false;
|
if (!SearchUint64Benchmark::SetUp()) return false;
|
||||||
for (int i = 0; i < values_.size(); ++i) {
|
for (int i = 0; i < values_.size(); ++i) {
|
||||||
@ -93,7 +95,7 @@ int main(int argc, char** argv) {
|
|||||||
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0));
|
||||||
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
Benchmark::Register(new BM_SearchUrls<mph_map<StringPiece, StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
||||||
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
Benchmark::Register(new BM_SearchUrls<unordered_map<StringPiece, StringPiece, Murmur3StringPiece>>("URLS100k", 10*1000 * 1000, 0.9));
|
||||||
Benchmark::Register(new BM_SearchUint64<unordered_map<uint64_t, uint64_t>>);
|
|
||||||
Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>);
|
Benchmark::Register(new BM_SearchUint64<mph_map<uint64_t, uint64_t>>);
|
||||||
|
Benchmark::Register(new BM_SearchUint64<unordered_map<uint64_t, uint64_t>>);
|
||||||
Benchmark::RunAll();
|
Benchmark::RunAll();
|
||||||
}
|
}
|
||||||
|
@ -63,8 +63,8 @@ int main(int argc, char** argv) {
|
|||||||
for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i];
|
for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i];
|
||||||
mph_map<string, string>::const_iterator it = table.begin();
|
mph_map<string, string>::const_iterator it = table.begin();
|
||||||
mph_map<string, string>::const_iterator end = table.end();
|
mph_map<string, string>::const_iterator end = table.end();
|
||||||
for (; it != end; ++it) {
|
for (int i = 0; it != end; ++it, ++i) {
|
||||||
cout << (it - table.begin()) << ": " << it->first
|
cout << i << ": " << it->first
|
||||||
<<" -> " << it->second << endl;
|
<<" -> " << it->second << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
71
cxxmph/hollow_iterator.h
Normal file
71
cxxmph/hollow_iterator.h
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#ifndef __CXXMPH_HOLLOW_ITERATOR_H__
|
||||||
|
#define __CXXMPH_HOLLOW_ITERATOR_H__
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
|
template <typename container_type, typename presence_type, typename iterator_type>
|
||||||
|
struct hollow_iterator_base
|
||||||
|
: public std::iterator<std::forward_iterator_tag,
|
||||||
|
typename container_type::value_type> {
|
||||||
|
typedef presence_type presence;
|
||||||
|
typedef container_type container;
|
||||||
|
typedef iterator_type iterator;
|
||||||
|
typedef hollow_iterator_base<container, presence, iterator>& self_reference;
|
||||||
|
typedef typename iterator::reference reference;
|
||||||
|
typedef typename iterator::pointer pointer;
|
||||||
|
|
||||||
|
hollow_iterator_base(container* c, presence* p, iterator it)
|
||||||
|
: c_(c), p_(p), it_(it) { if (c_) find_present(); }
|
||||||
|
self_reference operator++() {
|
||||||
|
++it_; find_present();
|
||||||
|
}
|
||||||
|
reference operator*() { return *it_; }
|
||||||
|
pointer operator->() { return &(*it_); }
|
||||||
|
|
||||||
|
// TODO find syntax to make this less permissible at compile time
|
||||||
|
template <class T>
|
||||||
|
bool operator==(const T& rhs) { return rhs.it_ == this->it_; }
|
||||||
|
template <class T>
|
||||||
|
bool operator!=(const T& rhs) { return rhs.it_ != this->it_; }
|
||||||
|
|
||||||
|
public: // TODO find syntax to make this friend of const iterator
|
||||||
|
void find_present() {
|
||||||
|
while (it_ != c_->end() && !((*p_)[it_-c_->begin()])) ++it_;
|
||||||
|
}
|
||||||
|
container* c_;
|
||||||
|
presence* p_;
|
||||||
|
iterator it_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename container_type>
|
||||||
|
struct hollow_iterator : public hollow_iterator_base<
|
||||||
|
container_type, std::vector<bool>, typename container_type::iterator> {
|
||||||
|
typedef hollow_iterator_base<
|
||||||
|
container_type, std::vector<bool>, typename container_type::iterator> parent_class;
|
||||||
|
hollow_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { }
|
||||||
|
hollow_iterator(typename parent_class::container* c,
|
||||||
|
typename parent_class::presence* p,
|
||||||
|
typename parent_class::iterator it)
|
||||||
|
: parent_class(c, p, it) { }
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename container_type>
|
||||||
|
struct hollow_const_iterator : public hollow_iterator_base<
|
||||||
|
const container_type, const std::vector<bool>, typename container_type::const_iterator> {
|
||||||
|
typedef hollow_iterator_base<
|
||||||
|
const container_type, const std::vector<bool>, typename container_type::const_iterator> parent_class;
|
||||||
|
typedef hollow_const_iterator<container_type> self_type;
|
||||||
|
typedef hollow_iterator<container_type> non_const_type;
|
||||||
|
hollow_const_iterator(non_const_type rhs) : parent_class(rhs.c_, rhs.p_, typename container_type::const_iterator(rhs.it_)) { }
|
||||||
|
hollow_const_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { }
|
||||||
|
hollow_const_iterator(const typename parent_class::container* c,
|
||||||
|
const typename parent_class::presence* p,
|
||||||
|
typename parent_class::iterator it)
|
||||||
|
: parent_class(c, p, it) { }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
#endif // __CXXMPH_HOLLOW_ITERATOR_H__
|
38
cxxmph/hollow_iterator_test.cc
Normal file
38
cxxmph/hollow_iterator_test.cc
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#include <cstdlib>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "hollow_iterator.h"
|
||||||
|
|
||||||
|
using std::vector;
|
||||||
|
using cxxmph::hollow_iterator;
|
||||||
|
using cxxmph::hollow_const_iterator;
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
vector<int> v;
|
||||||
|
vector<bool> p;
|
||||||
|
for (int i = 0; i < 100; ++i) {
|
||||||
|
v.push_back(i);
|
||||||
|
p.push_back(i % 2 == 0);
|
||||||
|
}
|
||||||
|
auto begin = hollow_iterator<vector<int>>(&v, &p, v.begin());
|
||||||
|
auto end = hollow_iterator<vector<int>>(&v, &p, v.end());
|
||||||
|
for (auto it = begin; it != end; ++it) {
|
||||||
|
if (((*it) % 2) != 0) exit(-1);
|
||||||
|
}
|
||||||
|
hollow_const_iterator<vector<int>> const_begin(begin);
|
||||||
|
hollow_const_iterator<vector<int>> const_end(end);
|
||||||
|
for (auto it = const_begin; it != const_end; ++it) {
|
||||||
|
if (((*it) % 2) != 0) exit(-1);
|
||||||
|
}
|
||||||
|
vector<int>::iterator vit1 = v.begin();
|
||||||
|
vector<int>::const_iterator vit2 = v.begin();
|
||||||
|
if (vit1 != vit2) exit(-1);
|
||||||
|
auto it1 = hollow_iterator<vector<int>>(&v, &p, v.begin());
|
||||||
|
auto it2 = hollow_const_iterator<vector<int>>(&v, &p, v.begin());
|
||||||
|
if (it1 != it2) exit(-1);
|
||||||
|
|
||||||
|
hollow_iterator<vector<int>> default_constructed;
|
||||||
|
default_constructed = hollow_iterator<vector<int>>(&v, &p, v.begin());
|
||||||
|
}
|
||||||
|
|
7
cxxmph/mph_bits.cc
Normal file
7
cxxmph/mph_bits.cc
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#include "mph_bits.h"
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
|
const uint8_t dynamic_2bitset::vmask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
|
||||||
|
|
||||||
|
}
|
67
cxxmph/mph_bits.h
Normal file
67
cxxmph/mph_bits.h
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
#ifndef __CXXMPH_MPH_BITS_H__
|
||||||
|
#define __CXXMPH_MPH_BITS_H__
|
||||||
|
|
||||||
|
#include <stdint.h> // for uint32_t and friends
|
||||||
|
#include <cassert>
|
||||||
|
#include <climits>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <limits>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
|
class dynamic_2bitset {
|
||||||
|
public:
|
||||||
|
dynamic_2bitset() : fill_(false) {}
|
||||||
|
dynamic_2bitset(uint32_t size, bool fill = false)
|
||||||
|
: size_(size), fill_(fill), data_(ceil(size / 4.0), ones()*fill) {
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint8_t operator[](uint32_t i) const { return get(i); }
|
||||||
|
uint8_t get(uint32_t i) const {
|
||||||
|
return (data_[(i >> 2)] >> (((i & 3) << 1)) & 3);
|
||||||
|
}
|
||||||
|
uint8_t set(uint32_t i, uint8_t v) {
|
||||||
|
data_[(i >> 2)] |= ones() ^ dynamic_2bitset::vmask[i & 3];
|
||||||
|
data_[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]);
|
||||||
|
assert(v <= 3);
|
||||||
|
assert(get(i) == v);
|
||||||
|
}
|
||||||
|
void resize(uint32_t size) {
|
||||||
|
size_ = size;
|
||||||
|
data_.resize(size >> 2, fill_*ones());
|
||||||
|
}
|
||||||
|
void swap(dynamic_2bitset& other) {
|
||||||
|
std::swap(other.size_, size_);
|
||||||
|
std::swap(other.fill_, fill_);
|
||||||
|
std::swap(other.data_, data_);
|
||||||
|
}
|
||||||
|
void clear() { data_.clear(); }
|
||||||
|
|
||||||
|
uint32_t size() const { return size_; }
|
||||||
|
static const uint8_t vmask[];
|
||||||
|
private:
|
||||||
|
uint32_t size_;
|
||||||
|
bool fill_;
|
||||||
|
std::vector<uint8_t> data_;
|
||||||
|
uint8_t ones() { return std::numeric_limits<uint8_t>::max(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
|
||||||
|
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]);
|
||||||
|
}
|
||||||
|
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
|
||||||
|
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
|
||||||
|
}
|
||||||
|
static uint32_t nextpoweroftwo(uint32_t k) {
|
||||||
|
if (k == 0) return 1;
|
||||||
|
k--;
|
||||||
|
for (int i=1; i<sizeof(uint32_t)*CHAR_BIT; i<<=1) k = k | k >> i;
|
||||||
|
return k+1;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
#endif
|
49
cxxmph/mph_bits_test.cc
Normal file
49
cxxmph/mph_bits_test.cc
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
#include "mph_bits.h"
|
||||||
|
|
||||||
|
using cxxmph::dynamic_2bitset;
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
dynamic_2bitset small(256, true);
|
||||||
|
for (int i = 0; i < small.size(); ++i) small.set(i, i % 4);
|
||||||
|
for (int i = 0; i < small.size(); ++i) {
|
||||||
|
if (small[i] != i % 4) {
|
||||||
|
fprintf(stderr, "wrong bits %d at %d expected %d\n", small[i], i, i % 4);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int size = 256;
|
||||||
|
dynamic_2bitset bits(size, true /* fill with ones */);
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
if (bits[i] != 3) {
|
||||||
|
fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 3);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size; ++i) bits.set(i, 0);
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
if (bits[i] != 0) {
|
||||||
|
fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 0);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size; ++i) bits.set(i, i % 4);
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
if (bits[i] != i % 4) {
|
||||||
|
fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, i % 4);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dynamic_2bitset size_corner1(1);
|
||||||
|
if (size_corner1.size() != 1) exit(-1);
|
||||||
|
dynamic_2bitset size_corner2(2);
|
||||||
|
if (size_corner2.size() != 2) exit(-1);
|
||||||
|
(dynamic_2bitset(4)).swap(size_corner2);
|
||||||
|
if (size_corner2.size() != 4) exit(-1);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <climits>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <unordered_map> // for std::hash
|
#include <unordered_map> // for std::hash
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -35,6 +36,7 @@ using std::cerr;
|
|||||||
using std::endl;
|
using std::endl;
|
||||||
|
|
||||||
#include "seeded_hash.h"
|
#include "seeded_hash.h"
|
||||||
|
#include "mph_bits.h"
|
||||||
#include "trigraph.h"
|
#include "trigraph.h"
|
||||||
|
|
||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
@ -42,13 +44,13 @@ namespace cxxmph {
|
|||||||
class MPHIndex {
|
class MPHIndex {
|
||||||
public:
|
public:
|
||||||
MPHIndex(double c = 1.23, uint8_t b = 7) :
|
MPHIndex(double c = 1.23, uint8_t b = 7) :
|
||||||
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0),
|
c_(c), b_(b), m_(0), n_(0), k_(0), r_(1),
|
||||||
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
|
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
|
||||||
deserialized_(false) { }
|
deserialized_(false) { }
|
||||||
~MPHIndex();
|
~MPHIndex();
|
||||||
|
|
||||||
template <class SeededHashFcn, class ForwardIterator>
|
template <class SeededHashFcn, class ForwardIterator>
|
||||||
bool Reset(ForwardIterator begin, ForwardIterator end);
|
bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size);
|
||||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||||
// Get a unique identifier for k, in the range [0;size()). If x wasn't part
|
// Get a unique identifier for k, in the range [0;size()). If x wasn't part
|
||||||
// of the input in the last Reset call, returns a random value.
|
// of the input in the last Reset call, returns a random value.
|
||||||
@ -63,6 +65,16 @@ class MPHIndex {
|
|||||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||||
uint32_t minimal_perfect_hash(const Key& x) const;
|
uint32_t minimal_perfect_hash(const Key& x) const;
|
||||||
|
|
||||||
|
// Crazy functions. Ignore.
|
||||||
|
template <class SeededHashFcn> // must agree with Reset
|
||||||
|
uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const;
|
||||||
|
template <class SeededHashFcn> // must agree with Reset
|
||||||
|
uint8_t cuckoo_nest(const uint32_t* h) const;
|
||||||
|
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||||
|
uint32_t cuckoo_nest_index(const Key& x, uint32_t* h) const;
|
||||||
|
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||||
|
void hash_vector(const Key& x, uint32_t* h) const;
|
||||||
|
|
||||||
// Serialization for mmap usage - not tested well, ping me if you care.
|
// Serialization for mmap usage - not tested well, ping me if you care.
|
||||||
// Serialized tables are not guaranteed to work across versions or different
|
// Serialized tables are not guaranteed to work across versions or different
|
||||||
// endianness (although they could easily be made to be).
|
// endianness (although they could easily be made to be).
|
||||||
@ -94,6 +106,8 @@ class MPHIndex {
|
|||||||
|
|
||||||
// Partition vertex count, derived from c parameter.
|
// Partition vertex count, derived from c parameter.
|
||||||
uint32_t r_;
|
uint32_t r_;
|
||||||
|
uint32_t nest_displacement_[3]; // derived from r_
|
||||||
|
|
||||||
// The array containing the minimal perfect hash function graph. Do not use
|
// The array containing the minimal perfect hash function graph. Do not use
|
||||||
// c++ vector to make mmap based backing easier.
|
// c++ vector to make mmap based backing easier.
|
||||||
const uint8_t* g_;
|
const uint8_t* g_;
|
||||||
@ -108,26 +122,26 @@ class MPHIndex {
|
|||||||
bool deserialized_;
|
bool deserialized_;
|
||||||
|
|
||||||
static const uint8_t valuemask[];
|
static const uint8_t valuemask[];
|
||||||
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
|
|
||||||
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
|
|
||||||
}
|
|
||||||
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
|
|
||||||
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Template method needs to go in the header file.
|
// Template method needs to go in the header file.
|
||||||
template <class SeededHashFcn, class ForwardIterator>
|
template <class SeededHashFcn, class ForwardIterator>
|
||||||
bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) {
|
bool MPHIndex::Reset(
|
||||||
|
ForwardIterator begin, ForwardIterator end, uint32_t size) {
|
||||||
if (end == begin) {
|
if (end == begin) {
|
||||||
clear();
|
clear();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
m_ = end - begin;
|
m_ = size;
|
||||||
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
|
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
|
||||||
if ((r_ % 2) == 0) r_ += 1;
|
if ((r_ % 2) == 0) r_ += 1;
|
||||||
|
nest_displacement_[0] = 0;
|
||||||
|
nest_displacement_[1] = r_;
|
||||||
|
nest_displacement_[2] = (r_ << 1);
|
||||||
|
// This can be used to speed mods, but increases occupation too much.
|
||||||
|
// Needs to try http://gmplib.org/manual/Integer-Exponentiation.html instead
|
||||||
|
// r_ = nextpoweroftwo(r_);
|
||||||
|
|
||||||
n_ = 3*r_;
|
n_ = 3*r_;
|
||||||
k_ = 1U << b_;
|
k_ = 1U << b_;
|
||||||
|
|
||||||
@ -173,21 +187,44 @@ bool MPHIndex::Mapping(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class SeededHashFcn>
|
||||||
|
uint32_t MPHIndex::cuckoo_hash(const uint32_t* h, uint8_t nest) const {
|
||||||
|
return (h[nest] % r_) + nest_displacement_[nest];
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class SeededHashFcn, class Key>
|
||||||
|
void MPHIndex::hash_vector(const Key& key, uint32_t* h) const {
|
||||||
|
SeededHashFcn().hash64(key, hash_seed_[0], h);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class SeededHashFcn> // must agree with Reset
|
||||||
|
uint8_t MPHIndex::cuckoo_nest(const uint32_t* h) const {
|
||||||
|
uint32_t x[4];
|
||||||
|
if (!g_size_) return 0;
|
||||||
|
x[0] = (h[0] % r_) + nest_displacement_[0];
|
||||||
|
x[1] = (h[1] % r_) + nest_displacement_[1];
|
||||||
|
x[2] = (h[2] % r_) + nest_displacement_[2];
|
||||||
|
assert((x[0] >> 2) <g_size_);
|
||||||
|
assert((x[1] >> 2) <g_size_);
|
||||||
|
assert((x[2] >> 2) <g_size_);
|
||||||
|
return (get_2bit_value(g_, x[0]) + get_2bit_value(g_, x[1]) + get_2bit_value(g_, x[2])) % 3;
|
||||||
|
}
|
||||||
|
|
||||||
template <class SeededHashFcn, class Key>
|
template <class SeededHashFcn, class Key>
|
||||||
uint32_t MPHIndex::perfect_hash(const Key& key) const {
|
uint32_t MPHIndex::perfect_hash(const Key& key) const {
|
||||||
uint32_t h[4];
|
uint32_t h[4];
|
||||||
SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast<uint32_t*>(&h));
|
if (!g_size_) return 0;
|
||||||
|
SeededHashFcn().hash64(key, hash_seed_[0], h);
|
||||||
// for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
|
// for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
|
||||||
assert(r_);
|
h[0] = (h[0] % r_) + nest_displacement_[0];
|
||||||
h[0] = h[0] % r_;
|
h[1] = (h[1] % r_) + nest_displacement_[1];
|
||||||
h[1] = h[1] % r_ + r_;
|
h[2] = (h[2] % r_) + nest_displacement_[2];
|
||||||
h[2] = h[2] % r_ + (r_ << 1);
|
|
||||||
assert(g_size_);
|
|
||||||
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
|
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
|
||||||
assert((h[0] >> 2) <g_size_);
|
assert((h[0] >> 2) <g_size_);
|
||||||
assert((h[1] >> 2) <g_size_);
|
assert((h[1] >> 2) <g_size_);
|
||||||
assert((h[2] >> 2) <g_size_);
|
assert((h[2] >> 2) <g_size_);
|
||||||
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
|
uint8_t nest = (get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3;
|
||||||
|
uint32_t vertex = h[nest];
|
||||||
return vertex;
|
return vertex;
|
||||||
}
|
}
|
||||||
template <class SeededHashFcn, class Key>
|
template <class SeededHashFcn, class Key>
|
||||||
@ -206,12 +243,15 @@ template <class Key, class HashFcn = typename seeded_hash<std::hash<Key>>::hash_
|
|||||||
class SimpleMPHIndex : public MPHIndex {
|
class SimpleMPHIndex : public MPHIndex {
|
||||||
public:
|
public:
|
||||||
template <class ForwardIterator>
|
template <class ForwardIterator>
|
||||||
bool Reset(ForwardIterator begin, ForwardIterator end) {
|
bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size) {
|
||||||
return MPHIndex::Reset<HashFcn>(begin, end);
|
return MPHIndex::Reset<HashFcn>(begin, end, size);
|
||||||
}
|
}
|
||||||
uint32_t index(const Key& key) const { return MPHIndex::index<HashFcn>(key); }
|
uint32_t index(const Key& key) const { return MPHIndex::index<HashFcn>(key); }
|
||||||
uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash<HashFcn>(key); }
|
uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash<HashFcn>(key); }
|
||||||
uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash<HashFcn>(key); }
|
uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash<HashFcn>(key); }
|
||||||
|
uint8_t cuckoo_nest(const uint32_t* h) const { return MPHIndex::cuckoo_nest<HashFcn>(h); }
|
||||||
|
uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const { return MPHIndex::cuckoo_hash<HashFcn>(h, nest); }
|
||||||
|
void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector<HashFcn>(key, h); }
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace cxxmph
|
} // namespace cxxmph
|
||||||
|
@ -24,7 +24,7 @@ int main(int argc, char** argv) {
|
|||||||
keys.push_back("algume");
|
keys.push_back("algume");
|
||||||
|
|
||||||
SimpleMPHIndex<string> mph_index;
|
SimpleMPHIndex<string> mph_index;
|
||||||
if (!mph_index.Reset(keys.begin(), keys.end())) { exit(-1); }
|
if (!mph_index.Reset(keys.begin(), keys.end(), keys.size())) { exit(-1); }
|
||||||
vector<int> ids;
|
vector<int> ids;
|
||||||
for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
|
for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
|
||||||
ids.push_back(mph_index.index(keys[i]));
|
ids.push_back(mph_index.index(keys[i]));
|
||||||
|
254
cxxmph/mph_map.h
254
cxxmph/mph_map.h
@ -10,11 +10,16 @@
|
|||||||
// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl
|
// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <limits>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <utility> // for std::pair
|
#include <utility> // for std::pair
|
||||||
|
|
||||||
|
#include "mph_bits.h"
|
||||||
#include "mph_index.h"
|
#include "mph_index.h"
|
||||||
|
#include "hollow_iterator.h"
|
||||||
|
|
||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
|
|
||||||
@ -42,8 +47,9 @@ class mph_map {
|
|||||||
typedef typename std::vector<value_type>::const_reference const_reference;
|
typedef typename std::vector<value_type>::const_reference const_reference;
|
||||||
typedef typename std::vector<value_type>::size_type size_type;
|
typedef typename std::vector<value_type>::size_type size_type;
|
||||||
typedef typename std::vector<value_type>::difference_type difference_type;
|
typedef typename std::vector<value_type>::difference_type difference_type;
|
||||||
typedef typename std::vector<value_type>::iterator iterator;
|
|
||||||
typedef typename std::vector<value_type>::const_iterator const_iterator;
|
typedef hollow_iterator<std::vector<value_type>> iterator;
|
||||||
|
typedef hollow_const_iterator<std::vector<value_type>> const_iterator;
|
||||||
|
|
||||||
// For making macros simpler.
|
// For making macros simpler.
|
||||||
typedef void void_type;
|
typedef void void_type;
|
||||||
@ -63,16 +69,15 @@ class mph_map {
|
|||||||
void erase(iterator pos);
|
void erase(iterator pos);
|
||||||
void erase(const key_type& k);
|
void erase(const key_type& k);
|
||||||
pair<iterator, bool> insert(const value_type& x);
|
pair<iterator, bool> insert(const value_type& x);
|
||||||
iterator find(const key_type& k);
|
iterator find(const key_type& k) { return slow_find(k, index_.perfect_hash(k)); }
|
||||||
const_iterator find(const key_type& k) const;
|
const_iterator find(const key_type& k) const { return slow_find(k, index_.perfect_hash(k)); };
|
||||||
typedef int32_t my_int32_t; // help macros
|
typedef int32_t my_int32_t; // help macros
|
||||||
int32_t index(const key_type& k) const;
|
int32_t index(const key_type& k) const;
|
||||||
data_type& operator[](const key_type &k);
|
data_type& operator[](const key_type &k);
|
||||||
const data_type& operator[](const key_type &k) const;
|
const data_type& operator[](const key_type &k) const;
|
||||||
|
|
||||||
size_type bucket_count() const { return size(); }
|
size_type bucket_count() const { return index_.perfect_hash_size() + slack_.bucket_count(); }
|
||||||
// FIXME: not sure if this has the semantics I want
|
void rehash(size_type nbuckets /*ignored*/);
|
||||||
void rehash(size_type nbuckets /*ignored*/) { pack(); }
|
|
||||||
|
|
||||||
protected: // mimicking STL implementation
|
protected: // mimicking STL implementation
|
||||||
EqualKey equal_;
|
EqualKey equal_;
|
||||||
@ -81,7 +86,7 @@ class mph_map {
|
|||||||
template <typename iterator>
|
template <typename iterator>
|
||||||
struct iterator_first : public iterator {
|
struct iterator_first : public iterator {
|
||||||
iterator_first(iterator it) : iterator(it) { }
|
iterator_first(iterator it) : iterator(it) { }
|
||||||
const typename iterator::value_type::first_type& operator*() const {
|
const typename iterator::value_type::first_type& operator*() {
|
||||||
return this->iterator::operator*().first;
|
return this->iterator::operator*().first;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -91,72 +96,173 @@ class mph_map {
|
|||||||
return iterator_first<iterator>(it);
|
return iterator_first<iterator>(it);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
iterator make_iterator(typename std::vector<value_type>::iterator it) {
|
||||||
|
return hollow_iterator<std::vector<value_type>>(&values_, &present_, it);
|
||||||
|
}
|
||||||
|
const_iterator make_iterator(typename std::vector<value_type>::const_iterator it) const {
|
||||||
|
return hollow_const_iterator<std::vector<value_type>>(&values_, &present_, it);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Experimental functions, not always faster
|
||||||
|
iterator fast_find(const key_type& k);
|
||||||
|
const_iterator fast_find(const key_type& k) const;
|
||||||
|
iterator slow_find(const key_type& k, uint32_t perfect_hash);
|
||||||
|
const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const;
|
||||||
|
static const uint8_t kNestCollision = 3; // biggest 2 bit value
|
||||||
|
void set_nest_value(const uint32_t* h, uint8_t value) {
|
||||||
|
auto index = get_nest_index(h);
|
||||||
|
assert(get_nest_index(h) < nests_.size());
|
||||||
|
assert(get_nest_index(h) >> 2 < nests_.size());
|
||||||
|
assert(value < 4);
|
||||||
|
nests_.set(index, value);
|
||||||
|
assert(nests_[index] == value);
|
||||||
|
}
|
||||||
|
uint32_t get_nest_value(const uint32_t* h) const {
|
||||||
|
assert(get_nest_index(h) < nests_.size());
|
||||||
|
return nests_[get_nest_index(h)];
|
||||||
|
}
|
||||||
|
uint32_t get_nest_index(const uint32_t* h) const {
|
||||||
|
assert(nests_.size());
|
||||||
|
assert(nests_.size() % 2 == 0);
|
||||||
|
assert((nests_.size() & (nests_.size() - 1)) == 0);
|
||||||
|
assert((h[3] % nests_.size()) == (h[3] & (nests_.size() - 1)));
|
||||||
|
return (h[3] & (nests_.size() - 1)); // a mod 2^n == a & 2^n - 1
|
||||||
|
}
|
||||||
|
|
||||||
void pack();
|
void pack();
|
||||||
std::vector<value_type> values_;
|
std::vector<value_type> values_;
|
||||||
|
std::vector<bool> present_;
|
||||||
|
dynamic_2bitset nests_;
|
||||||
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
|
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
|
||||||
// TODO(davi) optimize slack to no hold a copy of the key
|
// TODO(davi) optimize slack to hold 128 unique bits from hash64 as key
|
||||||
typedef unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
|
typedef unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
|
||||||
slack_type slack_;
|
slack_type slack_;
|
||||||
|
size_type size_;
|
||||||
|
|
||||||
|
mutable uint64_t fast_;
|
||||||
|
mutable uint64_t fast_taken_;
|
||||||
|
mutable uint64_t slow_;
|
||||||
|
mutable uint64_t very_slow_;
|
||||||
};
|
};
|
||||||
|
|
||||||
MPH_MAP_TMPL_SPEC
|
MPH_MAP_TMPL_SPEC
|
||||||
bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) {
|
bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) {
|
||||||
return lhs.values_ == rhs.values_;
|
return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() {
|
MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) {
|
||||||
|
clear();
|
||||||
pack();
|
pack();
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() {
|
MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() {
|
||||||
|
// fprintf(stderr, "Fast taken: %d Fast: %d Slow %d very_slow %d ratio %f\n", fast_taken_, fast_, slow_, very_slow_, fast_*1.0/slow_);
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
|
MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
|
||||||
iterator it = find(x.first);
|
auto it = find(x.first);
|
||||||
if (it != end()) return make_pair(it, false);
|
auto it_end = end();
|
||||||
values_.push_back(x);
|
if (it != it_end) return make_pair(it, false);
|
||||||
slack_.insert(make_pair(x.first, values_.size() - 1));
|
bool should_pack = false;
|
||||||
if (slack_.size() == index_.size() ||
|
if (values_.capacity() == values_.size() && values_.size() > 256) {
|
||||||
(slack_.size() >= 256 && index_.size() == 0)) {
|
should_pack = true;
|
||||||
pack();
|
|
||||||
}
|
}
|
||||||
|
values_.push_back(x);
|
||||||
|
present_.push_back(true);
|
||||||
|
uint32_t h[4];
|
||||||
|
index_.hash_vector(x.first, h);
|
||||||
|
set_nest_value(h, kNestCollision);
|
||||||
|
++size_;
|
||||||
|
slack_.insert(make_pair(x.first, values_.size() - 1));
|
||||||
|
if (should_pack) pack();
|
||||||
it = find(x.first);
|
it = find(x.first);
|
||||||
|
slow_ = 0;
|
||||||
|
very_slow_ = 0;
|
||||||
|
fast_ = 0;
|
||||||
|
fast_taken_ = 0;
|
||||||
return make_pair(it, true);
|
return make_pair(it, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(void_type, pack)() {
|
MPH_MAP_METHOD_DECL(void_type, pack)() {
|
||||||
|
// fprintf(stderr, "Paki %d values\n", values_.size());
|
||||||
if (values_.empty()) return;
|
if (values_.empty()) return;
|
||||||
slack_type().swap(slack_);
|
assert(std::unordered_set<key_type>(make_iterator_first(begin()), make_iterator_first(end())).size() == size());
|
||||||
bool success = index_.Reset(
|
bool success = index_.Reset(
|
||||||
make_iterator_first(values_.begin()),
|
make_iterator_first(begin()),
|
||||||
make_iterator_first(values_.end()));
|
make_iterator_first(end()), size_);
|
||||||
assert(success);
|
assert(success);
|
||||||
std::vector<value_type> new_values(values_.size());
|
std::vector<value_type> new_values(index_.perfect_hash_size());
|
||||||
for (const_iterator it = values_.begin(), end = values_.end();
|
new_values.reserve(new_values.size() * 2);
|
||||||
it != end; ++it) {
|
std::vector<bool> new_present(index_.perfect_hash_size(), false);
|
||||||
size_type id = index_.index(it->first);
|
new_present.reserve(new_present.size() * 2);
|
||||||
|
auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*10000 + 1);
|
||||||
|
dynamic_2bitset(new_nests_size, true /* fill with 1s */).swap(nests_);
|
||||||
|
vector<bool> used_nests(nests_.size());
|
||||||
|
uint32_t collisions = 0;
|
||||||
|
for (iterator it = begin(), it_end = end(); it != it_end; ++it) {
|
||||||
|
size_type id = index_.perfect_hash(it->first);
|
||||||
assert(id < new_values.size());
|
assert(id < new_values.size());
|
||||||
new_values[id] = *it;
|
new_values[id] = *it;
|
||||||
|
new_present[id] = true;
|
||||||
|
uint32_t h[4];
|
||||||
|
index_.hash_vector(it->first, h);
|
||||||
|
// fprintf(stderr, "Nest index: %d\n", get_nest_index(h));
|
||||||
|
assert(used_nests.size() > get_nest_index(h));
|
||||||
|
if (used_nests[get_nest_index(h)]) {
|
||||||
|
set_nest_value(h, kNestCollision);
|
||||||
|
assert(get_nest_value(h) == kNestCollision);
|
||||||
|
// fprintf(stderr, "Collision at nest index %d among %d positions\n", get_nest_index(h), nests_.size());
|
||||||
|
++collisions;
|
||||||
|
} else {
|
||||||
|
set_nest_value(h, index_.cuckoo_nest(h));
|
||||||
|
assert(get_nest_value(h) == index_.cuckoo_nest(h));
|
||||||
|
assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, get_nest_value(h)));
|
||||||
|
used_nests[get_nest_index(h)] = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size());
|
||||||
values_.swap(new_values);
|
values_.swap(new_values);
|
||||||
|
present_.swap(new_present);
|
||||||
|
slack_type().swap(slack_);
|
||||||
|
int32_t fast = 0;
|
||||||
|
int32_t slow= 0;
|
||||||
|
for (iterator it = begin(), it_end = end(); it != it_end; ++it) {
|
||||||
|
uint32_t h[4];
|
||||||
|
index_.hash_vector(it->first, h);
|
||||||
|
if (get_nest_value(h) == kNestCollision) ++slow;
|
||||||
|
else {
|
||||||
|
++fast;
|
||||||
|
auto cit = values_.begin() + index_.cuckoo_hash(h, get_nest_value(h));
|
||||||
|
assert(index_.perfect_hash(it->first) == cit - values_.begin());
|
||||||
|
assert(equal_(it->first, cit->first));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// fprintf(stderr, "Predicted fast: %d slow %d\n", fast, slow);
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(iterator, begin)() { return values_.begin(); }
|
MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); }
|
||||||
MPH_MAP_METHOD_DECL(iterator, end)() { return values_.end(); }
|
MPH_MAP_METHOD_DECL(iterator, end)() { return make_iterator(values_.end()); }
|
||||||
MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); }
|
MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return make_iterator(values_.begin()); }
|
||||||
MPH_MAP_METHOD_DECL(const_iterator, end)() const { return values_.end(); }
|
MPH_MAP_METHOD_DECL(const_iterator, end)() const { return make_iterator(values_.end()); }
|
||||||
MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); }
|
MPH_MAP_METHOD_DECL(bool_type, empty)() const { return size_ == 0; }
|
||||||
MPH_MAP_METHOD_DECL(size_type, size)() const { return values_.size(); }
|
MPH_MAP_METHOD_DECL(size_type, size)() const { return size_; }
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(void_type, clear)() {
|
MPH_MAP_METHOD_DECL(void_type, clear)() {
|
||||||
values_.clear();
|
values_.clear();
|
||||||
|
present_.clear();
|
||||||
slack_.clear();
|
slack_.clear();
|
||||||
index_.clear();
|
index_.clear();
|
||||||
|
dynamic_2bitset(8, true /* fill with 1s */).swap(nests_);
|
||||||
|
size_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
|
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
|
||||||
values_.erase(pos);
|
present_[pos - begin] = false;
|
||||||
pack();
|
uint32_t h[4];
|
||||||
|
index_.hash_vector(pos->first, &h);
|
||||||
|
nests_[get_nest_index(h)] = kNestCollision;
|
||||||
|
*pos = value_type();
|
||||||
|
--size_;
|
||||||
}
|
}
|
||||||
MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) {
|
MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) {
|
||||||
iterator it = find(k);
|
iterator it = find(k);
|
||||||
@ -164,36 +270,88 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) {
|
|||||||
erase(it);
|
erase(it);
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
|
MPH_MAP_METHOD_DECL(const_iterator, fast_find)(const key_type& k) const {
|
||||||
if (__builtin_expect(!slack_.empty(), 0)) {
|
uint32_t h[4];
|
||||||
typename slack_type::const_iterator it = slack_.find(k);
|
index_.hash_vector(k, h);
|
||||||
if (it != slack_.end()) return values_.begin() + it->second;
|
auto nest = get_nest_value(h);
|
||||||
|
if (__builtin_expect(nest != kNestCollision, 1)) {
|
||||||
|
++fast_taken_;
|
||||||
|
auto vit = values_.begin() + index_.cuckoo_hash(h, nest);
|
||||||
|
// do not hold for unknown keys
|
||||||
|
assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first));
|
||||||
|
if (equal_(k, vit->first)) {
|
||||||
|
++fast_;
|
||||||
|
return make_iterator(vit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nest = index_.cuckoo_nest(h);
|
||||||
|
++slow_;
|
||||||
|
return slow_find(k, index_.cuckoo_hash(h, nest));
|
||||||
|
}
|
||||||
|
|
||||||
|
MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const {
|
||||||
|
if (__builtin_expect(index_.perfect_hash_size(), 1)) {
|
||||||
|
if (__builtin_expect(present_[perfect_hash], true)) {
|
||||||
|
auto vit = values_.begin() + perfect_hash;
|
||||||
|
if (equal_(k, vit->first)) return make_iterator(vit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (__builtin_expect(!slack_.empty(), 0)) {
|
||||||
|
++very_slow_;
|
||||||
|
auto sit = slack_.find(k);
|
||||||
|
if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second);
|
||||||
}
|
}
|
||||||
if (__builtin_expect(index_.size() == 0, 0)) return end();
|
|
||||||
const_iterator it = values_.begin() + index_.index(k);
|
|
||||||
if (__builtin_expect(equal_(k, it->first), 1)) return it;
|
|
||||||
return end();
|
return end();
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
|
MPH_MAP_METHOD_DECL(iterator, fast_find)(const key_type& k) {
|
||||||
if (!slack_.empty()) {
|
uint32_t h[4];
|
||||||
typename slack_type::const_iterator it = slack_.find(k);
|
index_.hash_vector(k, h);
|
||||||
if (it != slack_.end()) return values_.begin() + it->second;
|
auto nest = get_nest_value(h);
|
||||||
|
if (__builtin_expect(nest != kNestCollision, 1)) {
|
||||||
|
++fast_taken_;
|
||||||
|
auto vit = values_.begin() + index_.cuckoo_hash(h, nest);
|
||||||
|
assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first));
|
||||||
|
if (equal_(k, vit->first)) {
|
||||||
|
++fast_;
|
||||||
|
return make_iterator(vit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nest = index_.cuckoo_nest(h);
|
||||||
|
++slow_;
|
||||||
|
return slow_find(k, index_.cuckoo_hash(h, nest));
|
||||||
|
}
|
||||||
|
|
||||||
|
MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) {
|
||||||
|
if (__builtin_expect(index_.perfect_hash_size(), 1)) {
|
||||||
|
if (__builtin_expect(present_[perfect_hash], true)) {
|
||||||
|
auto vit = values_.begin() + perfect_hash;
|
||||||
|
if (equal_(k, vit->first)) return make_iterator(vit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (__builtin_expect(!slack_.empty(), 0)) {
|
||||||
|
++very_slow_;
|
||||||
|
auto sit = slack_.find(k);
|
||||||
|
if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second);
|
||||||
}
|
}
|
||||||
if (index_.size() == 0) return end();
|
|
||||||
iterator it = values_.begin() + index_.index(k);
|
|
||||||
if (equal_(it->first, k)) return it;
|
|
||||||
return end();
|
return end();
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const {
|
MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const {
|
||||||
if (index_.size() == 0) return -1;
|
if (index_.size() == 0) return -1;
|
||||||
return index_.index(k);
|
return index_.perfect_hash(k);
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) {
|
MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) {
|
||||||
return insert(make_pair(k, data_type())).first->second;
|
return insert(make_pair(k, data_type())).first->second;
|
||||||
}
|
}
|
||||||
|
MPH_MAP_METHOD_DECL(void_type, rehash)(size_type nbuckets) {
|
||||||
|
pack();
|
||||||
|
vector<value_type>(values_.begin(), values_.end()).swap(values_);
|
||||||
|
vector<bool>(present_.begin(), present_.end()).swap(present_);
|
||||||
|
slack_type().swap(slack_);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
} // namespace cxxmph
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
@ -11,21 +11,32 @@ using cxxmph::mph_map;
|
|||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
mph_map<int64_t, int64_t> b;
|
mph_map<int64_t, int64_t> b;
|
||||||
for (int i = 0; i < 100*1000; ++i) {
|
int32_t num_keys = 1000*10;
|
||||||
|
for (int i = 0; i < num_keys; ++i) {
|
||||||
b.insert(make_pair(i, i));
|
b.insert(make_pair(i, i));
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 1000*1000; ++i) {
|
b.rehash(b.size());
|
||||||
b.find(i);
|
fprintf(stderr, "Insertion finished\n");
|
||||||
|
for (int i = 0; i < 1000000; ++i) {
|
||||||
|
auto it = b.find(i % num_keys);
|
||||||
|
if (it == b.end()) {
|
||||||
|
std::cerr << "Failed to find " << i << std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
if (it->first != it->second || it->first != i % num_keys) {
|
||||||
|
std::cerr << "Found " << it->first << " looking for " << i << std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
mph_map<string, int> h;
|
mph_map<string, int> h;
|
||||||
h.insert(std::make_pair("-1",-1));
|
h.insert(std::make_pair("-1",-1));
|
||||||
mph_map<string, int>::const_iterator it;
|
mph_map<string, int>::const_iterator it;
|
||||||
for (it = h.begin(); it != h.end(); ++it) {
|
for (it = h.begin(); it != h.end(); ++it) {
|
||||||
std::cerr << it->first << " -> " << it->second << std::endl;
|
if (it->second != -1) exit(-1);
|
||||||
}
|
}
|
||||||
std::cerr << "Search -1 gives " << h.find("-1")->second << std::endl;
|
int32_t num_valid = 100;
|
||||||
for (int i = 0; i < 100; ++i) {
|
for (int i = 0; i < num_valid; ++i) {
|
||||||
char buf[10];
|
char buf[10];
|
||||||
snprintf(buf, 10, "%d", i);
|
snprintf(buf, 10, "%d", i);
|
||||||
h.insert(std::make_pair(buf, i));
|
h.insert(std::make_pair(buf, i));
|
||||||
@ -34,18 +45,18 @@ int main(int argc, char** argv) {
|
|||||||
for (int i = 1000; i > 0; --i) {
|
for (int i = 1000; i > 0; --i) {
|
||||||
char buf[10];
|
char buf[10];
|
||||||
snprintf(buf, 10, "%d", i - 1);
|
snprintf(buf, 10, "%d", i - 1);
|
||||||
h.find(buf);
|
auto it = h.find(buf);
|
||||||
std::cerr << "Search " << i - 1 << " gives " << h.find(buf)->second << std::endl;
|
if (i < num_valid && it->second != i - 1) exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int j = 0; j < 100; ++j) {
|
for (int j = 0; j < 100; ++j) {
|
||||||
for (int i = 1000; i > 0; --i) {
|
for (int i = 1000; i > 0; --i) {
|
||||||
char buf[10];
|
char buf[10];
|
||||||
snprintf(buf, 10, "%d", i*100 - 1);
|
int key = i*100 - 1;
|
||||||
h.find(buf);
|
snprintf(buf, 10, "%d", key);
|
||||||
std::cerr << "Search " << i*100 - 1 << " gives " << h.find(buf)->second << std::endl;
|
auto it = h.find(buf);
|
||||||
|
if (key < num_valid && it->second != key) exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -9,8 +9,10 @@
|
|||||||
#include "MurmurHash3.h"
|
#include "MurmurHash3.h"
|
||||||
#include "stringpiece.h"
|
#include "stringpiece.h"
|
||||||
|
|
||||||
// From murmur, only used naively to extend 32 bits functions to 64 bits.
|
// From murmur, only used naively to extend 32 bits functions to 128 bits.
|
||||||
uint32_t fmix ( uint32_t h );
|
uint32_t fmix ( uint32_t h );
|
||||||
|
// Used for a quick and dirty hash function for integers. Probably a bad idea.
|
||||||
|
uint64_t fmix ( uint64_t h );
|
||||||
|
|
||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
|
|
||||||
@ -57,6 +59,18 @@ struct Murmur3StringPiece {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Murmur3Fmix64bitsType {
|
||||||
|
template <class Key>
|
||||||
|
uint32_t operator()(const Key& k) const {
|
||||||
|
return fmix(*reinterpret_cast<const uint64_t*>(&k));
|
||||||
|
}
|
||||||
|
template <class Key>
|
||||||
|
void hash64(const Key& k, uint32_t* out) const {
|
||||||
|
*reinterpret_cast<uint64_t*>(out) = fmix(k);
|
||||||
|
*(out + 2) = fmix(*out);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct seeded_hash_function<Murmur3> {
|
struct seeded_hash_function<Murmur3> {
|
||||||
template <class Key>
|
template <class Key>
|
||||||
@ -87,6 +101,20 @@ struct seeded_hash_function<Murmur3StringPiece> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct seeded_hash_function<Murmur3Fmix64bitsType> {
|
||||||
|
template <class Key>
|
||||||
|
uint32_t operator()(const Key& k, uint32_t seed) const {
|
||||||
|
return fmix(k + seed);
|
||||||
|
}
|
||||||
|
template <class Key>
|
||||||
|
void hash64(const Key& k, uint32_t seed, uint32_t* out) const {
|
||||||
|
*reinterpret_cast<uint64_t*>(out) = fmix(k ^ seed);
|
||||||
|
*(out + 2) = fmix(*out);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
template <class HashFcn> struct seeded_hash
|
template <class HashFcn> struct seeded_hash
|
||||||
{ typedef seeded_hash_function<HashFcn> hash_function; };
|
{ typedef seeded_hash_function<HashFcn> hash_function; };
|
||||||
// Use Murmur3 instead for all types defined in std::hash, plus
|
// Use Murmur3 instead for all types defined in std::hash, plus
|
||||||
|
Loading…
Reference in New Issue
Block a user