diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index f1129d4..db8ffa1 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,12 +1,12 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_index_test trigraph_test +check_PROGRAMS = mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc +libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc mph_bits.h mph_bits.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h mph_map_test_LDADD = libcxxmph.la mph_map_test_SOURCES = mph_map_test.cc @@ -25,3 +25,8 @@ bm_map_SOURCES = bm_common.cc bm_map.cc cxxmph_LDADD = libcxxmph.la cxxmph_SOURCES = cxxmph.cc + +hollow_iterator_test_SOURCES = hollow_iterator_test.cc +mph_bits_test_SOURCES = mph_bits_test.cc +mph_bits_test_LDADD = libcxxmph.la + diff --git a/cxxmph/MurmurHash3.cpp b/cxxmph/MurmurHash3.cpp new file mode 100644 index 0000000..09ffb26 --- /dev/null +++ b/cxxmph/MurmurHash3.cpp @@ -0,0 +1,335 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + h3 = fmix(h3); + h4 = fmix(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= uint64_t(tail[14]) << 48; + case 14: k2 ^= uint64_t(tail[13]) << 40; + case 13: k2 ^= uint64_t(tail[12]) << 32; + case 12: k2 ^= uint64_t(tail[11]) << 24; + case 11: k2 ^= uint64_t(tail[10]) << 16; + case 10: k2 ^= uint64_t(tail[ 9]) << 8; + case 9: k2 ^= uint64_t(tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= uint64_t(tail[ 7]) << 56; + case 7: k1 ^= uint64_t(tail[ 6]) << 48; + case 6: k1 ^= uint64_t(tail[ 5]) << 40; + case 5: k1 ^= uint64_t(tail[ 4]) << 32; + case 4: k1 ^= uint64_t(tail[ 3]) << 24; + case 3: k1 ^= uint64_t(tail[ 2]) << 16; + case 2: k1 ^= uint64_t(tail[ 1]) << 8; + case 1: k1 ^= uint64_t(tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + diff --git a/cxxmph/MurmurHash3.h b/cxxmph/MurmurHash3.h new file mode 100644 index 0000000..54e9d3f --- /dev/null +++ b/cxxmph/MurmurHash3.h @@ -0,0 +1,37 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 443178f..9345a11 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -21,7 +21,7 @@ class BM_MPHIndexCreate : public UrlsBenchmark { protected: virtual void Run() { SimpleMPHIndex index; - index.Reset(urls_.begin(), urls_.end()); + index.Reset(urls_.begin(), urls_.end(), urls_.size()); } }; @@ -53,7 +53,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { protected: virtual bool SetUp () { if (!SearchUrlsBenchmark::SetUp()) return false; - index_.Reset(urls_.begin(), urls_.end()); + index_.Reset(urls_.begin(), urls_.end(), urls_.size()); return true; } SimpleMPHIndex index_; diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 25ba463..0a0b225 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -13,7 +13,8 @@ namespace cxxmph { template const T* myfind(const MapType& mymap, const T& k) { auto it = mymap.find(k); - if (it == mymap.end()) return NULL; + auto end = mymap.end(); + if (it == end) return NULL; return &it->second; } @@ -48,6 +49,7 @@ class BM_SearchUrls : public SearchUrlsBenchmark { mymap_[*it] = *it; } mymap_.rehash(mymap_.bucket_count()); + fprintf(stderr, "Occupation: %f\n", static_cast(mymap_.size())/mymap_.bucket_count()); return true; } MapType mymap_; @@ -56,7 +58,7 @@ class BM_SearchUrls : public SearchUrlsBenchmark { template class BM_SearchUint64 : public SearchUint64Benchmark { public: - BM_SearchUint64() : SearchUint64Benchmark(10000, 10*1000*1000) { } + BM_SearchUint64() : SearchUint64Benchmark(100000, 10*1000*1000) { } virtual bool SetUp() { if (!SearchUint64Benchmark::SetUp()) return false; for (int i = 0; i < values_.size(); ++i) { @@ -93,7 +95,7 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); + Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); } diff --git a/cxxmph/cxxmph.cc b/cxxmph/cxxmph.cc index 68bb23e..e9bffd0 100644 --- a/cxxmph/cxxmph.cc +++ b/cxxmph/cxxmph.cc @@ -63,8 +63,8 @@ int main(int argc, char** argv) { for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i]; mph_map::const_iterator it = table.begin(); mph_map::const_iterator end = table.end(); - for (; it != end; ++it) { - cout << (it - table.begin()) << ": " << it->first + for (int i = 0; it != end; ++it, ++i) { + cout << i << ": " << it->first <<" -> " << it->second << endl; } } diff --git a/cxxmph/hollow_iterator.h b/cxxmph/hollow_iterator.h new file mode 100644 index 0000000..c650d21 --- /dev/null +++ b/cxxmph/hollow_iterator.h @@ -0,0 +1,71 @@ +#ifndef __CXXMPH_HOLLOW_ITERATOR_H__ +#define __CXXMPH_HOLLOW_ITERATOR_H__ + +#include + +namespace cxxmph { + +template +struct hollow_iterator_base + : public std::iterator { + typedef presence_type presence; + typedef container_type container; + typedef iterator_type iterator; + typedef hollow_iterator_base& self_reference; + typedef typename iterator::reference reference; + typedef typename iterator::pointer pointer; + + hollow_iterator_base(container* c, presence* p, iterator it) + : c_(c), p_(p), it_(it) { if (c_) find_present(); } + self_reference operator++() { + ++it_; find_present(); + } + reference operator*() { return *it_; } + pointer operator->() { return &(*it_); } + + // TODO find syntax to make this less permissible at compile time + template + bool operator==(const T& rhs) { return rhs.it_ == this->it_; } + template + bool operator!=(const T& rhs) { return rhs.it_ != this->it_; } + + public: // TODO find syntax to make this friend of const iterator + void find_present() { + while (it_ != c_->end() && !((*p_)[it_-c_->begin()])) ++it_; + } + container* c_; + presence* p_; + iterator it_; +}; + +template +struct hollow_iterator : public hollow_iterator_base< + container_type, std::vector, typename container_type::iterator> { + typedef hollow_iterator_base< + container_type, std::vector, typename container_type::iterator> parent_class; + hollow_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { } + hollow_iterator(typename parent_class::container* c, + typename parent_class::presence* p, + typename parent_class::iterator it) + : parent_class(c, p, it) { } +}; + +template +struct hollow_const_iterator : public hollow_iterator_base< + const container_type, const std::vector, typename container_type::const_iterator> { + typedef hollow_iterator_base< + const container_type, const std::vector, typename container_type::const_iterator> parent_class; + typedef hollow_const_iterator self_type; + typedef hollow_iterator non_const_type; + hollow_const_iterator(non_const_type rhs) : parent_class(rhs.c_, rhs.p_, typename container_type::const_iterator(rhs.it_)) { } + hollow_const_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { } + hollow_const_iterator(const typename parent_class::container* c, + const typename parent_class::presence* p, + typename parent_class::iterator it) + : parent_class(c, p, it) { } +}; + +} // namespace cxxmph + +#endif // __CXXMPH_HOLLOW_ITERATOR_H__ diff --git a/cxxmph/hollow_iterator_test.cc b/cxxmph/hollow_iterator_test.cc new file mode 100644 index 0000000..07963ae --- /dev/null +++ b/cxxmph/hollow_iterator_test.cc @@ -0,0 +1,38 @@ +#include +#include +#include + +#include "hollow_iterator.h" + +using std::vector; +using cxxmph::hollow_iterator; +using cxxmph::hollow_const_iterator; + +int main(int argc, char** argv) { + vector v; + vector p; + for (int i = 0; i < 100; ++i) { + v.push_back(i); + p.push_back(i % 2 == 0); + } + auto begin = hollow_iterator>(&v, &p, v.begin()); + auto end = hollow_iterator>(&v, &p, v.end()); + for (auto it = begin; it != end; ++it) { + if (((*it) % 2) != 0) exit(-1); + } + hollow_const_iterator> const_begin(begin); + hollow_const_iterator> const_end(end); + for (auto it = const_begin; it != const_end; ++it) { + if (((*it) % 2) != 0) exit(-1); + } + vector::iterator vit1 = v.begin(); + vector::const_iterator vit2 = v.begin(); + if (vit1 != vit2) exit(-1); + auto it1 = hollow_iterator>(&v, &p, v.begin()); + auto it2 = hollow_const_iterator>(&v, &p, v.begin()); + if (it1 != it2) exit(-1); + + hollow_iterator> default_constructed; + default_constructed = hollow_iterator>(&v, &p, v.begin()); +} + diff --git a/cxxmph/mph_bits.cc b/cxxmph/mph_bits.cc new file mode 100644 index 0000000..510572c --- /dev/null +++ b/cxxmph/mph_bits.cc @@ -0,0 +1,7 @@ +#include "mph_bits.h" + +namespace cxxmph { + +const uint8_t dynamic_2bitset::vmask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; + +} diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h new file mode 100644 index 0000000..06b2946 --- /dev/null +++ b/cxxmph/mph_bits.h @@ -0,0 +1,67 @@ +#ifndef __CXXMPH_MPH_BITS_H__ +#define __CXXMPH_MPH_BITS_H__ + +#include // for uint32_t and friends +#include +#include +#include +#include +#include +#include +#include + +namespace cxxmph { + +class dynamic_2bitset { + public: + dynamic_2bitset() : fill_(false) {} + dynamic_2bitset(uint32_t size, bool fill = false) + : size_(size), fill_(fill), data_(ceil(size / 4.0), ones()*fill) { + } + + const uint8_t operator[](uint32_t i) const { return get(i); } + uint8_t get(uint32_t i) const { + return (data_[(i >> 2)] >> (((i & 3) << 1)) & 3); + } + uint8_t set(uint32_t i, uint8_t v) { + data_[(i >> 2)] |= ones() ^ dynamic_2bitset::vmask[i & 3]; + data_[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); + assert(v <= 3); + assert(get(i) == v); + } + void resize(uint32_t size) { + size_ = size; + data_.resize(size >> 2, fill_*ones()); + } + void swap(dynamic_2bitset& other) { + std::swap(other.size_, size_); + std::swap(other.fill_, fill_); + std::swap(other.data_, data_); + } + void clear() { data_.clear(); } + + uint32_t size() const { return size_; } + static const uint8_t vmask[]; + private: + uint32_t size_; + bool fill_; + std::vector data_; + uint8_t ones() { return std::numeric_limits::max(); } +}; + +static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { + d[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); +} +static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { + return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); +} +static uint32_t nextpoweroftwo(uint32_t k) { + if (k == 0) return 1; + k--; + for (int i=1; i> i; + return k+1; +} + +} // namespace cxxmph + +#endif diff --git a/cxxmph/mph_bits_test.cc b/cxxmph/mph_bits_test.cc new file mode 100644 index 0000000..e6a764d --- /dev/null +++ b/cxxmph/mph_bits_test.cc @@ -0,0 +1,49 @@ +#include +#include + +#include "mph_bits.h" + +using cxxmph::dynamic_2bitset; +int main(int argc, char** argv) { + dynamic_2bitset small(256, true); + for (int i = 0; i < small.size(); ++i) small.set(i, i % 4); + for (int i = 0; i < small.size(); ++i) { + if (small[i] != i % 4) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", small[i], i, i % 4); + exit(-1); + } + } + + int size = 256; + dynamic_2bitset bits(size, true /* fill with ones */); + for (int i = 0; i < size; ++i) { + if (bits[i] != 3) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 3); + exit(-1); + } + } + for (int i = 0; i < size; ++i) bits.set(i, 0); + for (int i = 0; i < size; ++i) { + if (bits[i] != 0) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 0); + exit(-1); + } + } + for (int i = 0; i < size; ++i) bits.set(i, i % 4); + for (int i = 0; i < size; ++i) { + if (bits[i] != i % 4) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, i % 4); + exit(-1); + } + } + dynamic_2bitset size_corner1(1); + if (size_corner1.size() != 1) exit(-1); + dynamic_2bitset size_corner2(2); + if (size_corner2.size() != 2) exit(-1); + (dynamic_2bitset(4)).swap(size_corner2); + if (size_corner2.size() != 4) exit(-1); + + +} + + diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 9970943..deccf22 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -25,6 +25,7 @@ #include #include +#include #include #include // for std::hash #include @@ -35,6 +36,7 @@ using std::cerr; using std::endl; #include "seeded_hash.h" +#include "mph_bits.h" #include "trigraph.h" namespace cxxmph { @@ -42,13 +44,13 @@ namespace cxxmph { class MPHIndex { public: MPHIndex(double c = 1.23, uint8_t b = 7) : - c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), + c_(c), b_(b), m_(0), n_(0), k_(0), r_(1), g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), deserialized_(false) { } ~MPHIndex(); template - bool Reset(ForwardIterator begin, ForwardIterator end); + bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size); template // must agree with Reset // Get a unique identifier for k, in the range [0;size()). If x wasn't part // of the input in the last Reset call, returns a random value. @@ -63,6 +65,16 @@ class MPHIndex { template // must agree with Reset uint32_t minimal_perfect_hash(const Key& x) const; + // Crazy functions. Ignore. + template // must agree with Reset + uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const; + template // must agree with Reset + uint8_t cuckoo_nest(const uint32_t* h) const; + template // must agree with Reset + uint32_t cuckoo_nest_index(const Key& x, uint32_t* h) const; + template // must agree with Reset + void hash_vector(const Key& x, uint32_t* h) const; + // Serialization for mmap usage - not tested well, ping me if you care. // Serialized tables are not guaranteed to work across versions or different // endianness (although they could easily be made to be). @@ -94,6 +106,8 @@ class MPHIndex { // Partition vertex count, derived from c parameter. uint32_t r_; + uint32_t nest_displacement_[3]; // derived from r_ + // The array containing the minimal perfect hash function graph. Do not use // c++ vector to make mmap based backing easier. const uint8_t* g_; @@ -108,26 +122,26 @@ class MPHIndex { bool deserialized_; static const uint8_t valuemask[]; - static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { - d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); - } - static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { - return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); - } - - }; // Template method needs to go in the header file. template -bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { +bool MPHIndex::Reset( + ForwardIterator begin, ForwardIterator end, uint32_t size) { if (end == begin) { clear(); return true; } - m_ = end - begin; + m_ = size; r_ = static_cast(ceil((c_*m_)/3)); if ((r_ % 2) == 0) r_ += 1; + nest_displacement_[0] = 0; + nest_displacement_[1] = r_; + nest_displacement_[2] = (r_ << 1); + // This can be used to speed mods, but increases occupation too much. + // Needs to try http://gmplib.org/manual/Integer-Exponentiation.html instead + // r_ = nextpoweroftwo(r_); + n_ = 3*r_; k_ = 1U << b_; @@ -173,21 +187,44 @@ bool MPHIndex::Mapping( return false; } +template +uint32_t MPHIndex::cuckoo_hash(const uint32_t* h, uint8_t nest) const { + return (h[nest] % r_) + nest_displacement_[nest]; +} + +template +void MPHIndex::hash_vector(const Key& key, uint32_t* h) const { + SeededHashFcn().hash64(key, hash_seed_[0], h); +} + +template // must agree with Reset +uint8_t MPHIndex::cuckoo_nest(const uint32_t* h) const { + uint32_t x[4]; + if (!g_size_) return 0; + x[0] = (h[0] % r_) + nest_displacement_[0]; + x[1] = (h[1] % r_) + nest_displacement_[1]; + x[2] = (h[2] % r_) + nest_displacement_[2]; + assert((x[0] >> 2) > 2) > 2) uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t h[4]; - SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast(&h)); + if (!g_size_) return 0; + SeededHashFcn().hash64(key, hash_seed_[0], h); // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); - assert(r_); - h[0] = h[0] % r_; - h[1] = h[1] % r_ + r_; - h[2] = h[2] % r_ + (r_ << 1); - assert(g_size_); + h[0] = (h[0] % r_) + nest_displacement_[0]; + h[1] = (h[1] % r_) + nest_displacement_[1]; + h[2] = (h[2] % r_) + nest_displacement_[2]; // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; assert((h[0] >> 2) > 2) > 2) @@ -206,12 +243,15 @@ template >::hash_ class SimpleMPHIndex : public MPHIndex { public: template - bool Reset(ForwardIterator begin, ForwardIterator end) { - return MPHIndex::Reset(begin, end); + bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size) { + return MPHIndex::Reset(begin, end, size); } uint32_t index(const Key& key) const { return MPHIndex::index(key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash(key); } uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash(key); } + uint8_t cuckoo_nest(const uint32_t* h) const { return MPHIndex::cuckoo_nest(h); } + uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const { return MPHIndex::cuckoo_hash(h, nest); } + void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector(key, h); } }; } // namespace cxxmph diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index f2482b7..b4101df 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -24,7 +24,7 @@ int main(int argc, char** argv) { keys.push_back("algume"); SimpleMPHIndex mph_index; - if (!mph_index.Reset(keys.begin(), keys.end())) { exit(-1); } + if (!mph_index.Reset(keys.begin(), keys.end(), keys.size())) { exit(-1); } vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { ids.push_back(mph_index.index(keys[i])); diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 405a7f9..dd7bb08 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -10,11 +10,16 @@ // See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl #include +#include +#include #include +#include #include #include // for std::pair +#include "mph_bits.h" #include "mph_index.h" +#include "hollow_iterator.h" namespace cxxmph { @@ -42,8 +47,9 @@ class mph_map { typedef typename std::vector::const_reference const_reference; typedef typename std::vector::size_type size_type; typedef typename std::vector::difference_type difference_type; - typedef typename std::vector::iterator iterator; - typedef typename std::vector::const_iterator const_iterator; + + typedef hollow_iterator> iterator; + typedef hollow_const_iterator> const_iterator; // For making macros simpler. typedef void void_type; @@ -63,16 +69,15 @@ class mph_map { void erase(iterator pos); void erase(const key_type& k); pair insert(const value_type& x); - iterator find(const key_type& k); - const_iterator find(const key_type& k) const; + iterator find(const key_type& k) { return slow_find(k, index_.perfect_hash(k)); } + const_iterator find(const key_type& k) const { return slow_find(k, index_.perfect_hash(k)); }; typedef int32_t my_int32_t; // help macros int32_t index(const key_type& k) const; data_type& operator[](const key_type &k); const data_type& operator[](const key_type &k) const; - size_type bucket_count() const { return size(); } - // FIXME: not sure if this has the semantics I want - void rehash(size_type nbuckets /*ignored*/) { pack(); } + size_type bucket_count() const { return index_.perfect_hash_size() + slack_.bucket_count(); } + void rehash(size_type nbuckets /*ignored*/); protected: // mimicking STL implementation EqualKey equal_; @@ -81,7 +86,7 @@ class mph_map { template struct iterator_first : public iterator { iterator_first(iterator it) : iterator(it) { } - const typename iterator::value_type::first_type& operator*() const { + const typename iterator::value_type::first_type& operator*() { return this->iterator::operator*().first; } }; @@ -91,72 +96,173 @@ class mph_map { return iterator_first(it); } + iterator make_iterator(typename std::vector::iterator it) { + return hollow_iterator>(&values_, &present_, it); + } + const_iterator make_iterator(typename std::vector::const_iterator it) const { + return hollow_const_iterator>(&values_, &present_, it); + } + + // Experimental functions, not always faster + iterator fast_find(const key_type& k); + const_iterator fast_find(const key_type& k) const; + iterator slow_find(const key_type& k, uint32_t perfect_hash); + const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const; + static const uint8_t kNestCollision = 3; // biggest 2 bit value + void set_nest_value(const uint32_t* h, uint8_t value) { + auto index = get_nest_index(h); + assert(get_nest_index(h) < nests_.size()); + assert(get_nest_index(h) >> 2 < nests_.size()); + assert(value < 4); + nests_.set(index, value); + assert(nests_[index] == value); + } + uint32_t get_nest_value(const uint32_t* h) const { + assert(get_nest_index(h) < nests_.size()); + return nests_[get_nest_index(h)]; + } + uint32_t get_nest_index(const uint32_t* h) const { + assert(nests_.size()); + assert(nests_.size() % 2 == 0); + assert((nests_.size() & (nests_.size() - 1)) == 0); + assert((h[3] % nests_.size()) == (h[3] & (nests_.size() - 1))); + return (h[3] & (nests_.size() - 1)); // a mod 2^n == a & 2^n - 1 + } + void pack(); std::vector values_; + std::vector present_; + dynamic_2bitset nests_; SimpleMPHIndex::hash_function> index_; - // TODO(davi) optimize slack to no hold a copy of the key + // TODO(davi) optimize slack to hold 128 unique bits from hash64 as key typedef unordered_map slack_type; slack_type slack_; + size_type size_; + + mutable uint64_t fast_; + mutable uint64_t fast_taken_; + mutable uint64_t slow_; + mutable uint64_t very_slow_; }; MPH_MAP_TMPL_SPEC bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { - return lhs.values_ == rhs.values_; + return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); } -MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() { +MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) { + clear(); pack(); } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { + // fprintf(stderr, "Fast taken: %d Fast: %d Slow %d very_slow %d ratio %f\n", fast_taken_, fast_, slow_, very_slow_, fast_*1.0/slow_); } MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { - iterator it = find(x.first); - if (it != end()) return make_pair(it, false); - values_.push_back(x); - slack_.insert(make_pair(x.first, values_.size() - 1)); - if (slack_.size() == index_.size() || - (slack_.size() >= 256 && index_.size() == 0)) { - pack(); + auto it = find(x.first); + auto it_end = end(); + if (it != it_end) return make_pair(it, false); + bool should_pack = false; + if (values_.capacity() == values_.size() && values_.size() > 256) { + should_pack = true; } + values_.push_back(x); + present_.push_back(true); + uint32_t h[4]; + index_.hash_vector(x.first, h); + set_nest_value(h, kNestCollision); + ++size_; + slack_.insert(make_pair(x.first, values_.size() - 1)); + if (should_pack) pack(); it = find(x.first); + slow_ = 0; + very_slow_ = 0; + fast_ = 0; + fast_taken_ = 0; return make_pair(it, true); } MPH_MAP_METHOD_DECL(void_type, pack)() { + // fprintf(stderr, "Paki %d values\n", values_.size()); if (values_.empty()) return; - slack_type().swap(slack_); + assert(std::unordered_set(make_iterator_first(begin()), make_iterator_first(end())).size() == size()); bool success = index_.Reset( - make_iterator_first(values_.begin()), - make_iterator_first(values_.end())); + make_iterator_first(begin()), + make_iterator_first(end()), size_); assert(success); - std::vector new_values(values_.size()); - for (const_iterator it = values_.begin(), end = values_.end(); - it != end; ++it) { - size_type id = index_.index(it->first); + std::vector new_values(index_.perfect_hash_size()); + new_values.reserve(new_values.size() * 2); + std::vector new_present(index_.perfect_hash_size(), false); + new_present.reserve(new_present.size() * 2); + auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*10000 + 1); + dynamic_2bitset(new_nests_size, true /* fill with 1s */).swap(nests_); + vector used_nests(nests_.size()); + uint32_t collisions = 0; + for (iterator it = begin(), it_end = end(); it != it_end; ++it) { + size_type id = index_.perfect_hash(it->first); assert(id < new_values.size()); new_values[id] = *it; + new_present[id] = true; + uint32_t h[4]; + index_.hash_vector(it->first, h); + // fprintf(stderr, "Nest index: %d\n", get_nest_index(h)); + assert(used_nests.size() > get_nest_index(h)); + if (used_nests[get_nest_index(h)]) { + set_nest_value(h, kNestCollision); + assert(get_nest_value(h) == kNestCollision); + // fprintf(stderr, "Collision at nest index %d among %d positions\n", get_nest_index(h), nests_.size()); + ++collisions; + } else { + set_nest_value(h, index_.cuckoo_nest(h)); + assert(get_nest_value(h) == index_.cuckoo_nest(h)); + assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, get_nest_value(h))); + used_nests[get_nest_index(h)] = true; + } } + // fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); values_.swap(new_values); + present_.swap(new_present); + slack_type().swap(slack_); + int32_t fast = 0; + int32_t slow= 0; + for (iterator it = begin(), it_end = end(); it != it_end; ++it) { + uint32_t h[4]; + index_.hash_vector(it->first, h); + if (get_nest_value(h) == kNestCollision) ++slow; + else { + ++fast; + auto cit = values_.begin() + index_.cuckoo_hash(h, get_nest_value(h)); + assert(index_.perfect_hash(it->first) == cit - values_.begin()); + assert(equal_(it->first, cit->first)); + } + } + // fprintf(stderr, "Predicted fast: %d slow %d\n", fast, slow); } -MPH_MAP_METHOD_DECL(iterator, begin)() { return values_.begin(); } -MPH_MAP_METHOD_DECL(iterator, end)() { return values_.end(); } -MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } -MPH_MAP_METHOD_DECL(const_iterator, end)() const { return values_.end(); } -MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } -MPH_MAP_METHOD_DECL(size_type, size)() const { return values_.size(); } +MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); } +MPH_MAP_METHOD_DECL(iterator, end)() { return make_iterator(values_.end()); } +MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return make_iterator(values_.begin()); } +MPH_MAP_METHOD_DECL(const_iterator, end)() const { return make_iterator(values_.end()); } +MPH_MAP_METHOD_DECL(bool_type, empty)() const { return size_ == 0; } +MPH_MAP_METHOD_DECL(size_type, size)() const { return size_; } MPH_MAP_METHOD_DECL(void_type, clear)() { values_.clear(); + present_.clear(); slack_.clear(); index_.clear(); + dynamic_2bitset(8, true /* fill with 1s */).swap(nests_); + size_ = 0; } MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { - values_.erase(pos); - pack(); + present_[pos - begin] = false; + uint32_t h[4]; + index_.hash_vector(pos->first, &h); + nests_[get_nest_index(h)] = kNestCollision; + *pos = value_type(); + --size_; } MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { iterator it = find(k); @@ -164,36 +270,88 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { erase(it); } -MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { - if (__builtin_expect(!slack_.empty(), 0)) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; +MPH_MAP_METHOD_DECL(const_iterator, fast_find)(const key_type& k) const { + uint32_t h[4]; + index_.hash_vector(k, h); + auto nest = get_nest_value(h); + if (__builtin_expect(nest != kNestCollision, 1)) { + ++fast_taken_; + auto vit = values_.begin() + index_.cuckoo_hash(h, nest); + // do not hold for unknown keys + assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first)); + if (equal_(k, vit->first)) { + ++fast_; + return make_iterator(vit); + } + } + nest = index_.cuckoo_nest(h); + ++slow_; + return slow_find(k, index_.cuckoo_hash(h, nest)); +} + +MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const { + if (__builtin_expect(index_.perfect_hash_size(), 1)) { + if (__builtin_expect(present_[perfect_hash], true)) { + auto vit = values_.begin() + perfect_hash; + if (equal_(k, vit->first)) return make_iterator(vit); + } + } + if (__builtin_expect(!slack_.empty(), 0)) { + ++very_slow_; + auto sit = slack_.find(k); + if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); } - if (__builtin_expect(index_.size() == 0, 0)) return end(); - const_iterator it = values_.begin() + index_.index(k); - if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } -MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { - if (!slack_.empty()) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; +MPH_MAP_METHOD_DECL(iterator, fast_find)(const key_type& k) { + uint32_t h[4]; + index_.hash_vector(k, h); + auto nest = get_nest_value(h); + if (__builtin_expect(nest != kNestCollision, 1)) { + ++fast_taken_; + auto vit = values_.begin() + index_.cuckoo_hash(h, nest); + assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first)); + if (equal_(k, vit->first)) { + ++fast_; + return make_iterator(vit); + } + } + nest = index_.cuckoo_nest(h); + ++slow_; + return slow_find(k, index_.cuckoo_hash(h, nest)); +} + +MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) { + if (__builtin_expect(index_.perfect_hash_size(), 1)) { + if (__builtin_expect(present_[perfect_hash], true)) { + auto vit = values_.begin() + perfect_hash; + if (equal_(k, vit->first)) return make_iterator(vit); + } + } + if (__builtin_expect(!slack_.empty(), 0)) { + ++very_slow_; + auto sit = slack_.find(k); + if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); } - if (index_.size() == 0) return end(); - iterator it = values_.begin() + index_.index(k); - if (equal_(it->first, k)) return it; return end(); } MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { if (index_.size() == 0) return -1; - return index_.index(k); + return index_.perfect_hash(k); } MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(make_pair(k, data_type())).first->second; } +MPH_MAP_METHOD_DECL(void_type, rehash)(size_type nbuckets) { + pack(); + vector(values_.begin(), values_.end()).swap(values_); + vector(present_.begin(), present_.end()).swap(present_); + slack_type().swap(slack_); +} + } // namespace cxxmph diff --git a/cxxmph/mph_map_test.cc b/cxxmph/mph_map_test.cc index 579e0ca..1d489c6 100644 --- a/cxxmph/mph_map_test.cc +++ b/cxxmph/mph_map_test.cc @@ -11,21 +11,32 @@ using cxxmph::mph_map; int main(int argc, char** argv) { mph_map b; - for (int i = 0; i < 100*1000; ++i) { + int32_t num_keys = 1000*10; + for (int i = 0; i < num_keys; ++i) { b.insert(make_pair(i, i)); } - for (int i = 0; i < 1000*1000; ++i) { - b.find(i); + b.rehash(b.size()); + fprintf(stderr, "Insertion finished\n"); + for (int i = 0; i < 1000000; ++i) { + auto it = b.find(i % num_keys); + if (it == b.end()) { + std::cerr << "Failed to find " << i << std::endl; + exit(-1); + } + if (it->first != it->second || it->first != i % num_keys) { + std::cerr << "Found " << it->first << " looking for " << i << std::endl; + exit(-1); + } } /* mph_map h; h.insert(std::make_pair("-1",-1)); mph_map::const_iterator it; for (it = h.begin(); it != h.end(); ++it) { - std::cerr << it->first << " -> " << it->second << std::endl; + if (it->second != -1) exit(-1); } - std::cerr << "Search -1 gives " << h.find("-1")->second << std::endl; - for (int i = 0; i < 100; ++i) { + int32_t num_valid = 100; + for (int i = 0; i < num_valid; ++i) { char buf[10]; snprintf(buf, 10, "%d", i); h.insert(std::make_pair(buf, i)); @@ -34,18 +45,18 @@ int main(int argc, char** argv) { for (int i = 1000; i > 0; --i) { char buf[10]; snprintf(buf, 10, "%d", i - 1); - h.find(buf); - std::cerr << "Search " << i - 1 << " gives " << h.find(buf)->second << std::endl; + auto it = h.find(buf); + if (i < num_valid && it->second != i - 1) exit(-1); } } for (int j = 0; j < 100; ++j) { for (int i = 1000; i > 0; --i) { char buf[10]; - snprintf(buf, 10, "%d", i*100 - 1); - h.find(buf); - std::cerr << "Search " << i*100 - 1 << " gives " << h.find(buf)->second << std::endl; + int key = i*100 - 1; + snprintf(buf, 10, "%d", key); + auto it = h.find(buf); + if (key < num_valid && it->second != key) exit(-1); } } */ - } diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index e204d36..0979ef1 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -9,8 +9,10 @@ #include "MurmurHash3.h" #include "stringpiece.h" -// From murmur, only used naively to extend 32 bits functions to 64 bits. +// From murmur, only used naively to extend 32 bits functions to 128 bits. uint32_t fmix ( uint32_t h ); +// Used for a quick and dirty hash function for integers. Probably a bad idea. +uint64_t fmix ( uint64_t h ); namespace cxxmph { @@ -57,6 +59,18 @@ struct Murmur3StringPiece { } }; +struct Murmur3Fmix64bitsType { + template + uint32_t operator()(const Key& k) const { + return fmix(*reinterpret_cast(&k)); + } + template + void hash64(const Key& k, uint32_t* out) const { + *reinterpret_cast(out) = fmix(k); + *(out + 2) = fmix(*out); + } +}; + template <> struct seeded_hash_function { template @@ -87,6 +101,20 @@ struct seeded_hash_function { } }; +template <> +struct seeded_hash_function { + template + uint32_t operator()(const Key& k, uint32_t seed) const { + return fmix(k + seed); + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + *reinterpret_cast(out) = fmix(k ^ seed); + *(out + 2) = fmix(*out); + } +}; + + template struct seeded_hash { typedef seeded_hash_function hash_function; }; // Use Murmur3 instead for all types defined in std::hash, plus