Optimized slack_type.
This commit is contained in:
parent
b8610f52e1
commit
1bb2d6a4dc
@ -1,5 +1,5 @@
|
|||||||
TESTS = $(check_PROGRAMS)
|
TESTS = $(check_PROGRAMS)
|
||||||
check_PROGRAMS = mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test
|
check_PROGRAMS = seeded_hash_test mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test
|
||||||
noinst_PROGRAMS = bm_index bm_map
|
noinst_PROGRAMS = bm_index bm_map
|
||||||
bin_PROGRAMS = cxxmph
|
bin_PROGRAMS = cxxmph
|
||||||
lib_LTLIBRARIES = libcxxmph.la
|
lib_LTLIBRARIES = libcxxmph.la
|
||||||
@ -27,9 +27,10 @@ cxxmph_LDADD = libcxxmph.la
|
|||||||
cxxmph_SOURCES = cxxmph.cc
|
cxxmph_SOURCES = cxxmph.cc
|
||||||
|
|
||||||
hollow_iterator_test_SOURCES = hollow_iterator_test.cc
|
hollow_iterator_test_SOURCES = hollow_iterator_test.cc
|
||||||
|
|
||||||
seeded_hash_test_SOURCES = seeded_hash_test.cc
|
seeded_hash_test_SOURCES = seeded_hash_test.cc
|
||||||
|
seeded_hash_test_LDADD = libcxxmph.la
|
||||||
|
|
||||||
mph_bits_test_SOURCES = mph_bits_test.cc
|
mph_bits_test_SOURCES = mph_bits_test.cc
|
||||||
mph_bits_test_LDADD = libcxxmph.la
|
mph_bits_test_LDADD = libcxxmph.la
|
||||||
mph_bits_test_LDADD = libcxxmph.la
|
|
||||||
|
|
||||||
|
@ -8,6 +8,9 @@ using cxxmph::mph_map;
|
|||||||
using std::string;
|
using std::string;
|
||||||
using std::unordered_map;
|
using std::unordered_map;
|
||||||
|
|
||||||
|
// Another reference benchmark:
|
||||||
|
// http://blog.aggregateknowledge.com/tag/bigmemory/
|
||||||
|
|
||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
|
|
||||||
template<class MapType, class T>
|
template<class MapType, class T>
|
||||||
|
@ -63,6 +63,23 @@ static uint32_t nextpoweroftwo(uint32_t k) {
|
|||||||
return k+1;
|
return k+1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int n, int mask = (1 << 7)> struct bitcount {
|
||||||
|
enum { value = (n & mask ? 1:0) + bitcount<n, (mask >> 1)>::value };
|
||||||
|
};
|
||||||
|
template <int n> struct bitcount<n, 0> { enum { value = 0 }; };
|
||||||
|
|
||||||
|
template <int size, int index = size>
|
||||||
|
class CompileTimeRankTable {
|
||||||
|
public:
|
||||||
|
CompileTimeRankTable() : current(bitcount<index - 1>::value) { }
|
||||||
|
int operator[] (int i) { return *(¤t + size - i - 1); }
|
||||||
|
private:
|
||||||
|
unsigned char current;
|
||||||
|
CompileTimeRankTable<index -1> next;
|
||||||
|
};
|
||||||
|
template <int size> class CompileTimeRankTable<size, 0> { };
|
||||||
|
typedef CompileTimeRankTable<256> Ranktable;
|
||||||
|
|
||||||
// Interesting bit tricks that might end up here:
|
// Interesting bit tricks that might end up here:
|
||||||
// http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
|
// http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
|
||||||
// Fast a % (k*2^t)
|
// Fast a % (k*2^t)
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
#include "mph_bits.h"
|
#include "mph_bits.h"
|
||||||
|
|
||||||
using cxxmph::dynamic_2bitset;
|
using cxxmph::dynamic_2bitset;
|
||||||
|
using cxxmph::Ranktable;
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
dynamic_2bitset small(256, true);
|
dynamic_2bitset small(256, true);
|
||||||
for (int i = 0; i < small.size(); ++i) small.set(i, i % 4);
|
for (int i = 0; i < small.size(); ++i) small.set(i, i % 4);
|
||||||
@ -52,6 +54,12 @@ int main(int argc, char** argv) {
|
|||||||
empty.clear();
|
empty.clear();
|
||||||
dynamic_2bitset large(1000, true);
|
dynamic_2bitset large(1000, true);
|
||||||
empty.swap(large);
|
empty.swap(large);
|
||||||
|
|
||||||
|
Ranktable ranktable;
|
||||||
|
if (ranktable[0] != 0) exit(-1);
|
||||||
|
if (ranktable[1] != 1) exit(-1);
|
||||||
|
if (ranktable[2] != 1) exit(-1);
|
||||||
|
if (ranktable[255] != 8) exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -112,10 +112,11 @@ class mph_map {
|
|||||||
std::vector<value_type> values_;
|
std::vector<value_type> values_;
|
||||||
std::vector<bool> present_;
|
std::vector<bool> present_;
|
||||||
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
|
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
|
||||||
// TODO(davi) optimize slack to hold 128 unique bits from hash64 as key
|
// TODO(davi) optimize slack to use hash from index rather than calculate its own
|
||||||
typedef unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
|
typedef unordered_map<h128, uint32_t, h128::hash32> slack_type;
|
||||||
slack_type slack_;
|
slack_type slack_;
|
||||||
size_type size_;
|
size_type size_;
|
||||||
|
typename seeded_hash<HashFcn>::hash_function hasher128_;
|
||||||
|
|
||||||
mutable uint64_t fast_;
|
mutable uint64_t fast_;
|
||||||
mutable uint64_t fast_taken_;
|
mutable uint64_t fast_taken_;
|
||||||
@ -148,7 +149,9 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
|
|||||||
values_.push_back(x);
|
values_.push_back(x);
|
||||||
present_.push_back(true);
|
present_.push_back(true);
|
||||||
++size_;
|
++size_;
|
||||||
slack_.insert(make_pair(x.first, values_.size() - 1));
|
h128 h = hasher128_.hash128(x.first, 0);
|
||||||
|
if (slack_.find(h) != slack_.end()) should_pack = true; // unavoidable pack
|
||||||
|
else slack_.insert(std::make_pair(h, values_.size() - 1));
|
||||||
if (should_pack) pack();
|
if (should_pack) pack();
|
||||||
it = find(x.first);
|
it = find(x.first);
|
||||||
slow_ = 0;
|
slow_ = 0;
|
||||||
@ -218,7 +221,7 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfe
|
|||||||
}
|
}
|
||||||
if (__builtin_expect(!slack_.empty(), 0)) {
|
if (__builtin_expect(!slack_.empty(), 0)) {
|
||||||
++very_slow_;
|
++very_slow_;
|
||||||
auto sit = slack_.find(k);
|
auto sit = slack_.find(hasher128_.hash128(k, 0));
|
||||||
if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second);
|
if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second);
|
||||||
}
|
}
|
||||||
return end();
|
return end();
|
||||||
@ -233,7 +236,7 @@ MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_has
|
|||||||
}
|
}
|
||||||
if (__builtin_expect(!slack_.empty(), 0)) {
|
if (__builtin_expect(!slack_.empty(), 0)) {
|
||||||
++very_slow_;
|
++very_slow_;
|
||||||
auto sit = slack_.find(k);
|
auto sit = slack_.find(hasher128_.hash128(k, 0));
|
||||||
if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second);
|
if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second);
|
||||||
}
|
}
|
||||||
return end();
|
return end();
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include <stdint.h> // for uint32_t and friends
|
#include <stdint.h> // for uint32_t and friends
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <unordered_map> // for std::hash
|
#include <unordered_map> // for std::hash
|
||||||
|
|
||||||
#include "MurmurHash3.h"
|
#include "MurmurHash3.h"
|
||||||
@ -17,13 +18,15 @@ uint64_t fmix ( uint64_t h );
|
|||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
|
|
||||||
struct h128 {
|
struct h128 {
|
||||||
uint32_t operator[](uint8_t i) const { return uint32[i]; }
|
const uint32_t& operator[](uint8_t i) const { return uint32[i]; }
|
||||||
uint32_t& operator[](uint8_t i) { return uint32[i]; }
|
uint32_t& operator[](uint8_t i) { return uint32[i]; }
|
||||||
uint64_t* uint64ptr(bool second) { return reinterpret_cast<uint64_t*>(&uint32[static_cast<uint8_t>(second) << 1]); }
|
const uint64_t get64(bool second) const { return (static_cast<uint64_t>(uint32[second << 1]) << 32) | uint32[1 + (second << 1)]; }
|
||||||
uint64_t uint64(bool second) const { return *reinterpret_cast<const uint64_t*>(&uint32[static_cast<uint8_t>(second) << 1]); }
|
void set64(uint64_t v, bool second) { uint32[second << 1] = v >> 32; uint32[1+(second<<1)] = ((v << 32) >> 32); }
|
||||||
bool operator==(const h128 rhs) const { return uint64(0) == rhs.uint64(0) && uint64(1) == rhs.uint64(1); }
|
bool operator==(const h128 rhs) const { return memcmp(uint32, rhs.uint32, sizeof(uint32)) == 0; }
|
||||||
|
|
||||||
uint32_t uint32[4];
|
uint32_t uint32[4];
|
||||||
|
|
||||||
|
struct hash32 { uint32_t operator()(const cxxmph::h128& h) const { return h[3]; } };
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class HashFcn>
|
template <class HashFcn>
|
||||||
@ -83,8 +86,8 @@ struct Murmur3Fmix64bitsType {
|
|||||||
template <class Key>
|
template <class Key>
|
||||||
h128 hash128(const Key& k) const {
|
h128 hash128(const Key& k) const {
|
||||||
h128 h;
|
h128 h;
|
||||||
*h.uint64ptr(0) = fmix(k);
|
h.set64(fmix(k), 0);
|
||||||
*h.uint64ptr(1) = fmix(h.uint64(0));
|
h.set64(fmix(h.get64(0)), 1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -131,8 +134,8 @@ struct seeded_hash_function<Murmur3Fmix64bitsType> {
|
|||||||
template <class Key>
|
template <class Key>
|
||||||
h128 hash128(const Key& k, uint32_t seed) const {
|
h128 hash128(const Key& k, uint32_t seed) const {
|
||||||
h128 h;
|
h128 h;
|
||||||
*h.uint64ptr(0) = fmix(k ^ seed);
|
h.set64(fmix(k ^ seed), 0);
|
||||||
*h.uint64ptr(1) = fmix(h.uint64(0));
|
h.set64(fmix(h.get64(0)), 1);
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
59
cxxmph/seeded_hash_test.cc
Normal file
59
cxxmph/seeded_hash_test.cc
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
#include "seeded_hash.h"
|
||||||
|
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <string>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
using std::cerr;
|
||||||
|
using std::endl;
|
||||||
|
using std::string;
|
||||||
|
using std::unordered_map;
|
||||||
|
using namespace cxxmph;
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
auto hasher = seeded_hash_function<Murmur3StringPiece>();
|
||||||
|
string key1("0");
|
||||||
|
string key2("1");
|
||||||
|
auto h1 = hasher.hash128(key1, 1);
|
||||||
|
auto h2 = hasher.hash128(key2, 1);
|
||||||
|
if (h1 == h2) {
|
||||||
|
fprintf(stderr, "unexpected murmur collision\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
unordered_map<uint64_t, int, Murmur3Fmix64bitsType> g;
|
||||||
|
for (int i = 0; i < 1000; ++i) g[i] = i;
|
||||||
|
for (int i = 0; i < 1000; ++i) if (g[i] != i) exit(-1);
|
||||||
|
|
||||||
|
auto inthasher = seeded_hash_function<std::hash<uint64_t>>();
|
||||||
|
unordered_map<h128, int, h128::hash32> g2;
|
||||||
|
for (uint64_t i = 0; i < 1000; ++i) {
|
||||||
|
auto h = inthasher.hash128(i, 0);
|
||||||
|
if (g2.find(h) != g2.end()) {
|
||||||
|
std::cerr << "Incorrectly found " << i << std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
if (h128::hash32()(h) != h[3]) {
|
||||||
|
cerr << "Buggy hash method." << endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
auto h2 = inthasher.hash128(i, 0);
|
||||||
|
if (!(h == h2)) {
|
||||||
|
cerr << "h 64(0) " << h.get64(0) << " h 64(1) " << h.get64(1) << endl;
|
||||||
|
cerr << " h2 64(0) " << h2.get64(0) << " h2 64(1) " << h2.get64(1) << endl;
|
||||||
|
cerr << "Broken equality for h128" << endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
if (h128::hash32()(h) != h128::hash32()(h2)) {
|
||||||
|
cerr << "Inconsistent hash method." << endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
g2[h] = i;
|
||||||
|
if (g2.find(h) == g2.end()) {
|
||||||
|
std::cerr << "Incorrectly missed " << i << std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint64_t i = 0; i < 1000; ++i) if (g2[inthasher.hash128(i, 0)] != i) exit(-1);
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user