Better design for hash templates.
This commit is contained in:
parent
6c69aa0a8f
commit
8663285897
|
@ -1,7 +1,8 @@
|
|||
bin_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test
|
||||
lib_LTLIBRARIES = libcxxmph.la
|
||||
include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cmph_hash_function.h
|
||||
|
||||
libcxxmph_la_SOURCES = stringpiece.h MurmurHash2.h randomly_seeded_hash.h trigragh.h trigraph.cc mphtable.h mphtable.cc
|
||||
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cmph_hash_function.h
|
||||
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
||||
|
||||
cmph_hash_map_test_LDADD = libcxxmph.la
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
#include <cstdlib>
|
||||
#include <ext/hash_map> // for __gnu_cxx::hash
|
||||
|
||||
#include "MurmurHash2.h"
|
||||
#include "stringpiece.h"
|
||||
#include "cmph_types.h"
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
template <class HashFcn>
|
||||
struct seeded_hash_function {
|
||||
template <class Key>
|
||||
cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const {
|
||||
return HashFcn()(k) ^ seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct Murmur2 {
|
||||
template<class Key>
|
||||
cmph_uint32 operator()(const Key& k) const {
|
||||
return MurmurHash2(k, sizeof(Key), 1 /* seed */);
|
||||
}
|
||||
};
|
||||
struct Murmur2StringPiece {
|
||||
template <class Key>
|
||||
cmph_uint32 operator()(const Key& k) const {
|
||||
StringPiece s(k);
|
||||
return MurmurHash2(k.data(), k.length(), 1 /* seed */);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct seeded_hash_function<Murmur2> {
|
||||
template <class Key>
|
||||
cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const {
|
||||
return MurmurHash2(k, sizeof(Key), seed);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct seeded_hash_function<Murmur2StringPiece> {
|
||||
template <class Key>
|
||||
cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const {
|
||||
StringPiece s(k);
|
||||
return MurmurHash2(k.data(), k.length(), seed);
|
||||
}
|
||||
};
|
||||
|
||||
template <class HashFcn> struct OptimizedSeededHashFunction
|
||||
{ typedef seeded_hash_function<HashFcn> hash_function; };
|
||||
// Use Murmur2 instead for all types defined in __gnu_cxx::hash, plus
|
||||
// std::string which is commonly extended.
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<char*> >
|
||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<const char*> >
|
||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<std::string> >
|
||||
{ typedef seeded_hash_function<Murmur2StringPiece> hash_function; };
|
||||
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<char> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<unsigned char> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<short> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<unsigned short> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<int> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<unsigned int> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<long> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash<unsigned long> >
|
||||
{ typedef seeded_hash_function<Murmur2> hash_function; };
|
||||
|
||||
} // namespace cxxmph
|
|
@ -12,7 +12,12 @@ template <> struct hash<std::string> {
|
|||
return MurmurHash2(s.c_str(), s.length(), 1 /* seed */);
|
||||
}
|
||||
};
|
||||
}
|
||||
template <> struct hash<long long int> {
|
||||
std::size_t operator()(const long long int& s) const {
|
||||
return MurmurHash2(reinterpret_cast<const char*>(&s), sizeof(long long int), 1 /* seed */);
|
||||
}
|
||||
};
|
||||
} // namespace __gnu_cxx
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
|
@ -63,11 +68,25 @@ class cmph_hash_map {
|
|||
void pack() { rehash(); }
|
||||
|
||||
private:
|
||||
void rehash();
|
||||
std::vector<value_type> values_;
|
||||
MPHTable table_;
|
||||
typedef typename __gnu_cxx::hash_map<Key, Data, HashFcn, EqualKey, Alloc> slack_type;
|
||||
slack_type slack_;
|
||||
template <typename iterator>
|
||||
struct iterator_first : public iterator {
|
||||
iterator_first(iterator it) : iterator(it) { }
|
||||
const typename iterator::value_type::first_type& operator*() const {
|
||||
return this->iterator::operator*().first;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename iterator>
|
||||
iterator_first<iterator> make_iterator_first(iterator it) {
|
||||
return iterator_first<iterator>(it);
|
||||
}
|
||||
|
||||
|
||||
void rehash();
|
||||
std::vector<value_type> values_;
|
||||
SimpleMPHTable<Key, typename OptimizedSeededHashFunction<HashFcn>::hash_function> table_;
|
||||
typedef typename __gnu_cxx::hash_map<Key, Data, HashFcn, EqualKey, Alloc> slack_type;
|
||||
slack_type slack_;
|
||||
};
|
||||
|
||||
CMPH_TMPL_SPEC
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
#include "stringpiece.h"
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
template <typename iterator>
|
||||
struct iterator_first : public iterator {
|
||||
iterator_first(iterator it) : iterator(it) { }
|
||||
|
@ -10,3 +14,30 @@ template <typename iterator>
|
|||
iterator_first<iterator> make_iterator_first(iterator it) {
|
||||
return iterator_first<iterator>(it);
|
||||
}
|
||||
|
||||
template <typename value> class MakeStringPiece {
|
||||
public:
|
||||
StringPiece operator()(const value& v) { return StringPiece(reinterpret_cast<const char*>(&v), sizeof(value)); }
|
||||
};
|
||||
template <> class MakeStringPiece<std::string> {
|
||||
public:
|
||||
StringPiece operator()(const std::string& v) { return StringPiece(v); }
|
||||
};
|
||||
template <> class MakeStringPiece<const char*> {
|
||||
public:
|
||||
StringPiece operator()(const char* v) { return StringPiece(v); }
|
||||
};
|
||||
|
||||
template <typename iterator>
|
||||
struct iterator_stringpiece : public iterator {
|
||||
iterator_stringpiece(iterator it) : iterator(it) { }
|
||||
StringPiece operator*() const {
|
||||
return MakeStringPiece<typename iterator::value_type::first_type>()(this->iterator::operator*());
|
||||
}
|
||||
};
|
||||
template <typename iterator>
|
||||
iterator_stringpiece<iterator> make_iterator_stringpiece(iterator it) {
|
||||
return iterator_stringpiece<iterator>(it);
|
||||
}
|
||||
|
||||
} // namespace cxxmph
|
||||
|
|
|
@ -32,18 +32,12 @@ static cmph_uint8 kBdzLookupTable[] =
|
|||
2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0
|
||||
};
|
||||
|
||||
static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
|
||||
void set_2bit_value(vector<cmph_uint8> *d, cmph_uint8 i, cmph_uint8 v) {
|
||||
(*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3];
|
||||
}
|
||||
cmph_uint32 get_2bit_value(const vector<cmph_uint8>& d, cmph_uint8 i) {
|
||||
return (d[(i >> 2)] >> ((i & 3) << 1)) & 3;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
const cmph_uint8 MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
|
||||
|
||||
void MPHTable::clear() {
|
||||
// TODO(davi) impolement me
|
||||
}
|
||||
|
@ -166,18 +160,6 @@ void MPHTable::Ranking() {
|
|||
}
|
||||
}
|
||||
|
||||
cmph_uint32 MPHTable::Search(const key_type& key) const {
|
||||
cmph_uint32 h[3];
|
||||
for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key);
|
||||
// hash_function_[0](key, h);
|
||||
h[0] = h[0] % r_;
|
||||
h[1] = h[1] % r_ + r_;
|
||||
h[2] = h[2] % r_ + (r_ << 1);
|
||||
cmph_uint32 vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
|
||||
cerr << "Search found vertex " << vertex << endl;
|
||||
return Rank(vertex);
|
||||
}
|
||||
|
||||
cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const {
|
||||
cmph_uint32 index = vertex >> b_;
|
||||
cmph_uint32 base_rank = ranktable_[index];
|
||||
|
@ -202,8 +184,4 @@ cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const {
|
|||
return base_rank;
|
||||
}
|
||||
|
||||
cmph_uint32 MPHTable::index(const key_type& key) const {
|
||||
return Search(key);
|
||||
}
|
||||
|
||||
} // namespace cxxmph
|
||||
|
|
|
@ -11,31 +11,26 @@
|
|||
using std::cerr;
|
||||
using std::endl;
|
||||
|
||||
#include "randomly_seeded_hash.h"
|
||||
#include "stringpiece.h"
|
||||
#include "cmph_hash_function.h"
|
||||
#include "trigraph.h"
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
class MPHTable {
|
||||
public:
|
||||
// This class could be a template for both key type and hash function, but we
|
||||
// chose to go with simplicity.
|
||||
typedef StringPiece key_type;
|
||||
typedef RandomlySeededHashFunction<Murmur2StringPiece> hasher_type;
|
||||
|
||||
MPHTable(double c = 1.23, cmph_uint8 b = 7) :
|
||||
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0) { }
|
||||
~MPHTable() {}
|
||||
|
||||
template <class ForwardIterator>
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool Reset(ForwardIterator begin, ForwardIterator end);
|
||||
cmph_uint32 index(const key_type& x) const;
|
||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||
cmph_uint32 index(const Key& x) const;
|
||||
cmph_uint32 size() const { return m_; }
|
||||
void clear();
|
||||
|
||||
private:
|
||||
template <class ForwardIterator>
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool Mapping(ForwardIterator begin, ForwardIterator end,
|
||||
std::vector<TriGraph::Edge>* edges,
|
||||
std::vector<cmph_uint32>* queue);
|
||||
|
@ -43,7 +38,6 @@ class MPHTable {
|
|||
void Assigning(const std::vector<TriGraph::Edge>& edges,
|
||||
const std::vector<cmph_uint32>& queue);
|
||||
void Ranking();
|
||||
cmph_uint32 Search(const key_type& key) const;
|
||||
cmph_uint32 Rank(cmph_uint32 vertex) const;
|
||||
|
||||
// Algorithm parameters
|
||||
|
@ -63,14 +57,23 @@ class MPHTable {
|
|||
std::vector<cmph_uint8> g_;
|
||||
// The table used for the rank step of the minimal perfect hash function
|
||||
std::vector<cmph_uint32> ranktable_;
|
||||
// The selected hash function triplet for finding the edges in the minimal
|
||||
// The selected hash seed triplet for finding the edges in the minimal
|
||||
// perfect hash function graph.
|
||||
hasher_type hash_function_[3];
|
||||
cmph_uint32 hash_seed_[3];
|
||||
|
||||
static const cmph_uint8 valuemask[];
|
||||
static void set_2bit_value(std::vector<cmph_uint8> *d, cmph_uint8 i, cmph_uint8 v) {
|
||||
(*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3];
|
||||
}
|
||||
static cmph_uint32 get_2bit_value(const std::vector<cmph_uint8>& d, cmph_uint8 i) {
|
||||
return (d[(i >> 2)] >> ((i & 3) << 1)) & 3;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
// Template method needs to go in the header file.
|
||||
template <class ForwardIterator>
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) {
|
||||
m_ = end - begin;
|
||||
r_ = static_cast<cmph_uint32>(ceil((c_*m_)/3));
|
||||
|
@ -85,10 +88,8 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) {
|
|||
std::vector<cmph_uint32> queue;
|
||||
while (1) {
|
||||
cerr << "Iterations missing: " << iterations << endl;
|
||||
for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type();
|
||||
// hash_function_[0] = hasher_type();
|
||||
cerr << "Seed: " << hash_function_[0].seed << endl;
|
||||
if (Mapping(begin, end, &edges, &queue)) break;
|
||||
for (int i = 0; i < 3; ++i) hash_seed_[i] = random();
|
||||
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
|
||||
else --iterations;
|
||||
if (iterations == 0) break;
|
||||
}
|
||||
|
@ -99,15 +100,14 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) {
|
|||
return true;
|
||||
}
|
||||
|
||||
template <class ForwardIterator>
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool MPHTable::Mapping(
|
||||
ForwardIterator begin, ForwardIterator end,
|
||||
std::vector<TriGraph::Edge>* edges, std::vector<cmph_uint32>* queue) {
|
||||
TriGraph graph(n_, m_);
|
||||
for (ForwardIterator it = begin; it != end; ++it) {
|
||||
cmph_uint32 h[3];
|
||||
for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it);
|
||||
// hash_function_[0](*it, h);
|
||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
|
||||
cmph_uint32 v0 = h[0] % r_;
|
||||
cmph_uint32 v1 = h[1] % r_ + r_;
|
||||
cmph_uint32 v2 = h[2] % r_ + (r_ << 1);
|
||||
|
@ -121,6 +121,28 @@ bool MPHTable::Mapping(
|
|||
return false;
|
||||
}
|
||||
|
||||
template <class SeededHashFcn, class Key>
|
||||
cmph_uint32 MPHTable::index(const Key& key) const {
|
||||
cmph_uint32 h[3];
|
||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
|
||||
h[0] = h[0] % r_;
|
||||
h[1] = h[1] % r_ + r_;
|
||||
h[2] = h[2] % r_ + (r_ << 1);
|
||||
cmph_uint32 vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
|
||||
cerr << "Search found vertex " << vertex << endl;
|
||||
return Rank(vertex);
|
||||
}
|
||||
|
||||
template <class Key, class HashFcn = typename OptimizedSeededHashFunction<__gnu_cxx::hash<Key> >::hash_function>
|
||||
class SimpleMPHTable : public MPHTable {
|
||||
public:
|
||||
template <class ForwardIterator>
|
||||
bool Reset(ForwardIterator begin, ForwardIterator end) {
|
||||
return MPHTable::Reset<HashFcn>(begin, end);
|
||||
}
|
||||
cmph_uint32 index(const Key& key) { return MPHTable::index<HashFcn>(key); }
|
||||
};
|
||||
|
||||
} // namespace cxxmph
|
||||
|
||||
#endif // __CXXMPH_MPHTABLE_H__
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using cxxmph::MPHTable;
|
||||
using cxxmph::SimpleMPHTable;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
|
@ -23,7 +23,7 @@ int main(int argc, char** argv) {
|
|||
keys.push_back("diogo");
|
||||
keys.push_back("algume");
|
||||
|
||||
MPHTable mphtable;
|
||||
SimpleMPHTable<string> mphtable;
|
||||
assert(mphtable.Reset(keys.begin(), keys.end()));
|
||||
vector<int> ids;
|
||||
for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "../src/cmph_types.h"
|
||||
#include "cmph_types.h"
|
||||
#include "MurmurHash2.h"
|
||||
#include "stringpiece.h"
|
||||
|
||||
|
|
|
@ -172,6 +172,8 @@ inline bool operator>=(const cxxmph::StringPiece& x, const cxxmph::StringPiece&
|
|||
}
|
||||
|
||||
// allow StringPiece to be logged
|
||||
extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece);
|
||||
inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) {
|
||||
return operator<<(o, std::string(piece.data(), piece.size()));
|
||||
}
|
||||
|
||||
#endif // CXXMPH_STRINGPIECE_H__
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
|
||||
#include <vector>
|
||||
|
||||
#include "../src/cmph_types.h"
|
||||
#include "cmph_types.h"
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
|
|
Loading…
Reference in New Issue