From 8663285897d319e0bbc5ce7df04140093ddc04c4 Mon Sep 17 00:00:00 2001 From: davi Date: Fri, 5 Nov 2010 04:40:15 -0200 Subject: [PATCH] Better design for hash templates. --- cxxmph/Makefile.am | 3 +- cxxmph/cmph_hash_function.h | 77 +++++++++++++++++++++++++++++++++++ cxxmph/cmph_hash_map.h | 31 +++++++++++--- cxxmph/iterator_first.h | 31 ++++++++++++++ cxxmph/mphtable.cc | 26 +----------- cxxmph/mphtable.h | 64 +++++++++++++++++++---------- cxxmph/mphtable_test.cc | 4 +- cxxmph/randomly_seeded_hash.h | 2 +- cxxmph/stringpiece.h | 4 +- cxxmph/trigraph.h | 2 +- 10 files changed, 187 insertions(+), 57 deletions(-) create mode 100644 cxxmph/cmph_hash_function.h diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 10bd278..7566f00 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,7 +1,8 @@ bin_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test lib_LTLIBRARIES = libcxxmph.la +include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cmph_hash_function.h -libcxxmph_la_SOURCES = stringpiece.h MurmurHash2.h randomly_seeded_hash.h trigragh.h trigraph.cc mphtable.h mphtable.cc +libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cmph_hash_function.h libcxxmph_la_LDFLAGS = -version-info 0:0:0 cmph_hash_map_test_LDADD = libcxxmph.la diff --git a/cxxmph/cmph_hash_function.h b/cxxmph/cmph_hash_function.h new file mode 100644 index 0000000..900491d --- /dev/null +++ b/cxxmph/cmph_hash_function.h @@ -0,0 +1,77 @@ +#include +#include // for __gnu_cxx::hash + +#include "MurmurHash2.h" +#include "stringpiece.h" +#include "cmph_types.h" + +namespace cxxmph { + +template +struct seeded_hash_function { + template + cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + return HashFcn()(k) ^ seed; + } +}; + +struct Murmur2 { + template + cmph_uint32 operator()(const Key& k) const { + return MurmurHash2(k, sizeof(Key), 1 /* seed */); + } +}; +struct Murmur2StringPiece { + template + cmph_uint32 operator()(const Key& k) const { + StringPiece s(k); + return MurmurHash2(k.data(), k.length(), 1 /* seed */); + } +}; + +template <> +struct seeded_hash_function { + template + cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + return MurmurHash2(k, sizeof(Key), seed); + } +}; + +template <> +struct seeded_hash_function { + template + cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + StringPiece s(k); + return MurmurHash2(k.data(), k.length(), seed); + } +}; + +template struct OptimizedSeededHashFunction +{ typedef seeded_hash_function hash_function; }; +// Use Murmur2 instead for all types defined in __gnu_cxx::hash, plus +// std::string which is commonly extended. +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; + +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; + +} // namespace cxxmph diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 12d39f1..a606c32 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -12,7 +12,12 @@ template <> struct hash { return MurmurHash2(s.c_str(), s.length(), 1 /* seed */); } }; -} +template <> struct hash { + std::size_t operator()(const long long int& s) const { + return MurmurHash2(reinterpret_cast(&s), sizeof(long long int), 1 /* seed */); + } +}; +} // namespace __gnu_cxx namespace cxxmph { @@ -63,11 +68,25 @@ class cmph_hash_map { void pack() { rehash(); } private: - void rehash(); - std::vector values_; - MPHTable table_; - typedef typename __gnu_cxx::hash_map slack_type; - slack_type slack_; + template + struct iterator_first : public iterator { + iterator_first(iterator it) : iterator(it) { } + const typename iterator::value_type::first_type& operator*() const { + return this->iterator::operator*().first; + } + }; + + template + iterator_first make_iterator_first(iterator it) { + return iterator_first(it); + } + + + void rehash(); + std::vector values_; + SimpleMPHTable::hash_function> table_; + typedef typename __gnu_cxx::hash_map slack_type; + slack_type slack_; }; CMPH_TMPL_SPEC diff --git a/cxxmph/iterator_first.h b/cxxmph/iterator_first.h index d8350af..1babb77 100644 --- a/cxxmph/iterator_first.h +++ b/cxxmph/iterator_first.h @@ -1,3 +1,7 @@ +#include "stringpiece.h" + +namespace cxxmph { + template struct iterator_first : public iterator { iterator_first(iterator it) : iterator(it) { } @@ -10,3 +14,30 @@ template iterator_first make_iterator_first(iterator it) { return iterator_first(it); } + +template class MakeStringPiece { + public: + StringPiece operator()(const value& v) { return StringPiece(reinterpret_cast(&v), sizeof(value)); } +}; +template <> class MakeStringPiece { + public: + StringPiece operator()(const std::string& v) { return StringPiece(v); } +}; +template <> class MakeStringPiece { + public: + StringPiece operator()(const char* v) { return StringPiece(v); } +}; + +template +struct iterator_stringpiece : public iterator { + iterator_stringpiece(iterator it) : iterator(it) { } + StringPiece operator*() const { + return MakeStringPiece()(this->iterator::operator*()); + } +}; +template +iterator_stringpiece make_iterator_stringpiece(iterator it) { + return iterator_stringpiece(it); +} + +} // namespace cxxmph diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 0b899da..d3537a9 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -32,18 +32,12 @@ static cmph_uint8 kBdzLookupTable[] = 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 }; -static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; -void set_2bit_value(vector *d, cmph_uint8 i, cmph_uint8 v) { - (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; -} -cmph_uint32 get_2bit_value(const vector& d, cmph_uint8 i) { - return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; -} - } // anonymous namespace namespace cxxmph { +const cmph_uint8 MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; + void MPHTable::clear() { // TODO(davi) impolement me } @@ -166,18 +160,6 @@ void MPHTable::Ranking() { } } -cmph_uint32 MPHTable::Search(const key_type& key) const { - cmph_uint32 h[3]; - for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); - // hash_function_[0](key, h); - h[0] = h[0] % r_; - h[1] = h[1] % r_ + r_; - h[2] = h[2] % r_ + (r_ << 1); - cmph_uint32 vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3]; - cerr << "Search found vertex " << vertex << endl; - return Rank(vertex); -} - cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { cmph_uint32 index = vertex >> b_; cmph_uint32 base_rank = ranktable_[index]; @@ -202,8 +184,4 @@ cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { return base_rank; } -cmph_uint32 MPHTable::index(const key_type& key) const { - return Search(key); -} - } // namespace cxxmph diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index 0a37799..2a3786a 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -11,31 +11,26 @@ using std::cerr; using std::endl; -#include "randomly_seeded_hash.h" -#include "stringpiece.h" +#include "cmph_hash_function.h" #include "trigraph.h" namespace cxxmph { class MPHTable { public: - // This class could be a template for both key type and hash function, but we - // chose to go with simplicity. - typedef StringPiece key_type; - typedef RandomlySeededHashFunction hasher_type; - MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b), m_(0), n_(0), k_(0), r_(0) { } ~MPHTable() {} - template + template bool Reset(ForwardIterator begin, ForwardIterator end); - cmph_uint32 index(const key_type& x) const; + template // must agree with Reset + cmph_uint32 index(const Key& x) const; cmph_uint32 size() const { return m_; } void clear(); private: - template + template bool Mapping(ForwardIterator begin, ForwardIterator end, std::vector* edges, std::vector* queue); @@ -43,7 +38,6 @@ class MPHTable { void Assigning(const std::vector& edges, const std::vector& queue); void Ranking(); - cmph_uint32 Search(const key_type& key) const; cmph_uint32 Rank(cmph_uint32 vertex) const; // Algorithm parameters @@ -63,14 +57,23 @@ class MPHTable { std::vector g_; // The table used for the rank step of the minimal perfect hash function std::vector ranktable_; - // The selected hash function triplet for finding the edges in the minimal + // The selected hash seed triplet for finding the edges in the minimal // perfect hash function graph. - hasher_type hash_function_[3]; + cmph_uint32 hash_seed_[3]; + + static const cmph_uint8 valuemask[]; + static void set_2bit_value(std::vector *d, cmph_uint8 i, cmph_uint8 v) { + (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; + } + static cmph_uint32 get_2bit_value(const std::vector& d, cmph_uint8 i) { + return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; + } + }; // Template method needs to go in the header file. -template +template bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { m_ = end - begin; r_ = static_cast(ceil((c_*m_)/3)); @@ -85,10 +88,8 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { std::vector queue; while (1) { cerr << "Iterations missing: " << iterations << endl; - for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type(); - // hash_function_[0] = hasher_type(); - cerr << "Seed: " << hash_function_[0].seed << endl; - if (Mapping(begin, end, &edges, &queue)) break; + for (int i = 0; i < 3; ++i) hash_seed_[i] = random(); + if (Mapping(begin, end, &edges, &queue)) break; else --iterations; if (iterations == 0) break; } @@ -99,15 +100,14 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { return true; } -template +template bool MPHTable::Mapping( ForwardIterator begin, ForwardIterator end, std::vector* edges, std::vector* queue) { TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { cmph_uint32 h[3]; - for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); - // hash_function_[0](*it, h); + for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); cmph_uint32 v0 = h[0] % r_; cmph_uint32 v1 = h[1] % r_ + r_; cmph_uint32 v2 = h[2] % r_ + (r_ << 1); @@ -121,6 +121,28 @@ bool MPHTable::Mapping( return false; } +template +cmph_uint32 MPHTable::index(const Key& key) const { + cmph_uint32 h[3]; + for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); + h[0] = h[0] % r_; + h[1] = h[1] % r_ + r_; + h[2] = h[2] % r_ + (r_ << 1); + cmph_uint32 vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3]; + cerr << "Search found vertex " << vertex << endl; + return Rank(vertex); +} + +template >::hash_function> +class SimpleMPHTable : public MPHTable { + public: + template + bool Reset(ForwardIterator begin, ForwardIterator end) { + return MPHTable::Reset(begin, end); + } + cmph_uint32 index(const Key& key) { return MPHTable::index(key); } +}; + } // namespace cxxmph #endif // __CXXMPH_MPHTABLE_H__ diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mphtable_test.cc index a745718..eb6ed3f 100644 --- a/cxxmph/mphtable_test.cc +++ b/cxxmph/mphtable_test.cc @@ -7,7 +7,7 @@ using std::string; using std::vector; -using cxxmph::MPHTable; +using cxxmph::SimpleMPHTable; int main(int argc, char** argv) { @@ -23,7 +23,7 @@ int main(int argc, char** argv) { keys.push_back("diogo"); keys.push_back("algume"); - MPHTable mphtable; + SimpleMPHTable mphtable; assert(mphtable.Reset(keys.begin(), keys.end())); vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { diff --git a/cxxmph/randomly_seeded_hash.h b/cxxmph/randomly_seeded_hash.h index fa382dd..747bbf3 100644 --- a/cxxmph/randomly_seeded_hash.h +++ b/cxxmph/randomly_seeded_hash.h @@ -6,7 +6,7 @@ #include -#include "../src/cmph_types.h" +#include "cmph_types.h" #include "MurmurHash2.h" #include "stringpiece.h" diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h index fdd8f75..4595dc7 100644 --- a/cxxmph/stringpiece.h +++ b/cxxmph/stringpiece.h @@ -172,6 +172,8 @@ inline bool operator>=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& } // allow StringPiece to be logged -extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece); +inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) { + return operator<<(o, std::string(piece.data(), piece.size())); +} #endif // CXXMPH_STRINGPIECE_H__ diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h index 18d8d98..7321d5a 100644 --- a/cxxmph/trigraph.h +++ b/cxxmph/trigraph.h @@ -9,7 +9,7 @@ #include -#include "../src/cmph_types.h" +#include "cmph_types.h" namespace cxxmph {