From d4ee76b7bfacbde849fa2467e2e9b39e4f959959 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sat, 5 Nov 2011 15:15:11 -0200 Subject: [PATCH] Small fixes, more comments. --- cxxmph/Makefile.am | 5 ++++- cxxmph/mph_index.h | 29 +++++++++++++++++++++++++++-- cxxmph/mph_map.h | 12 ++++++++++-- cxxmph/stringpiece.h | 4 +++- cxxmph/trigraph_test.cc | 22 ++++++++++++++++++++++ 5 files changed, 66 insertions(+), 6 deletions(-) create mode 100644 cxxmph/trigraph_test.cc diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 04a90fe..55df057 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,5 +1,5 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_index_test +check_PROGRAMS = mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la @@ -17,6 +17,9 @@ mph_index_test_SOURCES = mph_index_test.cc bm_index_LDADD = libcxxmph.la bm_index_SOURCES = bm_common.cc bm_index.cc +trigraph_test_LDADD = libcxxmph.la +trigraph_test_SOURCES = trigraph_test.cc + bm_map_LDADD = libcxxmph.la bm_map_SOURCES = bm_common.cc bm_map.cc diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 3afc518..70cee68 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -2,6 +2,25 @@ #define __CXXMPH_MPH_INDEX_H__ // Minimal perfect hash abstraction implementing the BDZ algorithm +// +// This is a data structure that given a set of known keys S, will create a +// mapping from S to [0..|S|). The class is informed about S through the Reset +// method and the mapping is queried by calling index(key). +// +// This is a pretty uncommon data structure, and if you application has a real +// use case for it, chances are that it is a real win. If all you are doing is +// a straightforward implementation of an in-memory associative mapping data +// structure (e.g., mph_map.h), then it will probably be slower, since that the +// evaluation of index() is typically slower than the total cost of running a +// traditional hash function over a key and doing 2-3 conflict resolutions on +// 100byte-ish strings. +// +// Notes: +// +// Most users can use the SimpleMPHIndex wrapper instead of the MPHIndex which +// have confusing template parameters. +// This class only implements a minimal perfect hash function, it does not +// implement an associative mapping data structure. #include @@ -31,16 +50,20 @@ class MPHIndex { template bool Reset(ForwardIterator begin, ForwardIterator end); template // must agree with Reset + // Get a unique identifier for k, in the range [0;size()). If x wasn't part + // of the input in the last Reset call, returns a random value. uint32_t index(const Key& x) const; uint32_t size() const { return m_; } void clear(); + // Advanced users functions. Please avoid unless you know what you are doing. uint32_t perfect_hash_size() const { return n_; } template // must agree with Reset uint32_t perfect_hash(const Key& x) const; template // must agree with Reset uint32_t minimal_perfect_hash(const Key& x) const; - // Serialization machinery for mmap usage. + + // Serialization for mmap usage - not tested well, ping me if you care. // Serialized tables are not guaranteed to work across versions or different // endianness (although they could easily be made to be). uint32_t serialize_bytes_needed() const; @@ -110,7 +133,7 @@ bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; - int iterations = 10; + int iterations = 1000; std::vector edges; std::vector queue; while (1) { @@ -176,6 +199,8 @@ uint32_t MPHIndex::index(const Key& key) const { return minimal_perfect_hash(key); } +// Simple wrapper around MPHIndex to simplify calling code. Please refer to the +// MPHIndex class for documentation. template >::hash_function> class SimpleMPHIndex : public MPHIndex { public: diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index cd8f684..bcfebb6 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -1,3 +1,10 @@ +// Implementation of the unordered associative mapping interface using a +// minimal perfect hash function. +// +// This class is about 20% to 100% slower than unordered_map (or ext/hash_map) +// and should not be used if performance is a concern. In fact, you should only +// use it for educational purposes. + #include #include #include @@ -58,6 +65,7 @@ class mph_map { const data_type& operator[](const key_type &k) const; size_type bucket_count() const { return size(); } + // FIXME: not sure if this has the semantics I want void rehash(size_type nbuckets /*ignored*/) { pack(); } protected: // mimicking STL implementation @@ -156,7 +164,7 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { if (it != slack_.end()) return values_.begin() + it->second; } if (__builtin_expect(index_.size() == 0, 0)) return end(); - auto it = values_.begin() + index_.index(k); + const_iterator it = values_.begin() + index_.index(k); if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } @@ -167,7 +175,7 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { if (it != slack_.end()) return values_.begin() + it->second; } if (index_.size() == 0) return end(); - auto it = values_.begin() + index_.index(k); + iterator it = values_.begin() + index_.index(k); if (equal_(it->first, k)) return it; return end(); } diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h index ee6d125..f1327ea 100644 --- a/cxxmph/stringpiece.h +++ b/cxxmph/stringpiece.h @@ -174,6 +174,8 @@ inline bool operator>=(const StringPiece& x, StringPiece& y) { } // namespace cxxmph // allow StringPiece to be logged -extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece); +inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) { + o << piece.as_string(); return o; +} #endif // CXXMPH_STRINGPIECE_H__ diff --git a/cxxmph/trigraph_test.cc b/cxxmph/trigraph_test.cc new file mode 100644 index 0000000..6220138 --- /dev/null +++ b/cxxmph/trigraph_test.cc @@ -0,0 +1,22 @@ +#include + +#include "trigraph.h" + +using cxxmph::TriGraph; + +int main(int argc, char** argv) { + TriGraph g(4, 2); + g.AddEdge(TriGraph::Edge(0, 1, 2)); + g.AddEdge(TriGraph::Edge(1, 3, 2)); + assert(g.vertex_degree()[0] == 1); + assert(g.vertex_degree()[1] == 2); + assert(g.vertex_degree()[2] == 2); + assert(g.vertex_degree()[3] == 1); + g.RemoveEdge(0); + assert(g.vertex_degree()[0] == 0); + assert(g.vertex_degree()[1] == 1); + assert(g.vertex_degree()[2] == 1); + assert(g.vertex_degree()[3] == 1); + std::vector edges; + g.ExtractEdgesAndClear(&edges); +}