Small fixes, more comments.

This commit is contained in:
Davi de Castro Reis 2011-11-05 15:15:11 -02:00
parent d3b3b3dfba
commit d4ee76b7bf
5 changed files with 66 additions and 6 deletions

View File

@ -1,5 +1,5 @@
TESTS = $(check_PROGRAMS) TESTS = $(check_PROGRAMS)
check_PROGRAMS = mph_map_test mph_index_test check_PROGRAMS = mph_map_test mph_index_test trigraph_test
noinst_PROGRAMS = bm_index bm_map noinst_PROGRAMS = bm_index bm_map
bin_PROGRAMS = cxxmph bin_PROGRAMS = cxxmph
lib_LTLIBRARIES = libcxxmph.la lib_LTLIBRARIES = libcxxmph.la
@ -17,6 +17,9 @@ mph_index_test_SOURCES = mph_index_test.cc
bm_index_LDADD = libcxxmph.la bm_index_LDADD = libcxxmph.la
bm_index_SOURCES = bm_common.cc bm_index.cc bm_index_SOURCES = bm_common.cc bm_index.cc
trigraph_test_LDADD = libcxxmph.la
trigraph_test_SOURCES = trigraph_test.cc
bm_map_LDADD = libcxxmph.la bm_map_LDADD = libcxxmph.la
bm_map_SOURCES = bm_common.cc bm_map.cc bm_map_SOURCES = bm_common.cc bm_map.cc

View File

@ -2,6 +2,25 @@
#define __CXXMPH_MPH_INDEX_H__ #define __CXXMPH_MPH_INDEX_H__
// Minimal perfect hash abstraction implementing the BDZ algorithm // Minimal perfect hash abstraction implementing the BDZ algorithm
//
// This is a data structure that given a set of known keys S, will create a
// mapping from S to [0..|S|). The class is informed about S through the Reset
// method and the mapping is queried by calling index(key).
//
// This is a pretty uncommon data structure, and if you application has a real
// use case for it, chances are that it is a real win. If all you are doing is
// a straightforward implementation of an in-memory associative mapping data
// structure (e.g., mph_map.h), then it will probably be slower, since that the
// evaluation of index() is typically slower than the total cost of running a
// traditional hash function over a key and doing 2-3 conflict resolutions on
// 100byte-ish strings.
//
// Notes:
//
// Most users can use the SimpleMPHIndex wrapper instead of the MPHIndex which
// have confusing template parameters.
// This class only implements a minimal perfect hash function, it does not
// implement an associative mapping data structure.
#include <stdint.h> #include <stdint.h>
@ -31,16 +50,20 @@ class MPHIndex {
template <class SeededHashFcn, class ForwardIterator> template <class SeededHashFcn, class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end); bool Reset(ForwardIterator begin, ForwardIterator end);
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn, class Key> // must agree with Reset
// Get a unique identifier for k, in the range [0;size()). If x wasn't part
// of the input in the last Reset call, returns a random value.
uint32_t index(const Key& x) const; uint32_t index(const Key& x) const;
uint32_t size() const { return m_; } uint32_t size() const { return m_; }
void clear(); void clear();
// Advanced users functions. Please avoid unless you know what you are doing.
uint32_t perfect_hash_size() const { return n_; } uint32_t perfect_hash_size() const { return n_; }
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t perfect_hash(const Key& x) const; uint32_t perfect_hash(const Key& x) const;
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t minimal_perfect_hash(const Key& x) const; uint32_t minimal_perfect_hash(const Key& x) const;
// Serialization machinery for mmap usage.
// Serialization for mmap usage - not tested well, ping me if you care.
// Serialized tables are not guaranteed to work across versions or different // Serialized tables are not guaranteed to work across versions or different
// endianness (although they could easily be made to be). // endianness (although they could easily be made to be).
uint32_t serialize_bytes_needed() const; uint32_t serialize_bytes_needed() const;
@ -110,7 +133,7 @@ bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) {
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
int iterations = 10; int iterations = 1000;
std::vector<TriGraph::Edge> edges; std::vector<TriGraph::Edge> edges;
std::vector<uint32_t> queue; std::vector<uint32_t> queue;
while (1) { while (1) {
@ -176,6 +199,8 @@ uint32_t MPHIndex::index(const Key& key) const {
return minimal_perfect_hash<SeededHashFcn, Key>(key); return minimal_perfect_hash<SeededHashFcn, Key>(key);
} }
// Simple wrapper around MPHIndex to simplify calling code. Please refer to the
// MPHIndex class for documentation.
template <class Key, class HashFcn = typename seeded_hash<std::tr1::hash<Key> >::hash_function> template <class Key, class HashFcn = typename seeded_hash<std::tr1::hash<Key> >::hash_function>
class SimpleMPHIndex : public MPHIndex { class SimpleMPHIndex : public MPHIndex {
public: public:

View File

@ -1,3 +1,10 @@
// Implementation of the unordered associative mapping interface using a
// minimal perfect hash function.
//
// This class is about 20% to 100% slower than unordered_map (or ext/hash_map)
// and should not be used if performance is a concern. In fact, you should only
// use it for educational purposes.
#include <algorithm> #include <algorithm>
#include <tr1/unordered_map> #include <tr1/unordered_map>
#include <vector> #include <vector>
@ -58,6 +65,7 @@ class mph_map {
const data_type& operator[](const key_type &k) const; const data_type& operator[](const key_type &k) const;
size_type bucket_count() const { return size(); } size_type bucket_count() const { return size(); }
// FIXME: not sure if this has the semantics I want
void rehash(size_type nbuckets /*ignored*/) { pack(); } void rehash(size_type nbuckets /*ignored*/) { pack(); }
protected: // mimicking STL implementation protected: // mimicking STL implementation
@ -156,7 +164,7 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
if (it != slack_.end()) return values_.begin() + it->second; if (it != slack_.end()) return values_.begin() + it->second;
} }
if (__builtin_expect(index_.size() == 0, 0)) return end(); if (__builtin_expect(index_.size() == 0, 0)) return end();
auto it = values_.begin() + index_.index(k); const_iterator it = values_.begin() + index_.index(k);
if (__builtin_expect(equal_(k, it->first), 1)) return it; if (__builtin_expect(equal_(k, it->first), 1)) return it;
return end(); return end();
} }
@ -167,7 +175,7 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
if (it != slack_.end()) return values_.begin() + it->second; if (it != slack_.end()) return values_.begin() + it->second;
} }
if (index_.size() == 0) return end(); if (index_.size() == 0) return end();
auto it = values_.begin() + index_.index(k); iterator it = values_.begin() + index_.index(k);
if (equal_(it->first, k)) return it; if (equal_(it->first, k)) return it;
return end(); return end();
} }

View File

@ -174,6 +174,8 @@ inline bool operator>=(const StringPiece& x, StringPiece& y) {
} // namespace cxxmph } // namespace cxxmph
// allow StringPiece to be logged // allow StringPiece to be logged
extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece); inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) {
o << piece.as_string(); return o;
}
#endif // CXXMPH_STRINGPIECE_H__ #endif // CXXMPH_STRINGPIECE_H__

22
cxxmph/trigraph_test.cc Normal file
View File

@ -0,0 +1,22 @@
#include <cassert>
#include "trigraph.h"
using cxxmph::TriGraph;
int main(int argc, char** argv) {
TriGraph g(4, 2);
g.AddEdge(TriGraph::Edge(0, 1, 2));
g.AddEdge(TriGraph::Edge(1, 3, 2));
assert(g.vertex_degree()[0] == 1);
assert(g.vertex_degree()[1] == 2);
assert(g.vertex_degree()[2] == 2);
assert(g.vertex_degree()[3] == 1);
g.RemoveEdge(0);
assert(g.vertex_degree()[0] == 0);
assert(g.vertex_degree()[1] == 1);
assert(g.vertex_degree()[2] == 1);
assert(g.vertex_degree()[3] == 1);
std::vector<TriGraph::Edge> edges;
g.ExtractEdgesAndClear(&edges);
}