Small fixes, more comments.
This commit is contained in:
parent
d3b3b3dfba
commit
d4ee76b7bf
|
@ -1,5 +1,5 @@
|
||||||
TESTS = $(check_PROGRAMS)
|
TESTS = $(check_PROGRAMS)
|
||||||
check_PROGRAMS = mph_map_test mph_index_test
|
check_PROGRAMS = mph_map_test mph_index_test trigraph_test
|
||||||
noinst_PROGRAMS = bm_index bm_map
|
noinst_PROGRAMS = bm_index bm_map
|
||||||
bin_PROGRAMS = cxxmph
|
bin_PROGRAMS = cxxmph
|
||||||
lib_LTLIBRARIES = libcxxmph.la
|
lib_LTLIBRARIES = libcxxmph.la
|
||||||
|
@ -17,6 +17,9 @@ mph_index_test_SOURCES = mph_index_test.cc
|
||||||
bm_index_LDADD = libcxxmph.la
|
bm_index_LDADD = libcxxmph.la
|
||||||
bm_index_SOURCES = bm_common.cc bm_index.cc
|
bm_index_SOURCES = bm_common.cc bm_index.cc
|
||||||
|
|
||||||
|
trigraph_test_LDADD = libcxxmph.la
|
||||||
|
trigraph_test_SOURCES = trigraph_test.cc
|
||||||
|
|
||||||
bm_map_LDADD = libcxxmph.la
|
bm_map_LDADD = libcxxmph.la
|
||||||
bm_map_SOURCES = bm_common.cc bm_map.cc
|
bm_map_SOURCES = bm_common.cc bm_map.cc
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,25 @@
|
||||||
#define __CXXMPH_MPH_INDEX_H__
|
#define __CXXMPH_MPH_INDEX_H__
|
||||||
|
|
||||||
// Minimal perfect hash abstraction implementing the BDZ algorithm
|
// Minimal perfect hash abstraction implementing the BDZ algorithm
|
||||||
|
//
|
||||||
|
// This is a data structure that given a set of known keys S, will create a
|
||||||
|
// mapping from S to [0..|S|). The class is informed about S through the Reset
|
||||||
|
// method and the mapping is queried by calling index(key).
|
||||||
|
//
|
||||||
|
// This is a pretty uncommon data structure, and if you application has a real
|
||||||
|
// use case for it, chances are that it is a real win. If all you are doing is
|
||||||
|
// a straightforward implementation of an in-memory associative mapping data
|
||||||
|
// structure (e.g., mph_map.h), then it will probably be slower, since that the
|
||||||
|
// evaluation of index() is typically slower than the total cost of running a
|
||||||
|
// traditional hash function over a key and doing 2-3 conflict resolutions on
|
||||||
|
// 100byte-ish strings.
|
||||||
|
//
|
||||||
|
// Notes:
|
||||||
|
//
|
||||||
|
// Most users can use the SimpleMPHIndex wrapper instead of the MPHIndex which
|
||||||
|
// have confusing template parameters.
|
||||||
|
// This class only implements a minimal perfect hash function, it does not
|
||||||
|
// implement an associative mapping data structure.
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
@ -31,16 +50,20 @@ class MPHIndex {
|
||||||
template <class SeededHashFcn, class ForwardIterator>
|
template <class SeededHashFcn, class ForwardIterator>
|
||||||
bool Reset(ForwardIterator begin, ForwardIterator end);
|
bool Reset(ForwardIterator begin, ForwardIterator end);
|
||||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||||
|
// Get a unique identifier for k, in the range [0;size()). If x wasn't part
|
||||||
|
// of the input in the last Reset call, returns a random value.
|
||||||
uint32_t index(const Key& x) const;
|
uint32_t index(const Key& x) const;
|
||||||
uint32_t size() const { return m_; }
|
uint32_t size() const { return m_; }
|
||||||
void clear();
|
void clear();
|
||||||
|
|
||||||
|
// Advanced users functions. Please avoid unless you know what you are doing.
|
||||||
uint32_t perfect_hash_size() const { return n_; }
|
uint32_t perfect_hash_size() const { return n_; }
|
||||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||||
uint32_t perfect_hash(const Key& x) const;
|
uint32_t perfect_hash(const Key& x) const;
|
||||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||||
uint32_t minimal_perfect_hash(const Key& x) const;
|
uint32_t minimal_perfect_hash(const Key& x) const;
|
||||||
// Serialization machinery for mmap usage.
|
|
||||||
|
// Serialization for mmap usage - not tested well, ping me if you care.
|
||||||
// Serialized tables are not guaranteed to work across versions or different
|
// Serialized tables are not guaranteed to work across versions or different
|
||||||
// endianness (although they could easily be made to be).
|
// endianness (although they could easily be made to be).
|
||||||
uint32_t serialize_bytes_needed() const;
|
uint32_t serialize_bytes_needed() const;
|
||||||
|
@ -110,7 +133,7 @@ bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) {
|
||||||
|
|
||||||
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
|
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
|
||||||
|
|
||||||
int iterations = 10;
|
int iterations = 1000;
|
||||||
std::vector<TriGraph::Edge> edges;
|
std::vector<TriGraph::Edge> edges;
|
||||||
std::vector<uint32_t> queue;
|
std::vector<uint32_t> queue;
|
||||||
while (1) {
|
while (1) {
|
||||||
|
@ -176,6 +199,8 @@ uint32_t MPHIndex::index(const Key& key) const {
|
||||||
return minimal_perfect_hash<SeededHashFcn, Key>(key);
|
return minimal_perfect_hash<SeededHashFcn, Key>(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Simple wrapper around MPHIndex to simplify calling code. Please refer to the
|
||||||
|
// MPHIndex class for documentation.
|
||||||
template <class Key, class HashFcn = typename seeded_hash<std::tr1::hash<Key> >::hash_function>
|
template <class Key, class HashFcn = typename seeded_hash<std::tr1::hash<Key> >::hash_function>
|
||||||
class SimpleMPHIndex : public MPHIndex {
|
class SimpleMPHIndex : public MPHIndex {
|
||||||
public:
|
public:
|
||||||
|
|
|
@ -1,3 +1,10 @@
|
||||||
|
// Implementation of the unordered associative mapping interface using a
|
||||||
|
// minimal perfect hash function.
|
||||||
|
//
|
||||||
|
// This class is about 20% to 100% slower than unordered_map (or ext/hash_map)
|
||||||
|
// and should not be used if performance is a concern. In fact, you should only
|
||||||
|
// use it for educational purposes.
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <tr1/unordered_map>
|
#include <tr1/unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -58,6 +65,7 @@ class mph_map {
|
||||||
const data_type& operator[](const key_type &k) const;
|
const data_type& operator[](const key_type &k) const;
|
||||||
|
|
||||||
size_type bucket_count() const { return size(); }
|
size_type bucket_count() const { return size(); }
|
||||||
|
// FIXME: not sure if this has the semantics I want
|
||||||
void rehash(size_type nbuckets /*ignored*/) { pack(); }
|
void rehash(size_type nbuckets /*ignored*/) { pack(); }
|
||||||
|
|
||||||
protected: // mimicking STL implementation
|
protected: // mimicking STL implementation
|
||||||
|
@ -156,7 +164,7 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
|
||||||
if (it != slack_.end()) return values_.begin() + it->second;
|
if (it != slack_.end()) return values_.begin() + it->second;
|
||||||
}
|
}
|
||||||
if (__builtin_expect(index_.size() == 0, 0)) return end();
|
if (__builtin_expect(index_.size() == 0, 0)) return end();
|
||||||
auto it = values_.begin() + index_.index(k);
|
const_iterator it = values_.begin() + index_.index(k);
|
||||||
if (__builtin_expect(equal_(k, it->first), 1)) return it;
|
if (__builtin_expect(equal_(k, it->first), 1)) return it;
|
||||||
return end();
|
return end();
|
||||||
}
|
}
|
||||||
|
@ -167,7 +175,7 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
|
||||||
if (it != slack_.end()) return values_.begin() + it->second;
|
if (it != slack_.end()) return values_.begin() + it->second;
|
||||||
}
|
}
|
||||||
if (index_.size() == 0) return end();
|
if (index_.size() == 0) return end();
|
||||||
auto it = values_.begin() + index_.index(k);
|
iterator it = values_.begin() + index_.index(k);
|
||||||
if (equal_(it->first, k)) return it;
|
if (equal_(it->first, k)) return it;
|
||||||
return end();
|
return end();
|
||||||
}
|
}
|
||||||
|
|
|
@ -174,6 +174,8 @@ inline bool operator>=(const StringPiece& x, StringPiece& y) {
|
||||||
} // namespace cxxmph
|
} // namespace cxxmph
|
||||||
|
|
||||||
// allow StringPiece to be logged
|
// allow StringPiece to be logged
|
||||||
extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece);
|
inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) {
|
||||||
|
o << piece.as_string(); return o;
|
||||||
|
}
|
||||||
|
|
||||||
#endif // CXXMPH_STRINGPIECE_H__
|
#endif // CXXMPH_STRINGPIECE_H__
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
|
#include "trigraph.h"
|
||||||
|
|
||||||
|
using cxxmph::TriGraph;
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
TriGraph g(4, 2);
|
||||||
|
g.AddEdge(TriGraph::Edge(0, 1, 2));
|
||||||
|
g.AddEdge(TriGraph::Edge(1, 3, 2));
|
||||||
|
assert(g.vertex_degree()[0] == 1);
|
||||||
|
assert(g.vertex_degree()[1] == 2);
|
||||||
|
assert(g.vertex_degree()[2] == 2);
|
||||||
|
assert(g.vertex_degree()[3] == 1);
|
||||||
|
g.RemoveEdge(0);
|
||||||
|
assert(g.vertex_degree()[0] == 0);
|
||||||
|
assert(g.vertex_degree()[1] == 1);
|
||||||
|
assert(g.vertex_degree()[2] == 1);
|
||||||
|
assert(g.vertex_degree()[3] == 1);
|
||||||
|
std::vector<TriGraph::Edge> edges;
|
||||||
|
g.ExtractEdgesAndClear(&edges);
|
||||||
|
}
|
Loading…
Reference in New Issue