diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index da7fa84..10bd278 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cmph_hash_map_test mphtable_test +bin_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test lib_LTLIBRARIES = libcxxmph.la libcxxmph_la_SOURCES = stringpiece.h MurmurHash2.h randomly_seeded_hash.h trigragh.h trigraph.cc mphtable.h mphtable.cc @@ -9,3 +9,6 @@ cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc mphtable_test_LDADD = libcxxmph.la mphtable_test_SOURCES = mphtable_test.cc + +trigraph_test_LDADD = libcxxmph.la +trigraph_test_SOURCES = trigraph_test.cc diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h index 81051fe..aa9338f 100644 --- a/cxxmph/MurmurHash2.h +++ b/cxxmph/MurmurHash2.h @@ -1,3 +1,6 @@ +#ifndef __CXXMPH_MURMUR_HASH2__ +#define __CXXMPH_MURMUR_HASH2__ + //----------------------------------------------------------------------------- // MurmurHash2, by Austin Appleby @@ -12,6 +15,8 @@ // 2. It will not produce the same results on little-endian and big-endian // machines. +namespace { + unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) { // 'm' and 'r' are mixing constants generated offline. @@ -62,3 +67,7 @@ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) return h; } + +} + +#endif // __CXXMPH_MURMUR_HASH2__ diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 2c5ba32..88ab6ed 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -1,49 +1,58 @@ #include +#include + +using std::cerr; +using std::endl; #include "mphtable.h" using std::vector; -namespace cxxmph { +namespace { -template -template -bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { - TableBuilderState st; - m_ = end - begin; - r_ = static_cast(ceil((c_*m_)/3)); - if (r_ % 2) == 0) r_ += 1; - n_ = 3*r_; - k_ = 1U << b_; +static const cmph_uint8 kUnassigned = 3; +// table used for looking up the number of assigned vertices to a 8-bit integer +static cmph_uint8 kBdzLookupTable[] = +{ +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 +}; - int iterations = 1000; - while (1) { - for (int i = 0; i < 3; ++i) hash_function_[i] = hasher(); - vector edges; - vector queue; - if (Mapping(begin, end, &edges, &queue)) break; - else --iterations; - if (iterations == 0) break; - } - if (iterations == 0) return false; - vector& edges; - graph->ExtractEdgesAndClear(&edges); - Assigning(queue, edges); - vector().swap(edges); - Ranking(); - +static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; +void set_2bit_value(vector *d, cmph_uint8 i, cmph_uint8 v) { + (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; +} +cmph_uint8 get_2bit_value(const vector& d, cmph_uint8 i) { + return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; } -template -bool MPHTable::GenerateQueue( +} // anonymous namespace + +namespace cxxmph { + +bool MPHTable::GenerateQueue( TriGraph* graph, vector* queue_output) { cmph_uint32 queue_head = 0, queue_tail = 0; - cmph_uint32 nedges = n_; - cmph_uint32 nvertices = m_; + cmph_uint32 nedges = m_; + cmph_uint32 nvertices = n_; // Relies on vector using 1 bit per element vector marked_edge((nedges >> 3) + 1, false); - Queue queue(nvertices, 0); - for (int i = 0; i < nedges; ++i) { + vector queue(nvertices, 0); + for (cmph_uint32 i = 0; i < nedges; ++i) { const TriGraph::Edge& e = graph->edges()[i]; if (graph->vertex_degree()[e[0]] == 1 || graph->vertex_degree()[e[1]] == 1 || @@ -74,102 +83,56 @@ bool MPHTable::GenerateQueue( return cycles == 0; } -template -template -bool MPHTable::Mapping( - ForwardIterator begin, ForwardIterator end, - vector* edges, vector queue) { - int cycles = 0; - TriGraph graph(m, n); - for (ForwardIterator it = begin; it != end; ++it) { - cmph_uint32 h[3]; - for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); - cmph_uint32 v0 = h[0] % r_; - cmph_uint32 v1 = h[1] % r_ + r_; - cmph_uint32 v2 = h[2] % r_ + (r_ << 1); - graph.AddEdge(Edge(v0, v1, v2)); - } - if (GenerateQueue(&graph, queue)) { - graph.ExtractEdgesAndClear(edges); - return true; - } - return false; -} - -template -void MPHTable::Assigning( - const vector& edges, const vector& queue) { +void MPHTable::Assigning( + const vector& edges, const vector& queue) { cmph_uint32 nedges = n_; cmph_uint32 current_edge = 0; vector marked_vertices(nedges + 1); - // TODO(davi) use half nibbles instead - // vector g(static_cast(ceil(nedges / 4.0)), - // std::numerical_limits::max()); - static const cmph_uint8 kUnassigned = 3; - vector(nedges, kUnassigned).swap(g_); + // Initialize vector of half nibbles with all bits set. + vector(nedges, std::numeric_limits::max()).swap(g_); for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; const TriGraph::Edge& e = edges[current_edge]; if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { - g_[e[1]] = kUnassigned; + set_2bit_value(&g_, e[1], kUnassigned); marked_vertices[e[1]] = true; } if (!marked_vertices[e[2]]) { - g_[e[2]] = kUnassigned; + set_2bit_value(&g_, e[2], kUnassigned); marked_vertices[e[2]] = true; } - g_[e[0]] = (6 - g_[e[1]] + g_[e2]) % 3; + set_2bit_value(&g_, e[0], (6 - (get_2bit_value(g_, e[1]) + get_2bit_value(g_, e[2]))) % 3); marked_vertices[e[0]] = true; - } else if (!marked_vertices[e[1]])) { - if (!marked_vertices[e[2]])) { - g_[e[2]] = kUnassigned; + } else if (!marked_vertices[e[1]]) { + if (!marked_vertices[e[2]]) { + set_2bit_value(&g_, e[2], kUnassigned); marked_vertices[e[2]] = true; } - g_[e[1]] = 7 - (g_[e[0]] + g_[e[2]]) % 3; + set_2bit_value(&g_, e[1], (7 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[2]))) % 3); marked_vertices[e[1]] = true; } else { - g_[e[2]] = (8 - g_[e[0]] + g_[e[1]]) % 3; + set_2bit_value(&g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); marked_vertices[e[2]] = true; } } } -// table used for looking up the number of assigned vertices to a 8-bit integer -static cmph_uint8 kBdzLookupTable[] = -{ -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 -}; - -template -void MPHTable::Ranking() { - cmph_uint32 nbytes_total = static_cast(ceil(st->n / 4.0)); +void MPHTable::Ranking() { + cmph_uint32 nbytes_total = static_cast(ceil(n_ / 4.0)); cmph_uint32 size = k_ >> 2U; - ranktablesize = static_cast(ceil(n_ / static_cast(k_))); - // TODO(davi) Change swap of member classes for resize + memset to avoid fragmentation + cmph_uint32 ranktablesize = static_cast( + ceil(n_ / static_cast(k_))); + // TODO(davi) Change swap of member classes for resize + memset to avoid + // fragmentation vector (ranktablesize).swap(ranktable_);; cmph_uint32 offset = 0; cmph_uint32 count = 0; cmph_uint32 i = 0; while (1) { - if (i == ranktable.size()) break; + if (i == ranktable_.size()) break; cmph_uint32 nbytes = size < nbytes_total ? size : nbytes_total; - for (j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; + for (cmph_uint32 j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; ranktable_[i] = count; offset += nbytes; nbytes_total -= size; @@ -177,36 +140,32 @@ void MPHTable::Ranking() { } } -template -cmph_uint32 MPHTable::Search(const key_type& key) const { - cmph_uint32 vertex; +cmph_uint32 MPHTable::Search(const key_type& key) const { cmph_uint32 h[3]; for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); - h[0] = h[0] % st->r; - h[1] = h[1] % st->r + st->r; - h[2] = h[2] % st->r + (st->r << 1); - cmph_uint32 vertex = h[(h[g_[h[0]] + g_[h[1]] + g_[h[2]]) % 3]; - return Rank(st->b, st->ranktable, vertex); + h[0] = h[0] % r_; + h[1] = h[1] % r_ + r_; + h[2] = h[2] % r_ + (r_ << 1); + cmph_uint32 vertex = h[(g_[h[0]] + g_[h[1]] + g_[h[2]]) % 3]; + return Rank(vertex); } -template -cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { +cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { cmph_uint32 index = vertex >> b_; cmph_uint32 base_rank = ranktable_[index]; - cmph_uint32 beg_idx_v = index << b; - cmph_uint32 beg_idx_b = index >> 2 - cmph_uint32 end_idx_b = index >> 2 + cmph_uint32 beg_idx_v = index << b_; + cmph_uint32 beg_idx_b = index >> 2; + cmph_uint32 end_idx_b = index >> 2; while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; while (beg_idx_v < vertex) { - if (g_[beg_idx_v) != kUnassigned) ++base_rank; + if (g_[beg_idx_v] != kUnassigned) ++base_rank; ++beg_idx_v; } return base_rank; } -template -cmph_uint32 MPHTable::index(const key_type& key) const { +cmph_uint32 MPHTable::index(const key_type& key) const { return Search(key); } diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index eccff61..c0ef402 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -3,21 +3,29 @@ // Minimal perfect hash abstraction implementing the BDZ algorithm +#include #include +#include + +using std::cerr; +using std::endl; + #include "randomly_seeded_hash.h" #include "stringpiece.h" #include "trigraph.h" namespace cxxmph { -template class MPHTable { public: - typedef Key key_type; - typedef NewRandomlySeededHashFcn hasher; + // This class could be a template for both key type and hash function, but we + // chose to go with simplicity. + typedef StringPiece key_type; + typedef RandomlySeededHashFunction hasher_type; + MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b) { } - ~MPHTable(); + ~MPHTable() {} template bool Reset(ForwardIterator begin, ForwardIterator end); @@ -26,21 +34,23 @@ class MPHTable { private: template bool Mapping(ForwardIterator begin, ForwardIterator end, - vector* edges, vector queue); - bool GenerateQueue(TriGraph* graph, vector* queue); - void Assigning(TriGraph* graph_builder, Queue* queue); - void Ranking(TriGraph* graph_builder, Queue* queue); - cmph_uint32 Search(const StringPiece& key); - cmph_uint32 Rank(const StringPiece& key); + std::vector* edges, + std::vector* queue); + bool GenerateQueue(TriGraph* graph, std::vector* queue); + void Assigning(const std::vector& edges, + const std::vector& queue); + void Ranking(); + cmph_uint32 Search(const key_type& key) const; + cmph_uint32 Rank(cmph_uint32 vertex) const; // Algorithm parameters - cmph_uint8 b_; // Number of bits of the kth index in the ranktable double c_; // Number of bits per key (? is it right) + cmph_uint8 b_; // Number of bits of the kth index in the ranktable // Values used during generation cmph_uint32 m_; // edges count cmph_uint32 n_; // vertex count - cmph_uint32 k_ // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ + cmph_uint32 k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ // Values used during search @@ -52,10 +62,59 @@ class MPHTable { std::vector ranktable_; // The selected hash function triplet for finding the edges in the minimal // perfect hash function graph. - hasher hash_function_[3]; + hasher_type hash_function_[3]; }; +// Template method needs to go in the header file. +template +bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { + m_ = end - begin; + r_ = static_cast(ceil((c_*m_)/3)); + if ((r_ % 2) == 0) r_ += 1; + n_ = 3*r_; + k_ = 1U << b_; + + cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; + + int iterations = 1000; + std::vector edges; + std::vector queue; + while (1) { + cerr << "Iterations missing: " << iterations << endl; + for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type(); + if (Mapping(begin, end, &edges, &queue)) break; + else --iterations; + if (iterations == 0) break; + } + if (iterations == 0) return false; + Assigning(edges, queue); + std::vector().swap(edges); + Ranking(); + return true; +} + +template +bool MPHTable::Mapping( + ForwardIterator begin, ForwardIterator end, + std::vector* edges, std::vector* queue) { + TriGraph graph(n_, m_); + for (ForwardIterator it = begin; it != end; ++it) { + cmph_uint32 h[3]; + for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); + cmph_uint32 v0 = h[0] % r_; + cmph_uint32 v1 = h[1] % r_ + r_; + cmph_uint32 v2 = h[2] % r_ + (r_ << 1); + cerr << "Key: " << *it << " vertex " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + graph.AddEdge(TriGraph::Edge(v0, v1, v2)); + } + if (GenerateQueue(&graph, queue)) { + graph.ExtractEdgesAndClear(edges); + return true; + } + return false; +} + } // namespace cxxmph -#define // __CXXMPH_MPHTABLE_H__ +#endif // __CXXMPH_MPHTABLE_H__ diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mphtable_test.cc index e18b34d..b08ffc5 100644 --- a/cxxmph/mphtable_test.cc +++ b/cxxmph/mphtable_test.cc @@ -1,22 +1,30 @@ #include +#include #include #include "mphtable.h" +using std::string; using std::vector; using cxxmph::MPHTable; int main(int argc, char** argv) { - vector keys; - keys.push_back(10); - keys.push_back(4); - keys.push_back(3); + vector keys; + keys.push_back("davi"); + keys.push_back("paulo"); + keys.push_back("joao"); + keys.push_back("maria"); + keys.push_back("bruno"); - MPHTable mphtable; + MPHTable mphtable; assert(mphtable.Reset(keys.begin(), keys.end())); vector ids; - for (int i = 0; i < keys.size(); ++i) ids.push_back(mphtable.index(keys[i])); + for (vector::size_type i = 0; i < keys.size(); ++i) { + ids.push_back(mphtable.index(keys[i])); + cerr << " " << *(ids.end() - 1); + } + cerr << endl; sort(ids.begin(), ids.end()); - for (int i = 0; i < ids.size(); ++i) assert(ids[i] == i); + for (vector::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); } diff --git a/cxxmph/randomly_seeded_hash.h b/cxxmph/randomly_seeded_hash.h index 69db56a..fa382dd 100644 --- a/cxxmph/randomly_seeded_hash.h +++ b/cxxmph/randomly_seeded_hash.h @@ -8,17 +8,35 @@ #include "../src/cmph_types.h" #include "MurmurHash2.h" +#include "stringpiece.h" namespace cxxmph { -struct RandomlySeededMurmur2 { +template +struct RandomlySeededHashFunction { }; + +class Murmur2StringPiece { }; +class Murmur2Pod { }; + +template <> +struct RandomlySeededHashFunction { RandomlySeededHashFunction() : seed(random()) { } - cmph_uint32 operator()(const StringPiece& key) { + cmph_uint32 operator()(const StringPiece& key) const { return MurmurHash2(key.data(), key.length(), seed); } cmph_uint32 seed; }; +template<> +struct RandomlySeededHashFunction { + RandomlySeededHashFunction() : seed(random()) { } + template + cmph_uint32 operator()(const Key& key) const { + return MurmurHash2(&key, sizeof(key), seed); + } + cmph_uint32 seed; +}; + } // namespace cxxmph #endif // __CXXMPH_RANDOMLY_SEEDED_HASH__ diff --git a/cxxmph/trigraph.cc b/cxxmph/trigraph.cc index 63c36e1..ff738a6 100644 --- a/cxxmph/trigraph.cc +++ b/cxxmph/trigraph.cc @@ -1,8 +1,11 @@ #include #include +#include #include "trigraph.h" +using std::cerr; +using std::endl; using std::vector; namespace { @@ -11,9 +14,10 @@ static const cmph_uint8 kInvalidEdge = std::numeric_limits::max(); namespace cxxmph { -TriGraph::TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices) +TriGraph::TriGraph(cmph_uint32 nvertices, cmph_uint32 nedges) : nedges_(0), edges_(nedges), + next_edge_(nedges), first_edge_(nvertices, kInvalidEdge), vertex_degree_(nvertices, 0) { } @@ -25,7 +29,13 @@ void TriGraph::ExtractEdgesAndClear(vector* edges) { edges->swap(edges_); } void TriGraph::AddEdge(const Edge& edge) { - edges_[nedges_] = edge; + edges_[nedges_] = edge; + assert(first_edge_.size() > edge[0]); + assert(first_edge_.size() > edge[1]); + assert(first_edge_.size() > edge[0]); + assert(first_edge_.size() > edge[1]); + assert(first_edge_.size() > edge[2]); + assert(next_edge_.size() > nedges_); next_edge_[nedges_] = Edge( first_edge_[edge[0]], first_edge_[edge[1]], first_edge_[edge[2]]); first_edge_[edge[0]] = first_edge_[edge[1]] = first_edge_[edge[2]] = nedges_; @@ -36,7 +46,7 @@ void TriGraph::AddEdge(const Edge& edge) { } void TriGraph::RemoveEdge(cmph_uint32 current_edge) { - cmph_uint32 vertex, edge1, edge2; + cerr << "Removing edge " << current_edge << " from " << nedges_ << " existing edges " << endl; for (int i = 0; i < 3; ++i) { cmph_uint32 vertex = edges_[current_edge][i]; cmph_uint32 edge1 = first_edge_[vertex]; diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h index 9d60151..18d8d98 100644 --- a/cxxmph/trigraph.h +++ b/cxxmph/trigraph.h @@ -14,9 +14,14 @@ namespace cxxmph { class TriGraph { + public: struct Edge { Edge() { } - Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); + Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2) { + vertices[0] = v0; + vertices[1] = v1; + vertices[2] = v2; + } cmph_uint32& operator[](cmph_uint8 v) { return vertices[v]; } const cmph_uint32& operator[](cmph_uint8 v) const { return vertices[v]; } cmph_uint32 vertices[3];