Added murmur hash and finished porting all c code.
This commit is contained in:
parent
bf0c5892d8
commit
724e716d67
@ -1,8 +1,11 @@
|
|||||||
bin_PROGRAMS = cmph_hash_map_test
|
bin_PROGRAMS = cmph_hash_map_test mphtable_test
|
||||||
lib_LTLIBRARIES = libcxxmph.la
|
lib_LTLIBRARIES = libcxxmph.la
|
||||||
|
|
||||||
libcxxmph_la_SOURCES = trigragh.h trigraph.cc
|
libcxxmph_la_SOURCES = stringpiece.h MurmurHash2.h randomly_seeded_hash.h trigragh.h trigraph.cc mphtable.h mphtable.cc
|
||||||
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
||||||
|
|
||||||
cmph_hash_map_test_LDADD = libcxxmph.la
|
cmph_hash_map_test_LDADD = libcxxmph.la
|
||||||
cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc
|
cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc
|
||||||
|
|
||||||
|
mphtable_test_LDADD = libcxxmph.la
|
||||||
|
mphtable_test_SOURCES = mphtable_test.cc
|
||||||
|
64
cxxmph/MurmurHash2.h
Normal file
64
cxxmph/MurmurHash2.h
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// MurmurHash2, by Austin Appleby
|
||||||
|
|
||||||
|
// Note - This code makes a few assumptions about how your machine behaves -
|
||||||
|
|
||||||
|
// 1. We can read a 4-byte value from any address without crashing
|
||||||
|
// 2. sizeof(int) == 4
|
||||||
|
|
||||||
|
// And it has a few limitations -
|
||||||
|
|
||||||
|
// 1. It will not work incrementally.
|
||||||
|
// 2. It will not produce the same results on little-endian and big-endian
|
||||||
|
// machines.
|
||||||
|
|
||||||
|
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
||||||
|
{
|
||||||
|
// 'm' and 'r' are mixing constants generated offline.
|
||||||
|
// They're not really 'magic', they just happen to work well.
|
||||||
|
|
||||||
|
const unsigned int m = 0x5bd1e995;
|
||||||
|
const int r = 24;
|
||||||
|
|
||||||
|
// Initialize the hash to a 'random' value
|
||||||
|
|
||||||
|
unsigned int h = seed ^ len;
|
||||||
|
|
||||||
|
// Mix 4 bytes at a time into the hash
|
||||||
|
|
||||||
|
const unsigned char * data = (const unsigned char *)key;
|
||||||
|
|
||||||
|
while(len >= 4)
|
||||||
|
{
|
||||||
|
unsigned int k = *(unsigned int *)data;
|
||||||
|
|
||||||
|
k *= m;
|
||||||
|
k ^= k >> r;
|
||||||
|
k *= m;
|
||||||
|
|
||||||
|
h *= m;
|
||||||
|
h ^= k;
|
||||||
|
|
||||||
|
data += 4;
|
||||||
|
len -= 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the last few bytes of the input array
|
||||||
|
|
||||||
|
switch(len)
|
||||||
|
{
|
||||||
|
case 3: h ^= data[2] << 16;
|
||||||
|
case 2: h ^= data[1] << 8;
|
||||||
|
case 1: h ^= data[0];
|
||||||
|
h *= m;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Do a few final mixes of the hash to ensure the last few
|
||||||
|
// bytes are well-incorporated.
|
||||||
|
|
||||||
|
h ^= h >> 13;
|
||||||
|
h *= m;
|
||||||
|
h ^= h >> 15;
|
||||||
|
|
||||||
|
return h;
|
||||||
|
}
|
@ -2,8 +2,6 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <utility> // for std::pair
|
#include <utility> // for std::pair
|
||||||
|
|
||||||
#include <cmph.h>
|
|
||||||
|
|
||||||
// Save on repetitive typing.
|
// Save on repetitive typing.
|
||||||
#define CMPH_TMPL_SPEC template <class Key, class Data, class HashFcn, class EqualKey, class Alloc>
|
#define CMPH_TMPL_SPEC template <class Key, class Data, class HashFcn, class EqualKey, class Alloc>
|
||||||
#define CMPH_CLASS_SPEC cmph_hash_map<Key, Data, HashFcn, EqualKey, Alloc>
|
#define CMPH_CLASS_SPEC cmph_hash_map<Key, Data, HashFcn, EqualKey, Alloc>
|
||||||
|
@ -1,105 +1,213 @@
|
|||||||
#include <numerical_limits>
|
#include <limits>
|
||||||
|
|
||||||
#include "mphtable.h"
|
#include "mphtable.h"
|
||||||
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
template <class Key, class HashFcn>
|
template <class Key, class HashFcn>
|
||||||
template <class ForwardIterator>
|
template <class ForwardIterator>
|
||||||
|
bool MPHTable<Key, HashFcn>::Reset(ForwardIterator begin, ForwardIterator end) {
|
||||||
void MPHTable::Reset(ForwardIterator begin, ForwardIterator end) {
|
TableBuilderState<ForwardIterator> st;
|
||||||
TableBuilderState st;
|
m_ = end - begin;
|
||||||
st.c = 1.23;
|
r_ = static_cast<cmph_uint32>(ceil((c_*m_)/3));
|
||||||
st.b = 7;
|
if (r_ % 2) == 0) r_ += 1;
|
||||||
st.m = end - begin;
|
n_ = 3*r_;
|
||||||
st.r = static_cast<cmph_uint32>(ceil((st.c*st.m)/3));
|
k_ = 1U << b_;
|
||||||
if ((st.r % 2) == 0) st.r += 1;
|
|
||||||
st.n = 3*st.r;
|
|
||||||
st.k = 1U << st.b;
|
|
||||||
st.ranktablesize = static_cast<cmph_uint32>(
|
|
||||||
ceil(st.n / static_cast<double>(st.k)));
|
|
||||||
st.graph_builder = TriGraph(st.m, st.n); // giant copy
|
|
||||||
st.edges_queue.resize(st.m)
|
|
||||||
|
|
||||||
int iterations = 1000;
|
int iterations = 1000;
|
||||||
while (1) {
|
while (1) {
|
||||||
hasher hasher0 = HashFcn();
|
for (int i = 0; i < 3; ++i) hash_function_[i] = hasher();
|
||||||
ok = Mapping(st.graph_builder, st.edges_queue);
|
vector<Edge> edges;
|
||||||
if (ok) break;
|
vector<cmph_uint32> queue;
|
||||||
|
if (Mapping(begin, end, &edges, &queue)) break;
|
||||||
else --iterations;
|
else --iterations;
|
||||||
if (iterations == 0) break;
|
if (iterations == 0) break;
|
||||||
}
|
}
|
||||||
if (iterations == 0) return false;
|
if (iterations == 0) return false;
|
||||||
vector<ConnectedEdge> graph;
|
vector<Edge>& edges;
|
||||||
st.graph_builder.ExtractEdgesAndClear(&graph);
|
graph->ExtractEdgesAndClear(&edges);
|
||||||
Assigning(graph, st.edges_queue);
|
Assigning(queue, edges);
|
||||||
vector<cmph_uint32>().swap(st.edges_queue);
|
vector<cmph_uint32>().swap(edges);
|
||||||
Ranking(graph);
|
Ranking();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Key, class HashFcn>
|
template <class Key, class HashFcn>
|
||||||
int MPHTable::GenerateQueue(
|
bool MPHTable<Key, HashFcn>::GenerateQueue(
|
||||||
cmph_uint32 nedges, cmph_uint32 nvertices,
|
TriGraph* graph, vector<cmph_uint32>* queue_output) {
|
||||||
TriGraph* graph, Queue* queue) {
|
|
||||||
cmph_uint32 queue_head = 0, queue_tail = 0;
|
cmph_uint32 queue_head = 0, queue_tail = 0;
|
||||||
|
cmph_uint32 nedges = n_;
|
||||||
|
cmph_uint32 nvertices = m_;
|
||||||
// Relies on vector<bool> using 1 bit per element
|
// Relies on vector<bool> using 1 bit per element
|
||||||
vector<bool> marked_edge((nedges >> 3) + 1, false);
|
vector<bool> marked_edge((nedges >> 3) + 1, false);
|
||||||
queue->swap(Queue(nvertices, 0));
|
Queue queue(nvertices, 0);
|
||||||
for (int i = 0; i < nedges; ++i) {
|
for (int i = 0; i < nedges; ++i) {
|
||||||
TriGraph::Edge e = graph.edges[i].vertices;
|
const TriGraph::Edge& e = graph->edges()[i];
|
||||||
if (graph.vertex_degree_[e.vertices[0]] == 1 ||
|
if (graph->vertex_degree()[e[0]] == 1 ||
|
||||||
graph.vertex_degree_[e.vertices[1]] == 1 ||
|
graph->vertex_degree()[e[1]] == 1 ||
|
||||||
graph.vertex_degree[e.vertices[2]] == 1) {
|
graph->vertex_degree()[e[2]] == 1) {
|
||||||
if (!marked_edge[i]) {
|
if (!marked_edge[i]) {
|
||||||
(*queue)[queue_head++] = i;
|
queue[queue_head++] = i;
|
||||||
marked_edge[i] = true;
|
marked_edge[i] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (queue_tail != queue_head) {
|
while (queue_tail != queue_head) {
|
||||||
cmph_uint32 current_edge = (*queue)[queue_tail++];
|
cmph_uint32 current_edge = queue[queue_tail++];
|
||||||
graph->RemoveEdge(current_edge);
|
graph->RemoveEdge(current_edge);
|
||||||
TriGraph::Edge e = graph->edges[current_edge];
|
const TriGraph::Edge& e = graph->edges()[current_edge];
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
cmph_uint32 v = e.vertices[i];
|
cmph_uint32 v = e[i];
|
||||||
if (graph->vertex_degree[v] == 1) {
|
if (graph->vertex_degree()[v] == 1) {
|
||||||
cmph_uint32 first_edge = graph->first_edge_[v];
|
cmph_uint32 first_edge = graph->first_edge()[v];
|
||||||
if (!marked_edge[first_edge) {
|
if (!marked_edge[first_edge]) {
|
||||||
queue[queue_head++] = first_edge;
|
queue[queue_head++] = first_edge;
|
||||||
marked_edge[first_edge] = true;
|
marked_edge[first_edge] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
vector<bool>().swap(marked_edge);
|
int cycles = queue_head - nedges;
|
||||||
return queue_head - nedges;
|
if (cycles == 0) queue.swap(*queue_output);
|
||||||
}
|
|
||||||
|
|
||||||
template <class Key, class HashFcn>
|
|
||||||
int MPHTable::Mapping(TriGraph* graph, Queue* queue) {
|
|
||||||
int cycles = 0;
|
|
||||||
graph->Reset(m, n);
|
|
||||||
for (ForwardIterator it = begin_; it != end_; ++it) {
|
|
||||||
cmph_uint32 hash_values[3];
|
|
||||||
for (int i = 0; i < 3; ++i) {
|
|
||||||
hash_values[i] = hasher_(*it);
|
|
||||||
}
|
|
||||||
cmph_uint32 v0 = hash_values[0] % bdz->r;
|
|
||||||
cmph_uint32 v1 = hash_values[1] % bdz->r + bdz->r;
|
|
||||||
cmph_uint32 v2 = hash_values[2] % bdz->r + (bdz->r << 1);
|
|
||||||
graph->AddEdge(Edge(v0, v1, v2));
|
|
||||||
}
|
|
||||||
cycles = GenerateQueue(bdz->m, bdz->n, queue, graph);
|
|
||||||
return cycles == 0;
|
return cycles == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MPHTable::Assigning(TriGraph* graph, Queue* queue) {
|
template <class Key, class HashFcn>
|
||||||
}
|
template <class ForwardIterator>
|
||||||
void MPHTable::Ranking(TriGraph* graph, Queue* queue) {
|
bool MPHTable<Key, HashFcn>::Mapping(
|
||||||
}
|
ForwardIterator begin, ForwardIterator end,
|
||||||
cmph_uint32 MPHTable::Search(const key_type& key) {
|
vector<Edge>* edges, vector<cmph_uint32> queue) {
|
||||||
|
int cycles = 0;
|
||||||
|
TriGraph graph(m, n);
|
||||||
|
for (ForwardIterator it = begin; it != end; ++it) {
|
||||||
|
cmph_uint32 h[3];
|
||||||
|
for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it);
|
||||||
|
cmph_uint32 v0 = h[0] % r_;
|
||||||
|
cmph_uint32 v1 = h[1] % r_ + r_;
|
||||||
|
cmph_uint32 v2 = h[2] % r_ + (r_ << 1);
|
||||||
|
graph.AddEdge(Edge(v0, v1, v2));
|
||||||
|
}
|
||||||
|
if (GenerateQueue(&graph, queue)) {
|
||||||
|
graph.ExtractEdgesAndClear(edges);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
cmph_uint32 MPHTable::Rank(const key_type& key) {
|
template <class Key, class HashFcn>
|
||||||
|
void MPHTable<Key, HashFcn>::Assigning(
|
||||||
|
const vector<Edge>& edges, const vector<cmph_uint32>& queue) {
|
||||||
|
cmph_uint32 nedges = n_;
|
||||||
|
cmph_uint32 current_edge = 0;
|
||||||
|
vector<bool> marked_vertices(nedges + 1);
|
||||||
|
// TODO(davi) use half nibbles instead
|
||||||
|
// vector<cmph_uint8> g(static_cast<cmph_uint32>(ceil(nedges / 4.0)),
|
||||||
|
// std::numerical_limits<cmph_uint8>::max());
|
||||||
|
static const cmph_uint8 kUnassigned = 3;
|
||||||
|
vector<cmph_uint8>(nedges, kUnassigned).swap(g_);
|
||||||
|
for (int i = nedges - 1; i + 1 >= 1; --i) {
|
||||||
|
current_edge = queue[i];
|
||||||
|
const TriGraph::Edge& e = edges[current_edge];
|
||||||
|
if (!marked_vertices[e[0]]) {
|
||||||
|
if (!marked_vertices[e[1]]) {
|
||||||
|
g_[e[1]] = kUnassigned;
|
||||||
|
marked_vertices[e[1]] = true;
|
||||||
|
}
|
||||||
|
if (!marked_vertices[e[2]]) {
|
||||||
|
g_[e[2]] = kUnassigned;
|
||||||
|
marked_vertices[e[2]] = true;
|
||||||
|
}
|
||||||
|
g_[e[0]] = (6 - g_[e[1]] + g_[e2]) % 3;
|
||||||
|
marked_vertices[e[0]] = true;
|
||||||
|
} else if (!marked_vertices[e[1]])) {
|
||||||
|
if (!marked_vertices[e[2]])) {
|
||||||
|
g_[e[2]] = kUnassigned;
|
||||||
|
marked_vertices[e[2]] = true;
|
||||||
|
}
|
||||||
|
g_[e[1]] = 7 - (g_[e[0]] + g_[e[2]]) % 3;
|
||||||
|
marked_vertices[e[1]] = true;
|
||||||
|
} else {
|
||||||
|
g_[e[2]] = (8 - g_[e[0]] + g_[e[1]]) % 3;
|
||||||
|
marked_vertices[e[2]] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// table used for looking up the number of assigned vertices to a 8-bit integer
|
||||||
|
static cmph_uint8 kBdzLookupTable[] =
|
||||||
|
{
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
|
||||||
|
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
|
||||||
|
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
|
||||||
|
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
|
||||||
|
2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Key, class HashFcn>
|
||||||
|
void MPHTable<Key, HashFcn>::Ranking() {
|
||||||
|
cmph_uint32 nbytes_total = static_cast<cmph_uint32>(ceil(st->n / 4.0));
|
||||||
|
cmph_uint32 size = k_ >> 2U;
|
||||||
|
ranktablesize = static_cast<cmph_uint32>(ceil(n_ / static_cast<double>(k_)));
|
||||||
|
// TODO(davi) Change swap of member classes for resize + memset to avoid fragmentation
|
||||||
|
vector<cmph_uint32> (ranktablesize).swap(ranktable_);;
|
||||||
|
cmph_uint32 offset = 0;
|
||||||
|
cmph_uint32 count = 0;
|
||||||
|
cmph_uint32 i = 0;
|
||||||
|
while (1) {
|
||||||
|
if (i == ranktable.size()) break;
|
||||||
|
cmph_uint32 nbytes = size < nbytes_total ? size : nbytes_total;
|
||||||
|
for (j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]];
|
||||||
|
ranktable_[i] = count;
|
||||||
|
offset += nbytes;
|
||||||
|
nbytes_total -= size;
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Key, class HashFcn>
|
||||||
|
cmph_uint32 MPHTable<Key, HashFcn>::Search(const key_type& key) const {
|
||||||
|
cmph_uint32 vertex;
|
||||||
|
cmph_uint32 h[3];
|
||||||
|
for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key);
|
||||||
|
h[0] = h[0] % st->r;
|
||||||
|
h[1] = h[1] % st->r + st->r;
|
||||||
|
h[2] = h[2] % st->r + (st->r << 1);
|
||||||
|
cmph_uint32 vertex = h[(h[g_[h[0]] + g_[h[1]] + g_[h[2]]) % 3];
|
||||||
|
return Rank(st->b, st->ranktable, vertex);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Key, class HashFcn>
|
||||||
|
cmph_uint32 MPHTable<Key, HashFcn>::Rank(cmph_uint32 vertex) const {
|
||||||
|
cmph_uint32 index = vertex >> b_;
|
||||||
|
cmph_uint32 base_rank = ranktable_[index];
|
||||||
|
cmph_uint32 beg_idx_v = index << b;
|
||||||
|
cmph_uint32 beg_idx_b = index >> 2
|
||||||
|
cmph_uint32 end_idx_b = index >> 2
|
||||||
|
while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]];
|
||||||
|
beg_idx_v = beg_idx_b << 2;
|
||||||
|
while (beg_idx_v < vertex) {
|
||||||
|
if (g_[beg_idx_v) != kUnassigned) ++base_rank;
|
||||||
|
++beg_idx_v;
|
||||||
|
}
|
||||||
|
return base_rank;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Key, class HashFcn>
|
||||||
|
cmph_uint32 MPHTable<Key, HashFcn>::index(const key_type& key) const {
|
||||||
|
return Search(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
@ -1,15 +1,22 @@
|
|||||||
|
#ifndef __CXXMPH_MPHTABLE_H__
|
||||||
|
#define __CXXMPH_MPHTABLE_H__
|
||||||
|
|
||||||
// Minimal perfect hash abstraction implementing the BDZ algorithm
|
// Minimal perfect hash abstraction implementing the BDZ algorithm
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "randomly_seeded_hash.h"
|
||||||
|
#include "stringpiece.h"
|
||||||
#include "trigraph.h"
|
#include "trigraph.h"
|
||||||
|
|
||||||
template <class Key, class NewRandomlySeededHashFcn = __gnu_cxx::hash<Key> >
|
namespace cxxmph {
|
||||||
|
|
||||||
|
template <class Key, class NewRandomlySeededHashFcn = RandomlySeededMurmur2>
|
||||||
class MPHTable {
|
class MPHTable {
|
||||||
public:
|
public:
|
||||||
typedef Key key_type;
|
typedef Key key_type;
|
||||||
typedef NewRandomlySeededHashFcn hasher;
|
typedef NewRandomlySeededHashFcn hasher;
|
||||||
MPHTable();
|
MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b) { }
|
||||||
~MPHTable();
|
~MPHTable();
|
||||||
|
|
||||||
template <class ForwardIterator>
|
template <class ForwardIterator>
|
||||||
@ -17,28 +24,38 @@ class MPHTable {
|
|||||||
cmph_uint32 index(const key_type& x) const;
|
cmph_uint32 index(const key_type& x) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
typedef std::vector<cmph_uint32> Queue;
|
template <class ForwardIterator>
|
||||||
template<class ForwardIterator>
|
bool Mapping(ForwardIterator begin, ForwardIterator end,
|
||||||
struct TableBuilderState {
|
vector<Edge>* edges, vector<cmph_uint32> queue);
|
||||||
ForwardIterator begin;
|
bool GenerateQueue(TriGraph* graph, vector<cmph_uint32>* queue);
|
||||||
ForwardIterator end;
|
void Assigning(TriGraph* graph_builder, Queue* queue);
|
||||||
Queue edges_queue;
|
void Ranking(TriGraph* graph_builder, Queue* queue);
|
||||||
TriGraph graph_builder;
|
cmph_uint32 Search(const StringPiece& key);
|
||||||
double c;
|
cmph_uint32 Rank(const StringPiece& key);
|
||||||
cmph_uint32 m;
|
|
||||||
cmph_uint32 n;
|
|
||||||
cmph_uint32 k;
|
|
||||||
cmph_uint32 ranktablesize;
|
|
||||||
};
|
|
||||||
int GenerateQueue(
|
|
||||||
cmph_uint32 nedges, cmph_uint32 nvertices,
|
|
||||||
TriGraph* graph, Queue* queue);
|
|
||||||
void Assigning(TriGraph* graph, Queue* queue);
|
|
||||||
void Ranking(TriGraph* graph, Queue* queue);
|
|
||||||
cmph_uint32 Search(const StringPiece& key);
|
|
||||||
cmph_uint32 Rank(const StringPiece& key);
|
|
||||||
|
|
||||||
std::vector<ConnectedEdge> graph_;
|
// Algorithm parameters
|
||||||
|
cmph_uint8 b_; // Number of bits of the kth index in the ranktable
|
||||||
|
double c_; // Number of bits per key (? is it right)
|
||||||
|
|
||||||
|
// Values used during generation
|
||||||
|
cmph_uint32 m_; // edges count
|
||||||
|
cmph_uint32 n_; // vertex count
|
||||||
|
cmph_uint32 k_ // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
|
||||||
|
|
||||||
|
// Values used during search
|
||||||
|
|
||||||
|
// Partition vertex count, derived from c parameter.
|
||||||
|
cmph_uint32 r_;
|
||||||
|
// The array containing the minimal perfect hash function graph.
|
||||||
|
std::vector<cmph_uint8> g_;
|
||||||
|
// The table used for the rank step of the minimal perfect hash function
|
||||||
|
std::vector<cmph_uint32> ranktable_;
|
||||||
|
// The selected hash function triplet for finding the edges in the minimal
|
||||||
|
// perfect hash function graph.
|
||||||
|
hasher hash_function_[3];
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
#define // __CXXMPH_MPHTABLE_H__
|
||||||
|
22
cxxmph/mphtable_test.cc
Normal file
22
cxxmph/mphtable_test.cc
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#include <cassert>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "mphtable.h"
|
||||||
|
|
||||||
|
using std::vector;
|
||||||
|
using cxxmph::MPHTable;
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
vector<int> keys;
|
||||||
|
keys.push_back(10);
|
||||||
|
keys.push_back(4);
|
||||||
|
keys.push_back(3);
|
||||||
|
|
||||||
|
MPHTable<int> mphtable;
|
||||||
|
assert(mphtable.Reset(keys.begin(), keys.end()));
|
||||||
|
vector<int> ids;
|
||||||
|
for (int i = 0; i < keys.size(); ++i) ids.push_back(mphtable.index(keys[i]));
|
||||||
|
sort(ids.begin(), ids.end());
|
||||||
|
for (int i = 0; i < ids.size(); ++i) assert(ids[i] == i);
|
||||||
|
}
|
||||||
|
|
24
cxxmph/randomly_seeded_hash.h
Normal file
24
cxxmph/randomly_seeded_hash.h
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#ifndef __CXXMPH_RANDOMLY_SEEDED_HASH__
|
||||||
|
#define __CXXMPH_RANDOMLY_SEEDED_HASH__
|
||||||
|
|
||||||
|
// Helper to create randomly seeded hash functions out of existing hash
|
||||||
|
// functions that take a seed as a parameter.
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
#include "../src/cmph_types.h"
|
||||||
|
#include "MurmurHash2.h"
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
|
struct RandomlySeededMurmur2 {
|
||||||
|
RandomlySeededHashFunction() : seed(random()) { }
|
||||||
|
cmph_uint32 operator()(const StringPiece& key) {
|
||||||
|
return MurmurHash2(key.data(), key.length(), seed);
|
||||||
|
}
|
||||||
|
cmph_uint32 seed;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
#endif // __CXXMPH_RANDOMLY_SEEDED_HASH__
|
177
cxxmph/stringpiece.h
Normal file
177
cxxmph/stringpiece.h
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// A string-like object that points to a sized piece of memory.
|
||||||
|
//
|
||||||
|
// Functions or methods may use const StringPiece& parameters to accept either
|
||||||
|
// a "const char*" or a "string" value that will be implicitly converted to
|
||||||
|
// a StringPiece. The implicit conversion means that it is often appropriate
|
||||||
|
// to include this .h file in other files rather than forward-declaring
|
||||||
|
// StringPiece as would be appropriate for most other Google classes.
|
||||||
|
//
|
||||||
|
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
|
||||||
|
// conversions from "const char*" to "string" and back again.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Arghh! I wish C++ literals were "string".
|
||||||
|
|
||||||
|
#ifndef CXXMPH_STRINGPIECE_H__
|
||||||
|
#define CXXMPH_STRINGPIECE_H__
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#include <iosfwd>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
|
class StringPiece {
|
||||||
|
private:
|
||||||
|
const char* ptr_;
|
||||||
|
int length_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
// We provide non-explicit singleton constructors so users can pass
|
||||||
|
// in a "const char*" or a "string" wherever a "StringPiece" is
|
||||||
|
// expected.
|
||||||
|
StringPiece() : ptr_(NULL), length_(0) { }
|
||||||
|
StringPiece(const char* str)
|
||||||
|
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
|
||||||
|
StringPiece(const std::string& str)
|
||||||
|
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
|
||||||
|
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
|
||||||
|
|
||||||
|
// data() may return a pointer to a buffer with embedded NULs, and the
|
||||||
|
// returned buffer may or may not be null terminated. Therefore it is
|
||||||
|
// typically a mistake to pass data() to a routine that expects a NUL
|
||||||
|
// terminated string.
|
||||||
|
const char* data() const { return ptr_; }
|
||||||
|
int size() const { return length_; }
|
||||||
|
int length() const { return length_; }
|
||||||
|
bool empty() const { return length_ == 0; }
|
||||||
|
|
||||||
|
void clear() { ptr_ = NULL; length_ = 0; }
|
||||||
|
void set(const char* data, int len) { ptr_ = data; length_ = len; }
|
||||||
|
void set(const char* str) {
|
||||||
|
ptr_ = str;
|
||||||
|
if (str != NULL)
|
||||||
|
length_ = static_cast<int>(strlen(str));
|
||||||
|
else
|
||||||
|
length_ = 0;
|
||||||
|
}
|
||||||
|
void set(const void* data, int len) {
|
||||||
|
ptr_ = reinterpret_cast<const char*>(data);
|
||||||
|
length_ = len;
|
||||||
|
}
|
||||||
|
|
||||||
|
char operator[](int i) const { return ptr_[i]; }
|
||||||
|
|
||||||
|
void remove_prefix(int n) {
|
||||||
|
ptr_ += n;
|
||||||
|
length_ -= n;
|
||||||
|
}
|
||||||
|
|
||||||
|
void remove_suffix(int n) {
|
||||||
|
length_ -= n;
|
||||||
|
}
|
||||||
|
|
||||||
|
int compare(const StringPiece& x) const {
|
||||||
|
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
|
||||||
|
if (r == 0) {
|
||||||
|
if (length_ < x.length_) r = -1;
|
||||||
|
else if (length_ > x.length_) r = +1;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string as_string() const {
|
||||||
|
return std::string(data(), size());
|
||||||
|
}
|
||||||
|
// We also define ToString() here, since many other string-like
|
||||||
|
// interfaces name the routine that converts to a C++ string
|
||||||
|
// "ToString", and it's confusing to have the method that does that
|
||||||
|
// for a StringPiece be called "as_string()". We also leave the
|
||||||
|
// "as_string()" method defined here for existing code.
|
||||||
|
std::string ToString() const {
|
||||||
|
return std::string(data(), size());
|
||||||
|
}
|
||||||
|
|
||||||
|
void CopyToString(std::string* target) const;
|
||||||
|
void AppendToString(std::string* target) const;
|
||||||
|
|
||||||
|
// Does "this" start with "x"
|
||||||
|
bool starts_with(const StringPiece& x) const {
|
||||||
|
return ((length_ >= x.length_) &&
|
||||||
|
(memcmp(ptr_, x.ptr_, x.length_) == 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Does "this" end with "x"
|
||||||
|
bool ends_with(const StringPiece& x) const {
|
||||||
|
return ((length_ >= x.length_) &&
|
||||||
|
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// standard STL container boilerplate
|
||||||
|
typedef char value_type;
|
||||||
|
typedef const char* pointer;
|
||||||
|
typedef const char& reference;
|
||||||
|
typedef const char& const_reference;
|
||||||
|
typedef size_t size_type;
|
||||||
|
typedef ptrdiff_t difference_type;
|
||||||
|
static const size_type npos;
|
||||||
|
typedef const char* const_iterator;
|
||||||
|
typedef const char* iterator;
|
||||||
|
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
||||||
|
typedef std::reverse_iterator<iterator> reverse_iterator;
|
||||||
|
iterator begin() const { return ptr_; }
|
||||||
|
iterator end() const { return ptr_ + length_; }
|
||||||
|
const_reverse_iterator rbegin() const {
|
||||||
|
return const_reverse_iterator(ptr_ + length_);
|
||||||
|
}
|
||||||
|
const_reverse_iterator rend() const {
|
||||||
|
return const_reverse_iterator(ptr_);
|
||||||
|
}
|
||||||
|
// STLS says return size_type, but Google says return int
|
||||||
|
int max_size() const { return length_; }
|
||||||
|
int capacity() const { return length_; }
|
||||||
|
|
||||||
|
int copy(char* buf, size_type n, size_type pos = 0) const;
|
||||||
|
|
||||||
|
int find(const StringPiece& s, size_type pos = 0) const;
|
||||||
|
int find(char c, size_type pos = 0) const;
|
||||||
|
int rfind(const StringPiece& s, size_type pos = npos) const;
|
||||||
|
int rfind(char c, size_type pos = npos) const;
|
||||||
|
|
||||||
|
StringPiece substr(size_type pos, size_type n = npos) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
bool operator==(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y);
|
||||||
|
|
||||||
|
inline bool operator!=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) {
|
||||||
|
return !(x == y);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator<(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) {
|
||||||
|
const int r = memcmp(x.data(), y.data(),
|
||||||
|
std::min(x.size(), y.size()));
|
||||||
|
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator>(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) {
|
||||||
|
return y < x;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator<=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) {
|
||||||
|
return !(x > y);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator>=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) {
|
||||||
|
return !(x < y);
|
||||||
|
}
|
||||||
|
|
||||||
|
// allow StringPiece to be logged
|
||||||
|
extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece);
|
||||||
|
|
||||||
|
#endif // CXXMPH_STRINGPIECE_H__
|
@ -1,3 +1,4 @@
|
|||||||
|
#include <cassert>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
#include "trigraph.h"
|
#include "trigraph.h"
|
||||||
@ -8,17 +9,51 @@ namespace {
|
|||||||
static const cmph_uint8 kInvalidEdge = std::numeric_limits<cmph_uint8>::max();
|
static const cmph_uint8 kInvalidEdge = std::numeric_limits<cmph_uint8>::max();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
TriGraph::TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices)
|
TriGraph::TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices)
|
||||||
: nedges_(0),
|
: nedges_(0),
|
||||||
edges_(nedges),
|
edges_(nedges),
|
||||||
first_edge_(nvertices, kInvalidEdge),
|
first_edge_(nvertices, kInvalidEdge),
|
||||||
vertex_degree_(nvertices, 0) { }
|
vertex_degree_(nvertices, 0) { }
|
||||||
|
|
||||||
void TriGraph::ExtractEdgesAndClear(vector<ConnectedEdge>* edges) {
|
void TriGraph::ExtractEdgesAndClear(vector<Edge>* edges) {
|
||||||
|
vector<Edge>().swap(next_edge_);
|
||||||
vector<cmph_uint32>().swap(first_edge_);
|
vector<cmph_uint32>().swap(first_edge_);
|
||||||
vector<cmph_uint8>().swap(vertex_degree_);
|
vector<cmph_uint8>().swap(vertex_degree_);
|
||||||
nedges_ = 0;
|
nedges_ = 0;
|
||||||
edges->swap(edges_);
|
edges->swap(edges_);
|
||||||
}
|
}
|
||||||
void TriGraph::AddEdge(const Edge& edge) { }
|
void TriGraph::AddEdge(const Edge& edge) {
|
||||||
void TriGraph::RemoveEdge(cmph_uint32 current_edge) { }
|
edges_[nedges_] = edge;
|
||||||
|
next_edge_[nedges_] = Edge(
|
||||||
|
first_edge_[edge[0]], first_edge_[edge[1]], first_edge_[edge[2]]);
|
||||||
|
first_edge_[edge[0]] = first_edge_[edge[1]] = first_edge_[edge[2]] = nedges_;
|
||||||
|
++vertex_degree_[edge[0]];
|
||||||
|
++vertex_degree_[edge[1]];
|
||||||
|
++vertex_degree_[edge[2]];
|
||||||
|
++nedges_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void TriGraph::RemoveEdge(cmph_uint32 current_edge) {
|
||||||
|
cmph_uint32 vertex, edge1, edge2;
|
||||||
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
cmph_uint32 vertex = edges_[current_edge][i];
|
||||||
|
cmph_uint32 edge1 = first_edge_[vertex];
|
||||||
|
cmph_uint32 edge2 = kInvalidEdge;
|
||||||
|
cmph_uint32 j = 0;
|
||||||
|
while (edge1 != current_edge && edge1 != kInvalidEdge) {
|
||||||
|
edge2 = edge1;
|
||||||
|
if (edges_[edge1][0] == vertex) j = 0;
|
||||||
|
else if (edges_[edge1][1] == vertex) j = 1;
|
||||||
|
else j = 2;
|
||||||
|
edge1 = next_edge_[edge1][j];
|
||||||
|
}
|
||||||
|
assert(edge1 != kInvalidEdge);
|
||||||
|
if (edge2 != kInvalidEdge) next_edge_[edge2][j] = next_edge_[edge1][i];
|
||||||
|
else first_edge_[vertex] = next_edge_[edge1][i];
|
||||||
|
--vertex_degree_[vertex];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
@ -1,26 +1,43 @@
|
|||||||
|
#ifndef __CXXMPH_TRIGRAPH_H__
|
||||||
|
#define __CXXMPH_TRIGRAPH_H__
|
||||||
|
// Build a trigraph using a memory efficient representation.
|
||||||
|
//
|
||||||
|
// Prior knowledge of the number of edges and vertices for the graph is
|
||||||
|
// required. For each vertex, we store how many edges touch it (degree) and the
|
||||||
|
// index of the first edge in the vector of triples representing the edges.
|
||||||
|
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "../src/cmph_types.h"
|
#include "../src/cmph_types.h"
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
class TriGraph {
|
class TriGraph {
|
||||||
struct Edge {
|
struct Edge {
|
||||||
Edge() { }
|
Edge() { }
|
||||||
Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2);
|
Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2);
|
||||||
|
cmph_uint32& operator[](cmph_uint8 v) { return vertices[v]; }
|
||||||
|
const cmph_uint32& operator[](cmph_uint8 v) const { return vertices[v]; }
|
||||||
cmph_uint32 vertices[3];
|
cmph_uint32 vertices[3];
|
||||||
};
|
};
|
||||||
struct ConnectedEdge {
|
|
||||||
Edge current;
|
|
||||||
Edge next;
|
|
||||||
};
|
|
||||||
|
|
||||||
TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices);
|
TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices);
|
||||||
void AddEdge(const Edge& edge);
|
void AddEdge(const Edge& edge);
|
||||||
void RemoveEdge(cmph_uint32 current_edge);
|
void RemoveEdge(cmph_uint32 edge_id);
|
||||||
void ExtractEdgesAndClear(std::vector<ConnectedEdge>* edges);
|
void ExtractEdgesAndClear(std::vector<Edge>* edges);
|
||||||
|
|
||||||
|
const std::vector<Edge>& edges() const { return edges_; }
|
||||||
|
const std::vector<cmph_uint8>& vertex_degree() const { return vertex_degree_; }
|
||||||
|
const std::vector<cmph_uint32>& first_edge() const { return first_edge_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
cmph_uint32 nedges_;
|
cmph_uint32 nedges_; // total number of edges
|
||||||
std::vector<ConnectedEdge> edges_;
|
std::vector<Edge> edges_;
|
||||||
std::vector<cmph_uint32> first_edge_;
|
std::vector<Edge> next_edge_; // for implementing removal
|
||||||
std::vector<cmph_uint8> vertex_degree_;
|
std::vector<cmph_uint32> first_edge_; // the first edge for this vertex
|
||||||
|
std::vector<cmph_uint8> vertex_degree_; // number of edges for this vertex
|
||||||
};
|
};
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
#endif // __CXXMPH_TRIGRAPH_H__
|
||||||
|
Loading…
Reference in New Issue
Block a user