Renamed table to index and reorganized benchmarks.

This commit is contained in:
Davi Reis 2011-05-23 11:01:08 -07:00
parent c630eb2a70
commit bb40a4bb00
12 changed files with 367 additions and 353 deletions

View File

@ -1,28 +1,28 @@
AM_CXXFLAGS='-std=c++0x' AM_CXXFLAGS='-std=c++0x'
TESTS = $(check_PROGRAMS) TESTS = $(check_PROGRAMS)
check_PROGRAMS = mph_map_test mph_table_test trigraph_test check_PROGRAMS = mph_map_test mph_index_test trigraph_test
noinst_PROGRAMS = bm_numbers bm_urls noinst_PROGRAMS = bm_index bm_map
bin_PROGRAMS = cxxmph bin_PROGRAMS = cxxmph
lib_LTLIBRARIES = libcxxmph.la lib_LTLIBRARIES = libcxxmph.la
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_table.h mph_table.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
libcxxmph_la_LDFLAGS = -version-info 0:0:0 libcxxmph_la_LDFLAGS = -version-info 0:0:0
cxxmph_includedir = $(includedir)/cxxmph/ cxxmph_includedir = $(includedir)/cxxmph/
cxxmph_include_HEADERS = mph_map.h mph_table.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h
mph_map_test_LDADD = libcxxmph.la mph_map_test_LDADD = libcxxmph.la
mph_map_test_SOURCES = mph_map_test.cc mph_map_test_SOURCES = mph_map_test.cc
mph_table_test_LDADD = libcxxmph.la mph_index_test_LDADD = libcxxmph.la
mph_table_test_SOURCES = mph_table_test.cc mph_index_test_SOURCES = mph_index_test.cc
trigraph_test_LDADD = libcxxmph.la trigraph_test_LDADD = libcxxmph.la
trigraph_test_SOURCES = trigraph_test.cc trigraph_test_SOURCES = trigraph_test.cc
bm_numbers_LDADD = libcxxmph.la bm_index_LDADD = libcxxmph.la
bm_numbers_SOURCES = bm_numbers.cc bm_index_SOURCES = bm_common.cc bm_index.cc
bm_urls_LDADD = libcxxmph.la bm_map_LDADD = libcxxmph.la
bm_urls_SOURCES = bm_urls.cc bm_map_SOURCES = bm_common.cc bm_map.cc
cxxmph_LDADD = libcxxmph.la cxxmph_LDADD = libcxxmph.la
cxxmph_SOURCES = cxxmph.cc cxxmph_SOURCES = cxxmph.cc

View File

@ -1,7 +1,9 @@
#include "benchmark.h" #include "benchmark.h"
#include <cerrno>
#include <cstring> #include <cstring>
#include <cstdio> #include <cstdio>
#include <sys/time.h>
#include <sys/resource.h> #include <sys/resource.h>
#include <iostream> #include <iostream>
@ -50,6 +52,16 @@ struct rusage getrusage_or_die() {
return rs; return rs;
} }
struct timeval gettimeofday_or_die() {
struct timeval tv;
int ret = gettimeofday(&tv, NULL);
if (ret != 0) {
cerr << "gettimeofday failed: " << strerror(errno) << endl;
exit(-1);
}
return tv;
}
#ifdef HAVE_CXA_DEMANGLE #ifdef HAVE_CXA_DEMANGLE
string demangle(const string& name) { string demangle(const string& name) {
char buf[1024]; char buf[1024];
@ -79,25 +91,33 @@ namespace cxxmph {
} }
/* static */ void Benchmark::RunAll() { /* static */ void Benchmark::RunAll() {
for (auto it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { for (int i = 0; i < g_benchmarks.size(); ++i) {
(*it)->MeasureRun(); Benchmark* bm = g_benchmarks[i];
delete *it; bm->SetUp();
bm->MeasureRun();
bm->TearDown();
delete bm;
} }
} }
void Benchmark::MeasureRun() { void Benchmark::MeasureRun() {
struct timeval walltime_begin = gettimeofday_or_die();
struct rusage begin = getrusage_or_die(); struct rusage begin = getrusage_or_die();
Run(iters_); Run();
struct rusage end = getrusage_or_die(); struct rusage end = getrusage_or_die();
struct timeval walltime_end = gettimeofday_or_die();
struct timeval utime; struct timeval utime;
timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime); timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime);
struct timeval stime; struct timeval stime;
timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime); timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime);
struct timeval wtime;
timeval_subtract(&wtime, &walltime_end, &walltime_begin);
printf("Benchmark: %s\n", name().c_str()); printf("Benchmark: %s\n", name().c_str());
printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); printf("CPU User time : %ld.%06ld\n", utime.tv_sec, utime.tv_usec);
printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); printf("CPU System time: %ld.%06ld\n", stime.tv_sec, stime.tv_usec);
printf("Wall clock time: %ld.%06ld\n", wtime.tv_sec, wtime.tv_usec);
printf("\n"); printf("\n");
} }

View File

@ -8,9 +8,9 @@ namespace cxxmph {
class Benchmark { class Benchmark {
public: public:
Benchmark(int iters = 1) : iters_(iters) { } Benchmark() {}
virtual void Run(int iters) = 0;
virtual ~Benchmark() {} virtual ~Benchmark() {}
const std::string& name() { return name_; } const std::string& name() { return name_; }
void set_name(const std::string& name) { name_ = name; } void set_name(const std::string& name) { name_ = name; }
@ -18,10 +18,11 @@ class Benchmark {
static void RunAll(); static void RunAll();
protected: protected:
int iters() { return iters_; } virtual bool SetUp() {};
virtual void Run() = 0;
virtual bool TearDown() {};
private: private:
int iters_;
std::string name_; std::string name_;
void MeasureRun(); void MeasureRun();
}; };

52
cxxmph/bm_map.cc Normal file
View File

@ -0,0 +1,52 @@
#include <string>
#include <unordered_map>
#include "bm_common.h"
#include "mph_map.h"
using cxxmph::mph_map;
using std::string;
using std::unordered_map;
namespace cxxmph {
template <class MapType>
class BM_MapCreate : public UrlsBenchmark {
public:
virtual void Run() {
MapType mymap;
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
mymap[*it] = *it;
}
}
};
template <class MapType>
class BM_MapSearch : public SearchUrlsBenchmark {
public:
virtual void Run() {
for (auto it = random_.begin(); it != random_.end(); ++it) {
auto value = mymap[*it];
}
}
protected:
virtual void SetUp() {
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
mymap_[*it] = *it;
}
mymap_.resize(mymap.size());
}
MapType mymap_;
};
} // namespace cxxmph
using namespace cxxmph;
int main(int argc, char** argv) {
Benchmark::Register(new BM_MapCreate<mph_map<string, string>>("URLS100k"));
Benchmark::Register(new BM_MapCreate<unordered_map<string, string>>("URLS100k"));
Benchmark::Register(new BM_MapSearch<mph_map<string, string>>("URLS100k", 1000 * 1000));
Benchmark::Register(new BM_MapSearch<unordered_map<string, string>>("URLS100k", 1000 * 1000));
Benchmark::RunAll();
}

View File

@ -1,52 +0,0 @@
#include <set>
#include <vector>
#include "benchmark.h"
#include "mph_table.h"
using std::set;
using std::vector;
namespace cxxmph {
class BM_NumbersCreate : public Benchmark {
public:
BM_NumbersCreate(int iters = 1) : Benchmark(iters) {
set<int> unique;
while (unique.size() < 1000 * 1000) {
int v = random();
if (unique.find(v) == unique.end()) {
unique.insert(v);
random_unique_.push_back(v);
}
}
}
protected:
virtual void Run(int iters) {
SimpleMPHTable<int> table;
table.Reset(random_unique_.begin(), random_unique_.end());
}
std::vector<int> random_unique_;
};
class BM_NumbersFind : public BM_NumbersCreate {
public:
BM_NumbersFind(int iters) : BM_NumbersCreate(iters) { table_.Reset(random_unique_.begin(), random_unique_.end()); }
virtual void Run(int iters) {
for (int i = 0; i < iters * 100; ++i) {
int pos = random() % random_unique_.size();;
int h = table_.index(pos);
}
}
private:
SimpleMPHTable<int> table_;
};
} // namespace cxxmph
using namespace cxxmph;
int main(int argc, char** argv) {
Benchmark::Register(new BM_NumbersCreate());
Benchmark::Register(new BM_NumbersFind(1000 * 1000));
Benchmark::RunAll();
}

View File

@ -1,70 +0,0 @@
#include <fstream>
#include <iostream>
#include <set>
#include <string>
#include <vector>
#include <unordered_map>
#include "benchmark.h"
#include "mph_map.h"
using std::ifstream;
using std::set;
using std::string;
using std::vector;
namespace cxxmph {
class BM_UrlsCreate : public Benchmark {
public:
BM_UrlsCreate(int iters = 1) : Benchmark(iters) {
ReadUrls();
}
protected:
virtual void Run(int iters) {
BuildTable();
}
void BuildTable() {
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
table_[*it] = it - urls_.begin();
}
table_.pack();
}
void ReadUrls() {
vector<string> urls;
std::ifstream f("URLS100k");
string buffer;
while(std::getline(f, buffer)) urls.push_back(buffer);
set<string> unique(urls.begin(), urls.end());
if (unique.size() != urls.size()) {
cerr << "Input file has repeated keys." << endl;
exit(-1);
}
urls_.swap(urls);
}
vector<string> urls_;
cxxmph::mph_map<string, int> table_;
};
class BM_UrlsFind : public BM_UrlsCreate {
public:
BM_UrlsFind(int iters = 1) : BM_UrlsCreate(iters) { ReadUrls(); BuildTable(); }
protected:
virtual void Run(int iters) {
for (int i = 0; i < iters * 100; ++i) {
int pos = random() % urls_.size();;
int h = table_[urls_[pos]];
assert(h == pos);
}
}
};
} // namespace cxxmph
using namespace cxxmph;
int main(int argc, char** argv) {
Benchmark::Register(new BM_UrlsCreate());
Benchmark::Register(new BM_UrlsFind(1000 * 1000));
Benchmark::RunAll();
}

View File

@ -5,7 +5,7 @@
using std::cerr; using std::cerr;
using std::endl; using std::endl;
#include "mph_table.h" #include "mph_index.h"
using std::vector; using std::vector;
@ -13,7 +13,7 @@ namespace {
static const uint8_t kUnassigned = 3; static const uint8_t kUnassigned = 3;
// table used for looking up the number of assigned vertices to a 8-bit integer // table used for looking up the number of assigned vertices to a 8-bit integer
static uint8_t kBdzLookupTable[] = static uint8_t kBdzLookupIndex[] =
{ {
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
@ -37,13 +37,13 @@ static uint8_t kBdzLookupTable[] =
namespace cxxmph { namespace cxxmph {
const uint8_t MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; const uint8_t MPHIndex::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
MPHTable::~MPHTable() { MPHIndex::~MPHIndex() {
clear(); clear();
} }
void MPHTable::clear() { void MPHIndex::clear() {
if (!deserialized_) delete [] g_; if (!deserialized_) delete [] g_;
g_ = NULL; g_ = NULL;
g_size_ = 0; g_size_ = 0;
@ -53,7 +53,7 @@ void MPHTable::clear() {
// TODO(davi) implement me // TODO(davi) implement me
} }
bool MPHTable::GenerateQueue( bool MPHIndex::GenerateQueue(
TriGraph* graph, vector<uint32_t>* queue_output) { TriGraph* graph, vector<uint32_t>* queue_output) {
uint32_t queue_head = 0, queue_tail = 0; uint32_t queue_head = 0, queue_tail = 0;
uint32_t nedges = m_; uint32_t nedges = m_;
@ -109,7 +109,7 @@ bool MPHTable::GenerateQueue(
return cycles == 0; return cycles == 0;
} }
void MPHTable::Assigning( void MPHIndex::Assigning(
const vector<TriGraph::Edge>& edges, const vector<uint32_t>& queue) { const vector<TriGraph::Edge>& edges, const vector<uint32_t>& queue) {
uint32_t current_edge = 0; uint32_t current_edge = 0;
vector<bool> marked_vertices(n_ + 1); vector<bool> marked_vertices(n_ + 1);
@ -164,7 +164,7 @@ void MPHTable::Assigning(
g_ = g; g_ = g;
} }
void MPHTable::Ranking() { void MPHIndex::Ranking() {
uint32_t nbytes_total = static_cast<uint32_t>(ceil(n_ / 4.0)); uint32_t nbytes_total = static_cast<uint32_t>(ceil(n_ / 4.0));
uint32_t size = k_ >> 2U; uint32_t size = k_ >> 2U;
ranktable_size_ = static_cast<uint32_t>( ranktable_size_ = static_cast<uint32_t>(
@ -179,7 +179,7 @@ void MPHTable::Ranking() {
while (1) { while (1) {
if (i == ranktable_size_) break; if (i == ranktable_size_) break;
uint32_t nbytes = size < nbytes_total ? size : nbytes_total; uint32_t nbytes = size < nbytes_total ? size : nbytes_total;
for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupIndex[g_[offset + j]];
ranktable[i] = count; ranktable[i] = count;
offset += nbytes; offset += nbytes;
nbytes_total -= size; nbytes_total -= size;
@ -188,13 +188,13 @@ void MPHTable::Ranking() {
ranktable_ = ranktable; ranktable_ = ranktable;
} }
uint32_t MPHTable::Rank(uint32_t vertex) const { uint32_t MPHIndex::Rank(uint32_t vertex) const {
uint32_t index = vertex >> b_; uint32_t index = vertex >> b_;
uint32_t base_rank = ranktable_[index]; uint32_t base_rank = ranktable_[index];
uint32_t beg_idx_v = index << b_; uint32_t beg_idx_v = index << b_;
uint32_t beg_idx_b = beg_idx_v >> 2; uint32_t beg_idx_b = beg_idx_v >> 2;
uint32_t end_idx_b = vertex >> 2; uint32_t end_idx_b = vertex >> 2;
while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_[beg_idx_b++]];
beg_idx_v = beg_idx_b << 2; beg_idx_v = beg_idx_b << 2;
// cerr << "beg_idx_v: " << beg_idx_v << endl; // cerr << "beg_idx_v: " << beg_idx_v << endl;
// cerr << "base rank: " << base_rank << endl; // cerr << "base rank: " << base_rank << endl;
@ -213,21 +213,21 @@ uint32_t MPHTable::Rank(uint32_t vertex) const {
return base_rank; return base_rank;
} }
uint32_t MPHTable::serialize_bytes_needed() const { uint32_t MPHIndex::serialize_bytes_needed() const {
return sizeof(MPHTable) + g_size_ + ranktable_size_*sizeof(uint32_t); return sizeof(MPHIndex) + g_size_ + ranktable_size_*sizeof(uint32_t);
} }
void MPHTable::serialize(char* memory) const { void MPHIndex::serialize(char* memory) const {
memcpy(memory, this, sizeof(MPHTable)); memcpy(memory, this, sizeof(MPHIndex));
memcpy(memory + sizeof(MPHTable), g_, g_size_); memcpy(memory + sizeof(MPHIndex), g_, g_size_);
memcpy(memory + sizeof(MPHTable) + g_size_, memcpy(memory + sizeof(MPHIndex) + g_size_,
ranktable_, ranktable_size_*sizeof(uint32_t)); ranktable_, ranktable_size_*sizeof(uint32_t));
} }
bool MPHTable::deserialize(const char* serialized_memory) { bool MPHIndex::deserialize(const char* serialized_memory) {
memcpy(this, serialized_memory, sizeof(MPHTable)); memcpy(this, serialized_memory, sizeof(MPHIndex));
g_ = reinterpret_cast<const uint8_t*>(serialized_memory + sizeof(MPHTable)); g_ = reinterpret_cast<const uint8_t*>(serialized_memory + sizeof(MPHIndex));
ranktable_ = reinterpret_cast<const uint32_t*>( ranktable_ = reinterpret_cast<const uint32_t*>(
serialized_memory + sizeof(MPHTable) + g_size_); serialized_memory + sizeof(MPHIndex) + g_size_);
deserialized_ = true; deserialized_ = true;
return true; return true;
} }

173
cxxmph/mph_index.h Normal file
View File

@ -0,0 +1,173 @@
#ifndef __CXXMPH_MPH_INDEX_H__
#define __CXXMPH_MPH_INDEX_H__
// Minimal perfect hash abstraction implementing the BDZ algorithm
#include <stdint.h>
#include <cassert>
#include <cmath>
#include <unordered_map> // for std::hash
#include <vector>
#include <iostream>
using std::cerr;
using std::endl;
#include "seeded_hash.h"
#include "trigraph.h"
namespace cxxmph {
class MPHIndex {
public:
MPHIndex(double c = 1.23, uint8_t b = 7) :
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0),
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
deserialized_(false) { }
~MPHIndex();
template <class SeededHashFcn, class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end);
template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t index(const Key& x) const;
uint32_t size() const { return m_; }
void clear();
// Serialization machinery for mmap usage.
// Serialized tables are not guaranteed to work across versions or different
// endianness (although they could easily be made to be).
uint32_t serialize_bytes_needed() const;
void serialize(char *memory) const;
bool deserialize(const char* serialized_memory);
private:
template <class SeededHashFcn, class ForwardIterator>
bool Mapping(ForwardIterator begin, ForwardIterator end,
std::vector<TriGraph::Edge>* edges,
std::vector<uint32_t>* queue);
bool GenerateQueue(TriGraph* graph, std::vector<uint32_t>* queue);
void Assigning(const std::vector<TriGraph::Edge>& edges,
const std::vector<uint32_t>& queue);
void Ranking();
uint32_t Rank(uint32_t vertex) const;
// Algorithm parameters
double c_; // Number of bits per key (? is it right)
uint8_t b_; // Number of bits of the kth index in the ranktable
// Values used during generation
uint32_t m_; // edges count
uint32_t n_; // vertex count
uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
// Values used during search
// Partition vertex count, derived from c parameter.
uint32_t r_;
// The array containing the minimal perfect hash function graph. Do not use
// c++ vector to make mmap based backing easier.
const uint8_t* g_;
uint32_t g_size_;
// The table used for the rank step of the minimal perfect hash function
const uint32_t* ranktable_;
uint32_t ranktable_size_;
// The selected hash seed triplet for finding the edges in the minimal
// perfect hash function graph.
uint32_t hash_seed_[3];
bool deserialized_;
static const uint8_t valuemask[];
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
}
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
}
};
// Template method needs to go in the header file.
template <class SeededHashFcn, class ForwardIterator>
bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) {
m_ = end - begin;
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
if ((r_ % 2) == 0) r_ += 1;
n_ = 3*r_;
k_ = 1U << b_;
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
int iterations = 10;
std::vector<TriGraph::Edge> edges;
std::vector<uint32_t> queue;
while (1) {
// cerr << "Iterations missing: " << iterations << endl;
for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_;
// for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i;
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
else --iterations;
if (iterations == 0) break;
}
if (iterations == 0) return false;
Assigning(edges, queue);
std::vector<TriGraph::Edge>().swap(edges);
Ranking();
deserialized_ = false;
return true;
}
template <class SeededHashFcn, class ForwardIterator>
bool MPHIndex::Mapping(
ForwardIterator begin, ForwardIterator end,
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
TriGraph graph(n_, m_);
for (ForwardIterator it = begin; it != end; ++it) {
uint32_t h[3];
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
uint32_t v0 = h[0] % r_;
uint32_t v1 = h[1] % r_ + r_;
uint32_t v2 = h[2] % r_ + (r_ << 1);
// cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl;
graph.AddEdge(TriGraph::Edge(v0, v1, v2));
}
if (GenerateQueue(&graph, queue)) {
graph.ExtractEdgesAndClear(edges);
return true;
}
return false;
}
template <class SeededHashFcn, class Key>
uint32_t MPHIndex::index(const Key& key) const {
uint32_t h[3];
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
h[0] = h[0] % r_;
h[1] = h[1] % r_ + r_;
h[2] = h[2] % r_ + (r_ << 1);
assert(g_size_);
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
assert((h[0] >> 2) <g_size_);
assert((h[1] >> 2) <g_size_);
assert((h[2] >> 2) <g_size_);
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
// cerr << "Search found vertex " << vertex << endl;
return Rank(vertex);
}
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
class SimpleMPHIndex : public MPHIndex {
public:
template <class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end) {
return MPHIndex::Reset<HashFcn>(begin, end);
}
uint32_t index(const Key& key) { return MPHIndex::index<HashFcn>(key); }
};
} // namespace cxxmph
#endif // __CXXMPH_MPH_INDEX_H__

View File

@ -3,11 +3,11 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "mph_table.h" #include "mph_index.h"
using std::string; using std::string;
using std::vector; using std::vector;
using cxxmph::SimpleMPHTable; using cxxmph::SimpleMPHIndex;
int main(int argc, char** argv) { int main(int argc, char** argv) {
@ -23,20 +23,20 @@ int main(int argc, char** argv) {
keys.push_back("diogo"); keys.push_back("diogo");
keys.push_back("algume"); keys.push_back("algume");
SimpleMPHTable<string> mph_table; SimpleMPHIndex<string> mph_index;
assert(mph_table.Reset(keys.begin(), keys.end())); assert(mph_index.Reset(keys.begin(), keys.end()));
vector<int> ids; vector<int> ids;
for (vector<int>::size_type i = 0; i < keys.size(); ++i) { for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
ids.push_back(mph_table.index(keys[i])); ids.push_back(mph_index.index(keys[i]));
cerr << " " << *(ids.end() - 1); cerr << " " << *(ids.end() - 1);
} }
cerr << endl; cerr << endl;
sort(ids.begin(), ids.end()); sort(ids.begin(), ids.end());
for (vector<int>::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast<vector<int>::value_type>(i)); for (vector<int>::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast<vector<int>::value_type>(i));
char* serialized = new char[mph_table.serialize_bytes_needed()]; char* serialized = new char[mph_index.serialize_bytes_needed()];
mph_table.serialize(serialized); mph_index.serialize(serialized);
SimpleMPHTable<string> other_mph_table; SimpleMPHIndex<string> other_mph_index;
other_mph_table.deserialize(serialized); other_mph_index.deserialize(serialized);
} }

View File

@ -4,7 +4,7 @@
#include <utility> // for std::pair #include <utility> // for std::pair
#include "MurmurHash2.h" #include "MurmurHash2.h"
#include "mph_table.h" #include "mph_index.h"
namespace cxxmph { namespace cxxmph {
@ -70,7 +70,7 @@ class mph_map {
void rehash(); void rehash();
std::vector<value_type> values_; std::vector<value_type> values_;
SimpleMPHTable<Key, typename seeded_hash<HashFcn>::hash_function> table_; SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
// TODO(davi) optimize slack to no hold a copy of the key // TODO(davi) optimize slack to no hold a copy of the key
typedef typename std::unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type; typedef typename std::unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
slack_type slack_; slack_type slack_;
@ -93,8 +93,8 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
if (it != end()) return std::make_pair(it, false); if (it != end()) return std::make_pair(it, false);
values_.push_back(x); values_.push_back(x);
slack_.insert(std::make_pair(x.first, values_.size() - 1)); slack_.insert(std::make_pair(x.first, values_.size() - 1));
if (slack_.size() == table_.size() || if (slack_.size() == index_.size() ||
(slack_.size() >= 256 && table_.size() == 0)) { (slack_.size() >= 256 && index_.size() == 0)) {
rehash(); rehash();
} }
it = find(x.first); it = find(x.first);
@ -104,14 +104,14 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
MPH_MAP_METHOD_DECL(void_type, rehash)() { MPH_MAP_METHOD_DECL(void_type, rehash)() {
if (values_.empty()) return; if (values_.empty()) return;
slack_type().swap(slack_); slack_type().swap(slack_);
bool success = table_.Reset( bool success = index_.Reset(
make_iterator_first(values_.begin()), make_iterator_first(values_.begin()),
make_iterator_first(values_.end())); make_iterator_first(values_.end()));
assert(success); assert(success);
std::vector<value_type> new_values(values_.size()); std::vector<value_type> new_values(values_.size());
for (const_iterator it = values_.begin(), end = values_.end(); for (const_iterator it = values_.begin(), end = values_.end();
it != end; ++it) { it != end; ++it) {
size_type id = table_.index(it->first); size_type id = index_.index(it->first);
assert(id < new_values.size()); assert(id < new_values.size());
new_values[id] = *it; new_values[id] = *it;
} }
@ -127,7 +127,7 @@ MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); }
MPH_MAP_METHOD_DECL(void_type, clear)() { MPH_MAP_METHOD_DECL(void_type, clear)() {
values_.clear(); values_.clear();
slack_.clear(); slack_.clear();
table_.clear(); index_.clear();
} }
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
@ -145,8 +145,8 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
typename slack_type::const_iterator it = slack_.find(k); typename slack_type::const_iterator it = slack_.find(k);
if (it != slack_.end()) return values_.begin() + it->second; if (it != slack_.end()) return values_.begin() + it->second;
} }
if (table_.size() == 0) return end(); if (index_.size() == 0) return end();
size_type id = table_.index(k); size_type id = index_.index(k);
if (key_equal()(values_[id].first, k)) { if (key_equal()(values_[id].first, k)) {
return values_.begin() + id; return values_.begin() + id;
} }
@ -157,8 +157,8 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
typename slack_type::const_iterator it = slack_.find(k); typename slack_type::const_iterator it = slack_.find(k);
if (it != slack_.end()) return values_.begin() + it->second; if (it != slack_.end()) return values_.begin() + it->second;
} }
if (table_.size() == 0) return end(); if (index_.size() == 0) return end();
size_type id = table_.index(k); size_type id = index_.index(k);
if (key_equal()(values_[id].first, k)) { if (key_equal()(values_[id].first, k)) {
return values_.begin() + id; return values_.begin() + id;
} }

View File

@ -1,173 +1,16 @@
#ifndef __CXXMPH_MPH_TABLE_H__ #include "mph_index.h"
#define __CXXMPH_MPH_TABLE_H__
// Minimal perfect hash abstraction implementing the BDZ algorithm // String to string map working on mmap'ed memory
#include <stdint.h>
#include <cassert>
#include <cmath>
#include <unordered_map> // for std::hash
#include <vector>
#include <iostream>
using std::cerr;
using std::endl;
#include "seeded_hash.h"
#include "trigraph.h"
namespace cxxmph {
class MPHTable { class MPHTable {
public: public:
MPHTable(double c = 1.23, uint8_t b = 7) : typedef StringPiece key_type;
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), typedef StringPiece data_type;
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), typedef std::pair<StringPiece, StringPiece> value_type;
deserialized_(false) { }
~MPHTable();
template <class SeededHashFcn, class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end);
template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t index(const Key& x) const;
uint32_t size() const { return m_; }
void clear();
// Serialization machinery for mmap usage.
// Serialized tables are not guaranteed to work across versions or different
// endianness (although they could easily be made to be).
uint32_t serialize_bytes_needed() const;
void serialize(char *memory) const;
bool deserialize(const char* serialized_memory);
private:
template <class SeededHashFcn, class ForwardIterator>
bool Mapping(ForwardIterator begin, ForwardIterator end,
std::vector<TriGraph::Edge>* edges,
std::vector<uint32_t>* queue);
bool GenerateQueue(TriGraph* graph, std::vector<uint32_t>* queue);
void Assigning(const std::vector<TriGraph::Edge>& edges,
const std::vector<uint32_t>& queue);
void Ranking();
uint32_t Rank(uint32_t vertex) const;
// Algorithm parameters
double c_; // Number of bits per key (? is it right)
uint8_t b_; // Number of bits of the kth index in the ranktable
// Values used during generation
uint32_t m_; // edges count
uint32_t n_; // vertex count
uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
// Values used during search
// Partition vertex count, derived from c parameter.
uint32_t r_;
// The array containing the minimal perfect hash function graph. Do not use
// c++ vector to make mmap based backing easier.
const uint8_t* g_;
uint32_t g_size_;
// The table used for the rank step of the minimal perfect hash function
const uint32_t* ranktable_;
uint32_t ranktable_size_;
// The selected hash seed triplet for finding the edges in the minimal
// perfect hash function graph.
uint32_t hash_seed_[3];
bool deserialized_;
static const uint8_t valuemask[];
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
}
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
}
};
// Template method needs to go in the header file.
template <class SeededHashFcn, class ForwardIterator>
bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) {
m_ = end - begin;
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
if ((r_ % 2) == 0) r_ += 1;
n_ = 3*r_;
k_ = 1U << b_;
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
int iterations = 10;
std::vector<TriGraph::Edge> edges;
std::vector<uint32_t> queue;
while (1) {
// cerr << "Iterations missing: " << iterations << endl;
for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_;
// for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i;
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
else --iterations;
if (iterations == 0) break;
}
if (iterations == 0) return false;
Assigning(edges, queue);
std::vector<TriGraph::Edge>().swap(edges);
Ranking();
deserialized_ = false;
return true;
}
template <class SeededHashFcn, class ForwardIterator>
bool MPHTable::Mapping(
ForwardIterator begin, ForwardIterator end,
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
TriGraph graph(n_, m_);
for (ForwardIterator it = begin; it != end; ++it) {
uint32_t h[3];
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
uint32_t v0 = h[0] % r_;
uint32_t v1 = h[1] % r_ + r_;
uint32_t v2 = h[2] % r_ + (r_ << 1);
// cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl;
graph.AddEdge(TriGraph::Edge(v0, v1, v2));
}
if (GenerateQueue(&graph, queue)) {
graph.ExtractEdgesAndClear(edges);
return true;
}
return false;
}
template <class SeededHashFcn, class Key>
uint32_t MPHTable::index(const Key& key) const {
uint32_t h[3];
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
h[0] = h[0] % r_;
h[1] = h[1] % r_ + r_;
h[2] = h[2] % r_ + (r_ << 1);
assert(g_size_);
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
assert((h[0] >> 2) <g_size_);
assert((h[1] >> 2) <g_size_);
assert((h[2] >> 2) <g_size_);
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
// cerr << "Search found vertex " << vertex << endl;
return Rank(vertex);
}
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
class SimpleMPHTable : public MPHTable {
public:
template <class ForwardIterator> template <class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end) { bool Reset(ForwardIterator begin, ForwardIterator end);
return MPHTable::Reset<HashFcn>(begin, end); private:
} char* data_;
uint32_t index(const Key& key) { return MPHTable::index<HashFcn>(key); } vector<uint64_t> offsets_;
MPHIndex index_;
}; };
} // namespace cxxmph
#endif // __CXXMPH_MPH_TABLE_H__

View File

@ -1,6 +1,10 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <ext/hash_set>
using __gnu_cxx::hash_set;
static const char cxx_name = "__gnu_cxx::hash_set";
#include "bitbool.h" #include "bitbool.h"
#include "cmph.h" #include "cmph.h"
#include "cmph_benchmark.h" #include "cmph_benchmark.h"
@ -71,8 +75,8 @@ void bm_search(CMPH_ALGO algo, int iters) {
cmph_t* mphf = NULL; cmph_t* mphf = NULL;
snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); snprintf(mphf_name, 128, "%s:%u", cxx_name, iters);
mphf = lsmap_search(g_created_mphf, mphf_name); mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name);
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
@ -102,6 +106,49 @@ DECLARE_ALGO(CMPH_BRZ);
DECLARE_ALGO(CMPH_FCH); DECLARE_ALGO(CMPH_FCH);
DECLARE_ALGO(CMPH_BDZ); DECLARE_ALGO(CMPH_BDZ);
void bm_create_ext_hash_set(int iters) {
cmph_uint32 i = 0;
if (iters > g_numbers_len) {
fprintf(stderr, "No input with proper size.");
exit(-1);
}
hash_set<cmph_uint32>* ext_hash_set = new hash_set<cmph_uint32>;
for (i = 0; i < iters; ++i) {
ext_hash_set->insert(g_numbers[i]);
}
lsmap_append(g_created_mphf, cxx_name, ext_hash_set);
}
void bm_search_ext_hash_set(int iters) {
cmph_uint32 i = 0;
if (iters > g_numbers_len) {
fprintf(stderr, "No input with proper size.");
exit(-1);
}
snprintf(mphf_name, 128, "%s:%u", hash_count, iters);
mphf = (__gnu_cxx::hash_set*)lsmap_search(g_created_mphf, mphf_name);
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
for (i = 0; i < iters * 100; ++i) {
cmph_uint32 pos = random() % iters;
const char* buf = (const char*)(g_numbers + pos);
cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32));
++count[pos];
++hash_count[h];
}
// Verify correctness later.
lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count);
lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count);
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
g_numbers_len = 1000 * 1000; g_numbers_len = 1000 * 1000;
g_numbers = random_numbers_vector_new(g_numbers_len); g_numbers = random_numbers_vector_new(g_numbers_len);