Renamed table to index and reorganized benchmarks.
This commit is contained in:
parent
c630eb2a70
commit
bb40a4bb00
@ -1,28 +1,28 @@
|
||||
AM_CXXFLAGS='-std=c++0x'
|
||||
TESTS = $(check_PROGRAMS)
|
||||
check_PROGRAMS = mph_map_test mph_table_test trigraph_test
|
||||
noinst_PROGRAMS = bm_numbers bm_urls
|
||||
check_PROGRAMS = mph_map_test mph_index_test trigraph_test
|
||||
noinst_PROGRAMS = bm_index bm_map
|
||||
bin_PROGRAMS = cxxmph
|
||||
lib_LTLIBRARIES = libcxxmph.la
|
||||
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_table.h mph_table.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
|
||||
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
|
||||
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
||||
cxxmph_includedir = $(includedir)/cxxmph/
|
||||
cxxmph_include_HEADERS = mph_map.h mph_table.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h
|
||||
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h
|
||||
|
||||
mph_map_test_LDADD = libcxxmph.la
|
||||
mph_map_test_SOURCES = mph_map_test.cc
|
||||
|
||||
mph_table_test_LDADD = libcxxmph.la
|
||||
mph_table_test_SOURCES = mph_table_test.cc
|
||||
mph_index_test_LDADD = libcxxmph.la
|
||||
mph_index_test_SOURCES = mph_index_test.cc
|
||||
|
||||
trigraph_test_LDADD = libcxxmph.la
|
||||
trigraph_test_SOURCES = trigraph_test.cc
|
||||
|
||||
bm_numbers_LDADD = libcxxmph.la
|
||||
bm_numbers_SOURCES = bm_numbers.cc
|
||||
bm_index_LDADD = libcxxmph.la
|
||||
bm_index_SOURCES = bm_common.cc bm_index.cc
|
||||
|
||||
bm_urls_LDADD = libcxxmph.la
|
||||
bm_urls_SOURCES = bm_urls.cc
|
||||
bm_map_LDADD = libcxxmph.la
|
||||
bm_map_SOURCES = bm_common.cc bm_map.cc
|
||||
|
||||
cxxmph_LDADD = libcxxmph.la
|
||||
cxxmph_SOURCES = cxxmph.cc
|
||||
|
@ -1,7 +1,9 @@
|
||||
#include "benchmark.h"
|
||||
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
|
||||
#include <iostream>
|
||||
@ -50,6 +52,16 @@ struct rusage getrusage_or_die() {
|
||||
return rs;
|
||||
}
|
||||
|
||||
struct timeval gettimeofday_or_die() {
|
||||
struct timeval tv;
|
||||
int ret = gettimeofday(&tv, NULL);
|
||||
if (ret != 0) {
|
||||
cerr << "gettimeofday failed: " << strerror(errno) << endl;
|
||||
exit(-1);
|
||||
}
|
||||
return tv;
|
||||
}
|
||||
|
||||
#ifdef HAVE_CXA_DEMANGLE
|
||||
string demangle(const string& name) {
|
||||
char buf[1024];
|
||||
@ -79,25 +91,33 @@ namespace cxxmph {
|
||||
}
|
||||
|
||||
/* static */ void Benchmark::RunAll() {
|
||||
for (auto it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
|
||||
(*it)->MeasureRun();
|
||||
delete *it;
|
||||
for (int i = 0; i < g_benchmarks.size(); ++i) {
|
||||
Benchmark* bm = g_benchmarks[i];
|
||||
bm->SetUp();
|
||||
bm->MeasureRun();
|
||||
bm->TearDown();
|
||||
delete bm;
|
||||
}
|
||||
}
|
||||
|
||||
void Benchmark::MeasureRun() {
|
||||
struct timeval walltime_begin = gettimeofday_or_die();
|
||||
struct rusage begin = getrusage_or_die();
|
||||
Run(iters_);
|
||||
Run();
|
||||
struct rusage end = getrusage_or_die();
|
||||
struct timeval walltime_end = gettimeofday_or_die();
|
||||
|
||||
struct timeval utime;
|
||||
timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime);
|
||||
struct timeval stime;
|
||||
timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime);
|
||||
struct timeval wtime;
|
||||
timeval_subtract(&wtime, &walltime_end, &walltime_begin);
|
||||
|
||||
printf("Benchmark: %s\n", name().c_str());
|
||||
printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec);
|
||||
printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec);
|
||||
printf("CPU User time : %ld.%06ld\n", utime.tv_sec, utime.tv_usec);
|
||||
printf("CPU System time: %ld.%06ld\n", stime.tv_sec, stime.tv_usec);
|
||||
printf("Wall clock time: %ld.%06ld\n", wtime.tv_sec, wtime.tv_usec);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
@ -8,9 +8,9 @@ namespace cxxmph {
|
||||
|
||||
class Benchmark {
|
||||
public:
|
||||
Benchmark(int iters = 1) : iters_(iters) { }
|
||||
virtual void Run(int iters) = 0;
|
||||
virtual ~Benchmark() { }
|
||||
Benchmark() {}
|
||||
virtual ~Benchmark() {}
|
||||
|
||||
const std::string& name() { return name_; }
|
||||
void set_name(const std::string& name) { name_ = name; }
|
||||
|
||||
@ -18,10 +18,11 @@ class Benchmark {
|
||||
static void RunAll();
|
||||
|
||||
protected:
|
||||
int iters() { return iters_; }
|
||||
virtual bool SetUp() {};
|
||||
virtual void Run() = 0;
|
||||
virtual bool TearDown() {};
|
||||
|
||||
private:
|
||||
int iters_;
|
||||
std::string name_;
|
||||
void MeasureRun();
|
||||
};
|
||||
|
52
cxxmph/bm_map.cc
Normal file
52
cxxmph/bm_map.cc
Normal file
@ -0,0 +1,52 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "bm_common.h"
|
||||
#include "mph_map.h"
|
||||
|
||||
using cxxmph::mph_map;
|
||||
using std::string;
|
||||
using std::unordered_map;
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
template <class MapType>
|
||||
class BM_MapCreate : public UrlsBenchmark {
|
||||
public:
|
||||
virtual void Run() {
|
||||
MapType mymap;
|
||||
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
|
||||
mymap[*it] = *it;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <class MapType>
|
||||
class BM_MapSearch : public SearchUrlsBenchmark {
|
||||
public:
|
||||
virtual void Run() {
|
||||
for (auto it = random_.begin(); it != random_.end(); ++it) {
|
||||
auto value = mymap[*it];
|
||||
}
|
||||
}
|
||||
protected:
|
||||
virtual void SetUp() {
|
||||
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
|
||||
mymap_[*it] = *it;
|
||||
}
|
||||
mymap_.resize(mymap.size());
|
||||
}
|
||||
MapType mymap_;
|
||||
};
|
||||
|
||||
} // namespace cxxmph
|
||||
|
||||
using namespace cxxmph;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
Benchmark::Register(new BM_MapCreate<mph_map<string, string>>("URLS100k"));
|
||||
Benchmark::Register(new BM_MapCreate<unordered_map<string, string>>("URLS100k"));
|
||||
Benchmark::Register(new BM_MapSearch<mph_map<string, string>>("URLS100k", 1000 * 1000));
|
||||
Benchmark::Register(new BM_MapSearch<unordered_map<string, string>>("URLS100k", 1000 * 1000));
|
||||
Benchmark::RunAll();
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "benchmark.h"
|
||||
#include "mph_table.h"
|
||||
|
||||
using std::set;
|
||||
using std::vector;
|
||||
|
||||
namespace cxxmph {
|
||||
class BM_NumbersCreate : public Benchmark {
|
||||
public:
|
||||
BM_NumbersCreate(int iters = 1) : Benchmark(iters) {
|
||||
set<int> unique;
|
||||
while (unique.size() < 1000 * 1000) {
|
||||
int v = random();
|
||||
if (unique.find(v) == unique.end()) {
|
||||
unique.insert(v);
|
||||
random_unique_.push_back(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
protected:
|
||||
virtual void Run(int iters) {
|
||||
SimpleMPHTable<int> table;
|
||||
table.Reset(random_unique_.begin(), random_unique_.end());
|
||||
}
|
||||
std::vector<int> random_unique_;
|
||||
};
|
||||
|
||||
class BM_NumbersFind : public BM_NumbersCreate {
|
||||
public:
|
||||
BM_NumbersFind(int iters) : BM_NumbersCreate(iters) { table_.Reset(random_unique_.begin(), random_unique_.end()); }
|
||||
virtual void Run(int iters) {
|
||||
for (int i = 0; i < iters * 100; ++i) {
|
||||
int pos = random() % random_unique_.size();;
|
||||
int h = table_.index(pos);
|
||||
}
|
||||
}
|
||||
private:
|
||||
SimpleMPHTable<int> table_;
|
||||
};
|
||||
|
||||
} // namespace cxxmph
|
||||
|
||||
using namespace cxxmph;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
Benchmark::Register(new BM_NumbersCreate());
|
||||
Benchmark::Register(new BM_NumbersFind(1000 * 1000));
|
||||
Benchmark::RunAll();
|
||||
}
|
@ -1,70 +0,0 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "benchmark.h"
|
||||
#include "mph_map.h"
|
||||
|
||||
using std::ifstream;
|
||||
using std::set;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
class BM_UrlsCreate : public Benchmark {
|
||||
public:
|
||||
BM_UrlsCreate(int iters = 1) : Benchmark(iters) {
|
||||
ReadUrls();
|
||||
}
|
||||
protected:
|
||||
virtual void Run(int iters) {
|
||||
BuildTable();
|
||||
}
|
||||
void BuildTable() {
|
||||
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
|
||||
table_[*it] = it - urls_.begin();
|
||||
}
|
||||
table_.pack();
|
||||
}
|
||||
void ReadUrls() {
|
||||
vector<string> urls;
|
||||
std::ifstream f("URLS100k");
|
||||
string buffer;
|
||||
while(std::getline(f, buffer)) urls.push_back(buffer);
|
||||
set<string> unique(urls.begin(), urls.end());
|
||||
if (unique.size() != urls.size()) {
|
||||
cerr << "Input file has repeated keys." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
urls_.swap(urls);
|
||||
}
|
||||
vector<string> urls_;
|
||||
cxxmph::mph_map<string, int> table_;
|
||||
};
|
||||
|
||||
class BM_UrlsFind : public BM_UrlsCreate {
|
||||
public:
|
||||
BM_UrlsFind(int iters = 1) : BM_UrlsCreate(iters) { ReadUrls(); BuildTable(); }
|
||||
protected:
|
||||
virtual void Run(int iters) {
|
||||
for (int i = 0; i < iters * 100; ++i) {
|
||||
int pos = random() % urls_.size();;
|
||||
int h = table_[urls_[pos]];
|
||||
assert(h == pos);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cxxmph
|
||||
|
||||
using namespace cxxmph;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
Benchmark::Register(new BM_UrlsCreate());
|
||||
Benchmark::Register(new BM_UrlsFind(1000 * 1000));
|
||||
Benchmark::RunAll();
|
||||
}
|
@ -5,7 +5,7 @@
|
||||
using std::cerr;
|
||||
using std::endl;
|
||||
|
||||
#include "mph_table.h"
|
||||
#include "mph_index.h"
|
||||
|
||||
using std::vector;
|
||||
|
||||
@ -13,7 +13,7 @@ namespace {
|
||||
|
||||
static const uint8_t kUnassigned = 3;
|
||||
// table used for looking up the number of assigned vertices to a 8-bit integer
|
||||
static uint8_t kBdzLookupTable[] =
|
||||
static uint8_t kBdzLookupIndex[] =
|
||||
{
|
||||
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||
@ -37,13 +37,13 @@ static uint8_t kBdzLookupTable[] =
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
const uint8_t MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
|
||||
const uint8_t MPHIndex::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
|
||||
|
||||
MPHTable::~MPHTable() {
|
||||
MPHIndex::~MPHIndex() {
|
||||
clear();
|
||||
}
|
||||
|
||||
void MPHTable::clear() {
|
||||
void MPHIndex::clear() {
|
||||
if (!deserialized_) delete [] g_;
|
||||
g_ = NULL;
|
||||
g_size_ = 0;
|
||||
@ -53,7 +53,7 @@ void MPHTable::clear() {
|
||||
// TODO(davi) implement me
|
||||
}
|
||||
|
||||
bool MPHTable::GenerateQueue(
|
||||
bool MPHIndex::GenerateQueue(
|
||||
TriGraph* graph, vector<uint32_t>* queue_output) {
|
||||
uint32_t queue_head = 0, queue_tail = 0;
|
||||
uint32_t nedges = m_;
|
||||
@ -109,7 +109,7 @@ bool MPHTable::GenerateQueue(
|
||||
return cycles == 0;
|
||||
}
|
||||
|
||||
void MPHTable::Assigning(
|
||||
void MPHIndex::Assigning(
|
||||
const vector<TriGraph::Edge>& edges, const vector<uint32_t>& queue) {
|
||||
uint32_t current_edge = 0;
|
||||
vector<bool> marked_vertices(n_ + 1);
|
||||
@ -164,7 +164,7 @@ void MPHTable::Assigning(
|
||||
g_ = g;
|
||||
}
|
||||
|
||||
void MPHTable::Ranking() {
|
||||
void MPHIndex::Ranking() {
|
||||
uint32_t nbytes_total = static_cast<uint32_t>(ceil(n_ / 4.0));
|
||||
uint32_t size = k_ >> 2U;
|
||||
ranktable_size_ = static_cast<uint32_t>(
|
||||
@ -179,7 +179,7 @@ void MPHTable::Ranking() {
|
||||
while (1) {
|
||||
if (i == ranktable_size_) break;
|
||||
uint32_t nbytes = size < nbytes_total ? size : nbytes_total;
|
||||
for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]];
|
||||
for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupIndex[g_[offset + j]];
|
||||
ranktable[i] = count;
|
||||
offset += nbytes;
|
||||
nbytes_total -= size;
|
||||
@ -188,13 +188,13 @@ void MPHTable::Ranking() {
|
||||
ranktable_ = ranktable;
|
||||
}
|
||||
|
||||
uint32_t MPHTable::Rank(uint32_t vertex) const {
|
||||
uint32_t MPHIndex::Rank(uint32_t vertex) const {
|
||||
uint32_t index = vertex >> b_;
|
||||
uint32_t base_rank = ranktable_[index];
|
||||
uint32_t beg_idx_v = index << b_;
|
||||
uint32_t beg_idx_b = beg_idx_v >> 2;
|
||||
uint32_t end_idx_b = vertex >> 2;
|
||||
while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]];
|
||||
while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_[beg_idx_b++]];
|
||||
beg_idx_v = beg_idx_b << 2;
|
||||
// cerr << "beg_idx_v: " << beg_idx_v << endl;
|
||||
// cerr << "base rank: " << base_rank << endl;
|
||||
@ -213,21 +213,21 @@ uint32_t MPHTable::Rank(uint32_t vertex) const {
|
||||
return base_rank;
|
||||
}
|
||||
|
||||
uint32_t MPHTable::serialize_bytes_needed() const {
|
||||
return sizeof(MPHTable) + g_size_ + ranktable_size_*sizeof(uint32_t);
|
||||
uint32_t MPHIndex::serialize_bytes_needed() const {
|
||||
return sizeof(MPHIndex) + g_size_ + ranktable_size_*sizeof(uint32_t);
|
||||
}
|
||||
void MPHTable::serialize(char* memory) const {
|
||||
memcpy(memory, this, sizeof(MPHTable));
|
||||
memcpy(memory + sizeof(MPHTable), g_, g_size_);
|
||||
memcpy(memory + sizeof(MPHTable) + g_size_,
|
||||
void MPHIndex::serialize(char* memory) const {
|
||||
memcpy(memory, this, sizeof(MPHIndex));
|
||||
memcpy(memory + sizeof(MPHIndex), g_, g_size_);
|
||||
memcpy(memory + sizeof(MPHIndex) + g_size_,
|
||||
ranktable_, ranktable_size_*sizeof(uint32_t));
|
||||
}
|
||||
|
||||
bool MPHTable::deserialize(const char* serialized_memory) {
|
||||
memcpy(this, serialized_memory, sizeof(MPHTable));
|
||||
g_ = reinterpret_cast<const uint8_t*>(serialized_memory + sizeof(MPHTable));
|
||||
bool MPHIndex::deserialize(const char* serialized_memory) {
|
||||
memcpy(this, serialized_memory, sizeof(MPHIndex));
|
||||
g_ = reinterpret_cast<const uint8_t*>(serialized_memory + sizeof(MPHIndex));
|
||||
ranktable_ = reinterpret_cast<const uint32_t*>(
|
||||
serialized_memory + sizeof(MPHTable) + g_size_);
|
||||
serialized_memory + sizeof(MPHIndex) + g_size_);
|
||||
deserialized_ = true;
|
||||
return true;
|
||||
}
|
173
cxxmph/mph_index.h
Normal file
173
cxxmph/mph_index.h
Normal file
@ -0,0 +1,173 @@
|
||||
#ifndef __CXXMPH_MPH_INDEX_H__
|
||||
#define __CXXMPH_MPH_INDEX_H__
|
||||
|
||||
// Minimal perfect hash abstraction implementing the BDZ algorithm
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <unordered_map> // for std::hash
|
||||
#include <vector>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
using std::cerr;
|
||||
using std::endl;
|
||||
|
||||
#include "seeded_hash.h"
|
||||
#include "trigraph.h"
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
class MPHIndex {
|
||||
public:
|
||||
MPHIndex(double c = 1.23, uint8_t b = 7) :
|
||||
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0),
|
||||
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
|
||||
deserialized_(false) { }
|
||||
~MPHIndex();
|
||||
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool Reset(ForwardIterator begin, ForwardIterator end);
|
||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||
uint32_t index(const Key& x) const;
|
||||
uint32_t size() const { return m_; }
|
||||
void clear();
|
||||
|
||||
// Serialization machinery for mmap usage.
|
||||
// Serialized tables are not guaranteed to work across versions or different
|
||||
// endianness (although they could easily be made to be).
|
||||
uint32_t serialize_bytes_needed() const;
|
||||
void serialize(char *memory) const;
|
||||
bool deserialize(const char* serialized_memory);
|
||||
|
||||
private:
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool Mapping(ForwardIterator begin, ForwardIterator end,
|
||||
std::vector<TriGraph::Edge>* edges,
|
||||
std::vector<uint32_t>* queue);
|
||||
bool GenerateQueue(TriGraph* graph, std::vector<uint32_t>* queue);
|
||||
void Assigning(const std::vector<TriGraph::Edge>& edges,
|
||||
const std::vector<uint32_t>& queue);
|
||||
void Ranking();
|
||||
uint32_t Rank(uint32_t vertex) const;
|
||||
|
||||
// Algorithm parameters
|
||||
double c_; // Number of bits per key (? is it right)
|
||||
uint8_t b_; // Number of bits of the kth index in the ranktable
|
||||
|
||||
// Values used during generation
|
||||
uint32_t m_; // edges count
|
||||
uint32_t n_; // vertex count
|
||||
uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
|
||||
|
||||
// Values used during search
|
||||
|
||||
// Partition vertex count, derived from c parameter.
|
||||
uint32_t r_;
|
||||
// The array containing the minimal perfect hash function graph. Do not use
|
||||
// c++ vector to make mmap based backing easier.
|
||||
const uint8_t* g_;
|
||||
uint32_t g_size_;
|
||||
// The table used for the rank step of the minimal perfect hash function
|
||||
const uint32_t* ranktable_;
|
||||
uint32_t ranktable_size_;
|
||||
// The selected hash seed triplet for finding the edges in the minimal
|
||||
// perfect hash function graph.
|
||||
uint32_t hash_seed_[3];
|
||||
|
||||
bool deserialized_;
|
||||
|
||||
static const uint8_t valuemask[];
|
||||
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
|
||||
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
|
||||
}
|
||||
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
|
||||
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
// Template method needs to go in the header file.
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) {
|
||||
m_ = end - begin;
|
||||
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
|
||||
if ((r_ % 2) == 0) r_ += 1;
|
||||
n_ = 3*r_;
|
||||
k_ = 1U << b_;
|
||||
|
||||
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
|
||||
|
||||
int iterations = 10;
|
||||
std::vector<TriGraph::Edge> edges;
|
||||
std::vector<uint32_t> queue;
|
||||
while (1) {
|
||||
// cerr << "Iterations missing: " << iterations << endl;
|
||||
for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_;
|
||||
// for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i;
|
||||
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
|
||||
else --iterations;
|
||||
if (iterations == 0) break;
|
||||
}
|
||||
if (iterations == 0) return false;
|
||||
Assigning(edges, queue);
|
||||
std::vector<TriGraph::Edge>().swap(edges);
|
||||
Ranking();
|
||||
deserialized_ = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool MPHIndex::Mapping(
|
||||
ForwardIterator begin, ForwardIterator end,
|
||||
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
|
||||
TriGraph graph(n_, m_);
|
||||
for (ForwardIterator it = begin; it != end; ++it) {
|
||||
uint32_t h[3];
|
||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
|
||||
uint32_t v0 = h[0] % r_;
|
||||
uint32_t v1 = h[1] % r_ + r_;
|
||||
uint32_t v2 = h[2] % r_ + (r_ << 1);
|
||||
// cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl;
|
||||
graph.AddEdge(TriGraph::Edge(v0, v1, v2));
|
||||
}
|
||||
if (GenerateQueue(&graph, queue)) {
|
||||
graph.ExtractEdgesAndClear(edges);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <class SeededHashFcn, class Key>
|
||||
uint32_t MPHIndex::index(const Key& key) const {
|
||||
uint32_t h[3];
|
||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
|
||||
h[0] = h[0] % r_;
|
||||
h[1] = h[1] % r_ + r_;
|
||||
h[2] = h[2] % r_ + (r_ << 1);
|
||||
assert(g_size_);
|
||||
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
|
||||
assert((h[0] >> 2) <g_size_);
|
||||
assert((h[1] >> 2) <g_size_);
|
||||
assert((h[2] >> 2) <g_size_);
|
||||
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
|
||||
// cerr << "Search found vertex " << vertex << endl;
|
||||
return Rank(vertex);
|
||||
}
|
||||
|
||||
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
|
||||
class SimpleMPHIndex : public MPHIndex {
|
||||
public:
|
||||
template <class ForwardIterator>
|
||||
bool Reset(ForwardIterator begin, ForwardIterator end) {
|
||||
return MPHIndex::Reset<HashFcn>(begin, end);
|
||||
}
|
||||
uint32_t index(const Key& key) { return MPHIndex::index<HashFcn>(key); }
|
||||
};
|
||||
|
||||
} // namespace cxxmph
|
||||
|
||||
#endif // __CXXMPH_MPH_INDEX_H__
|
@ -3,11 +3,11 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "mph_table.h"
|
||||
#include "mph_index.h"
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using cxxmph::SimpleMPHTable;
|
||||
using cxxmph::SimpleMPHIndex;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
@ -23,20 +23,20 @@ int main(int argc, char** argv) {
|
||||
keys.push_back("diogo");
|
||||
keys.push_back("algume");
|
||||
|
||||
SimpleMPHTable<string> mph_table;
|
||||
assert(mph_table.Reset(keys.begin(), keys.end()));
|
||||
SimpleMPHIndex<string> mph_index;
|
||||
assert(mph_index.Reset(keys.begin(), keys.end()));
|
||||
vector<int> ids;
|
||||
for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
|
||||
ids.push_back(mph_table.index(keys[i]));
|
||||
ids.push_back(mph_index.index(keys[i]));
|
||||
cerr << " " << *(ids.end() - 1);
|
||||
}
|
||||
cerr << endl;
|
||||
sort(ids.begin(), ids.end());
|
||||
for (vector<int>::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast<vector<int>::value_type>(i));
|
||||
|
||||
char* serialized = new char[mph_table.serialize_bytes_needed()];
|
||||
mph_table.serialize(serialized);
|
||||
SimpleMPHTable<string> other_mph_table;
|
||||
other_mph_table.deserialize(serialized);
|
||||
char* serialized = new char[mph_index.serialize_bytes_needed()];
|
||||
mph_index.serialize(serialized);
|
||||
SimpleMPHIndex<string> other_mph_index;
|
||||
other_mph_index.deserialize(serialized);
|
||||
}
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include <utility> // for std::pair
|
||||
|
||||
#include "MurmurHash2.h"
|
||||
#include "mph_table.h"
|
||||
#include "mph_index.h"
|
||||
|
||||
namespace cxxmph {
|
||||
|
||||
@ -70,7 +70,7 @@ class mph_map {
|
||||
|
||||
void rehash();
|
||||
std::vector<value_type> values_;
|
||||
SimpleMPHTable<Key, typename seeded_hash<HashFcn>::hash_function> table_;
|
||||
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
|
||||
// TODO(davi) optimize slack to no hold a copy of the key
|
||||
typedef typename std::unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
|
||||
slack_type slack_;
|
||||
@ -93,8 +93,8 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
|
||||
if (it != end()) return std::make_pair(it, false);
|
||||
values_.push_back(x);
|
||||
slack_.insert(std::make_pair(x.first, values_.size() - 1));
|
||||
if (slack_.size() == table_.size() ||
|
||||
(slack_.size() >= 256 && table_.size() == 0)) {
|
||||
if (slack_.size() == index_.size() ||
|
||||
(slack_.size() >= 256 && index_.size() == 0)) {
|
||||
rehash();
|
||||
}
|
||||
it = find(x.first);
|
||||
@ -104,14 +104,14 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
|
||||
MPH_MAP_METHOD_DECL(void_type, rehash)() {
|
||||
if (values_.empty()) return;
|
||||
slack_type().swap(slack_);
|
||||
bool success = table_.Reset(
|
||||
bool success = index_.Reset(
|
||||
make_iterator_first(values_.begin()),
|
||||
make_iterator_first(values_.end()));
|
||||
assert(success);
|
||||
std::vector<value_type> new_values(values_.size());
|
||||
for (const_iterator it = values_.begin(), end = values_.end();
|
||||
it != end; ++it) {
|
||||
size_type id = table_.index(it->first);
|
||||
size_type id = index_.index(it->first);
|
||||
assert(id < new_values.size());
|
||||
new_values[id] = *it;
|
||||
}
|
||||
@ -127,7 +127,7 @@ MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); }
|
||||
MPH_MAP_METHOD_DECL(void_type, clear)() {
|
||||
values_.clear();
|
||||
slack_.clear();
|
||||
table_.clear();
|
||||
index_.clear();
|
||||
}
|
||||
|
||||
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
|
||||
@ -145,8 +145,8 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
|
||||
typename slack_type::const_iterator it = slack_.find(k);
|
||||
if (it != slack_.end()) return values_.begin() + it->second;
|
||||
}
|
||||
if (table_.size() == 0) return end();
|
||||
size_type id = table_.index(k);
|
||||
if (index_.size() == 0) return end();
|
||||
size_type id = index_.index(k);
|
||||
if (key_equal()(values_[id].first, k)) {
|
||||
return values_.begin() + id;
|
||||
}
|
||||
@ -157,8 +157,8 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
|
||||
typename slack_type::const_iterator it = slack_.find(k);
|
||||
if (it != slack_.end()) return values_.begin() + it->second;
|
||||
}
|
||||
if (table_.size() == 0) return end();
|
||||
size_type id = table_.index(k);
|
||||
if (index_.size() == 0) return end();
|
||||
size_type id = index_.index(k);
|
||||
if (key_equal()(values_[id].first, k)) {
|
||||
return values_.begin() + id;
|
||||
}
|
||||
|
@ -1,173 +1,16 @@
|
||||
#ifndef __CXXMPH_MPH_TABLE_H__
|
||||
#define __CXXMPH_MPH_TABLE_H__
|
||||
#include "mph_index.h"
|
||||
|
||||
// Minimal perfect hash abstraction implementing the BDZ algorithm
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <unordered_map> // for std::hash
|
||||
#include <vector>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
using std::cerr;
|
||||
using std::endl;
|
||||
|
||||
#include "seeded_hash.h"
|
||||
#include "trigraph.h"
|
||||
|
||||
namespace cxxmph {
|
||||
// String to string map working on mmap'ed memory
|
||||
|
||||
class MPHTable {
|
||||
public:
|
||||
MPHTable(double c = 1.23, uint8_t b = 7) :
|
||||
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0),
|
||||
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
|
||||
deserialized_(false) { }
|
||||
~MPHTable();
|
||||
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool Reset(ForwardIterator begin, ForwardIterator end);
|
||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||
uint32_t index(const Key& x) const;
|
||||
uint32_t size() const { return m_; }
|
||||
void clear();
|
||||
|
||||
// Serialization machinery for mmap usage.
|
||||
// Serialized tables are not guaranteed to work across versions or different
|
||||
// endianness (although they could easily be made to be).
|
||||
uint32_t serialize_bytes_needed() const;
|
||||
void serialize(char *memory) const;
|
||||
bool deserialize(const char* serialized_memory);
|
||||
|
||||
private:
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool Mapping(ForwardIterator begin, ForwardIterator end,
|
||||
std::vector<TriGraph::Edge>* edges,
|
||||
std::vector<uint32_t>* queue);
|
||||
bool GenerateQueue(TriGraph* graph, std::vector<uint32_t>* queue);
|
||||
void Assigning(const std::vector<TriGraph::Edge>& edges,
|
||||
const std::vector<uint32_t>& queue);
|
||||
void Ranking();
|
||||
uint32_t Rank(uint32_t vertex) const;
|
||||
|
||||
// Algorithm parameters
|
||||
double c_; // Number of bits per key (? is it right)
|
||||
uint8_t b_; // Number of bits of the kth index in the ranktable
|
||||
|
||||
// Values used during generation
|
||||
uint32_t m_; // edges count
|
||||
uint32_t n_; // vertex count
|
||||
uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
|
||||
|
||||
// Values used during search
|
||||
|
||||
// Partition vertex count, derived from c parameter.
|
||||
uint32_t r_;
|
||||
// The array containing the minimal perfect hash function graph. Do not use
|
||||
// c++ vector to make mmap based backing easier.
|
||||
const uint8_t* g_;
|
||||
uint32_t g_size_;
|
||||
// The table used for the rank step of the minimal perfect hash function
|
||||
const uint32_t* ranktable_;
|
||||
uint32_t ranktable_size_;
|
||||
// The selected hash seed triplet for finding the edges in the minimal
|
||||
// perfect hash function graph.
|
||||
uint32_t hash_seed_[3];
|
||||
|
||||
bool deserialized_;
|
||||
|
||||
static const uint8_t valuemask[];
|
||||
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
|
||||
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
|
||||
}
|
||||
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
|
||||
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
// Template method needs to go in the header file.
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) {
|
||||
m_ = end - begin;
|
||||
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
|
||||
if ((r_ % 2) == 0) r_ += 1;
|
||||
n_ = 3*r_;
|
||||
k_ = 1U << b_;
|
||||
|
||||
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
|
||||
|
||||
int iterations = 10;
|
||||
std::vector<TriGraph::Edge> edges;
|
||||
std::vector<uint32_t> queue;
|
||||
while (1) {
|
||||
// cerr << "Iterations missing: " << iterations << endl;
|
||||
for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_;
|
||||
// for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i;
|
||||
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
|
||||
else --iterations;
|
||||
if (iterations == 0) break;
|
||||
}
|
||||
if (iterations == 0) return false;
|
||||
Assigning(edges, queue);
|
||||
std::vector<TriGraph::Edge>().swap(edges);
|
||||
Ranking();
|
||||
deserialized_ = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class SeededHashFcn, class ForwardIterator>
|
||||
bool MPHTable::Mapping(
|
||||
ForwardIterator begin, ForwardIterator end,
|
||||
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
|
||||
TriGraph graph(n_, m_);
|
||||
for (ForwardIterator it = begin; it != end; ++it) {
|
||||
uint32_t h[3];
|
||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
|
||||
uint32_t v0 = h[0] % r_;
|
||||
uint32_t v1 = h[1] % r_ + r_;
|
||||
uint32_t v2 = h[2] % r_ + (r_ << 1);
|
||||
// cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl;
|
||||
graph.AddEdge(TriGraph::Edge(v0, v1, v2));
|
||||
}
|
||||
if (GenerateQueue(&graph, queue)) {
|
||||
graph.ExtractEdgesAndClear(edges);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <class SeededHashFcn, class Key>
|
||||
uint32_t MPHTable::index(const Key& key) const {
|
||||
uint32_t h[3];
|
||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
|
||||
h[0] = h[0] % r_;
|
||||
h[1] = h[1] % r_ + r_;
|
||||
h[2] = h[2] % r_ + (r_ << 1);
|
||||
assert(g_size_);
|
||||
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
|
||||
assert((h[0] >> 2) <g_size_);
|
||||
assert((h[1] >> 2) <g_size_);
|
||||
assert((h[2] >> 2) <g_size_);
|
||||
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
|
||||
// cerr << "Search found vertex " << vertex << endl;
|
||||
return Rank(vertex);
|
||||
}
|
||||
|
||||
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
|
||||
class SimpleMPHTable : public MPHTable {
|
||||
public:
|
||||
typedef StringPiece key_type;
|
||||
typedef StringPiece data_type;
|
||||
typedef std::pair<StringPiece, StringPiece> value_type;
|
||||
template <class ForwardIterator>
|
||||
bool Reset(ForwardIterator begin, ForwardIterator end) {
|
||||
return MPHTable::Reset<HashFcn>(begin, end);
|
||||
}
|
||||
uint32_t index(const Key& key) { return MPHTable::index<HashFcn>(key); }
|
||||
bool Reset(ForwardIterator begin, ForwardIterator end);
|
||||
private:
|
||||
char* data_;
|
||||
vector<uint64_t> offsets_;
|
||||
MPHIndex index_;
|
||||
};
|
||||
|
||||
} // namespace cxxmph
|
||||
|
||||
#endif // __CXXMPH_MPH_TABLE_H__
|
||||
|
@ -1,6 +1,10 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <ext/hash_set>
|
||||
using __gnu_cxx::hash_set;
|
||||
static const char cxx_name = "__gnu_cxx::hash_set";
|
||||
|
||||
#include "bitbool.h"
|
||||
#include "cmph.h"
|
||||
#include "cmph_benchmark.h"
|
||||
@ -71,8 +75,8 @@ void bm_search(CMPH_ALGO algo, int iters) {
|
||||
cmph_t* mphf = NULL;
|
||||
|
||||
|
||||
snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters);
|
||||
mphf = lsmap_search(g_created_mphf, mphf_name);
|
||||
snprintf(mphf_name, 128, "%s:%u", cxx_name, iters);
|
||||
mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name);
|
||||
|
||||
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
||||
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
||||
@ -102,6 +106,49 @@ DECLARE_ALGO(CMPH_BRZ);
|
||||
DECLARE_ALGO(CMPH_FCH);
|
||||
DECLARE_ALGO(CMPH_BDZ);
|
||||
|
||||
void bm_create_ext_hash_set(int iters) {
|
||||
cmph_uint32 i = 0;
|
||||
|
||||
if (iters > g_numbers_len) {
|
||||
fprintf(stderr, "No input with proper size.");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
hash_set<cmph_uint32>* ext_hash_set = new hash_set<cmph_uint32>;
|
||||
for (i = 0; i < iters; ++i) {
|
||||
ext_hash_set->insert(g_numbers[i]);
|
||||
}
|
||||
lsmap_append(g_created_mphf, cxx_name, ext_hash_set);
|
||||
}
|
||||
|
||||
void bm_search_ext_hash_set(int iters) {
|
||||
cmph_uint32 i = 0;
|
||||
|
||||
if (iters > g_numbers_len) {
|
||||
fprintf(stderr, "No input with proper size.");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
snprintf(mphf_name, 128, "%s:%u", hash_count, iters);
|
||||
mphf = (__gnu_cxx::hash_set*)lsmap_search(g_created_mphf, mphf_name);
|
||||
|
||||
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
||||
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
||||
|
||||
for (i = 0; i < iters * 100; ++i) {
|
||||
cmph_uint32 pos = random() % iters;
|
||||
const char* buf = (const char*)(g_numbers + pos);
|
||||
cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32));
|
||||
++count[pos];
|
||||
++hash_count[h];
|
||||
}
|
||||
|
||||
// Verify correctness later.
|
||||
lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count);
|
||||
lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
g_numbers_len = 1000 * 1000;
|
||||
g_numbers = random_numbers_vector_new(g_numbers_len);
|
||||
|
Loading…
Reference in New Issue
Block a user