Renamed table to index and reorganized benchmarks.
This commit is contained in:
parent
c630eb2a70
commit
bb40a4bb00
|
@ -1,28 +1,28 @@
|
||||||
AM_CXXFLAGS='-std=c++0x'
|
AM_CXXFLAGS='-std=c++0x'
|
||||||
TESTS = $(check_PROGRAMS)
|
TESTS = $(check_PROGRAMS)
|
||||||
check_PROGRAMS = mph_map_test mph_table_test trigraph_test
|
check_PROGRAMS = mph_map_test mph_index_test trigraph_test
|
||||||
noinst_PROGRAMS = bm_numbers bm_urls
|
noinst_PROGRAMS = bm_index bm_map
|
||||||
bin_PROGRAMS = cxxmph
|
bin_PROGRAMS = cxxmph
|
||||||
lib_LTLIBRARIES = libcxxmph.la
|
lib_LTLIBRARIES = libcxxmph.la
|
||||||
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_table.h mph_table.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
|
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
|
||||||
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
libcxxmph_la_LDFLAGS = -version-info 0:0:0
|
||||||
cxxmph_includedir = $(includedir)/cxxmph/
|
cxxmph_includedir = $(includedir)/cxxmph/
|
||||||
cxxmph_include_HEADERS = mph_map.h mph_table.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h
|
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h
|
||||||
|
|
||||||
mph_map_test_LDADD = libcxxmph.la
|
mph_map_test_LDADD = libcxxmph.la
|
||||||
mph_map_test_SOURCES = mph_map_test.cc
|
mph_map_test_SOURCES = mph_map_test.cc
|
||||||
|
|
||||||
mph_table_test_LDADD = libcxxmph.la
|
mph_index_test_LDADD = libcxxmph.la
|
||||||
mph_table_test_SOURCES = mph_table_test.cc
|
mph_index_test_SOURCES = mph_index_test.cc
|
||||||
|
|
||||||
trigraph_test_LDADD = libcxxmph.la
|
trigraph_test_LDADD = libcxxmph.la
|
||||||
trigraph_test_SOURCES = trigraph_test.cc
|
trigraph_test_SOURCES = trigraph_test.cc
|
||||||
|
|
||||||
bm_numbers_LDADD = libcxxmph.la
|
bm_index_LDADD = libcxxmph.la
|
||||||
bm_numbers_SOURCES = bm_numbers.cc
|
bm_index_SOURCES = bm_common.cc bm_index.cc
|
||||||
|
|
||||||
bm_urls_LDADD = libcxxmph.la
|
bm_map_LDADD = libcxxmph.la
|
||||||
bm_urls_SOURCES = bm_urls.cc
|
bm_map_SOURCES = bm_common.cc bm_map.cc
|
||||||
|
|
||||||
cxxmph_LDADD = libcxxmph.la
|
cxxmph_LDADD = libcxxmph.la
|
||||||
cxxmph_SOURCES = cxxmph.cc
|
cxxmph_SOURCES = cxxmph.cc
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
#include "benchmark.h"
|
#include "benchmark.h"
|
||||||
|
|
||||||
|
#include <cerrno>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <sys/time.h>
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
@ -50,6 +52,16 @@ struct rusage getrusage_or_die() {
|
||||||
return rs;
|
return rs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct timeval gettimeofday_or_die() {
|
||||||
|
struct timeval tv;
|
||||||
|
int ret = gettimeofday(&tv, NULL);
|
||||||
|
if (ret != 0) {
|
||||||
|
cerr << "gettimeofday failed: " << strerror(errno) << endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
return tv;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef HAVE_CXA_DEMANGLE
|
#ifdef HAVE_CXA_DEMANGLE
|
||||||
string demangle(const string& name) {
|
string demangle(const string& name) {
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
|
@ -79,25 +91,33 @@ namespace cxxmph {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ void Benchmark::RunAll() {
|
/* static */ void Benchmark::RunAll() {
|
||||||
for (auto it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
|
for (int i = 0; i < g_benchmarks.size(); ++i) {
|
||||||
(*it)->MeasureRun();
|
Benchmark* bm = g_benchmarks[i];
|
||||||
delete *it;
|
bm->SetUp();
|
||||||
|
bm->MeasureRun();
|
||||||
|
bm->TearDown();
|
||||||
|
delete bm;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Benchmark::MeasureRun() {
|
void Benchmark::MeasureRun() {
|
||||||
|
struct timeval walltime_begin = gettimeofday_or_die();
|
||||||
struct rusage begin = getrusage_or_die();
|
struct rusage begin = getrusage_or_die();
|
||||||
Run(iters_);
|
Run();
|
||||||
struct rusage end = getrusage_or_die();
|
struct rusage end = getrusage_or_die();
|
||||||
|
struct timeval walltime_end = gettimeofday_or_die();
|
||||||
|
|
||||||
struct timeval utime;
|
struct timeval utime;
|
||||||
timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime);
|
timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime);
|
||||||
struct timeval stime;
|
struct timeval stime;
|
||||||
timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime);
|
timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime);
|
||||||
|
struct timeval wtime;
|
||||||
|
timeval_subtract(&wtime, &walltime_end, &walltime_begin);
|
||||||
|
|
||||||
printf("Benchmark: %s\n", name().c_str());
|
printf("Benchmark: %s\n", name().c_str());
|
||||||
printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec);
|
printf("CPU User time : %ld.%06ld\n", utime.tv_sec, utime.tv_usec);
|
||||||
printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec);
|
printf("CPU System time: %ld.%06ld\n", stime.tv_sec, stime.tv_usec);
|
||||||
|
printf("Wall clock time: %ld.%06ld\n", wtime.tv_sec, wtime.tv_usec);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,9 @@ namespace cxxmph {
|
||||||
|
|
||||||
class Benchmark {
|
class Benchmark {
|
||||||
public:
|
public:
|
||||||
Benchmark(int iters = 1) : iters_(iters) { }
|
Benchmark() {}
|
||||||
virtual void Run(int iters) = 0;
|
virtual ~Benchmark() {}
|
||||||
virtual ~Benchmark() { }
|
|
||||||
const std::string& name() { return name_; }
|
const std::string& name() { return name_; }
|
||||||
void set_name(const std::string& name) { name_ = name; }
|
void set_name(const std::string& name) { name_ = name; }
|
||||||
|
|
||||||
|
@ -18,10 +18,11 @@ class Benchmark {
|
||||||
static void RunAll();
|
static void RunAll();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
int iters() { return iters_; }
|
virtual bool SetUp() {};
|
||||||
|
virtual void Run() = 0;
|
||||||
|
virtual bool TearDown() {};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int iters_;
|
|
||||||
std::string name_;
|
std::string name_;
|
||||||
void MeasureRun();
|
void MeasureRun();
|
||||||
};
|
};
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "bm_common.h"
|
||||||
|
#include "mph_map.h"
|
||||||
|
|
||||||
|
using cxxmph::mph_map;
|
||||||
|
using std::string;
|
||||||
|
using std::unordered_map;
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
|
template <class MapType>
|
||||||
|
class BM_MapCreate : public UrlsBenchmark {
|
||||||
|
public:
|
||||||
|
virtual void Run() {
|
||||||
|
MapType mymap;
|
||||||
|
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
|
||||||
|
mymap[*it] = *it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class MapType>
|
||||||
|
class BM_MapSearch : public SearchUrlsBenchmark {
|
||||||
|
public:
|
||||||
|
virtual void Run() {
|
||||||
|
for (auto it = random_.begin(); it != random_.end(); ++it) {
|
||||||
|
auto value = mymap[*it];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
protected:
|
||||||
|
virtual void SetUp() {
|
||||||
|
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
|
||||||
|
mymap_[*it] = *it;
|
||||||
|
}
|
||||||
|
mymap_.resize(mymap.size());
|
||||||
|
}
|
||||||
|
MapType mymap_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
using namespace cxxmph;
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
Benchmark::Register(new BM_MapCreate<mph_map<string, string>>("URLS100k"));
|
||||||
|
Benchmark::Register(new BM_MapCreate<unordered_map<string, string>>("URLS100k"));
|
||||||
|
Benchmark::Register(new BM_MapSearch<mph_map<string, string>>("URLS100k", 1000 * 1000));
|
||||||
|
Benchmark::Register(new BM_MapSearch<unordered_map<string, string>>("URLS100k", 1000 * 1000));
|
||||||
|
Benchmark::RunAll();
|
||||||
|
}
|
|
@ -1,52 +0,0 @@
|
||||||
#include <set>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "benchmark.h"
|
|
||||||
#include "mph_table.h"
|
|
||||||
|
|
||||||
using std::set;
|
|
||||||
using std::vector;
|
|
||||||
|
|
||||||
namespace cxxmph {
|
|
||||||
class BM_NumbersCreate : public Benchmark {
|
|
||||||
public:
|
|
||||||
BM_NumbersCreate(int iters = 1) : Benchmark(iters) {
|
|
||||||
set<int> unique;
|
|
||||||
while (unique.size() < 1000 * 1000) {
|
|
||||||
int v = random();
|
|
||||||
if (unique.find(v) == unique.end()) {
|
|
||||||
unique.insert(v);
|
|
||||||
random_unique_.push_back(v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
protected:
|
|
||||||
virtual void Run(int iters) {
|
|
||||||
SimpleMPHTable<int> table;
|
|
||||||
table.Reset(random_unique_.begin(), random_unique_.end());
|
|
||||||
}
|
|
||||||
std::vector<int> random_unique_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class BM_NumbersFind : public BM_NumbersCreate {
|
|
||||||
public:
|
|
||||||
BM_NumbersFind(int iters) : BM_NumbersCreate(iters) { table_.Reset(random_unique_.begin(), random_unique_.end()); }
|
|
||||||
virtual void Run(int iters) {
|
|
||||||
for (int i = 0; i < iters * 100; ++i) {
|
|
||||||
int pos = random() % random_unique_.size();;
|
|
||||||
int h = table_.index(pos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
SimpleMPHTable<int> table_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace cxxmph
|
|
||||||
|
|
||||||
using namespace cxxmph;
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
Benchmark::Register(new BM_NumbersCreate());
|
|
||||||
Benchmark::Register(new BM_NumbersFind(1000 * 1000));
|
|
||||||
Benchmark::RunAll();
|
|
||||||
}
|
|
|
@ -1,70 +0,0 @@
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include "benchmark.h"
|
|
||||||
#include "mph_map.h"
|
|
||||||
|
|
||||||
using std::ifstream;
|
|
||||||
using std::set;
|
|
||||||
using std::string;
|
|
||||||
using std::vector;
|
|
||||||
|
|
||||||
namespace cxxmph {
|
|
||||||
|
|
||||||
class BM_UrlsCreate : public Benchmark {
|
|
||||||
public:
|
|
||||||
BM_UrlsCreate(int iters = 1) : Benchmark(iters) {
|
|
||||||
ReadUrls();
|
|
||||||
}
|
|
||||||
protected:
|
|
||||||
virtual void Run(int iters) {
|
|
||||||
BuildTable();
|
|
||||||
}
|
|
||||||
void BuildTable() {
|
|
||||||
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
|
|
||||||
table_[*it] = it - urls_.begin();
|
|
||||||
}
|
|
||||||
table_.pack();
|
|
||||||
}
|
|
||||||
void ReadUrls() {
|
|
||||||
vector<string> urls;
|
|
||||||
std::ifstream f("URLS100k");
|
|
||||||
string buffer;
|
|
||||||
while(std::getline(f, buffer)) urls.push_back(buffer);
|
|
||||||
set<string> unique(urls.begin(), urls.end());
|
|
||||||
if (unique.size() != urls.size()) {
|
|
||||||
cerr << "Input file has repeated keys." << endl;
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
urls_.swap(urls);
|
|
||||||
}
|
|
||||||
vector<string> urls_;
|
|
||||||
cxxmph::mph_map<string, int> table_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class BM_UrlsFind : public BM_UrlsCreate {
|
|
||||||
public:
|
|
||||||
BM_UrlsFind(int iters = 1) : BM_UrlsCreate(iters) { ReadUrls(); BuildTable(); }
|
|
||||||
protected:
|
|
||||||
virtual void Run(int iters) {
|
|
||||||
for (int i = 0; i < iters * 100; ++i) {
|
|
||||||
int pos = random() % urls_.size();;
|
|
||||||
int h = table_[urls_[pos]];
|
|
||||||
assert(h == pos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace cxxmph
|
|
||||||
|
|
||||||
using namespace cxxmph;
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
Benchmark::Register(new BM_UrlsCreate());
|
|
||||||
Benchmark::Register(new BM_UrlsFind(1000 * 1000));
|
|
||||||
Benchmark::RunAll();
|
|
||||||
}
|
|
|
@ -5,7 +5,7 @@
|
||||||
using std::cerr;
|
using std::cerr;
|
||||||
using std::endl;
|
using std::endl;
|
||||||
|
|
||||||
#include "mph_table.h"
|
#include "mph_index.h"
|
||||||
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ namespace {
|
||||||
|
|
||||||
static const uint8_t kUnassigned = 3;
|
static const uint8_t kUnassigned = 3;
|
||||||
// table used for looking up the number of assigned vertices to a 8-bit integer
|
// table used for looking up the number of assigned vertices to a 8-bit integer
|
||||||
static uint8_t kBdzLookupTable[] =
|
static uint8_t kBdzLookupIndex[] =
|
||||||
{
|
{
|
||||||
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
|
||||||
|
@ -37,13 +37,13 @@ static uint8_t kBdzLookupTable[] =
|
||||||
|
|
||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
|
|
||||||
const uint8_t MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
|
const uint8_t MPHIndex::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
|
||||||
|
|
||||||
MPHTable::~MPHTable() {
|
MPHIndex::~MPHIndex() {
|
||||||
clear();
|
clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void MPHTable::clear() {
|
void MPHIndex::clear() {
|
||||||
if (!deserialized_) delete [] g_;
|
if (!deserialized_) delete [] g_;
|
||||||
g_ = NULL;
|
g_ = NULL;
|
||||||
g_size_ = 0;
|
g_size_ = 0;
|
||||||
|
@ -53,7 +53,7 @@ void MPHTable::clear() {
|
||||||
// TODO(davi) implement me
|
// TODO(davi) implement me
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MPHTable::GenerateQueue(
|
bool MPHIndex::GenerateQueue(
|
||||||
TriGraph* graph, vector<uint32_t>* queue_output) {
|
TriGraph* graph, vector<uint32_t>* queue_output) {
|
||||||
uint32_t queue_head = 0, queue_tail = 0;
|
uint32_t queue_head = 0, queue_tail = 0;
|
||||||
uint32_t nedges = m_;
|
uint32_t nedges = m_;
|
||||||
|
@ -109,7 +109,7 @@ bool MPHTable::GenerateQueue(
|
||||||
return cycles == 0;
|
return cycles == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MPHTable::Assigning(
|
void MPHIndex::Assigning(
|
||||||
const vector<TriGraph::Edge>& edges, const vector<uint32_t>& queue) {
|
const vector<TriGraph::Edge>& edges, const vector<uint32_t>& queue) {
|
||||||
uint32_t current_edge = 0;
|
uint32_t current_edge = 0;
|
||||||
vector<bool> marked_vertices(n_ + 1);
|
vector<bool> marked_vertices(n_ + 1);
|
||||||
|
@ -164,7 +164,7 @@ void MPHTable::Assigning(
|
||||||
g_ = g;
|
g_ = g;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MPHTable::Ranking() {
|
void MPHIndex::Ranking() {
|
||||||
uint32_t nbytes_total = static_cast<uint32_t>(ceil(n_ / 4.0));
|
uint32_t nbytes_total = static_cast<uint32_t>(ceil(n_ / 4.0));
|
||||||
uint32_t size = k_ >> 2U;
|
uint32_t size = k_ >> 2U;
|
||||||
ranktable_size_ = static_cast<uint32_t>(
|
ranktable_size_ = static_cast<uint32_t>(
|
||||||
|
@ -179,7 +179,7 @@ void MPHTable::Ranking() {
|
||||||
while (1) {
|
while (1) {
|
||||||
if (i == ranktable_size_) break;
|
if (i == ranktable_size_) break;
|
||||||
uint32_t nbytes = size < nbytes_total ? size : nbytes_total;
|
uint32_t nbytes = size < nbytes_total ? size : nbytes_total;
|
||||||
for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]];
|
for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupIndex[g_[offset + j]];
|
||||||
ranktable[i] = count;
|
ranktable[i] = count;
|
||||||
offset += nbytes;
|
offset += nbytes;
|
||||||
nbytes_total -= size;
|
nbytes_total -= size;
|
||||||
|
@ -188,13 +188,13 @@ void MPHTable::Ranking() {
|
||||||
ranktable_ = ranktable;
|
ranktable_ = ranktable;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t MPHTable::Rank(uint32_t vertex) const {
|
uint32_t MPHIndex::Rank(uint32_t vertex) const {
|
||||||
uint32_t index = vertex >> b_;
|
uint32_t index = vertex >> b_;
|
||||||
uint32_t base_rank = ranktable_[index];
|
uint32_t base_rank = ranktable_[index];
|
||||||
uint32_t beg_idx_v = index << b_;
|
uint32_t beg_idx_v = index << b_;
|
||||||
uint32_t beg_idx_b = beg_idx_v >> 2;
|
uint32_t beg_idx_b = beg_idx_v >> 2;
|
||||||
uint32_t end_idx_b = vertex >> 2;
|
uint32_t end_idx_b = vertex >> 2;
|
||||||
while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]];
|
while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_[beg_idx_b++]];
|
||||||
beg_idx_v = beg_idx_b << 2;
|
beg_idx_v = beg_idx_b << 2;
|
||||||
// cerr << "beg_idx_v: " << beg_idx_v << endl;
|
// cerr << "beg_idx_v: " << beg_idx_v << endl;
|
||||||
// cerr << "base rank: " << base_rank << endl;
|
// cerr << "base rank: " << base_rank << endl;
|
||||||
|
@ -213,21 +213,21 @@ uint32_t MPHTable::Rank(uint32_t vertex) const {
|
||||||
return base_rank;
|
return base_rank;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t MPHTable::serialize_bytes_needed() const {
|
uint32_t MPHIndex::serialize_bytes_needed() const {
|
||||||
return sizeof(MPHTable) + g_size_ + ranktable_size_*sizeof(uint32_t);
|
return sizeof(MPHIndex) + g_size_ + ranktable_size_*sizeof(uint32_t);
|
||||||
}
|
}
|
||||||
void MPHTable::serialize(char* memory) const {
|
void MPHIndex::serialize(char* memory) const {
|
||||||
memcpy(memory, this, sizeof(MPHTable));
|
memcpy(memory, this, sizeof(MPHIndex));
|
||||||
memcpy(memory + sizeof(MPHTable), g_, g_size_);
|
memcpy(memory + sizeof(MPHIndex), g_, g_size_);
|
||||||
memcpy(memory + sizeof(MPHTable) + g_size_,
|
memcpy(memory + sizeof(MPHIndex) + g_size_,
|
||||||
ranktable_, ranktable_size_*sizeof(uint32_t));
|
ranktable_, ranktable_size_*sizeof(uint32_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MPHTable::deserialize(const char* serialized_memory) {
|
bool MPHIndex::deserialize(const char* serialized_memory) {
|
||||||
memcpy(this, serialized_memory, sizeof(MPHTable));
|
memcpy(this, serialized_memory, sizeof(MPHIndex));
|
||||||
g_ = reinterpret_cast<const uint8_t*>(serialized_memory + sizeof(MPHTable));
|
g_ = reinterpret_cast<const uint8_t*>(serialized_memory + sizeof(MPHIndex));
|
||||||
ranktable_ = reinterpret_cast<const uint32_t*>(
|
ranktable_ = reinterpret_cast<const uint32_t*>(
|
||||||
serialized_memory + sizeof(MPHTable) + g_size_);
|
serialized_memory + sizeof(MPHIndex) + g_size_);
|
||||||
deserialized_ = true;
|
deserialized_ = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
|
@ -0,0 +1,173 @@
|
||||||
|
#ifndef __CXXMPH_MPH_INDEX_H__
|
||||||
|
#define __CXXMPH_MPH_INDEX_H__
|
||||||
|
|
||||||
|
// Minimal perfect hash abstraction implementing the BDZ algorithm
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cmath>
|
||||||
|
#include <unordered_map> // for std::hash
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
using std::cerr;
|
||||||
|
using std::endl;
|
||||||
|
|
||||||
|
#include "seeded_hash.h"
|
||||||
|
#include "trigraph.h"
|
||||||
|
|
||||||
|
namespace cxxmph {
|
||||||
|
|
||||||
|
class MPHIndex {
|
||||||
|
public:
|
||||||
|
MPHIndex(double c = 1.23, uint8_t b = 7) :
|
||||||
|
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0),
|
||||||
|
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
|
||||||
|
deserialized_(false) { }
|
||||||
|
~MPHIndex();
|
||||||
|
|
||||||
|
template <class SeededHashFcn, class ForwardIterator>
|
||||||
|
bool Reset(ForwardIterator begin, ForwardIterator end);
|
||||||
|
template <class SeededHashFcn, class Key> // must agree with Reset
|
||||||
|
uint32_t index(const Key& x) const;
|
||||||
|
uint32_t size() const { return m_; }
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
// Serialization machinery for mmap usage.
|
||||||
|
// Serialized tables are not guaranteed to work across versions or different
|
||||||
|
// endianness (although they could easily be made to be).
|
||||||
|
uint32_t serialize_bytes_needed() const;
|
||||||
|
void serialize(char *memory) const;
|
||||||
|
bool deserialize(const char* serialized_memory);
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <class SeededHashFcn, class ForwardIterator>
|
||||||
|
bool Mapping(ForwardIterator begin, ForwardIterator end,
|
||||||
|
std::vector<TriGraph::Edge>* edges,
|
||||||
|
std::vector<uint32_t>* queue);
|
||||||
|
bool GenerateQueue(TriGraph* graph, std::vector<uint32_t>* queue);
|
||||||
|
void Assigning(const std::vector<TriGraph::Edge>& edges,
|
||||||
|
const std::vector<uint32_t>& queue);
|
||||||
|
void Ranking();
|
||||||
|
uint32_t Rank(uint32_t vertex) const;
|
||||||
|
|
||||||
|
// Algorithm parameters
|
||||||
|
double c_; // Number of bits per key (? is it right)
|
||||||
|
uint8_t b_; // Number of bits of the kth index in the ranktable
|
||||||
|
|
||||||
|
// Values used during generation
|
||||||
|
uint32_t m_; // edges count
|
||||||
|
uint32_t n_; // vertex count
|
||||||
|
uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
|
||||||
|
|
||||||
|
// Values used during search
|
||||||
|
|
||||||
|
// Partition vertex count, derived from c parameter.
|
||||||
|
uint32_t r_;
|
||||||
|
// The array containing the minimal perfect hash function graph. Do not use
|
||||||
|
// c++ vector to make mmap based backing easier.
|
||||||
|
const uint8_t* g_;
|
||||||
|
uint32_t g_size_;
|
||||||
|
// The table used for the rank step of the minimal perfect hash function
|
||||||
|
const uint32_t* ranktable_;
|
||||||
|
uint32_t ranktable_size_;
|
||||||
|
// The selected hash seed triplet for finding the edges in the minimal
|
||||||
|
// perfect hash function graph.
|
||||||
|
uint32_t hash_seed_[3];
|
||||||
|
|
||||||
|
bool deserialized_;
|
||||||
|
|
||||||
|
static const uint8_t valuemask[];
|
||||||
|
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
|
||||||
|
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
|
||||||
|
}
|
||||||
|
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
|
||||||
|
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
// Template method needs to go in the header file.
|
||||||
|
template <class SeededHashFcn, class ForwardIterator>
|
||||||
|
bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) {
|
||||||
|
m_ = end - begin;
|
||||||
|
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
|
||||||
|
if ((r_ % 2) == 0) r_ += 1;
|
||||||
|
n_ = 3*r_;
|
||||||
|
k_ = 1U << b_;
|
||||||
|
|
||||||
|
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
|
||||||
|
|
||||||
|
int iterations = 10;
|
||||||
|
std::vector<TriGraph::Edge> edges;
|
||||||
|
std::vector<uint32_t> queue;
|
||||||
|
while (1) {
|
||||||
|
// cerr << "Iterations missing: " << iterations << endl;
|
||||||
|
for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_;
|
||||||
|
// for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i;
|
||||||
|
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
|
||||||
|
else --iterations;
|
||||||
|
if (iterations == 0) break;
|
||||||
|
}
|
||||||
|
if (iterations == 0) return false;
|
||||||
|
Assigning(edges, queue);
|
||||||
|
std::vector<TriGraph::Edge>().swap(edges);
|
||||||
|
Ranking();
|
||||||
|
deserialized_ = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class SeededHashFcn, class ForwardIterator>
|
||||||
|
bool MPHIndex::Mapping(
|
||||||
|
ForwardIterator begin, ForwardIterator end,
|
||||||
|
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
|
||||||
|
TriGraph graph(n_, m_);
|
||||||
|
for (ForwardIterator it = begin; it != end; ++it) {
|
||||||
|
uint32_t h[3];
|
||||||
|
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
|
||||||
|
uint32_t v0 = h[0] % r_;
|
||||||
|
uint32_t v1 = h[1] % r_ + r_;
|
||||||
|
uint32_t v2 = h[2] % r_ + (r_ << 1);
|
||||||
|
// cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl;
|
||||||
|
graph.AddEdge(TriGraph::Edge(v0, v1, v2));
|
||||||
|
}
|
||||||
|
if (GenerateQueue(&graph, queue)) {
|
||||||
|
graph.ExtractEdgesAndClear(edges);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class SeededHashFcn, class Key>
|
||||||
|
uint32_t MPHIndex::index(const Key& key) const {
|
||||||
|
uint32_t h[3];
|
||||||
|
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
|
||||||
|
h[0] = h[0] % r_;
|
||||||
|
h[1] = h[1] % r_ + r_;
|
||||||
|
h[2] = h[2] % r_ + (r_ << 1);
|
||||||
|
assert(g_size_);
|
||||||
|
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
|
||||||
|
assert((h[0] >> 2) <g_size_);
|
||||||
|
assert((h[1] >> 2) <g_size_);
|
||||||
|
assert((h[2] >> 2) <g_size_);
|
||||||
|
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
|
||||||
|
// cerr << "Search found vertex " << vertex << endl;
|
||||||
|
return Rank(vertex);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
|
||||||
|
class SimpleMPHIndex : public MPHIndex {
|
||||||
|
public:
|
||||||
|
template <class ForwardIterator>
|
||||||
|
bool Reset(ForwardIterator begin, ForwardIterator end) {
|
||||||
|
return MPHIndex::Reset<HashFcn>(begin, end);
|
||||||
|
}
|
||||||
|
uint32_t index(const Key& key) { return MPHIndex::index<HashFcn>(key); }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace cxxmph
|
||||||
|
|
||||||
|
#endif // __CXXMPH_MPH_INDEX_H__
|
|
@ -3,11 +3,11 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "mph_table.h"
|
#include "mph_index.h"
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using cxxmph::SimpleMPHTable;
|
using cxxmph::SimpleMPHIndex;
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
|
||||||
|
@ -23,20 +23,20 @@ int main(int argc, char** argv) {
|
||||||
keys.push_back("diogo");
|
keys.push_back("diogo");
|
||||||
keys.push_back("algume");
|
keys.push_back("algume");
|
||||||
|
|
||||||
SimpleMPHTable<string> mph_table;
|
SimpleMPHIndex<string> mph_index;
|
||||||
assert(mph_table.Reset(keys.begin(), keys.end()));
|
assert(mph_index.Reset(keys.begin(), keys.end()));
|
||||||
vector<int> ids;
|
vector<int> ids;
|
||||||
for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
|
for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
|
||||||
ids.push_back(mph_table.index(keys[i]));
|
ids.push_back(mph_index.index(keys[i]));
|
||||||
cerr << " " << *(ids.end() - 1);
|
cerr << " " << *(ids.end() - 1);
|
||||||
}
|
}
|
||||||
cerr << endl;
|
cerr << endl;
|
||||||
sort(ids.begin(), ids.end());
|
sort(ids.begin(), ids.end());
|
||||||
for (vector<int>::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast<vector<int>::value_type>(i));
|
for (vector<int>::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast<vector<int>::value_type>(i));
|
||||||
|
|
||||||
char* serialized = new char[mph_table.serialize_bytes_needed()];
|
char* serialized = new char[mph_index.serialize_bytes_needed()];
|
||||||
mph_table.serialize(serialized);
|
mph_index.serialize(serialized);
|
||||||
SimpleMPHTable<string> other_mph_table;
|
SimpleMPHIndex<string> other_mph_index;
|
||||||
other_mph_table.deserialize(serialized);
|
other_mph_index.deserialize(serialized);
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
#include <utility> // for std::pair
|
#include <utility> // for std::pair
|
||||||
|
|
||||||
#include "MurmurHash2.h"
|
#include "MurmurHash2.h"
|
||||||
#include "mph_table.h"
|
#include "mph_index.h"
|
||||||
|
|
||||||
namespace cxxmph {
|
namespace cxxmph {
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ class mph_map {
|
||||||
|
|
||||||
void rehash();
|
void rehash();
|
||||||
std::vector<value_type> values_;
|
std::vector<value_type> values_;
|
||||||
SimpleMPHTable<Key, typename seeded_hash<HashFcn>::hash_function> table_;
|
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
|
||||||
// TODO(davi) optimize slack to no hold a copy of the key
|
// TODO(davi) optimize slack to no hold a copy of the key
|
||||||
typedef typename std::unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
|
typedef typename std::unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
|
||||||
slack_type slack_;
|
slack_type slack_;
|
||||||
|
@ -93,8 +93,8 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
|
||||||
if (it != end()) return std::make_pair(it, false);
|
if (it != end()) return std::make_pair(it, false);
|
||||||
values_.push_back(x);
|
values_.push_back(x);
|
||||||
slack_.insert(std::make_pair(x.first, values_.size() - 1));
|
slack_.insert(std::make_pair(x.first, values_.size() - 1));
|
||||||
if (slack_.size() == table_.size() ||
|
if (slack_.size() == index_.size() ||
|
||||||
(slack_.size() >= 256 && table_.size() == 0)) {
|
(slack_.size() >= 256 && index_.size() == 0)) {
|
||||||
rehash();
|
rehash();
|
||||||
}
|
}
|
||||||
it = find(x.first);
|
it = find(x.first);
|
||||||
|
@ -104,14 +104,14 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
|
||||||
MPH_MAP_METHOD_DECL(void_type, rehash)() {
|
MPH_MAP_METHOD_DECL(void_type, rehash)() {
|
||||||
if (values_.empty()) return;
|
if (values_.empty()) return;
|
||||||
slack_type().swap(slack_);
|
slack_type().swap(slack_);
|
||||||
bool success = table_.Reset(
|
bool success = index_.Reset(
|
||||||
make_iterator_first(values_.begin()),
|
make_iterator_first(values_.begin()),
|
||||||
make_iterator_first(values_.end()));
|
make_iterator_first(values_.end()));
|
||||||
assert(success);
|
assert(success);
|
||||||
std::vector<value_type> new_values(values_.size());
|
std::vector<value_type> new_values(values_.size());
|
||||||
for (const_iterator it = values_.begin(), end = values_.end();
|
for (const_iterator it = values_.begin(), end = values_.end();
|
||||||
it != end; ++it) {
|
it != end; ++it) {
|
||||||
size_type id = table_.index(it->first);
|
size_type id = index_.index(it->first);
|
||||||
assert(id < new_values.size());
|
assert(id < new_values.size());
|
||||||
new_values[id] = *it;
|
new_values[id] = *it;
|
||||||
}
|
}
|
||||||
|
@ -127,7 +127,7 @@ MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); }
|
||||||
MPH_MAP_METHOD_DECL(void_type, clear)() {
|
MPH_MAP_METHOD_DECL(void_type, clear)() {
|
||||||
values_.clear();
|
values_.clear();
|
||||||
slack_.clear();
|
slack_.clear();
|
||||||
table_.clear();
|
index_.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
|
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
|
||||||
|
@ -145,8 +145,8 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
|
||||||
typename slack_type::const_iterator it = slack_.find(k);
|
typename slack_type::const_iterator it = slack_.find(k);
|
||||||
if (it != slack_.end()) return values_.begin() + it->second;
|
if (it != slack_.end()) return values_.begin() + it->second;
|
||||||
}
|
}
|
||||||
if (table_.size() == 0) return end();
|
if (index_.size() == 0) return end();
|
||||||
size_type id = table_.index(k);
|
size_type id = index_.index(k);
|
||||||
if (key_equal()(values_[id].first, k)) {
|
if (key_equal()(values_[id].first, k)) {
|
||||||
return values_.begin() + id;
|
return values_.begin() + id;
|
||||||
}
|
}
|
||||||
|
@ -157,8 +157,8 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
|
||||||
typename slack_type::const_iterator it = slack_.find(k);
|
typename slack_type::const_iterator it = slack_.find(k);
|
||||||
if (it != slack_.end()) return values_.begin() + it->second;
|
if (it != slack_.end()) return values_.begin() + it->second;
|
||||||
}
|
}
|
||||||
if (table_.size() == 0) return end();
|
if (index_.size() == 0) return end();
|
||||||
size_type id = table_.index(k);
|
size_type id = index_.index(k);
|
||||||
if (key_equal()(values_[id].first, k)) {
|
if (key_equal()(values_[id].first, k)) {
|
||||||
return values_.begin() + id;
|
return values_.begin() + id;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,173 +1,16 @@
|
||||||
#ifndef __CXXMPH_MPH_TABLE_H__
|
#include "mph_index.h"
|
||||||
#define __CXXMPH_MPH_TABLE_H__
|
|
||||||
|
|
||||||
// Minimal perfect hash abstraction implementing the BDZ algorithm
|
// String to string map working on mmap'ed memory
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cmath>
|
|
||||||
#include <unordered_map> // for std::hash
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
using std::cerr;
|
|
||||||
using std::endl;
|
|
||||||
|
|
||||||
#include "seeded_hash.h"
|
|
||||||
#include "trigraph.h"
|
|
||||||
|
|
||||||
namespace cxxmph {
|
|
||||||
|
|
||||||
class MPHTable {
|
class MPHTable {
|
||||||
public:
|
public:
|
||||||
MPHTable(double c = 1.23, uint8_t b = 7) :
|
typedef StringPiece key_type;
|
||||||
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0),
|
typedef StringPiece data_type;
|
||||||
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
|
typedef std::pair<StringPiece, StringPiece> value_type;
|
||||||
deserialized_(false) { }
|
|
||||||
~MPHTable();
|
|
||||||
|
|
||||||
template <class SeededHashFcn, class ForwardIterator>
|
|
||||||
bool Reset(ForwardIterator begin, ForwardIterator end);
|
|
||||||
template <class SeededHashFcn, class Key> // must agree with Reset
|
|
||||||
uint32_t index(const Key& x) const;
|
|
||||||
uint32_t size() const { return m_; }
|
|
||||||
void clear();
|
|
||||||
|
|
||||||
// Serialization machinery for mmap usage.
|
|
||||||
// Serialized tables are not guaranteed to work across versions or different
|
|
||||||
// endianness (although they could easily be made to be).
|
|
||||||
uint32_t serialize_bytes_needed() const;
|
|
||||||
void serialize(char *memory) const;
|
|
||||||
bool deserialize(const char* serialized_memory);
|
|
||||||
|
|
||||||
private:
|
|
||||||
template <class SeededHashFcn, class ForwardIterator>
|
|
||||||
bool Mapping(ForwardIterator begin, ForwardIterator end,
|
|
||||||
std::vector<TriGraph::Edge>* edges,
|
|
||||||
std::vector<uint32_t>* queue);
|
|
||||||
bool GenerateQueue(TriGraph* graph, std::vector<uint32_t>* queue);
|
|
||||||
void Assigning(const std::vector<TriGraph::Edge>& edges,
|
|
||||||
const std::vector<uint32_t>& queue);
|
|
||||||
void Ranking();
|
|
||||||
uint32_t Rank(uint32_t vertex) const;
|
|
||||||
|
|
||||||
// Algorithm parameters
|
|
||||||
double c_; // Number of bits per key (? is it right)
|
|
||||||
uint8_t b_; // Number of bits of the kth index in the ranktable
|
|
||||||
|
|
||||||
// Values used during generation
|
|
||||||
uint32_t m_; // edges count
|
|
||||||
uint32_t n_; // vertex count
|
|
||||||
uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
|
|
||||||
|
|
||||||
// Values used during search
|
|
||||||
|
|
||||||
// Partition vertex count, derived from c parameter.
|
|
||||||
uint32_t r_;
|
|
||||||
// The array containing the minimal perfect hash function graph. Do not use
|
|
||||||
// c++ vector to make mmap based backing easier.
|
|
||||||
const uint8_t* g_;
|
|
||||||
uint32_t g_size_;
|
|
||||||
// The table used for the rank step of the minimal perfect hash function
|
|
||||||
const uint32_t* ranktable_;
|
|
||||||
uint32_t ranktable_size_;
|
|
||||||
// The selected hash seed triplet for finding the edges in the minimal
|
|
||||||
// perfect hash function graph.
|
|
||||||
uint32_t hash_seed_[3];
|
|
||||||
|
|
||||||
bool deserialized_;
|
|
||||||
|
|
||||||
static const uint8_t valuemask[];
|
|
||||||
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
|
|
||||||
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
|
|
||||||
}
|
|
||||||
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
|
|
||||||
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
// Template method needs to go in the header file.
|
|
||||||
template <class SeededHashFcn, class ForwardIterator>
|
|
||||||
bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) {
|
|
||||||
m_ = end - begin;
|
|
||||||
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
|
|
||||||
if ((r_ % 2) == 0) r_ += 1;
|
|
||||||
n_ = 3*r_;
|
|
||||||
k_ = 1U << b_;
|
|
||||||
|
|
||||||
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
|
|
||||||
|
|
||||||
int iterations = 10;
|
|
||||||
std::vector<TriGraph::Edge> edges;
|
|
||||||
std::vector<uint32_t> queue;
|
|
||||||
while (1) {
|
|
||||||
// cerr << "Iterations missing: " << iterations << endl;
|
|
||||||
for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_;
|
|
||||||
// for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i;
|
|
||||||
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
|
|
||||||
else --iterations;
|
|
||||||
if (iterations == 0) break;
|
|
||||||
}
|
|
||||||
if (iterations == 0) return false;
|
|
||||||
Assigning(edges, queue);
|
|
||||||
std::vector<TriGraph::Edge>().swap(edges);
|
|
||||||
Ranking();
|
|
||||||
deserialized_ = false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class SeededHashFcn, class ForwardIterator>
|
|
||||||
bool MPHTable::Mapping(
|
|
||||||
ForwardIterator begin, ForwardIterator end,
|
|
||||||
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
|
|
||||||
TriGraph graph(n_, m_);
|
|
||||||
for (ForwardIterator it = begin; it != end; ++it) {
|
|
||||||
uint32_t h[3];
|
|
||||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
|
|
||||||
uint32_t v0 = h[0] % r_;
|
|
||||||
uint32_t v1 = h[1] % r_ + r_;
|
|
||||||
uint32_t v2 = h[2] % r_ + (r_ << 1);
|
|
||||||
// cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl;
|
|
||||||
graph.AddEdge(TriGraph::Edge(v0, v1, v2));
|
|
||||||
}
|
|
||||||
if (GenerateQueue(&graph, queue)) {
|
|
||||||
graph.ExtractEdgesAndClear(edges);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class SeededHashFcn, class Key>
|
|
||||||
uint32_t MPHTable::index(const Key& key) const {
|
|
||||||
uint32_t h[3];
|
|
||||||
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
|
|
||||||
h[0] = h[0] % r_;
|
|
||||||
h[1] = h[1] % r_ + r_;
|
|
||||||
h[2] = h[2] % r_ + (r_ << 1);
|
|
||||||
assert(g_size_);
|
|
||||||
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
|
|
||||||
assert((h[0] >> 2) <g_size_);
|
|
||||||
assert((h[1] >> 2) <g_size_);
|
|
||||||
assert((h[2] >> 2) <g_size_);
|
|
||||||
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
|
|
||||||
// cerr << "Search found vertex " << vertex << endl;
|
|
||||||
return Rank(vertex);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
|
|
||||||
class SimpleMPHTable : public MPHTable {
|
|
||||||
public:
|
|
||||||
template <class ForwardIterator>
|
template <class ForwardIterator>
|
||||||
bool Reset(ForwardIterator begin, ForwardIterator end) {
|
bool Reset(ForwardIterator begin, ForwardIterator end);
|
||||||
return MPHTable::Reset<HashFcn>(begin, end);
|
private:
|
||||||
}
|
char* data_;
|
||||||
uint32_t index(const Key& key) { return MPHTable::index<HashFcn>(key); }
|
vector<uint64_t> offsets_;
|
||||||
|
MPHIndex index_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace cxxmph
|
|
||||||
|
|
||||||
#endif // __CXXMPH_MPH_TABLE_H__
|
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <ext/hash_set>
|
||||||
|
using __gnu_cxx::hash_set;
|
||||||
|
static const char cxx_name = "__gnu_cxx::hash_set";
|
||||||
|
|
||||||
#include "bitbool.h"
|
#include "bitbool.h"
|
||||||
#include "cmph.h"
|
#include "cmph.h"
|
||||||
#include "cmph_benchmark.h"
|
#include "cmph_benchmark.h"
|
||||||
|
@ -71,8 +75,8 @@ void bm_search(CMPH_ALGO algo, int iters) {
|
||||||
cmph_t* mphf = NULL;
|
cmph_t* mphf = NULL;
|
||||||
|
|
||||||
|
|
||||||
snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters);
|
snprintf(mphf_name, 128, "%s:%u", cxx_name, iters);
|
||||||
mphf = lsmap_search(g_created_mphf, mphf_name);
|
mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name);
|
||||||
|
|
||||||
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
||||||
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
||||||
|
@ -102,6 +106,49 @@ DECLARE_ALGO(CMPH_BRZ);
|
||||||
DECLARE_ALGO(CMPH_FCH);
|
DECLARE_ALGO(CMPH_FCH);
|
||||||
DECLARE_ALGO(CMPH_BDZ);
|
DECLARE_ALGO(CMPH_BDZ);
|
||||||
|
|
||||||
|
void bm_create_ext_hash_set(int iters) {
|
||||||
|
cmph_uint32 i = 0;
|
||||||
|
|
||||||
|
if (iters > g_numbers_len) {
|
||||||
|
fprintf(stderr, "No input with proper size.");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
hash_set<cmph_uint32>* ext_hash_set = new hash_set<cmph_uint32>;
|
||||||
|
for (i = 0; i < iters; ++i) {
|
||||||
|
ext_hash_set->insert(g_numbers[i]);
|
||||||
|
}
|
||||||
|
lsmap_append(g_created_mphf, cxx_name, ext_hash_set);
|
||||||
|
}
|
||||||
|
|
||||||
|
void bm_search_ext_hash_set(int iters) {
|
||||||
|
cmph_uint32 i = 0;
|
||||||
|
|
||||||
|
if (iters > g_numbers_len) {
|
||||||
|
fprintf(stderr, "No input with proper size.");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(mphf_name, 128, "%s:%u", hash_count, iters);
|
||||||
|
mphf = (__gnu_cxx::hash_set*)lsmap_search(g_created_mphf, mphf_name);
|
||||||
|
|
||||||
|
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
||||||
|
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
|
||||||
|
|
||||||
|
for (i = 0; i < iters * 100; ++i) {
|
||||||
|
cmph_uint32 pos = random() % iters;
|
||||||
|
const char* buf = (const char*)(g_numbers + pos);
|
||||||
|
cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32));
|
||||||
|
++count[pos];
|
||||||
|
++hash_count[h];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify correctness later.
|
||||||
|
lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count);
|
||||||
|
lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
g_numbers_len = 1000 * 1000;
|
g_numbers_len = 1000 * 1000;
|
||||||
g_numbers = random_numbers_vector_new(g_numbers_len);
|
g_numbers = random_numbers_vector_new(g_numbers_len);
|
||||||
|
|
Loading…
Reference in New Issue