Renamed table to index and reorganized benchmarks.

This commit is contained in:
Davi Reis 2011-05-23 11:01:08 -07:00
parent c630eb2a70
commit bb40a4bb00
12 changed files with 367 additions and 353 deletions

View File

@ -1,28 +1,28 @@
AM_CXXFLAGS='-std=c++0x'
TESTS = $(check_PROGRAMS)
check_PROGRAMS = mph_map_test mph_table_test trigraph_test
noinst_PROGRAMS = bm_numbers bm_urls
check_PROGRAMS = mph_map_test mph_index_test trigraph_test
noinst_PROGRAMS = bm_index bm_map
bin_PROGRAMS = cxxmph
lib_LTLIBRARIES = libcxxmph.la
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_table.h mph_table.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
libcxxmph_la_LDFLAGS = -version-info 0:0:0
cxxmph_includedir = $(includedir)/cxxmph/
cxxmph_include_HEADERS = mph_map.h mph_table.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h
mph_map_test_LDADD = libcxxmph.la
mph_map_test_SOURCES = mph_map_test.cc
mph_table_test_LDADD = libcxxmph.la
mph_table_test_SOURCES = mph_table_test.cc
mph_index_test_LDADD = libcxxmph.la
mph_index_test_SOURCES = mph_index_test.cc
trigraph_test_LDADD = libcxxmph.la
trigraph_test_SOURCES = trigraph_test.cc
bm_numbers_LDADD = libcxxmph.la
bm_numbers_SOURCES = bm_numbers.cc
bm_index_LDADD = libcxxmph.la
bm_index_SOURCES = bm_common.cc bm_index.cc
bm_urls_LDADD = libcxxmph.la
bm_urls_SOURCES = bm_urls.cc
bm_map_LDADD = libcxxmph.la
bm_map_SOURCES = bm_common.cc bm_map.cc
cxxmph_LDADD = libcxxmph.la
cxxmph_SOURCES = cxxmph.cc

View File

@ -1,7 +1,9 @@
#include "benchmark.h"
#include <cerrno>
#include <cstring>
#include <cstdio>
#include <sys/time.h>
#include <sys/resource.h>
#include <iostream>
@ -50,6 +52,16 @@ struct rusage getrusage_or_die() {
return rs;
}
struct timeval gettimeofday_or_die() {
struct timeval tv;
int ret = gettimeofday(&tv, NULL);
if (ret != 0) {
cerr << "gettimeofday failed: " << strerror(errno) << endl;
exit(-1);
}
return tv;
}
#ifdef HAVE_CXA_DEMANGLE
string demangle(const string& name) {
char buf[1024];
@ -79,25 +91,33 @@ namespace cxxmph {
}
/* static */ void Benchmark::RunAll() {
for (auto it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
(*it)->MeasureRun();
delete *it;
for (int i = 0; i < g_benchmarks.size(); ++i) {
Benchmark* bm = g_benchmarks[i];
bm->SetUp();
bm->MeasureRun();
bm->TearDown();
delete bm;
}
}
void Benchmark::MeasureRun() {
struct timeval walltime_begin = gettimeofday_or_die();
struct rusage begin = getrusage_or_die();
Run(iters_);
Run();
struct rusage end = getrusage_or_die();
struct timeval walltime_end = gettimeofday_or_die();
struct timeval utime;
timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime);
struct timeval stime;
timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime);
struct timeval wtime;
timeval_subtract(&wtime, &walltime_end, &walltime_begin);
printf("Benchmark: %s\n", name().c_str());
printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec);
printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec);
printf("CPU User time : %ld.%06ld\n", utime.tv_sec, utime.tv_usec);
printf("CPU System time: %ld.%06ld\n", stime.tv_sec, stime.tv_usec);
printf("Wall clock time: %ld.%06ld\n", wtime.tv_sec, wtime.tv_usec);
printf("\n");
}

View File

@ -8,9 +8,9 @@ namespace cxxmph {
class Benchmark {
public:
Benchmark(int iters = 1) : iters_(iters) { }
virtual void Run(int iters) = 0;
Benchmark() {}
virtual ~Benchmark() {}
const std::string& name() { return name_; }
void set_name(const std::string& name) { name_ = name; }
@ -18,10 +18,11 @@ class Benchmark {
static void RunAll();
protected:
int iters() { return iters_; }
virtual bool SetUp() {};
virtual void Run() = 0;
virtual bool TearDown() {};
private:
int iters_;
std::string name_;
void MeasureRun();
};

52
cxxmph/bm_map.cc Normal file
View File

@ -0,0 +1,52 @@
#include <string>
#include <unordered_map>
#include "bm_common.h"
#include "mph_map.h"
using cxxmph::mph_map;
using std::string;
using std::unordered_map;
namespace cxxmph {
template <class MapType>
class BM_MapCreate : public UrlsBenchmark {
public:
virtual void Run() {
MapType mymap;
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
mymap[*it] = *it;
}
}
};
template <class MapType>
class BM_MapSearch : public SearchUrlsBenchmark {
public:
virtual void Run() {
for (auto it = random_.begin(); it != random_.end(); ++it) {
auto value = mymap[*it];
}
}
protected:
virtual void SetUp() {
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
mymap_[*it] = *it;
}
mymap_.resize(mymap.size());
}
MapType mymap_;
};
} // namespace cxxmph
using namespace cxxmph;
int main(int argc, char** argv) {
Benchmark::Register(new BM_MapCreate<mph_map<string, string>>("URLS100k"));
Benchmark::Register(new BM_MapCreate<unordered_map<string, string>>("URLS100k"));
Benchmark::Register(new BM_MapSearch<mph_map<string, string>>("URLS100k", 1000 * 1000));
Benchmark::Register(new BM_MapSearch<unordered_map<string, string>>("URLS100k", 1000 * 1000));
Benchmark::RunAll();
}

View File

@ -1,52 +0,0 @@
#include <set>
#include <vector>
#include "benchmark.h"
#include "mph_table.h"
using std::set;
using std::vector;
namespace cxxmph {
class BM_NumbersCreate : public Benchmark {
public:
BM_NumbersCreate(int iters = 1) : Benchmark(iters) {
set<int> unique;
while (unique.size() < 1000 * 1000) {
int v = random();
if (unique.find(v) == unique.end()) {
unique.insert(v);
random_unique_.push_back(v);
}
}
}
protected:
virtual void Run(int iters) {
SimpleMPHTable<int> table;
table.Reset(random_unique_.begin(), random_unique_.end());
}
std::vector<int> random_unique_;
};
class BM_NumbersFind : public BM_NumbersCreate {
public:
BM_NumbersFind(int iters) : BM_NumbersCreate(iters) { table_.Reset(random_unique_.begin(), random_unique_.end()); }
virtual void Run(int iters) {
for (int i = 0; i < iters * 100; ++i) {
int pos = random() % random_unique_.size();;
int h = table_.index(pos);
}
}
private:
SimpleMPHTable<int> table_;
};
} // namespace cxxmph
using namespace cxxmph;
int main(int argc, char** argv) {
Benchmark::Register(new BM_NumbersCreate());
Benchmark::Register(new BM_NumbersFind(1000 * 1000));
Benchmark::RunAll();
}

View File

@ -1,70 +0,0 @@
#include <fstream>
#include <iostream>
#include <set>
#include <string>
#include <vector>
#include <unordered_map>
#include "benchmark.h"
#include "mph_map.h"
using std::ifstream;
using std::set;
using std::string;
using std::vector;
namespace cxxmph {
class BM_UrlsCreate : public Benchmark {
public:
BM_UrlsCreate(int iters = 1) : Benchmark(iters) {
ReadUrls();
}
protected:
virtual void Run(int iters) {
BuildTable();
}
void BuildTable() {
for (auto it = urls_.begin(); it != urls_.end(); ++it) {
table_[*it] = it - urls_.begin();
}
table_.pack();
}
void ReadUrls() {
vector<string> urls;
std::ifstream f("URLS100k");
string buffer;
while(std::getline(f, buffer)) urls.push_back(buffer);
set<string> unique(urls.begin(), urls.end());
if (unique.size() != urls.size()) {
cerr << "Input file has repeated keys." << endl;
exit(-1);
}
urls_.swap(urls);
}
vector<string> urls_;
cxxmph::mph_map<string, int> table_;
};
class BM_UrlsFind : public BM_UrlsCreate {
public:
BM_UrlsFind(int iters = 1) : BM_UrlsCreate(iters) { ReadUrls(); BuildTable(); }
protected:
virtual void Run(int iters) {
for (int i = 0; i < iters * 100; ++i) {
int pos = random() % urls_.size();;
int h = table_[urls_[pos]];
assert(h == pos);
}
}
};
} // namespace cxxmph
using namespace cxxmph;
int main(int argc, char** argv) {
Benchmark::Register(new BM_UrlsCreate());
Benchmark::Register(new BM_UrlsFind(1000 * 1000));
Benchmark::RunAll();
}

View File

@ -5,7 +5,7 @@
using std::cerr;
using std::endl;
#include "mph_table.h"
#include "mph_index.h"
using std::vector;
@ -13,7 +13,7 @@ namespace {
static const uint8_t kUnassigned = 3;
// table used for looking up the number of assigned vertices to a 8-bit integer
static uint8_t kBdzLookupTable[] =
static uint8_t kBdzLookupIndex[] =
{
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
@ -37,13 +37,13 @@ static uint8_t kBdzLookupTable[] =
namespace cxxmph {
const uint8_t MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
const uint8_t MPHIndex::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
MPHTable::~MPHTable() {
MPHIndex::~MPHIndex() {
clear();
}
void MPHTable::clear() {
void MPHIndex::clear() {
if (!deserialized_) delete [] g_;
g_ = NULL;
g_size_ = 0;
@ -53,7 +53,7 @@ void MPHTable::clear() {
// TODO(davi) implement me
}
bool MPHTable::GenerateQueue(
bool MPHIndex::GenerateQueue(
TriGraph* graph, vector<uint32_t>* queue_output) {
uint32_t queue_head = 0, queue_tail = 0;
uint32_t nedges = m_;
@ -109,7 +109,7 @@ bool MPHTable::GenerateQueue(
return cycles == 0;
}
void MPHTable::Assigning(
void MPHIndex::Assigning(
const vector<TriGraph::Edge>& edges, const vector<uint32_t>& queue) {
uint32_t current_edge = 0;
vector<bool> marked_vertices(n_ + 1);
@ -164,7 +164,7 @@ void MPHTable::Assigning(
g_ = g;
}
void MPHTable::Ranking() {
void MPHIndex::Ranking() {
uint32_t nbytes_total = static_cast<uint32_t>(ceil(n_ / 4.0));
uint32_t size = k_ >> 2U;
ranktable_size_ = static_cast<uint32_t>(
@ -179,7 +179,7 @@ void MPHTable::Ranking() {
while (1) {
if (i == ranktable_size_) break;
uint32_t nbytes = size < nbytes_total ? size : nbytes_total;
for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]];
for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupIndex[g_[offset + j]];
ranktable[i] = count;
offset += nbytes;
nbytes_total -= size;
@ -188,13 +188,13 @@ void MPHTable::Ranking() {
ranktable_ = ranktable;
}
uint32_t MPHTable::Rank(uint32_t vertex) const {
uint32_t MPHIndex::Rank(uint32_t vertex) const {
uint32_t index = vertex >> b_;
uint32_t base_rank = ranktable_[index];
uint32_t beg_idx_v = index << b_;
uint32_t beg_idx_b = beg_idx_v >> 2;
uint32_t end_idx_b = vertex >> 2;
while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]];
while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_[beg_idx_b++]];
beg_idx_v = beg_idx_b << 2;
// cerr << "beg_idx_v: " << beg_idx_v << endl;
// cerr << "base rank: " << base_rank << endl;
@ -213,21 +213,21 @@ uint32_t MPHTable::Rank(uint32_t vertex) const {
return base_rank;
}
uint32_t MPHTable::serialize_bytes_needed() const {
return sizeof(MPHTable) + g_size_ + ranktable_size_*sizeof(uint32_t);
uint32_t MPHIndex::serialize_bytes_needed() const {
return sizeof(MPHIndex) + g_size_ + ranktable_size_*sizeof(uint32_t);
}
void MPHTable::serialize(char* memory) const {
memcpy(memory, this, sizeof(MPHTable));
memcpy(memory + sizeof(MPHTable), g_, g_size_);
memcpy(memory + sizeof(MPHTable) + g_size_,
void MPHIndex::serialize(char* memory) const {
memcpy(memory, this, sizeof(MPHIndex));
memcpy(memory + sizeof(MPHIndex), g_, g_size_);
memcpy(memory + sizeof(MPHIndex) + g_size_,
ranktable_, ranktable_size_*sizeof(uint32_t));
}
bool MPHTable::deserialize(const char* serialized_memory) {
memcpy(this, serialized_memory, sizeof(MPHTable));
g_ = reinterpret_cast<const uint8_t*>(serialized_memory + sizeof(MPHTable));
bool MPHIndex::deserialize(const char* serialized_memory) {
memcpy(this, serialized_memory, sizeof(MPHIndex));
g_ = reinterpret_cast<const uint8_t*>(serialized_memory + sizeof(MPHIndex));
ranktable_ = reinterpret_cast<const uint32_t*>(
serialized_memory + sizeof(MPHTable) + g_size_);
serialized_memory + sizeof(MPHIndex) + g_size_);
deserialized_ = true;
return true;
}

173
cxxmph/mph_index.h Normal file
View File

@ -0,0 +1,173 @@
#ifndef __CXXMPH_MPH_INDEX_H__
#define __CXXMPH_MPH_INDEX_H__
// Minimal perfect hash abstraction implementing the BDZ algorithm
#include <stdint.h>
#include <cassert>
#include <cmath>
#include <unordered_map> // for std::hash
#include <vector>
#include <iostream>
using std::cerr;
using std::endl;
#include "seeded_hash.h"
#include "trigraph.h"
namespace cxxmph {
class MPHIndex {
public:
MPHIndex(double c = 1.23, uint8_t b = 7) :
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0),
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
deserialized_(false) { }
~MPHIndex();
template <class SeededHashFcn, class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end);
template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t index(const Key& x) const;
uint32_t size() const { return m_; }
void clear();
// Serialization machinery for mmap usage.
// Serialized tables are not guaranteed to work across versions or different
// endianness (although they could easily be made to be).
uint32_t serialize_bytes_needed() const;
void serialize(char *memory) const;
bool deserialize(const char* serialized_memory);
private:
template <class SeededHashFcn, class ForwardIterator>
bool Mapping(ForwardIterator begin, ForwardIterator end,
std::vector<TriGraph::Edge>* edges,
std::vector<uint32_t>* queue);
bool GenerateQueue(TriGraph* graph, std::vector<uint32_t>* queue);
void Assigning(const std::vector<TriGraph::Edge>& edges,
const std::vector<uint32_t>& queue);
void Ranking();
uint32_t Rank(uint32_t vertex) const;
// Algorithm parameters
double c_; // Number of bits per key (? is it right)
uint8_t b_; // Number of bits of the kth index in the ranktable
// Values used during generation
uint32_t m_; // edges count
uint32_t n_; // vertex count
uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
// Values used during search
// Partition vertex count, derived from c parameter.
uint32_t r_;
// The array containing the minimal perfect hash function graph. Do not use
// c++ vector to make mmap based backing easier.
const uint8_t* g_;
uint32_t g_size_;
// The table used for the rank step of the minimal perfect hash function
const uint32_t* ranktable_;
uint32_t ranktable_size_;
// The selected hash seed triplet for finding the edges in the minimal
// perfect hash function graph.
uint32_t hash_seed_[3];
bool deserialized_;
static const uint8_t valuemask[];
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
}
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
}
};
// Template method needs to go in the header file.
template <class SeededHashFcn, class ForwardIterator>
bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) {
m_ = end - begin;
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
if ((r_ % 2) == 0) r_ += 1;
n_ = 3*r_;
k_ = 1U << b_;
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
int iterations = 10;
std::vector<TriGraph::Edge> edges;
std::vector<uint32_t> queue;
while (1) {
// cerr << "Iterations missing: " << iterations << endl;
for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_;
// for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i;
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
else --iterations;
if (iterations == 0) break;
}
if (iterations == 0) return false;
Assigning(edges, queue);
std::vector<TriGraph::Edge>().swap(edges);
Ranking();
deserialized_ = false;
return true;
}
template <class SeededHashFcn, class ForwardIterator>
bool MPHIndex::Mapping(
ForwardIterator begin, ForwardIterator end,
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
TriGraph graph(n_, m_);
for (ForwardIterator it = begin; it != end; ++it) {
uint32_t h[3];
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
uint32_t v0 = h[0] % r_;
uint32_t v1 = h[1] % r_ + r_;
uint32_t v2 = h[2] % r_ + (r_ << 1);
// cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl;
graph.AddEdge(TriGraph::Edge(v0, v1, v2));
}
if (GenerateQueue(&graph, queue)) {
graph.ExtractEdgesAndClear(edges);
return true;
}
return false;
}
template <class SeededHashFcn, class Key>
uint32_t MPHIndex::index(const Key& key) const {
uint32_t h[3];
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
h[0] = h[0] % r_;
h[1] = h[1] % r_ + r_;
h[2] = h[2] % r_ + (r_ << 1);
assert(g_size_);
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
assert((h[0] >> 2) <g_size_);
assert((h[1] >> 2) <g_size_);
assert((h[2] >> 2) <g_size_);
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
// cerr << "Search found vertex " << vertex << endl;
return Rank(vertex);
}
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
class SimpleMPHIndex : public MPHIndex {
public:
template <class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end) {
return MPHIndex::Reset<HashFcn>(begin, end);
}
uint32_t index(const Key& key) { return MPHIndex::index<HashFcn>(key); }
};
} // namespace cxxmph
#endif // __CXXMPH_MPH_INDEX_H__

View File

@ -3,11 +3,11 @@
#include <string>
#include <vector>
#include "mph_table.h"
#include "mph_index.h"
using std::string;
using std::vector;
using cxxmph::SimpleMPHTable;
using cxxmph::SimpleMPHIndex;
int main(int argc, char** argv) {
@ -23,20 +23,20 @@ int main(int argc, char** argv) {
keys.push_back("diogo");
keys.push_back("algume");
SimpleMPHTable<string> mph_table;
assert(mph_table.Reset(keys.begin(), keys.end()));
SimpleMPHIndex<string> mph_index;
assert(mph_index.Reset(keys.begin(), keys.end()));
vector<int> ids;
for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
ids.push_back(mph_table.index(keys[i]));
ids.push_back(mph_index.index(keys[i]));
cerr << " " << *(ids.end() - 1);
}
cerr << endl;
sort(ids.begin(), ids.end());
for (vector<int>::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast<vector<int>::value_type>(i));
char* serialized = new char[mph_table.serialize_bytes_needed()];
mph_table.serialize(serialized);
SimpleMPHTable<string> other_mph_table;
other_mph_table.deserialize(serialized);
char* serialized = new char[mph_index.serialize_bytes_needed()];
mph_index.serialize(serialized);
SimpleMPHIndex<string> other_mph_index;
other_mph_index.deserialize(serialized);
}

View File

@ -4,7 +4,7 @@
#include <utility> // for std::pair
#include "MurmurHash2.h"
#include "mph_table.h"
#include "mph_index.h"
namespace cxxmph {
@ -70,7 +70,7 @@ class mph_map {
void rehash();
std::vector<value_type> values_;
SimpleMPHTable<Key, typename seeded_hash<HashFcn>::hash_function> table_;
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
// TODO(davi) optimize slack to no hold a copy of the key
typedef typename std::unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
slack_type slack_;
@ -93,8 +93,8 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
if (it != end()) return std::make_pair(it, false);
values_.push_back(x);
slack_.insert(std::make_pair(x.first, values_.size() - 1));
if (slack_.size() == table_.size() ||
(slack_.size() >= 256 && table_.size() == 0)) {
if (slack_.size() == index_.size() ||
(slack_.size() >= 256 && index_.size() == 0)) {
rehash();
}
it = find(x.first);
@ -104,14 +104,14 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
MPH_MAP_METHOD_DECL(void_type, rehash)() {
if (values_.empty()) return;
slack_type().swap(slack_);
bool success = table_.Reset(
bool success = index_.Reset(
make_iterator_first(values_.begin()),
make_iterator_first(values_.end()));
assert(success);
std::vector<value_type> new_values(values_.size());
for (const_iterator it = values_.begin(), end = values_.end();
it != end; ++it) {
size_type id = table_.index(it->first);
size_type id = index_.index(it->first);
assert(id < new_values.size());
new_values[id] = *it;
}
@ -127,7 +127,7 @@ MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); }
MPH_MAP_METHOD_DECL(void_type, clear)() {
values_.clear();
slack_.clear();
table_.clear();
index_.clear();
}
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
@ -145,8 +145,8 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
typename slack_type::const_iterator it = slack_.find(k);
if (it != slack_.end()) return values_.begin() + it->second;
}
if (table_.size() == 0) return end();
size_type id = table_.index(k);
if (index_.size() == 0) return end();
size_type id = index_.index(k);
if (key_equal()(values_[id].first, k)) {
return values_.begin() + id;
}
@ -157,8 +157,8 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
typename slack_type::const_iterator it = slack_.find(k);
if (it != slack_.end()) return values_.begin() + it->second;
}
if (table_.size() == 0) return end();
size_type id = table_.index(k);
if (index_.size() == 0) return end();
size_type id = index_.index(k);
if (key_equal()(values_[id].first, k)) {
return values_.begin() + id;
}

View File

@ -1,173 +1,16 @@
#ifndef __CXXMPH_MPH_TABLE_H__
#define __CXXMPH_MPH_TABLE_H__
#include "mph_index.h"
// Minimal perfect hash abstraction implementing the BDZ algorithm
#include <stdint.h>
#include <cassert>
#include <cmath>
#include <unordered_map> // for std::hash
#include <vector>
#include <iostream>
using std::cerr;
using std::endl;
#include "seeded_hash.h"
#include "trigraph.h"
namespace cxxmph {
// String to string map working on mmap'ed memory
class MPHTable {
public:
MPHTable(double c = 1.23, uint8_t b = 7) :
c_(c), b_(b), m_(0), n_(0), k_(0), r_(0),
g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0),
deserialized_(false) { }
~MPHTable();
template <class SeededHashFcn, class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end);
template <class SeededHashFcn, class Key> // must agree with Reset
uint32_t index(const Key& x) const;
uint32_t size() const { return m_; }
void clear();
// Serialization machinery for mmap usage.
// Serialized tables are not guaranteed to work across versions or different
// endianness (although they could easily be made to be).
uint32_t serialize_bytes_needed() const;
void serialize(char *memory) const;
bool deserialize(const char* serialized_memory);
private:
template <class SeededHashFcn, class ForwardIterator>
bool Mapping(ForwardIterator begin, ForwardIterator end,
std::vector<TriGraph::Edge>* edges,
std::vector<uint32_t>* queue);
bool GenerateQueue(TriGraph* graph, std::vector<uint32_t>* queue);
void Assigning(const std::vector<TriGraph::Edge>& edges,
const std::vector<uint32_t>& queue);
void Ranking();
uint32_t Rank(uint32_t vertex) const;
// Algorithm parameters
double c_; // Number of bits per key (? is it right)
uint8_t b_; // Number of bits of the kth index in the ranktable
// Values used during generation
uint32_t m_; // edges count
uint32_t n_; // vertex count
uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$
// Values used during search
// Partition vertex count, derived from c parameter.
uint32_t r_;
// The array containing the minimal perfect hash function graph. Do not use
// c++ vector to make mmap based backing easier.
const uint8_t* g_;
uint32_t g_size_;
// The table used for the rank step of the minimal perfect hash function
const uint32_t* ranktable_;
uint32_t ranktable_size_;
// The selected hash seed triplet for finding the edges in the minimal
// perfect hash function graph.
uint32_t hash_seed_[3];
bool deserialized_;
static const uint8_t valuemask[];
static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) {
d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]);
}
static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) {
return (d[(i >> 2)] >> (((i & 3) << 1)) & 3);
}
};
// Template method needs to go in the header file.
template <class SeededHashFcn, class ForwardIterator>
bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) {
m_ = end - begin;
r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
if ((r_ % 2) == 0) r_ += 1;
n_ = 3*r_;
k_ = 1U << b_;
// cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl;
int iterations = 10;
std::vector<TriGraph::Edge> edges;
std::vector<uint32_t> queue;
while (1) {
// cerr << "Iterations missing: " << iterations << endl;
for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_;
// for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i;
if (Mapping<SeededHashFcn>(begin, end, &edges, &queue)) break;
else --iterations;
if (iterations == 0) break;
}
if (iterations == 0) return false;
Assigning(edges, queue);
std::vector<TriGraph::Edge>().swap(edges);
Ranking();
deserialized_ = false;
return true;
}
template <class SeededHashFcn, class ForwardIterator>
bool MPHTable::Mapping(
ForwardIterator begin, ForwardIterator end,
std::vector<TriGraph::Edge>* edges, std::vector<uint32_t>* queue) {
TriGraph graph(n_, m_);
for (ForwardIterator it = begin; it != end; ++it) {
uint32_t h[3];
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]);
uint32_t v0 = h[0] % r_;
uint32_t v1 = h[1] % r_ + r_;
uint32_t v2 = h[2] % r_ + (r_ << 1);
// cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl;
graph.AddEdge(TriGraph::Edge(v0, v1, v2));
}
if (GenerateQueue(&graph, queue)) {
graph.ExtractEdgesAndClear(edges);
return true;
}
return false;
}
template <class SeededHashFcn, class Key>
uint32_t MPHTable::index(const Key& key) const {
uint32_t h[3];
for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]);
h[0] = h[0] % r_;
h[1] = h[1] % r_ + r_;
h[2] = h[2] % r_ + (r_ << 1);
assert(g_size_);
// cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl;
assert((h[0] >> 2) <g_size_);
assert((h[1] >> 2) <g_size_);
assert((h[2] >> 2) <g_size_);
uint32_t vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3];
// cerr << "Search found vertex " << vertex << endl;
return Rank(vertex);
}
template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash_function>
class SimpleMPHTable : public MPHTable {
public:
typedef StringPiece key_type;
typedef StringPiece data_type;
typedef std::pair<StringPiece, StringPiece> value_type;
template <class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end) {
return MPHTable::Reset<HashFcn>(begin, end);
}
uint32_t index(const Key& key) { return MPHTable::index<HashFcn>(key); }
bool Reset(ForwardIterator begin, ForwardIterator end);
private:
char* data_;
vector<uint64_t> offsets_;
MPHIndex index_;
};
} // namespace cxxmph
#endif // __CXXMPH_MPH_TABLE_H__

View File

@ -1,6 +1,10 @@
#include <stdlib.h>
#include <string.h>
#include <ext/hash_set>
using __gnu_cxx::hash_set;
static const char cxx_name = "__gnu_cxx::hash_set";
#include "bitbool.h"
#include "cmph.h"
#include "cmph_benchmark.h"
@ -71,8 +75,8 @@ void bm_search(CMPH_ALGO algo, int iters) {
cmph_t* mphf = NULL;
snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters);
mphf = lsmap_search(g_created_mphf, mphf_name);
snprintf(mphf_name, 128, "%s:%u", cxx_name, iters);
mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name);
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
@ -102,6 +106,49 @@ DECLARE_ALGO(CMPH_BRZ);
DECLARE_ALGO(CMPH_FCH);
DECLARE_ALGO(CMPH_BDZ);
void bm_create_ext_hash_set(int iters) {
cmph_uint32 i = 0;
if (iters > g_numbers_len) {
fprintf(stderr, "No input with proper size.");
exit(-1);
}
hash_set<cmph_uint32>* ext_hash_set = new hash_set<cmph_uint32>;
for (i = 0; i < iters; ++i) {
ext_hash_set->insert(g_numbers[i]);
}
lsmap_append(g_created_mphf, cxx_name, ext_hash_set);
}
void bm_search_ext_hash_set(int iters) {
cmph_uint32 i = 0;
if (iters > g_numbers_len) {
fprintf(stderr, "No input with proper size.");
exit(-1);
}
snprintf(mphf_name, 128, "%s:%u", hash_count, iters);
mphf = (__gnu_cxx::hash_set*)lsmap_search(g_created_mphf, mphf_name);
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
for (i = 0; i < iters * 100; ++i) {
cmph_uint32 pos = random() % iters;
const char* buf = (const char*)(g_numbers + pos);
cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32));
++count[pos];
++hash_count[h];
}
// Verify correctness later.
lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count);
lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count);
}
}
int main(int argc, char** argv) {
g_numbers_len = 1000 * 1000;
g_numbers = random_numbers_vector_new(g_numbers_len);