From bb40a4bb00f2ad3caccb5964c3b17b93c16c8386 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 23 May 2011 11:01:08 -0700 Subject: [PATCH] Renamed table to index and reorganized benchmarks. --- cxxmph/Makefile.am | 20 +- cxxmph/benchmark.cc | 32 +++- cxxmph/benchmark.h | 11 +- cxxmph/bm_map.cc | 52 +++++ cxxmph/bm_numbers.cc | 52 ----- cxxmph/bm_urls.cc | 70 ------- cxxmph/{mph_table.cc => mph_index.cc} | 42 ++--- cxxmph/mph_index.h | 173 +++++++++++++++++ .../{mph_table_test.cc => mph_index_test.cc} | 18 +- cxxmph/mph_map.h | 22 +-- cxxmph/mph_table.h | 177 +----------------- src/bm_numbers.c | 51 ++++- 12 files changed, 367 insertions(+), 353 deletions(-) create mode 100644 cxxmph/bm_map.cc delete mode 100644 cxxmph/bm_numbers.cc delete mode 100644 cxxmph/bm_urls.cc rename cxxmph/{mph_table.cc => mph_index.cc} (87%) create mode 100644 cxxmph/mph_index.h rename cxxmph/{mph_table_test.cc => mph_index_test.cc} (66%) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 2d44345..801d2d0 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,28 +1,28 @@ AM_CXXFLAGS='-std=c++0x' TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_table_test trigraph_test -noinst_PROGRAMS = bm_numbers bm_urls +check_PROGRAMS = mph_map_test mph_index_test trigraph_test +noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_table.h mph_table.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc +libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = mph_map.h mph_table.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h mph_map_test_LDADD = libcxxmph.la mph_map_test_SOURCES = mph_map_test.cc -mph_table_test_LDADD = libcxxmph.la -mph_table_test_SOURCES = mph_table_test.cc +mph_index_test_LDADD = libcxxmph.la +mph_index_test_SOURCES = mph_index_test.cc trigraph_test_LDADD = libcxxmph.la trigraph_test_SOURCES = trigraph_test.cc -bm_numbers_LDADD = libcxxmph.la -bm_numbers_SOURCES = bm_numbers.cc +bm_index_LDADD = libcxxmph.la +bm_index_SOURCES = bm_common.cc bm_index.cc -bm_urls_LDADD = libcxxmph.la -bm_urls_SOURCES = bm_urls.cc +bm_map_LDADD = libcxxmph.la +bm_map_SOURCES = bm_common.cc bm_map.cc cxxmph_LDADD = libcxxmph.la cxxmph_SOURCES = cxxmph.cc diff --git a/cxxmph/benchmark.cc b/cxxmph/benchmark.cc index 04e5086..644bdc9 100644 --- a/cxxmph/benchmark.cc +++ b/cxxmph/benchmark.cc @@ -1,7 +1,9 @@ #include "benchmark.h" +#include #include #include +#include #include #include @@ -50,6 +52,16 @@ struct rusage getrusage_or_die() { return rs; } +struct timeval gettimeofday_or_die() { + struct timeval tv; + int ret = gettimeofday(&tv, NULL); + if (ret != 0) { + cerr << "gettimeofday failed: " << strerror(errno) << endl; + exit(-1); + } + return tv; +} + #ifdef HAVE_CXA_DEMANGLE string demangle(const string& name) { char buf[1024]; @@ -79,25 +91,33 @@ namespace cxxmph { } /* static */ void Benchmark::RunAll() { - for (auto it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { - (*it)->MeasureRun(); - delete *it; + for (int i = 0; i < g_benchmarks.size(); ++i) { + Benchmark* bm = g_benchmarks[i]; + bm->SetUp(); + bm->MeasureRun(); + bm->TearDown(); + delete bm; } } void Benchmark::MeasureRun() { + struct timeval walltime_begin = gettimeofday_or_die(); struct rusage begin = getrusage_or_die(); - Run(iters_); + Run(); struct rusage end = getrusage_or_die(); + struct timeval walltime_end = gettimeofday_or_die(); struct timeval utime; timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime); struct timeval stime; timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime); + struct timeval wtime; + timeval_subtract(&wtime, &walltime_end, &walltime_begin); printf("Benchmark: %s\n", name().c_str()); - printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); - printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); + printf("CPU User time : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); + printf("CPU System time: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); + printf("Wall clock time: %ld.%06ld\n", wtime.tv_sec, wtime.tv_usec); printf("\n"); } diff --git a/cxxmph/benchmark.h b/cxxmph/benchmark.h index f0629e4..edd3fb9 100644 --- a/cxxmph/benchmark.h +++ b/cxxmph/benchmark.h @@ -8,9 +8,9 @@ namespace cxxmph { class Benchmark { public: - Benchmark(int iters = 1) : iters_(iters) { } - virtual void Run(int iters) = 0; - virtual ~Benchmark() { } + Benchmark() {} + virtual ~Benchmark() {} + const std::string& name() { return name_; } void set_name(const std::string& name) { name_ = name; } @@ -18,10 +18,11 @@ class Benchmark { static void RunAll(); protected: - int iters() { return iters_; } + virtual bool SetUp() {}; + virtual void Run() = 0; + virtual bool TearDown() {}; private: - int iters_; std::string name_; void MeasureRun(); }; diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc new file mode 100644 index 0000000..1d26847 --- /dev/null +++ b/cxxmph/bm_map.cc @@ -0,0 +1,52 @@ +#include +#include + +#include "bm_common.h" +#include "mph_map.h" + +using cxxmph::mph_map; +using std::string; +using std::unordered_map; + +namespace cxxmph { + +template +class BM_MapCreate : public UrlsBenchmark { + public: + virtual void Run() { + MapType mymap; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + mymap[*it] = *it; + } + } +}; + +template +class BM_MapSearch : public SearchUrlsBenchmark { + public: + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto value = mymap[*it]; + } + } + protected: + virtual void SetUp() { + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + mymap_[*it] = *it; + } + mymap_.resize(mymap.size()); + } + MapType mymap_; +}; + +} // namespace cxxmph + +using namespace cxxmph; + +int main(int argc, char** argv) { + Benchmark::Register(new BM_MapCreate>("URLS100k")); + Benchmark::Register(new BM_MapCreate>("URLS100k")); + Benchmark::Register(new BM_MapSearch>("URLS100k", 1000 * 1000)); + Benchmark::Register(new BM_MapSearch>("URLS100k", 1000 * 1000)); + Benchmark::RunAll(); +} diff --git a/cxxmph/bm_numbers.cc b/cxxmph/bm_numbers.cc deleted file mode 100644 index 85653f5..0000000 --- a/cxxmph/bm_numbers.cc +++ /dev/null @@ -1,52 +0,0 @@ -#include -#include - -#include "benchmark.h" -#include "mph_table.h" - -using std::set; -using std::vector; - -namespace cxxmph { -class BM_NumbersCreate : public Benchmark { - public: - BM_NumbersCreate(int iters = 1) : Benchmark(iters) { - set unique; - while (unique.size() < 1000 * 1000) { - int v = random(); - if (unique.find(v) == unique.end()) { - unique.insert(v); - random_unique_.push_back(v); - } - } - } - protected: - virtual void Run(int iters) { - SimpleMPHTable table; - table.Reset(random_unique_.begin(), random_unique_.end()); - } - std::vector random_unique_; -}; - -class BM_NumbersFind : public BM_NumbersCreate { - public: - BM_NumbersFind(int iters) : BM_NumbersCreate(iters) { table_.Reset(random_unique_.begin(), random_unique_.end()); } - virtual void Run(int iters) { - for (int i = 0; i < iters * 100; ++i) { - int pos = random() % random_unique_.size();; - int h = table_.index(pos); - } - } - private: - SimpleMPHTable table_; -}; - -} // namespace cxxmph - -using namespace cxxmph; - -int main(int argc, char** argv) { - Benchmark::Register(new BM_NumbersCreate()); - Benchmark::Register(new BM_NumbersFind(1000 * 1000)); - Benchmark::RunAll(); -} diff --git a/cxxmph/bm_urls.cc b/cxxmph/bm_urls.cc deleted file mode 100644 index 6424755..0000000 --- a/cxxmph/bm_urls.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "benchmark.h" -#include "mph_map.h" - -using std::ifstream; -using std::set; -using std::string; -using std::vector; - -namespace cxxmph { - -class BM_UrlsCreate : public Benchmark { - public: - BM_UrlsCreate(int iters = 1) : Benchmark(iters) { - ReadUrls(); - } - protected: - virtual void Run(int iters) { - BuildTable(); - } - void BuildTable() { - for (auto it = urls_.begin(); it != urls_.end(); ++it) { - table_[*it] = it - urls_.begin(); - } - table_.pack(); - } - void ReadUrls() { - vector urls; - std::ifstream f("URLS100k"); - string buffer; - while(std::getline(f, buffer)) urls.push_back(buffer); - set unique(urls.begin(), urls.end()); - if (unique.size() != urls.size()) { - cerr << "Input file has repeated keys." << endl; - exit(-1); - } - urls_.swap(urls); - } - vector urls_; - cxxmph::mph_map table_; -}; - -class BM_UrlsFind : public BM_UrlsCreate { - public: - BM_UrlsFind(int iters = 1) : BM_UrlsCreate(iters) { ReadUrls(); BuildTable(); } - protected: - virtual void Run(int iters) { - for (int i = 0; i < iters * 100; ++i) { - int pos = random() % urls_.size();; - int h = table_[urls_[pos]]; - assert(h == pos); - } - } -}; - -} // namespace cxxmph - -using namespace cxxmph; - -int main(int argc, char** argv) { - Benchmark::Register(new BM_UrlsCreate()); - Benchmark::Register(new BM_UrlsFind(1000 * 1000)); - Benchmark::RunAll(); -} diff --git a/cxxmph/mph_table.cc b/cxxmph/mph_index.cc similarity index 87% rename from cxxmph/mph_table.cc rename to cxxmph/mph_index.cc index fae8e98..b1c0176 100644 --- a/cxxmph/mph_table.cc +++ b/cxxmph/mph_index.cc @@ -5,7 +5,7 @@ using std::cerr; using std::endl; -#include "mph_table.h" +#include "mph_index.h" using std::vector; @@ -13,7 +13,7 @@ namespace { static const uint8_t kUnassigned = 3; // table used for looking up the number of assigned vertices to a 8-bit integer -static uint8_t kBdzLookupTable[] = +static uint8_t kBdzLookupIndex[] = { 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, @@ -37,13 +37,13 @@ static uint8_t kBdzLookupTable[] = namespace cxxmph { -const uint8_t MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; +const uint8_t MPHIndex::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; -MPHTable::~MPHTable() { +MPHIndex::~MPHIndex() { clear(); } -void MPHTable::clear() { +void MPHIndex::clear() { if (!deserialized_) delete [] g_; g_ = NULL; g_size_ = 0; @@ -53,7 +53,7 @@ void MPHTable::clear() { // TODO(davi) implement me } -bool MPHTable::GenerateQueue( +bool MPHIndex::GenerateQueue( TriGraph* graph, vector* queue_output) { uint32_t queue_head = 0, queue_tail = 0; uint32_t nedges = m_; @@ -109,7 +109,7 @@ bool MPHTable::GenerateQueue( return cycles == 0; } -void MPHTable::Assigning( +void MPHIndex::Assigning( const vector& edges, const vector& queue) { uint32_t current_edge = 0; vector marked_vertices(n_ + 1); @@ -164,7 +164,7 @@ void MPHTable::Assigning( g_ = g; } -void MPHTable::Ranking() { +void MPHIndex::Ranking() { uint32_t nbytes_total = static_cast(ceil(n_ / 4.0)); uint32_t size = k_ >> 2U; ranktable_size_ = static_cast( @@ -179,7 +179,7 @@ void MPHTable::Ranking() { while (1) { if (i == ranktable_size_) break; uint32_t nbytes = size < nbytes_total ? size : nbytes_total; - for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; + for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupIndex[g_[offset + j]]; ranktable[i] = count; offset += nbytes; nbytes_total -= size; @@ -188,13 +188,13 @@ void MPHTable::Ranking() { ranktable_ = ranktable; } -uint32_t MPHTable::Rank(uint32_t vertex) const { +uint32_t MPHIndex::Rank(uint32_t vertex) const { uint32_t index = vertex >> b_; uint32_t base_rank = ranktable_[index]; uint32_t beg_idx_v = index << b_; uint32_t beg_idx_b = beg_idx_v >> 2; uint32_t end_idx_b = vertex >> 2; - while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; + while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; // cerr << "beg_idx_v: " << beg_idx_v << endl; // cerr << "base rank: " << base_rank << endl; @@ -213,21 +213,21 @@ uint32_t MPHTable::Rank(uint32_t vertex) const { return base_rank; } -uint32_t MPHTable::serialize_bytes_needed() const { - return sizeof(MPHTable) + g_size_ + ranktable_size_*sizeof(uint32_t); +uint32_t MPHIndex::serialize_bytes_needed() const { + return sizeof(MPHIndex) + g_size_ + ranktable_size_*sizeof(uint32_t); } -void MPHTable::serialize(char* memory) const { - memcpy(memory, this, sizeof(MPHTable)); - memcpy(memory + sizeof(MPHTable), g_, g_size_); - memcpy(memory + sizeof(MPHTable) + g_size_, +void MPHIndex::serialize(char* memory) const { + memcpy(memory, this, sizeof(MPHIndex)); + memcpy(memory + sizeof(MPHIndex), g_, g_size_); + memcpy(memory + sizeof(MPHIndex) + g_size_, ranktable_, ranktable_size_*sizeof(uint32_t)); } -bool MPHTable::deserialize(const char* serialized_memory) { - memcpy(this, serialized_memory, sizeof(MPHTable)); - g_ = reinterpret_cast(serialized_memory + sizeof(MPHTable)); +bool MPHIndex::deserialize(const char* serialized_memory) { + memcpy(this, serialized_memory, sizeof(MPHIndex)); + g_ = reinterpret_cast(serialized_memory + sizeof(MPHIndex)); ranktable_ = reinterpret_cast( - serialized_memory + sizeof(MPHTable) + g_size_); + serialized_memory + sizeof(MPHIndex) + g_size_); deserialized_ = true; return true; } diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h new file mode 100644 index 0000000..77b6ea4 --- /dev/null +++ b/cxxmph/mph_index.h @@ -0,0 +1,173 @@ +#ifndef __CXXMPH_MPH_INDEX_H__ +#define __CXXMPH_MPH_INDEX_H__ + +// Minimal perfect hash abstraction implementing the BDZ algorithm + +#include + +#include +#include +#include // for std::hash +#include + +#include + +using std::cerr; +using std::endl; + +#include "seeded_hash.h" +#include "trigraph.h" + +namespace cxxmph { + +class MPHIndex { + public: + MPHIndex(double c = 1.23, uint8_t b = 7) : + c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), + g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), + deserialized_(false) { } + ~MPHIndex(); + + template + bool Reset(ForwardIterator begin, ForwardIterator end); + template // must agree with Reset + uint32_t index(const Key& x) const; + uint32_t size() const { return m_; } + void clear(); + + // Serialization machinery for mmap usage. + // Serialized tables are not guaranteed to work across versions or different + // endianness (although they could easily be made to be). + uint32_t serialize_bytes_needed() const; + void serialize(char *memory) const; + bool deserialize(const char* serialized_memory); + + private: + template + bool Mapping(ForwardIterator begin, ForwardIterator end, + std::vector* edges, + std::vector* queue); + bool GenerateQueue(TriGraph* graph, std::vector* queue); + void Assigning(const std::vector& edges, + const std::vector& queue); + void Ranking(); + uint32_t Rank(uint32_t vertex) const; + + // Algorithm parameters + double c_; // Number of bits per key (? is it right) + uint8_t b_; // Number of bits of the kth index in the ranktable + + // Values used during generation + uint32_t m_; // edges count + uint32_t n_; // vertex count + uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ + + // Values used during search + + // Partition vertex count, derived from c parameter. + uint32_t r_; + // The array containing the minimal perfect hash function graph. Do not use + // c++ vector to make mmap based backing easier. + const uint8_t* g_; + uint32_t g_size_; + // The table used for the rank step of the minimal perfect hash function + const uint32_t* ranktable_; + uint32_t ranktable_size_; + // The selected hash seed triplet for finding the edges in the minimal + // perfect hash function graph. + uint32_t hash_seed_[3]; + + bool deserialized_; + + static const uint8_t valuemask[]; + static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { + d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); + } + static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { + return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); + } + + +}; + +// Template method needs to go in the header file. +template +bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { + m_ = end - begin; + r_ = static_cast(ceil((c_*m_)/3)); + if ((r_ % 2) == 0) r_ += 1; + n_ = 3*r_; + k_ = 1U << b_; + + // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; + + int iterations = 10; + std::vector edges; + std::vector queue; + while (1) { + // cerr << "Iterations missing: " << iterations << endl; + for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; + // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; + if (Mapping(begin, end, &edges, &queue)) break; + else --iterations; + if (iterations == 0) break; + } + if (iterations == 0) return false; + Assigning(edges, queue); + std::vector().swap(edges); + Ranking(); + deserialized_ = false; + return true; +} + +template +bool MPHIndex::Mapping( + ForwardIterator begin, ForwardIterator end, + std::vector* edges, std::vector* queue) { + TriGraph graph(n_, m_); + for (ForwardIterator it = begin; it != end; ++it) { + uint32_t h[3]; + for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); + uint32_t v0 = h[0] % r_; + uint32_t v1 = h[1] % r_ + r_; + uint32_t v2 = h[2] % r_ + (r_ << 1); + // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + graph.AddEdge(TriGraph::Edge(v0, v1, v2)); + } + if (GenerateQueue(&graph, queue)) { + graph.ExtractEdgesAndClear(edges); + return true; + } + return false; +} + +template +uint32_t MPHIndex::index(const Key& key) const { + uint32_t h[3]; + for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); + h[0] = h[0] % r_; + h[1] = h[1] % r_ + r_; + h[2] = h[2] % r_ + (r_ << 1); + assert(g_size_); + // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; + assert((h[0] >> 2) > 2) > 2) >::hash_function> +class SimpleMPHIndex : public MPHIndex { + public: + template + bool Reset(ForwardIterator begin, ForwardIterator end) { + return MPHIndex::Reset(begin, end); + } + uint32_t index(const Key& key) { return MPHIndex::index(key); } +}; + +} // namespace cxxmph + +#endif // __CXXMPH_MPH_INDEX_H__ diff --git a/cxxmph/mph_table_test.cc b/cxxmph/mph_index_test.cc similarity index 66% rename from cxxmph/mph_table_test.cc rename to cxxmph/mph_index_test.cc index c9e91a8..421369c 100644 --- a/cxxmph/mph_table_test.cc +++ b/cxxmph/mph_index_test.cc @@ -3,11 +3,11 @@ #include #include -#include "mph_table.h" +#include "mph_index.h" using std::string; using std::vector; -using cxxmph::SimpleMPHTable; +using cxxmph::SimpleMPHIndex; int main(int argc, char** argv) { @@ -23,20 +23,20 @@ int main(int argc, char** argv) { keys.push_back("diogo"); keys.push_back("algume"); - SimpleMPHTable mph_table; - assert(mph_table.Reset(keys.begin(), keys.end())); + SimpleMPHIndex mph_index; + assert(mph_index.Reset(keys.begin(), keys.end())); vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { - ids.push_back(mph_table.index(keys[i])); + ids.push_back(mph_index.index(keys[i])); cerr << " " << *(ids.end() - 1); } cerr << endl; sort(ids.begin(), ids.end()); for (vector::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); - char* serialized = new char[mph_table.serialize_bytes_needed()]; - mph_table.serialize(serialized); - SimpleMPHTable other_mph_table; - other_mph_table.deserialize(serialized); + char* serialized = new char[mph_index.serialize_bytes_needed()]; + mph_index.serialize(serialized); + SimpleMPHIndex other_mph_index; + other_mph_index.deserialize(serialized); } diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 398db3e..1c01b64 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -4,7 +4,7 @@ #include // for std::pair #include "MurmurHash2.h" -#include "mph_table.h" +#include "mph_index.h" namespace cxxmph { @@ -70,7 +70,7 @@ class mph_map { void rehash(); std::vector values_; - SimpleMPHTable::hash_function> table_; + SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to no hold a copy of the key typedef typename std::unordered_map slack_type; slack_type slack_; @@ -93,8 +93,8 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { if (it != end()) return std::make_pair(it, false); values_.push_back(x); slack_.insert(std::make_pair(x.first, values_.size() - 1)); - if (slack_.size() == table_.size() || - (slack_.size() >= 256 && table_.size() == 0)) { + if (slack_.size() == index_.size() || + (slack_.size() >= 256 && index_.size() == 0)) { rehash(); } it = find(x.first); @@ -104,14 +104,14 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { MPH_MAP_METHOD_DECL(void_type, rehash)() { if (values_.empty()) return; slack_type().swap(slack_); - bool success = table_.Reset( + bool success = index_.Reset( make_iterator_first(values_.begin()), make_iterator_first(values_.end())); assert(success); std::vector new_values(values_.size()); for (const_iterator it = values_.begin(), end = values_.end(); it != end; ++it) { - size_type id = table_.index(it->first); + size_type id = index_.index(it->first); assert(id < new_values.size()); new_values[id] = *it; } @@ -127,7 +127,7 @@ MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } MPH_MAP_METHOD_DECL(void_type, clear)() { values_.clear(); slack_.clear(); - table_.clear(); + index_.clear(); } MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { @@ -145,8 +145,8 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; } - if (table_.size() == 0) return end(); - size_type id = table_.index(k); + if (index_.size() == 0) return end(); + size_type id = index_.index(k); if (key_equal()(values_[id].first, k)) { return values_.begin() + id; } @@ -157,8 +157,8 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; } - if (table_.size() == 0) return end(); - size_type id = table_.index(k); + if (index_.size() == 0) return end(); + size_type id = index_.index(k); if (key_equal()(values_[id].first, k)) { return values_.begin() + id; } diff --git a/cxxmph/mph_table.h b/cxxmph/mph_table.h index 51d26ea..234540d 100644 --- a/cxxmph/mph_table.h +++ b/cxxmph/mph_table.h @@ -1,173 +1,16 @@ -#ifndef __CXXMPH_MPH_TABLE_H__ -#define __CXXMPH_MPH_TABLE_H__ +#include "mph_index.h" -// Minimal perfect hash abstraction implementing the BDZ algorithm - -#include - -#include -#include -#include // for std::hash -#include - -#include - -using std::cerr; -using std::endl; - -#include "seeded_hash.h" -#include "trigraph.h" - -namespace cxxmph { +// String to string map working on mmap'ed memory class MPHTable { public: - MPHTable(double c = 1.23, uint8_t b = 7) : - c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), - g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), - deserialized_(false) { } - ~MPHTable(); - - template - bool Reset(ForwardIterator begin, ForwardIterator end); - template // must agree with Reset - uint32_t index(const Key& x) const; - uint32_t size() const { return m_; } - void clear(); - - // Serialization machinery for mmap usage. - // Serialized tables are not guaranteed to work across versions or different - // endianness (although they could easily be made to be). - uint32_t serialize_bytes_needed() const; - void serialize(char *memory) const; - bool deserialize(const char* serialized_memory); - - private: - template - bool Mapping(ForwardIterator begin, ForwardIterator end, - std::vector* edges, - std::vector* queue); - bool GenerateQueue(TriGraph* graph, std::vector* queue); - void Assigning(const std::vector& edges, - const std::vector& queue); - void Ranking(); - uint32_t Rank(uint32_t vertex) const; - - // Algorithm parameters - double c_; // Number of bits per key (? is it right) - uint8_t b_; // Number of bits of the kth index in the ranktable - - // Values used during generation - uint32_t m_; // edges count - uint32_t n_; // vertex count - uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ - - // Values used during search - - // Partition vertex count, derived from c parameter. - uint32_t r_; - // The array containing the minimal perfect hash function graph. Do not use - // c++ vector to make mmap based backing easier. - const uint8_t* g_; - uint32_t g_size_; - // The table used for the rank step of the minimal perfect hash function - const uint32_t* ranktable_; - uint32_t ranktable_size_; - // The selected hash seed triplet for finding the edges in the minimal - // perfect hash function graph. - uint32_t hash_seed_[3]; - - bool deserialized_; - - static const uint8_t valuemask[]; - static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { - d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); - } - static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { - return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); - } - - -}; - -// Template method needs to go in the header file. -template -bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { - m_ = end - begin; - r_ = static_cast(ceil((c_*m_)/3)); - if ((r_ % 2) == 0) r_ += 1; - n_ = 3*r_; - k_ = 1U << b_; - - // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; - - int iterations = 10; - std::vector edges; - std::vector queue; - while (1) { - // cerr << "Iterations missing: " << iterations << endl; - for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; - // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; - if (Mapping(begin, end, &edges, &queue)) break; - else --iterations; - if (iterations == 0) break; - } - if (iterations == 0) return false; - Assigning(edges, queue); - std::vector().swap(edges); - Ranking(); - deserialized_ = false; - return true; -} - -template -bool MPHTable::Mapping( - ForwardIterator begin, ForwardIterator end, - std::vector* edges, std::vector* queue) { - TriGraph graph(n_, m_); - for (ForwardIterator it = begin; it != end; ++it) { - uint32_t h[3]; - for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); - uint32_t v0 = h[0] % r_; - uint32_t v1 = h[1] % r_ + r_; - uint32_t v2 = h[2] % r_ + (r_ << 1); - // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; - graph.AddEdge(TriGraph::Edge(v0, v1, v2)); - } - if (GenerateQueue(&graph, queue)) { - graph.ExtractEdgesAndClear(edges); - return true; - } - return false; -} - -template -uint32_t MPHTable::index(const Key& key) const { - uint32_t h[3]; - for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); - h[0] = h[0] % r_; - h[1] = h[1] % r_ + r_; - h[2] = h[2] % r_ + (r_ << 1); - assert(g_size_); - // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; - assert((h[0] >> 2) > 2) > 2) >::hash_function> -class SimpleMPHTable : public MPHTable { - public: + typedef StringPiece key_type; + typedef StringPiece data_type; + typedef std::pair value_type; template - bool Reset(ForwardIterator begin, ForwardIterator end) { - return MPHTable::Reset(begin, end); - } - uint32_t index(const Key& key) { return MPHTable::index(key); } + bool Reset(ForwardIterator begin, ForwardIterator end); + private: + char* data_; + vector offsets_; + MPHIndex index_; }; - -} // namespace cxxmph - -#endif // __CXXMPH_MPH_TABLE_H__ diff --git a/src/bm_numbers.c b/src/bm_numbers.c index 7428bc5..7c6abb5 100644 --- a/src/bm_numbers.c +++ b/src/bm_numbers.c @@ -1,6 +1,10 @@ #include #include +#include +using __gnu_cxx::hash_set; +static const char cxx_name = "__gnu_cxx::hash_set"; + #include "bitbool.h" #include "cmph.h" #include "cmph_benchmark.h" @@ -71,8 +75,8 @@ void bm_search(CMPH_ALGO algo, int iters) { cmph_t* mphf = NULL; - snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); - mphf = lsmap_search(g_created_mphf, mphf_name); + snprintf(mphf_name, 128, "%s:%u", cxx_name, iters); + mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name); cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); @@ -102,6 +106,49 @@ DECLARE_ALGO(CMPH_BRZ); DECLARE_ALGO(CMPH_FCH); DECLARE_ALGO(CMPH_BDZ); +void bm_create_ext_hash_set(int iters) { + cmph_uint32 i = 0; + + if (iters > g_numbers_len) { + fprintf(stderr, "No input with proper size."); + exit(-1); + } + + hash_set* ext_hash_set = new hash_set; + for (i = 0; i < iters; ++i) { + ext_hash_set->insert(g_numbers[i]); + } + lsmap_append(g_created_mphf, cxx_name, ext_hash_set); +} + +void bm_search_ext_hash_set(int iters) { + cmph_uint32 i = 0; + + if (iters > g_numbers_len) { + fprintf(stderr, "No input with proper size."); + exit(-1); + } + + snprintf(mphf_name, 128, "%s:%u", hash_count, iters); + mphf = (__gnu_cxx::hash_set*)lsmap_search(g_created_mphf, mphf_name); + + cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); + cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); + + for (i = 0; i < iters * 100; ++i) { + cmph_uint32 pos = random() % iters; + const char* buf = (const char*)(g_numbers + pos); + cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32)); + ++count[pos]; + ++hash_count[h]; + } + + // Verify correctness later. + lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count); + lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count); +} +} + int main(int argc, char** argv) { g_numbers_len = 1000 * 1000; g_numbers = random_numbers_vector_new(g_numbers_len);