diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index 70a5f5e..fc95b21 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -56,7 +56,7 @@ class SearchUint64Benchmark : public Uint64Benchmark { protected: virtual bool SetUp(); const uint32_t nsearches_; - std::vector random_; + std::vector random_; }; } // namespace cxxmph diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 03cb222..f92972b 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -1,15 +1,15 @@ #include #include -#include +#include #include "bm_common.h" -#include "StringPiece.h" +#include "stringpiece.h" #include "mph_index.h" using namespace cxxmph; using std::string; -using std::tr1::unordered_set; +using std::tr1::unordered_map; class BM_MPHIndexCreate : public UrlsBenchmark { public: @@ -28,8 +28,11 @@ class BM_STLIndexCreate : public UrlsBenchmark { : UrlsBenchmark(urls_file) { } protected: virtual void Run() { - unordered_set index; - index.insert(urls_.begin(), urls_.end()); + unordered_map index; + int idx = 0; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + index.insert(make_pair(*it, idx++)); + } } }; @@ -38,10 +41,10 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { BM_MPHIndexSearch(const std::string& urls_file, int nsearches) : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { - while (true) { for (auto it = random_.begin(); it != random_.end(); ++it) { - index_.index(*it); - } + auto idx = index_.index(*it); + // Collision check to be fair with STL + if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; } } protected: @@ -59,23 +62,28 @@ class BM_STLIndexSearch : public SearchUrlsBenchmark { : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { - index_.find(*it); // - index_.begin(); + auto idx = index_.find(*it); } } protected: virtual bool SetUp () { if (!SearchUrlsBenchmark::SetUp()) return false; - std::tr1::unordered_set(urls_.begin(), urls_.end()).swap(index_); + unordered_map index; + int idx = 0; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + index.insert(make_pair(*it, idx++)); + } + index.swap(index_); return true; } - std::tr1::unordered_set index_; + std::tr1::unordered_map index_; }; int main(int argc, char** argv) { Benchmark::Register(new BM_MPHIndexCreate("URLS100k")); Benchmark::Register(new BM_STLIndexCreate("URLS100k")); - Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 1000*1000)); - Benchmark::Register(new BM_STLIndexSearch("URLS100k", 1000*1000)); + Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 100*1000*1000)); + Benchmark::Register(new BM_STLIndexSearch("URLS100k", 100*1000*1000)); Benchmark::RunAll(); return 0; } diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 423e329..12dd2f1 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -10,10 +10,25 @@ using std::tr1::unordered_map; namespace cxxmph { +uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { + return mymap.find(k)->second; +} +uint64_t myfind(const mph_map& mymap, const uint64_t& k) { + return mymap.index(k); +} + +const StringPiece& myfind(const unordered_map& mymap, const StringPiece& k) { + return mymap.find(k)->second; +} +StringPiece myfind(const mph_map& mymap, const StringPiece& k) { + auto it = mymap.find(k); + return it->second; +} + template -class BM_MapCreate : public UrlsBenchmark { +class BM_CreateUrls : public UrlsBenchmark { public: - BM_MapCreate(const string& urls_file) : UrlsBenchmark(urls_file) { } + BM_CreateUrls(const string& urls_file) : UrlsBenchmark(urls_file) { } virtual void Run() { MapType mymap; for (auto it = urls_.begin(); it != urls_.end(); ++it) { @@ -23,13 +38,13 @@ class BM_MapCreate : public UrlsBenchmark { }; template -class BM_MapSearch : public SearchUrlsBenchmark { +class BM_SearchUrls : public SearchUrlsBenchmark { public: - BM_MapSearch(const std::string& urls_file, int nsearches) + BM_SearchUrls(const std::string& urls_file, int nsearches) : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { - mymap_.find(*it); + auto idx = myfind(mymap_, *it); } } protected: @@ -44,14 +59,40 @@ class BM_MapSearch : public SearchUrlsBenchmark { MapType mymap_; }; +template +class BM_SearchUint64 : public SearchUint64Benchmark { + public: + BM_SearchUint64() : SearchUint64Benchmark(1000*1000, 1000*1000) { } + virtual bool SetUp() { + if (!SearchUint64Benchmark::SetUp()) return false; + for (int i = 0; i < values_.size(); ++i) { + mymap_[values_[i]] = values_[i]; + } + mymap_.rehash(mymap_.bucket_count()); + return true; + } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto v = myfind(mymap_, *it); + } + } + MapType mymap_; +}; + } // namespace cxxmph using namespace cxxmph; int main(int argc, char** argv) { - Benchmark::Register(new BM_MapCreate>("URLS100k")); - Benchmark::Register(new BM_MapCreate>("URLS100k")); - Benchmark::Register(new BM_MapSearch>("URLS100k", 1000* 1000)); - Benchmark::Register(new BM_MapSearch>("URLS100k", 1000* 1000)); + /* + Benchmark::Register(new BM_CreateUrls>("URLS100k")); + Benchmark::Register(new BM_CreateUrls>("URLS100k")); + */ + Benchmark::Register(new BM_SearchUrls>("URLS100k", 1000* 1000*100)); + /* + Benchmark::Register(new BM_SearchUrls>("URLS100k", 1000* 1000)); + Benchmark::Register(new BM_SearchUint64>); + Benchmark::Register(new BM_SearchUint64>); + */ Benchmark::RunAll(); } diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index d03dd92..02f7368 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -149,6 +149,7 @@ template uint32_t MPHIndex::index(const Key& key) const { uint32_t h[3]; for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); + assert(r_); h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; h[2] = h[2] % r_ + (r_ << 1); @@ -169,7 +170,7 @@ class SimpleMPHIndex : public MPHIndex { bool Reset(ForwardIterator begin, ForwardIterator end) { return MPHIndex::Reset(begin, end); } - uint32_t index(const Key& key) { return MPHIndex::index(key); } + uint32_t index(const Key& key) const { return MPHIndex::index(key); } }; } // namespace cxxmph diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index 421369c..7a7d036 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -24,7 +24,7 @@ int main(int argc, char** argv) { keys.push_back("algume"); SimpleMPHIndex mph_index; - assert(mph_index.Reset(keys.begin(), keys.end())); + if (!mph_index.Reset(keys.begin(), keys.end())) { exit(-1); } vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { ids.push_back(mph_index.index(keys[i])); @@ -33,7 +33,6 @@ int main(int argc, char** argv) { cerr << endl; sort(ids.begin(), ids.end()); for (vector::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); - char* serialized = new char[mph_index.serialize_bytes_needed()]; mph_index.serialize(serialized); SimpleMPHIndex other_mph_index; diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index d52f617..cd8f684 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -52,11 +52,17 @@ class mph_map { std::pair insert(const value_type& x); iterator find(const key_type& k); const_iterator find(const key_type& k) const; + typedef int32_t my_int32_t; + int32_t index(const key_type& k) const; data_type& operator[](const key_type &k); + const data_type& operator[](const key_type &k) const; size_type bucket_count() const { return size(); } void rehash(size_type nbuckets /*ignored*/) { pack(); } + protected: // mimicking STL implementation + EqualKey equal_; + private: template struct iterator_first : public iterator { @@ -145,30 +151,33 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { } MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { - if (!slack_.empty()) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; - } - if (index_.size() == 0) return end(); - size_type id = index_.index(k); - if (key_equal()(values_[id].first, k)) { - return values_.begin() + id; + if (__builtin_expect(!slack_.empty(), 0)) { + typename slack_type::const_iterator it = slack_.find(k); + if (it != slack_.end()) return values_.begin() + it->second; } + if (__builtin_expect(index_.size() == 0, 0)) return end(); + auto it = values_.begin() + index_.index(k); + if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } + MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { if (!slack_.empty()) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; + typename slack_type::const_iterator it = slack_.find(k); + if (it != slack_.end()) return values_.begin() + it->second; } if (index_.size() == 0) return end(); - size_type id = index_.index(k); - if (key_equal()(values_[id].first, k)) { - return values_.begin() + id; - } + auto it = values_.begin() + index_.index(k); + if (equal_(it->first, k)) return it; return end(); } +MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { + assert(slack_.empty()); + if (index_.size() == 0) return -1; + return index_.index(k); +} + MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(std::make_pair(k, data_type())).first->second; } diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index 99a3ca6..a12d4f8 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -19,10 +19,17 @@ struct seeded_hash_function { } }; +struct seeded_identity_function { + template + uint32_t operator()(const Key& k, uint32_t seed) const { + return k ^ seed; + } +}; + struct Murmur2 { template uint32_t operator()(const Key& k) const { - return MurmurHash2(k, sizeof(Key), 1 /* seed */); + return MurmurHash2(reinterpret_cast(&k), sizeof(Key), 1 /* seed */); } }; struct Murmur2StringPiece {