From 238e384367e635d2fdaf61904446348086351d75 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Sun, 11 Mar 2012 23:21:18 -0300 Subject: [PATCH] Compiles, still need to fix size tracking. --- cxxmph/Makefile.am | 7 ++- cxxmph/bm_index.cc | 4 +- cxxmph/bm_map.cc | 3 +- cxxmph/cxxmph.cc | 4 +- cxxmph/hollow_iterator.h | 69 +++++++++++++++++++++++++ cxxmph/hollow_iterator_test.cc | 35 +++++++++++++ cxxmph/mph_index.h | 11 ++-- cxxmph/mph_index_test.cc | 2 +- cxxmph/mph_map.h | 92 +++++++++++++++++----------------- 9 files changed, 167 insertions(+), 60 deletions(-) create mode 100644 cxxmph/hollow_iterator.h create mode 100644 cxxmph/hollow_iterator_test.cc diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 2e57a18..cec2073 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,12 +1,12 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_index_test trigraph_test +check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h mph_map_test_LDADD = libcxxmph.la mph_map_test_SOURCES = mph_map_test.cc @@ -25,3 +25,6 @@ bm_map_SOURCES = bm_common.cc bm_map.cc cxxmph_LDADD = libcxxmph.la cxxmph_SOURCES = cxxmph.cc + +hollow_iterator_test_SOURCES = hollow_iterator_test.cc + diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 924231c..d1cbc00 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -21,7 +21,7 @@ class BM_MPHIndexCreate : public UrlsBenchmark { protected: virtual void Run() { SimpleMPHIndex index; - index.Reset(urls_.begin(), urls_.end()); + index.Reset(urls_.begin(), urls_.end(), urls_.size()); } }; @@ -53,7 +53,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { protected: virtual bool SetUp () { if (!SearchUrlsBenchmark::SetUp()) return false; - index_.Reset(urls_.begin(), urls_.end()); + index_.Reset(urls_.begin(), urls_.end(), urls_.size()); return true; } SimpleMPHIndex index_; diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 5c0f7a4..e381976 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -13,7 +13,8 @@ namespace cxxmph { template const T* myfind(const MapType& mymap, const T& k) { auto it = mymap.find(k); - if (it == mymap.end()) return NULL; + auto end = mymap.end(); + if (it == end) return NULL; return &it->second; } diff --git a/cxxmph/cxxmph.cc b/cxxmph/cxxmph.cc index 68bb23e..e9bffd0 100644 --- a/cxxmph/cxxmph.cc +++ b/cxxmph/cxxmph.cc @@ -63,8 +63,8 @@ int main(int argc, char** argv) { for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i]; mph_map::const_iterator it = table.begin(); mph_map::const_iterator end = table.end(); - for (; it != end; ++it) { - cout << (it - table.begin()) << ": " << it->first + for (int i = 0; it != end; ++it, ++i) { + cout << i << ": " << it->first <<" -> " << it->second << endl; } } diff --git a/cxxmph/hollow_iterator.h b/cxxmph/hollow_iterator.h new file mode 100644 index 0000000..bbb34bf --- /dev/null +++ b/cxxmph/hollow_iterator.h @@ -0,0 +1,69 @@ +#ifndef __CXXMPH_HOLLOW_ITERATOR_H__ +#define __CXXMPH_HOLLOW_ITERATOR_H__ + +#include + +namespace cxxmph { + +template +struct hollow_iterator_base + : public std::iterator { + typedef presence_type presence; + typedef container_type container; + typedef iterator_type iterator; + typedef hollow_iterator_base& self_reference; + typedef typename iterator::reference reference; + typedef typename iterator::pointer pointer; + + hollow_iterator_base(container* c, presence* p, iterator it) + : c_(c), p_(p), it_(it) { find_present(); } + self_reference operator++() { + ++it_; find_present(); + } + reference operator*() { return *it_; } + pointer operator->() { return &(*it_); } + + // TODO find syntax to make this less permissible at compile time + template + bool operator==(const T& rhs) { return rhs.it_ == this->it_; } + template + bool operator!=(const T& rhs) { return rhs.it_ != this->it_; } + + public: // TODO find syntax to make this friend of const iterator + void find_present() { + while (it_ != c_->end() && !((*p_)[it_-c_->begin()])) ++it_; + } + container* c_; + presence* p_; + iterator it_; +}; + +template +struct hollow_iterator : public hollow_iterator_base< + container_type, std::vector, typename container_type::iterator> { + typedef hollow_iterator_base< + container_type, std::vector, typename container_type::iterator> parent_class; + hollow_iterator(typename parent_class::container* c, + typename parent_class::presence* p, + typename parent_class::iterator it) + : parent_class(c, p, it) { } +}; + +template +struct hollow_const_iterator : public hollow_iterator_base< + const container_type, const std::vector, typename container_type::const_iterator> { + typedef hollow_iterator_base< + const container_type, const std::vector, typename container_type::const_iterator> parent_class; + typedef hollow_const_iterator self_type; + typedef hollow_iterator non_const_type; + hollow_const_iterator(non_const_type rhs) : parent_class(rhs.c_, rhs.p_, typename container_type::const_iterator(rhs.it_)) { } + hollow_const_iterator(const typename parent_class::container* c, + const typename parent_class::presence* p, + typename parent_class::iterator it) + : parent_class(c, p, it) { } +}; + +} // namespace cxxmph + +#endif // __CXXMPH_HOLLOW_ITERATOR_H__ diff --git a/cxxmph/hollow_iterator_test.cc b/cxxmph/hollow_iterator_test.cc new file mode 100644 index 0000000..201b748 --- /dev/null +++ b/cxxmph/hollow_iterator_test.cc @@ -0,0 +1,35 @@ +#include +#include +#include + +#include "hollow_iterator.h" + +using std::vector; +using cxxmph::hollow_iterator; +using cxxmph::hollow_const_iterator; + +int main(int argc, char** argv) { + vector v; + vector p; + for (int i = 0; i < 100; ++i) { + v.push_back(i); + p.push_back(i % 2 == 0); + } + auto begin = hollow_iterator>(&v, &p, v.begin()); + auto end = hollow_iterator>(&v, &p, v.end()); + for (auto it = begin; it != end; ++it) { + if (((*it) % 2) != 0) exit(-1); + } + hollow_const_iterator> const_begin(begin); + hollow_const_iterator> const_end(end); + for (auto it = const_begin; it != const_end; ++it) { + if (((*it) % 2) != 0) exit(-1); + } + vector::iterator vit1 = v.begin(); + vector::const_iterator vit2 = v.begin(); + if (vit1 != vit2) exit(-1); + auto it1 = hollow_iterator>(&v, &p, v.begin()); + auto it2 = hollow_const_iterator>(&v, &p, v.begin()); + if (it1 != it2) exit(-1); +} + diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index d2e4a01..ad5bc6e 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -48,7 +48,7 @@ class MPHIndex { ~MPHIndex(); template - bool Reset(ForwardIterator begin, ForwardIterator end); + bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size); template // must agree with Reset // Get a unique identifier for k, in the range [0;size()). If x wasn't part // of the input in the last Reset call, returns a random value. @@ -120,12 +120,13 @@ class MPHIndex { // Template method needs to go in the header file. template -bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { +bool MPHIndex::Reset( + ForwardIterator begin, ForwardIterator end, uint32_t size) { if (end == begin) { clear(); return true; } - m_ = end - begin; + m_ = size; r_ = static_cast(ceil((c_*m_)/3)); if ((r_ % 2) == 0) r_ += 1; n_ = 3*r_; @@ -204,8 +205,8 @@ template >::hash class SimpleMPHIndex : public MPHIndex { public: template - bool Reset(ForwardIterator begin, ForwardIterator end) { - return MPHIndex::Reset(begin, end); + bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size) { + return MPHIndex::Reset(begin, end, size); } uint32_t index(const Key& key) const { return MPHIndex::index(key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash(key); } diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index 7a7d036..70e01bc 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -24,7 +24,7 @@ int main(int argc, char** argv) { keys.push_back("algume"); SimpleMPHIndex mph_index; - if (!mph_index.Reset(keys.begin(), keys.end())) { exit(-1); } + if (!mph_index.Reset(keys.begin(), keys.end(), keys.size())) { exit(-1); } vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { ids.push_back(mph_index.index(keys[i])); diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 6a09d21..7687ba5 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -14,6 +14,7 @@ #include "MurmurHash2.h" #include "mph_index.h" +#include "hollow_iterator.h" namespace cxxmph { @@ -42,17 +43,8 @@ class mph_map { typedef typename std::vector::size_type size_type; typedef typename std::vector::difference_type difference_type; - template - struct indirect_iterator : public typename slack_type::iterator { - indirect_iterator(T* v, iterator it) : iterator(it), v_(v) { } - const typename iterator::value_type::first_type& operator*() const { - return v->begin() + (this->iterator::operator*())->second; - } - }; - - - typedef indirect_iterator, slack_type>::iterator iterator; - typedef indirect_iterator, slack_type>::const_iterator const_iterator; + typedef hollow_iterator> iterator; + typedef hollow_const_iterator> const_iterator; // For making macros simpler. typedef void void_type; @@ -90,7 +82,7 @@ class mph_map { template struct iterator_first : public iterator { iterator_first(iterator it) : iterator(it) { } - const typename iterator::value_type::first_type& operator*() const { + const typename iterator::value_type::first_type& operator*() { return this->iterator::operator*().first; } }; @@ -100,25 +92,29 @@ class mph_map { return iterator_first(it); } - template - indirect_iterator make_indirect_iterator(T* v, iterator it) { - return indirect_iterator(v, it); + iterator make_iterator(typename std::vector::iterator it) { + return hollow_iterator>(&values_, &present_, it); + } + const_iterator make_iterator(typename std::vector::const_iterator it) const { + return hollow_const_iterator>(&values_, &present_, it); } void pack(); std::vector values_; + std::vector present_; SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to no hold a copy of the key typedef unordered_map slack_type; slack_type slack_; + size_type size_; }; MPH_MAP_TMPL_SPEC bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { - return lhs.values_ == rhs.values_; + return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); } -MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() { +MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) { pack(); } @@ -126,13 +122,15 @@ MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { } MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { - iterator it = find(x.first); - if (it != end()) return make_pair(it, false); - should_pack = false; + auto it = find(x.first); + auto it_end = end(); + if (it != it_end) return make_pair(it, false); + bool should_pack = false; if (values_.capacity() == values_.size() && values_.size() > 256) { should_pack = true; } values_.push_back(x); + present_.push_back(true); slack_.insert(make_pair(x.first, values_.size() - 1)); if (should_pack) pack(); it = find(x.first); @@ -142,43 +140,39 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { MPH_MAP_METHOD_DECL(void_type, pack)() { if (values_.empty()) return; bool success = index_.Reset( - make_iterator_first(slack_.begin())), - make_iterator_first(slack_.end()))); + make_iterator_first(begin()), + make_iterator_first(end()), size_); assert(success); std::vector new_values(index_.size()); - for (const_iterator it = values_.begin(), end = values_.end(); - it != end; ++it) { - size_type id = index_.index((*it)->first); + std::vector new_present(index_.size(), false); + for (iterator it(begin()), it_end(end()); it != it_end; ++it) { + size_type id = index_.index(it->first); assert(id < new_values.size()); new_values[id] = *it; + new_present[id] = true; } values_.swap(new_values); - std::vector new_values_pointer( - index_.perfect_hash_size());; - for (size_type i = 0; i < values_.size(); ++i) { - size_type id = index_.perfect_hash(values_[i].first); - assert(id < new_values_pointer.size()); - new_values_pointer[id] = i; - } - values_pointer_.swap(new_values_pointer); + present_.swap(new_present); + slack_type().swap(slack_); } -MPH_MAP_METHOD_DECL(iterator, begin)() { return values_.begin(); } -MPH_MAP_METHOD_DECL(iterator, end)() { return values_.end(); } -MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } -MPH_MAP_METHOD_DECL(const_iterator, end)() const { return values_.end(); } -MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } -MPH_MAP_METHOD_DECL(size_type, size)() const { return values_.size(); } +MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); } +MPH_MAP_METHOD_DECL(iterator, end)() { return make_iterator(values_.end()); } +MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return make_iterator(values_.begin()); } +MPH_MAP_METHOD_DECL(const_iterator, end)() const { return make_iterator(values_.end()); } +MPH_MAP_METHOD_DECL(bool_type, empty)() const { return size_ == 0; } +MPH_MAP_METHOD_DECL(size_type, size)() const { return size_; } MPH_MAP_METHOD_DECL(void_type, clear)() { values_.clear(); + present_.clear(); slack_.clear(); index_.clear(); } MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { - values_.erase(pos); - pack(); + present_[pos - begin] = false; + *pos = value_type(); } MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { iterator it = find(k); @@ -188,22 +182,26 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { if (__builtin_expect(!slack_.empty(), 0)) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; + auto it = slack_.find(k); + if (it != slack_.end()) return make_iterator(values_.begin() + it->second); } if (__builtin_expect(index_.size() == 0, 0)) return end(); - const_iterator it = values_.begin() + values_pointer_[index_.perfect_hash(k)]; + auto id = index_.perfect_hash(k); + if (!present_[id]) return end(); + auto it = make_iterator(values_.begin() + id); if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { if (__builtin_expect(!slack_.empty(), 0)) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; + auto it = slack_.find(k); + if (it != slack_.end()) return make_iterator(values_.begin() + it->second); } if (__builtin_expect(index_.size() == 0, 0)) return end(); - iterator it = values_.begin() + values_pointer_[index_.perfect_hash(k)]; + auto id = index_.perfect_hash(k); + if (!present_[id]) return end(); + auto it = make_iterator(values_.begin() + id); if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); }