Compiles, still need to fix size tracking.

This commit is contained in:
Davi Reis 2012-03-11 23:21:18 -03:00
parent c057fb882b
commit 238e384367
9 changed files with 167 additions and 60 deletions

View File

@ -1,12 +1,12 @@
TESTS = $(check_PROGRAMS) TESTS = $(check_PROGRAMS)
check_PROGRAMS = mph_map_test mph_index_test trigraph_test check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test
noinst_PROGRAMS = bm_index bm_map noinst_PROGRAMS = bm_index bm_map
bin_PROGRAMS = cxxmph bin_PROGRAMS = cxxmph
lib_LTLIBRARIES = libcxxmph.la lib_LTLIBRARIES = libcxxmph.la
libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc
libcxxmph_la_LDFLAGS = -version-info 0:0:0 libcxxmph_la_LDFLAGS = -version-info 0:0:0
cxxmph_includedir = $(includedir)/cxxmph/ cxxmph_includedir = $(includedir)/cxxmph/
cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h
mph_map_test_LDADD = libcxxmph.la mph_map_test_LDADD = libcxxmph.la
mph_map_test_SOURCES = mph_map_test.cc mph_map_test_SOURCES = mph_map_test.cc
@ -25,3 +25,6 @@ bm_map_SOURCES = bm_common.cc bm_map.cc
cxxmph_LDADD = libcxxmph.la cxxmph_LDADD = libcxxmph.la
cxxmph_SOURCES = cxxmph.cc cxxmph_SOURCES = cxxmph.cc
hollow_iterator_test_SOURCES = hollow_iterator_test.cc

View File

@ -21,7 +21,7 @@ class BM_MPHIndexCreate : public UrlsBenchmark {
protected: protected:
virtual void Run() { virtual void Run() {
SimpleMPHIndex<StringPiece> index; SimpleMPHIndex<StringPiece> index;
index.Reset(urls_.begin(), urls_.end()); index.Reset(urls_.begin(), urls_.end(), urls_.size());
} }
}; };
@ -53,7 +53,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark {
protected: protected:
virtual bool SetUp () { virtual bool SetUp () {
if (!SearchUrlsBenchmark::SetUp()) return false; if (!SearchUrlsBenchmark::SetUp()) return false;
index_.Reset(urls_.begin(), urls_.end()); index_.Reset(urls_.begin(), urls_.end(), urls_.size());
return true; return true;
} }
SimpleMPHIndex<StringPiece> index_; SimpleMPHIndex<StringPiece> index_;

View File

@ -13,7 +13,8 @@ namespace cxxmph {
template<class MapType, class T> template<class MapType, class T>
const T* myfind(const MapType& mymap, const T& k) { const T* myfind(const MapType& mymap, const T& k) {
auto it = mymap.find(k); auto it = mymap.find(k);
if (it == mymap.end()) return NULL; auto end = mymap.end();
if (it == end) return NULL;
return &it->second; return &it->second;
} }

View File

@ -63,8 +63,8 @@ int main(int argc, char** argv) {
for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i]; for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i];
mph_map<string, string>::const_iterator it = table.begin(); mph_map<string, string>::const_iterator it = table.begin();
mph_map<string, string>::const_iterator end = table.end(); mph_map<string, string>::const_iterator end = table.end();
for (; it != end; ++it) { for (int i = 0; it != end; ++it, ++i) {
cout << (it - table.begin()) << ": " << it->first cout << i << ": " << it->first
<<" -> " << it->second << endl; <<" -> " << it->second << endl;
} }
} }

69
cxxmph/hollow_iterator.h Normal file
View File

@ -0,0 +1,69 @@
#ifndef __CXXMPH_HOLLOW_ITERATOR_H__
#define __CXXMPH_HOLLOW_ITERATOR_H__
#include <vector>
namespace cxxmph {
template <typename container_type, typename presence_type, typename iterator_type>
struct hollow_iterator_base
: public std::iterator<std::forward_iterator_tag,
typename container_type::value_type> {
typedef presence_type presence;
typedef container_type container;
typedef iterator_type iterator;
typedef hollow_iterator_base<container, presence, iterator>& self_reference;
typedef typename iterator::reference reference;
typedef typename iterator::pointer pointer;
hollow_iterator_base(container* c, presence* p, iterator it)
: c_(c), p_(p), it_(it) { find_present(); }
self_reference operator++() {
++it_; find_present();
}
reference operator*() { return *it_; }
pointer operator->() { return &(*it_); }
// TODO find syntax to make this less permissible at compile time
template <class T>
bool operator==(const T& rhs) { return rhs.it_ == this->it_; }
template <class T>
bool operator!=(const T& rhs) { return rhs.it_ != this->it_; }
public: // TODO find syntax to make this friend of const iterator
void find_present() {
while (it_ != c_->end() && !((*p_)[it_-c_->begin()])) ++it_;
}
container* c_;
presence* p_;
iterator it_;
};
template <typename container_type>
struct hollow_iterator : public hollow_iterator_base<
container_type, std::vector<bool>, typename container_type::iterator> {
typedef hollow_iterator_base<
container_type, std::vector<bool>, typename container_type::iterator> parent_class;
hollow_iterator(typename parent_class::container* c,
typename parent_class::presence* p,
typename parent_class::iterator it)
: parent_class(c, p, it) { }
};
template <typename container_type>
struct hollow_const_iterator : public hollow_iterator_base<
const container_type, const std::vector<bool>, typename container_type::const_iterator> {
typedef hollow_iterator_base<
const container_type, const std::vector<bool>, typename container_type::const_iterator> parent_class;
typedef hollow_const_iterator<container_type> self_type;
typedef hollow_iterator<container_type> non_const_type;
hollow_const_iterator(non_const_type rhs) : parent_class(rhs.c_, rhs.p_, typename container_type::const_iterator(rhs.it_)) { }
hollow_const_iterator(const typename parent_class::container* c,
const typename parent_class::presence* p,
typename parent_class::iterator it)
: parent_class(c, p, it) { }
};
} // namespace cxxmph
#endif // __CXXMPH_HOLLOW_ITERATOR_H__

View File

@ -0,0 +1,35 @@
#include <cstdlib>
#include <cstdio>
#include <vector>
#include "hollow_iterator.h"
using std::vector;
using cxxmph::hollow_iterator;
using cxxmph::hollow_const_iterator;
int main(int argc, char** argv) {
vector<int> v;
vector<bool> p;
for (int i = 0; i < 100; ++i) {
v.push_back(i);
p.push_back(i % 2 == 0);
}
auto begin = hollow_iterator<vector<int>>(&v, &p, v.begin());
auto end = hollow_iterator<vector<int>>(&v, &p, v.end());
for (auto it = begin; it != end; ++it) {
if (((*it) % 2) != 0) exit(-1);
}
hollow_const_iterator<vector<int>> const_begin(begin);
hollow_const_iterator<vector<int>> const_end(end);
for (auto it = const_begin; it != const_end; ++it) {
if (((*it) % 2) != 0) exit(-1);
}
vector<int>::iterator vit1 = v.begin();
vector<int>::const_iterator vit2 = v.begin();
if (vit1 != vit2) exit(-1);
auto it1 = hollow_iterator<vector<int>>(&v, &p, v.begin());
auto it2 = hollow_const_iterator<vector<int>>(&v, &p, v.begin());
if (it1 != it2) exit(-1);
}

View File

@ -48,7 +48,7 @@ class MPHIndex {
~MPHIndex(); ~MPHIndex();
template <class SeededHashFcn, class ForwardIterator> template <class SeededHashFcn, class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end); bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size);
template <class SeededHashFcn, class Key> // must agree with Reset template <class SeededHashFcn, class Key> // must agree with Reset
// Get a unique identifier for k, in the range [0;size()). If x wasn't part // Get a unique identifier for k, in the range [0;size()). If x wasn't part
// of the input in the last Reset call, returns a random value. // of the input in the last Reset call, returns a random value.
@ -120,12 +120,13 @@ class MPHIndex {
// Template method needs to go in the header file. // Template method needs to go in the header file.
template <class SeededHashFcn, class ForwardIterator> template <class SeededHashFcn, class ForwardIterator>
bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { bool MPHIndex::Reset(
ForwardIterator begin, ForwardIterator end, uint32_t size) {
if (end == begin) { if (end == begin) {
clear(); clear();
return true; return true;
} }
m_ = end - begin; m_ = size;
r_ = static_cast<uint32_t>(ceil((c_*m_)/3)); r_ = static_cast<uint32_t>(ceil((c_*m_)/3));
if ((r_ % 2) == 0) r_ += 1; if ((r_ % 2) == 0) r_ += 1;
n_ = 3*r_; n_ = 3*r_;
@ -204,8 +205,8 @@ template <class Key, class HashFcn = typename seeded_hash<std::hash<Key> >::hash
class SimpleMPHIndex : public MPHIndex { class SimpleMPHIndex : public MPHIndex {
public: public:
template <class ForwardIterator> template <class ForwardIterator>
bool Reset(ForwardIterator begin, ForwardIterator end) { bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size) {
return MPHIndex::Reset<HashFcn>(begin, end); return MPHIndex::Reset<HashFcn>(begin, end, size);
} }
uint32_t index(const Key& key) const { return MPHIndex::index<HashFcn>(key); } uint32_t index(const Key& key) const { return MPHIndex::index<HashFcn>(key); }
uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash<HashFcn>(key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash<HashFcn>(key); }

View File

@ -24,7 +24,7 @@ int main(int argc, char** argv) {
keys.push_back("algume"); keys.push_back("algume");
SimpleMPHIndex<string> mph_index; SimpleMPHIndex<string> mph_index;
if (!mph_index.Reset(keys.begin(), keys.end())) { exit(-1); } if (!mph_index.Reset(keys.begin(), keys.end(), keys.size())) { exit(-1); }
vector<int> ids; vector<int> ids;
for (vector<int>::size_type i = 0; i < keys.size(); ++i) { for (vector<int>::size_type i = 0; i < keys.size(); ++i) {
ids.push_back(mph_index.index(keys[i])); ids.push_back(mph_index.index(keys[i]));

View File

@ -14,6 +14,7 @@
#include "MurmurHash2.h" #include "MurmurHash2.h"
#include "mph_index.h" #include "mph_index.h"
#include "hollow_iterator.h"
namespace cxxmph { namespace cxxmph {
@ -42,17 +43,8 @@ class mph_map {
typedef typename std::vector<value_type>::size_type size_type; typedef typename std::vector<value_type>::size_type size_type;
typedef typename std::vector<value_type>::difference_type difference_type; typedef typename std::vector<value_type>::difference_type difference_type;
template <class T, typename iterator> typedef hollow_iterator<std::vector<value_type>> iterator;
struct indirect_iterator : public typename slack_type::iterator { typedef hollow_const_iterator<std::vector<value_type>> const_iterator;
indirect_iterator(T* v, iterator it) : iterator(it), v_(v) { }
const typename iterator::value_type::first_type& operator*() const {
return v->begin() + (this->iterator::operator*())->second;
}
};
typedef indirect_iterator<std::vector<value_type>, slack_type>::iterator iterator;
typedef indirect_iterator<std::vector<value_type>, slack_type>::const_iterator const_iterator;
// For making macros simpler. // For making macros simpler.
typedef void void_type; typedef void void_type;
@ -90,7 +82,7 @@ class mph_map {
template <typename iterator> template <typename iterator>
struct iterator_first : public iterator { struct iterator_first : public iterator {
iterator_first(iterator it) : iterator(it) { } iterator_first(iterator it) : iterator(it) { }
const typename iterator::value_type::first_type& operator*() const { const typename iterator::value_type::first_type& operator*() {
return this->iterator::operator*().first; return this->iterator::operator*().first;
} }
}; };
@ -100,25 +92,29 @@ class mph_map {
return iterator_first<iterator>(it); return iterator_first<iterator>(it);
} }
template <class T, typename iterator> iterator make_iterator(typename std::vector<value_type>::iterator it) {
indirect_iterator<iterator> make_indirect_iterator(T* v, iterator it) { return hollow_iterator<std::vector<value_type>>(&values_, &present_, it);
return indirect_iterator<iterator>(v, it); }
const_iterator make_iterator(typename std::vector<value_type>::const_iterator it) const {
return hollow_const_iterator<std::vector<value_type>>(&values_, &present_, it);
} }
void pack(); void pack();
std::vector<value_type> values_; std::vector<value_type> values_;
std::vector<bool> present_;
SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_; SimpleMPHIndex<Key, typename seeded_hash<HashFcn>::hash_function> index_;
// TODO(davi) optimize slack to no hold a copy of the key // TODO(davi) optimize slack to no hold a copy of the key
typedef unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type; typedef unordered_map<Key, uint32_t, HashFcn, EqualKey, Alloc> slack_type;
slack_type slack_; slack_type slack_;
size_type size_;
}; };
MPH_MAP_TMPL_SPEC MPH_MAP_TMPL_SPEC
bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) {
return lhs.values_ == rhs.values_; return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
} }
MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() { MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) {
pack(); pack();
} }
@ -126,13 +122,15 @@ MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() {
} }
MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
iterator it = find(x.first); auto it = find(x.first);
if (it != end()) return make_pair(it, false); auto it_end = end();
should_pack = false; if (it != it_end) return make_pair(it, false);
bool should_pack = false;
if (values_.capacity() == values_.size() && values_.size() > 256) { if (values_.capacity() == values_.size() && values_.size() > 256) {
should_pack = true; should_pack = true;
} }
values_.push_back(x); values_.push_back(x);
present_.push_back(true);
slack_.insert(make_pair(x.first, values_.size() - 1)); slack_.insert(make_pair(x.first, values_.size() - 1));
if (should_pack) pack(); if (should_pack) pack();
it = find(x.first); it = find(x.first);
@ -142,43 +140,39 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) {
MPH_MAP_METHOD_DECL(void_type, pack)() { MPH_MAP_METHOD_DECL(void_type, pack)() {
if (values_.empty()) return; if (values_.empty()) return;
bool success = index_.Reset( bool success = index_.Reset(
make_iterator_first(slack_.begin())), make_iterator_first(begin()),
make_iterator_first(slack_.end()))); make_iterator_first(end()), size_);
assert(success); assert(success);
std::vector<value_type> new_values(index_.size()); std::vector<value_type> new_values(index_.size());
for (const_iterator it = values_.begin(), end = values_.end(); std::vector<bool> new_present(index_.size(), false);
it != end; ++it) { for (iterator it(begin()), it_end(end()); it != it_end; ++it) {
size_type id = index_.index((*it)->first); size_type id = index_.index(it->first);
assert(id < new_values.size()); assert(id < new_values.size());
new_values[id] = *it; new_values[id] = *it;
new_present[id] = true;
} }
values_.swap(new_values); values_.swap(new_values);
std::vector<size_type> new_values_pointer( present_.swap(new_present);
index_.perfect_hash_size());; slack_type().swap(slack_);
for (size_type i = 0; i < values_.size(); ++i) {
size_type id = index_.perfect_hash(values_[i].first);
assert(id < new_values_pointer.size());
new_values_pointer[id] = i;
}
values_pointer_.swap(new_values_pointer);
} }
MPH_MAP_METHOD_DECL(iterator, begin)() { return values_.begin(); } MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); }
MPH_MAP_METHOD_DECL(iterator, end)() { return values_.end(); } MPH_MAP_METHOD_DECL(iterator, end)() { return make_iterator(values_.end()); }
MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return make_iterator(values_.begin()); }
MPH_MAP_METHOD_DECL(const_iterator, end)() const { return values_.end(); } MPH_MAP_METHOD_DECL(const_iterator, end)() const { return make_iterator(values_.end()); }
MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } MPH_MAP_METHOD_DECL(bool_type, empty)() const { return size_ == 0; }
MPH_MAP_METHOD_DECL(size_type, size)() const { return values_.size(); } MPH_MAP_METHOD_DECL(size_type, size)() const { return size_; }
MPH_MAP_METHOD_DECL(void_type, clear)() { MPH_MAP_METHOD_DECL(void_type, clear)() {
values_.clear(); values_.clear();
present_.clear();
slack_.clear(); slack_.clear();
index_.clear(); index_.clear();
} }
MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) {
values_.erase(pos); present_[pos - begin] = false;
pack(); *pos = value_type();
} }
MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) {
iterator it = find(k); iterator it = find(k);
@ -188,22 +182,26 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) {
MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const {
if (__builtin_expect(!slack_.empty(), 0)) { if (__builtin_expect(!slack_.empty(), 0)) {
typename slack_type::const_iterator it = slack_.find(k); auto it = slack_.find(k);
if (it != slack_.end()) return values_.begin() + it->second; if (it != slack_.end()) return make_iterator(values_.begin() + it->second);
} }
if (__builtin_expect(index_.size() == 0, 0)) return end(); if (__builtin_expect(index_.size() == 0, 0)) return end();
const_iterator it = values_.begin() + values_pointer_[index_.perfect_hash(k)]; auto id = index_.perfect_hash(k);
if (!present_[id]) return end();
auto it = make_iterator(values_.begin() + id);
if (__builtin_expect(equal_(k, it->first), 1)) return it; if (__builtin_expect(equal_(k, it->first), 1)) return it;
return end(); return end();
} }
MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) {
if (__builtin_expect(!slack_.empty(), 0)) { if (__builtin_expect(!slack_.empty(), 0)) {
typename slack_type::const_iterator it = slack_.find(k); auto it = slack_.find(k);
if (it != slack_.end()) return values_.begin() + it->second; if (it != slack_.end()) return make_iterator(values_.begin() + it->second);
} }
if (__builtin_expect(index_.size() == 0, 0)) return end(); if (__builtin_expect(index_.size() == 0, 0)) return end();
iterator it = values_.begin() + values_pointer_[index_.perfect_hash(k)]; auto id = index_.perfect_hash(k);
if (!present_[id]) return end();
auto it = make_iterator(values_.begin() + id);
if (__builtin_expect(equal_(k, it->first), 1)) return it; if (__builtin_expect(equal_(k, it->first), 1)) return it;
return end(); return end();
} }