#ifndef __CXXMPH_MPH_MAP_H__ #define __CXXMPH_MPH_MAP_H__ // Implementation of the unordered associative mapping interface using a // minimal perfect hash function. // // Since these are header-mostly libraries, make sure you compile your code // with -DNDEBUG and -O3. The code requires a modern C++11 compiler. // // The container comes in 3 flavors, all in the cxxmph namespace and drop-in // replacement for the popular classes with the same names. // * dense_hash_map // -> fast, uses more memory, 2.93 bits per bucket, ~50% occupation // * unordered_map (aliases: hash_map, mph_map) // -> middle ground, uses 2.93 bits per bucket, ~81% occupation // * sparse_hash_map -> slower, uses 3.6 bits per bucket // -> less fast, uses 3.6 bits per bucket, 100% occupation // // Those classes are not necessarily faster than their existing counterparts. // Benchmark your code before using it. The larger the key, the larger the // number of elements inserted, and the bigger the number of failed searches, // the more likely those classes will outperform existing code. // // For large sets of urls (>100k), which are a somewhat expensive to compare, I // found those class to be about 10%-50% faster than unordered_map. #include #include #include #include #include #include #include // for std::pair #include "string_util.h" #include "hollow_iterator.h" #include "mph_bits.h" #include "mph_index.h" #include "seeded_hash.h" namespace cxxmph { using std::pair; using std::make_pair; using std::vector; // Save on repetitive typing. #define MPH_MAP_TMPL_SPEC \ template #define MPH_MAP_CLASS_SPEC mph_map_base #define MPH_MAP_METHOD_DECL(r, m) MPH_MAP_TMPL_SPEC typename MPH_MAP_CLASS_SPEC::r MPH_MAP_CLASS_SPEC::m #define MPH_MAP_INLINE_METHOD_DECL(r, m) MPH_MAP_TMPL_SPEC inline typename MPH_MAP_CLASS_SPEC::r MPH_MAP_CLASS_SPEC::m template , class EqualKey = std::equal_to, class Alloc = std::allocator > class mph_map_base { public: typedef Key key_type; typedef Data data_type; typedef pair value_type; typedef HashFcn hasher; typedef EqualKey key_equal; typedef typename vector::pointer pointer; typedef typename vector::reference reference; typedef typename vector::const_reference const_reference; typedef typename vector::size_type size_type; typedef typename vector::difference_type difference_type; typedef is_empty> is_empty_type; typedef hollow_iterator_base::iterator, is_empty_type> iterator; typedef hollow_iterator_base::const_iterator, is_empty_type> const_iterator; // For making macros simpler. typedef void void_type; typedef bool bool_type; typedef pair insert_return_type; mph_map_base(); ~mph_map_base(); iterator begin(); iterator end(); const_iterator begin() const; const_iterator end() const; size_type size() const; bool empty() const; void clear(); void erase(iterator pos); void erase(const key_type& k); pair insert(const value_type& x); inline iterator find(const key_type& k); inline const_iterator find(const key_type& k) const; typedef int32_t my_int32_t; // help macros inline int32_t index(const key_type& k) const; data_type& operator[](const key_type &k); const data_type& operator[](const key_type &k) const; size_type bucket_count() const { return index_.size() + slack_.bucket_count(); } void rehash(size_type nbuckets /*ignored*/); protected: // mimicking STL implementation EqualKey equal_; private: template struct iterator_first : public iterator { iterator_first(iterator it) : iterator(it) { } const typename iterator::value_type::first_type& operator*() { return this->iterator::operator*().first; } }; template iterator_first make_iterator_first(iterator it) { return iterator_first(it); } void pack(); vector values_; vector present_; FlexibleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to use hash from index rather than calculate its own typedef std::unordered_map slack_type; slack_type slack_; size_type size_; typename seeded_hash::hash_function hasher128_; }; MPH_MAP_TMPL_SPEC bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map_base() : size_(0) { clear(); pack(); } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map_base() { } MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { auto it = find(x.first); auto it_end = end(); if (it != it_end) return make_pair(it, false); bool should_pack = false; if (values_.capacity() == values_.size() && values_.size() > 256) { should_pack = true; } values_.push_back(x); present_.push_back(true); ++size_; h128 h = hasher128_.hash128(x.first, 0); if (slack_.find(h) != slack_.end()) should_pack = true; // unavoidable pack else slack_.insert(std::make_pair(h, values_.size() - 1)); if (should_pack) pack(); it = find(x.first); return make_pair(it, true); } MPH_MAP_METHOD_DECL(void_type, pack)() { // CXXMPH_DEBUGLN("Packing %v values")(values_.size()); if (values_.empty()) return; assert(std::unordered_set(make_iterator_first(begin()), make_iterator_first(end())).size() == size()); bool success = index_.Reset( make_iterator_first(begin()), make_iterator_first(end()), size_); if (!success) { exit(-1); } vector new_values(index_.size()); new_values.reserve(new_values.size() * 2); vector new_present(index_.size(), false); new_present.reserve(new_present.size() * 2); for (iterator it = begin(), it_end = end(); it != it_end; ++it) { size_type id = index_.index(it->first); assert(id < index_.size()); assert(id < new_values.size()); new_values[id] = *it; new_present[id] = true; } // fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); values_.swap(new_values); present_.swap(new_present); slack_type().swap(slack_); } MPH_MAP_METHOD_DECL(iterator, begin)() { return make_hollow(&values_, &present_, values_.begin()); } MPH_MAP_METHOD_DECL(iterator, end)() { return make_solid(&values_, &present_, values_.end()); } MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return make_hollow(&values_, &present_, values_.begin()); } MPH_MAP_METHOD_DECL(const_iterator, end)() const { return make_solid(&values_, &present_, values_.end()); } MPH_MAP_METHOD_DECL(bool_type, empty)() const { return size_ == 0; } MPH_MAP_METHOD_DECL(size_type, size)() const { return size_; } MPH_MAP_METHOD_DECL(void_type, clear)() { values_.clear(); present_.clear(); slack_.clear(); index_.clear(); size_ = 0; } MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { assert(pos.it_ - values_.begin() < present_.size()); assert(present_[pos.it_ - values_.begin()]); present_[pos.it_ - values_.begin()] = false; *pos = value_type(); --size_; } MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { iterator it = find(k); if (it == end()) return; erase(it); } MPH_MAP_INLINE_METHOD_DECL(const_iterator, find)(const key_type& k) const { auto idx = index(k); typename vector::const_iterator vit = values_.begin() + idx; if (idx == -1 || !equal_(vit->first, k)) return end(); return make_solid(&values_, &present_, vit);; } MPH_MAP_INLINE_METHOD_DECL(iterator, find)(const key_type& k) { auto idx = index(k); typename vector::iterator vit = values_.begin() + idx; if (idx == -1 || !equal_(vit->first, k)) return end(); return make_solid(&values_, &present_, vit);; } MPH_MAP_INLINE_METHOD_DECL(my_int32_t, index)(const key_type& k) const { if (__builtin_expect(!slack_.empty(), 0)) { auto sit = slack_.find(hasher128_.hash128(k, 0)); if (sit != slack_.end()) return sit->second; } if (__builtin_expect(index_.size(), 1)) { auto id = index_.index(k); if (__builtin_expect(present_[id], true)) return id; } return -1; } MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(make_pair(k, data_type())).first->second; } MPH_MAP_METHOD_DECL(void_type, rehash)(size_type /*nbuckets*/) { pack(); vector(values_.begin(), values_.end()).swap(values_); vector(present_.begin(), present_.end()).swap(present_); slack_type().swap(slack_); } #define MPH_MAP_PREAMBLE template , class EqualKey = std::equal_to,\ class Alloc = std::allocator > MPH_MAP_PREAMBLE class mph_map : public mph_map_base< false, false, Key, Data, HashFcn, EqualKey, Alloc> {}; MPH_MAP_PREAMBLE class unordered_map : public mph_map_base< false, false, Key, Data, HashFcn, EqualKey, Alloc> {}; MPH_MAP_PREAMBLE class hash_map : public mph_map_base< false, false, Key, Data, HashFcn, EqualKey, Alloc> {}; MPH_MAP_PREAMBLE class dense_hash_map : public mph_map_base< false, true, Key, Data, HashFcn, EqualKey, Alloc> {}; MPH_MAP_PREAMBLE class sparse_hash_map : public mph_map_base< true, false, Key, Data, HashFcn, EqualKey, Alloc> {}; #undef MPH_MAP_TMPL_SPEC #undef MPH_MAP_CLASS_SPEC #undef MPH_MAP_METHOD_DECL #undef MPH_MAP_INLINE_METHOD_DECL #undef MPH_MAP_PREAMBLE } // namespace cxxmph #endif // __CXXMPH_MPH_MAP_H__