commit 70dca0a0c6fd27bc39ac3a37edd2a6908bc0198f (tree)
parent f281b928d995ff68f4115fb5a9b7aa10f8c60322
Author: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 3 Jul 2020 17:11:54 +0000
Merge pull request #5779 from ziglang/stage1-hash-map
stage1 HashMap: store hash & do robin hood hashing
Diffstat:
2 files changed, 148 insertions(+), 36 deletions(-)
diff --git a/src/hash_map.hpp b/src/hash_map.hpp
@@ -25,6 +25,8 @@ public:
}
struct Entry {
+ uint32_t hash;
+ uint32_t distance_from_start_index;
K key;
V value;
};
@@ -43,6 +45,26 @@ public:
void put(const K &key, const V &value) {
_modification_count += 1;
+ // This allows us to take a pointer to an entry in `internal_put` which
+ // will not become a dead pointer when the array list is appended.
+ _entries.ensure_capacity(_entries.length + 1);
+
+ if (_index_bytes == nullptr) {
+ if (_entries.length < 16) {
+ _entries.append({HashFunction(key), 0, key, value});
+ return;
+ } else {
+ _indexes_len = 32;
+ _index_bytes = heap::c_allocator.allocate<uint8_t>(_indexes_len);
+ _max_distance_from_start_index = 0;
+ for (size_t i = 0; i < _entries.length; i += 1) {
+ Entry *entry = &_entries.items[i];
+ put_index(entry, i, _index_bytes);
+ }
+ return internal_put(key, value, _index_bytes);
+ }
+ }
+
// if we would get too full (60%), double the indexes size
if ((_entries.length + 1) * 5 >= _indexes_len * 3) {
heap::c_allocator.deallocate(_index_bytes,
@@ -56,22 +78,21 @@ public:
Entry *entry = &_entries.items[i];
switch (sz) {
case 1:
- put_index(key_to_index(entry->key), i, (uint8_t*)_index_bytes);
+ put_index(entry, i, (uint8_t*)_index_bytes);
continue;
case 2:
- put_index(key_to_index(entry->key), i, (uint16_t*)_index_bytes);
+ put_index(entry, i, (uint16_t*)_index_bytes);
continue;
case 4:
- put_index(key_to_index(entry->key), i, (uint32_t*)_index_bytes);
+ put_index(entry, i, (uint32_t*)_index_bytes);
continue;
default:
- put_index(key_to_index(entry->key), i, (size_t*)_index_bytes);
+ put_index(entry, i, (size_t*)_index_bytes);
continue;
}
}
}
-
switch (capacity_index_size(_indexes_len)) {
case 1: return internal_put(key, value, (uint8_t*)_index_bytes);
case 2: return internal_put(key, value, (uint16_t*)_index_bytes);
@@ -109,6 +130,16 @@ public:
bool maybe_remove(const K &key) {
_modification_count += 1;
+ if (_index_bytes == nullptr) {
+ uint32_t hash = HashFunction(key);
+ for (size_t i = 0; i < _entries.length; i += 1) {
+ if (_entries.items[i].hash == hash && EqualFn(_entries.items[i].key, key)) {
+ _entries.swap_remove(i);
+ return true;
+ }
+ }
+ return false;
+ }
switch (capacity_index_size(_indexes_len)) {
case 1: return internal_remove(key, (uint8_t*)_index_bytes);
case 2: return internal_remove(key, (uint16_t*)_index_bytes);
@@ -165,11 +196,16 @@ private:
void init_capacity(size_t capacity) {
_entries = {};
_entries.ensure_capacity(capacity);
- // So that at capacity it will only be 60% full.
- _indexes_len = capacity * 5 / 3;
- size_t sz = capacity_index_size(_indexes_len);
- // This zero initializes _index_bytes which sets them all to empty.
- _index_bytes = heap::c_allocator.allocate<uint8_t>(_indexes_len * sz);
+ _indexes_len = 0;
+ if (capacity >= 16) {
+ // So that at capacity it will only be 60% full.
+ _indexes_len = capacity * 5 / 3;
+ size_t sz = capacity_index_size(_indexes_len);
+ // This zero initializes _index_bytes which sets them all to empty.
+ _index_bytes = heap::c_allocator.allocate<uint8_t>(_indexes_len * sz);
+ } else {
+ _index_bytes = nullptr;
+ }
_max_distance_from_start_index = 0;
_modification_count = 0;
@@ -187,47 +223,113 @@ private:
template <typename I>
void internal_put(const K &key, const V &value, I *indexes) {
- size_t start_index = key_to_index(key);
- for (size_t roll_over = 0, distance_from_start_index = 0;
- roll_over < _indexes_len; roll_over += 1, distance_from_start_index += 1)
+ uint32_t hash = HashFunction(key);
+ uint32_t distance_from_start_index = 0;
+ size_t start_index = hash_to_index(hash);
+ for (size_t roll_over = 0; roll_over < _indexes_len;
+ roll_over += 1, distance_from_start_index += 1)
{
size_t index_index = (start_index + roll_over) % _indexes_len;
I index_data = indexes[index_index];
if (index_data == 0) {
- _entries.append({key, value});
+ _entries.append_assuming_capacity({ hash, distance_from_start_index, key, value });
indexes[index_index] = _entries.length;
if (distance_from_start_index > _max_distance_from_start_index)
_max_distance_from_start_index = distance_from_start_index;
return;
}
+ // This pointer survives the following append because we call
+ // _entries.ensure_capacity before internal_put.
Entry *entry = &_entries.items[index_data - 1];
- if (EqualFn(entry->key, key)) {
- *entry = {key, value};
+ if (entry->hash == hash && EqualFn(entry->key, key)) {
+ *entry = {hash, distance_from_start_index, key, value};
if (distance_from_start_index > _max_distance_from_start_index)
_max_distance_from_start_index = distance_from_start_index;
return;
}
+ if (entry->distance_from_start_index < distance_from_start_index) {
+ // In this case, we did not find the item. We will put a new entry.
+ // However, we will use this index for the new entry, and move
+ // the previous index down the line, to keep the _max_distance_from_start_index
+ // as small as possible.
+ _entries.append_assuming_capacity({ hash, distance_from_start_index, key, value });
+ indexes[index_index] = _entries.length;
+ if (distance_from_start_index > _max_distance_from_start_index)
+ _max_distance_from_start_index = distance_from_start_index;
+
+ distance_from_start_index = entry->distance_from_start_index;
+
+ // Find somewhere to put the index we replaced by shifting
+ // following indexes backwards.
+ roll_over += 1;
+ distance_from_start_index += 1;
+ for (; roll_over < _indexes_len; roll_over += 1, distance_from_start_index += 1) {
+ size_t index_index = (start_index + roll_over) % _indexes_len;
+ I next_index_data = indexes[index_index];
+ if (next_index_data == 0) {
+ if (distance_from_start_index > _max_distance_from_start_index)
+ _max_distance_from_start_index = distance_from_start_index;
+ entry->distance_from_start_index = distance_from_start_index;
+ indexes[index_index] = index_data;
+ return;
+ }
+ Entry *next_entry = &_entries.items[next_index_data - 1];
+ if (next_entry->distance_from_start_index < distance_from_start_index) {
+ if (distance_from_start_index > _max_distance_from_start_index)
+ _max_distance_from_start_index = distance_from_start_index;
+ entry->distance_from_start_index = distance_from_start_index;
+ indexes[index_index] = index_data;
+ distance_from_start_index = next_entry->distance_from_start_index;
+ entry = next_entry;
+ index_data = next_index_data;
+ }
+ }
+ zig_unreachable();
+ }
}
zig_unreachable();
}
template <typename I>
- void put_index(size_t start_index, size_t entry_index, I *indexes) {
+ void put_index(Entry *entry, size_t entry_index, I *indexes) {
+ size_t start_index = hash_to_index(entry->hash);
+ size_t index_data = entry_index + 1;
for (size_t roll_over = 0, distance_from_start_index = 0;
roll_over < _indexes_len; roll_over += 1, distance_from_start_index += 1)
{
size_t index_index = (start_index + roll_over) % _indexes_len;
- if (indexes[index_index] == 0) {
- indexes[index_index] = entry_index + 1;
+ size_t next_index_data = indexes[index_index];
+ if (next_index_data == 0) {
if (distance_from_start_index > _max_distance_from_start_index)
_max_distance_from_start_index = distance_from_start_index;
+ entry->distance_from_start_index = distance_from_start_index;
+ indexes[index_index] = index_data;
return;
}
+ Entry *next_entry = &_entries.items[next_index_data - 1];
+ if (next_entry->distance_from_start_index < distance_from_start_index) {
+ if (distance_from_start_index > _max_distance_from_start_index)
+ _max_distance_from_start_index = distance_from_start_index;
+ entry->distance_from_start_index = distance_from_start_index;
+ indexes[index_index] = index_data;
+ distance_from_start_index = next_entry->distance_from_start_index;
+ entry = next_entry;
+ index_data = next_index_data;
+ }
}
zig_unreachable();
}
Entry *internal_get(const K &key) const {
+ if (_index_bytes == nullptr) {
+ uint32_t hash = HashFunction(key);
+ for (size_t i = 0; i < _entries.length; i += 1) {
+ if (_entries.items[i].hash == hash && EqualFn(_entries.items[i].key, key)) {
+ return &_entries.items[i];
+ }
+ }
+ return nullptr;
+ }
switch (capacity_index_size(_indexes_len)) {
case 1: return internal_get2(key, (uint8_t*)_index_bytes);
case 2: return internal_get2(key, (uint16_t*)_index_bytes);
@@ -238,7 +340,8 @@ private:
template <typename I>
Entry *internal_get2(const K &key, I *indexes) const {
- size_t start_index = key_to_index(key);
+ uint32_t hash = HashFunction(key);
+ size_t start_index = hash_to_index(hash);
for (size_t roll_over = 0; roll_over <= _max_distance_from_start_index; roll_over += 1) {
size_t index_index = (start_index + roll_over) % _indexes_len;
size_t index_data = indexes[index_index];
@@ -246,19 +349,20 @@ private:
return nullptr;
Entry *entry = &_entries.items[index_data - 1];
- if (EqualFn(entry->key, key))
+ if (entry->hash == hash && EqualFn(entry->key, key))
return entry;
}
return nullptr;
}
- size_t key_to_index(const K &key) const {
- return ((size_t)HashFunction(key)) % _indexes_len;
+ size_t hash_to_index(uint32_t hash) const {
+ return ((size_t)hash) % _indexes_len;
}
template <typename I>
bool internal_remove(const K &key, I *indexes) {
- size_t start_index = key_to_index(key);
+ uint32_t hash = HashFunction(key);
+ size_t start_index = hash_to_index(hash);
for (size_t roll_over = 0; roll_over <= _max_distance_from_start_index; roll_over += 1) {
size_t index_index = (start_index + roll_over) % _indexes_len;
size_t index_data = indexes[index_index];
@@ -267,10 +371,10 @@ private:
size_t index = index_data - 1;
Entry *entry = &_entries.items[index];
- if (!EqualFn(entry->key, key))
+ if (entry->hash != hash || !EqualFn(entry->key, key))
continue;
- indexes[index_index] = 0;
+ size_t prev_index = index_index;
_entries.swap_remove(index);
if (_entries.length > 0 && _entries.length != index) {
// Because of the swap remove, now we need to update the index that was
@@ -280,24 +384,29 @@ private:
// Now we have to shift over the following indexes.
roll_over += 1;
- for (; roll_over <= _max_distance_from_start_index; roll_over += 1) {
+ for (; roll_over < _indexes_len; roll_over += 1) {
size_t next_index = (start_index + roll_over) % _indexes_len;
- if (indexes[next_index] == 0)
- break;
- size_t next_start_index = key_to_index(_entries.items[indexes[next_index]].key);
- if (next_start_index != start_index)
- break;
- indexes[next_index - 1] = indexes[next_index];
+ if (indexes[next_index] == 0) {
+ indexes[prev_index] = 0;
+ return true;
+ }
+ Entry *next_entry = &_entries.items[indexes[next_index] - 1];
+ if (next_entry->distance_from_start_index == 0) {
+ indexes[prev_index] = 0;
+ return true;
+ }
+ indexes[prev_index] = indexes[next_index];
+ prev_index = next_index;
+ next_entry->distance_from_start_index -= 1;
}
-
- return true;
+ zig_unreachable();
}
return false;
}
template <typename I>
void update_entry_index(size_t old_entry_index, size_t new_entry_index, I *indexes) {
- size_t start_index = key_to_index(_entries.items[new_entry_index].key);
+ size_t start_index = hash_to_index(_entries.items[new_entry_index].hash);
for (size_t roll_over = 0; roll_over <= _max_distance_from_start_index; roll_over += 1) {
size_t index_index = (start_index + roll_over) % _indexes_len;
if (indexes[index_index] == old_entry_index + 1) {
diff --git a/src/list.hpp b/src/list.hpp
@@ -19,6 +19,9 @@ struct ZigList {
ensure_capacity(length + 1);
items[length++] = item;
}
+ void append_assuming_capacity(const T& item) {
+ items[length++] = item;
+ }
// remember that the pointer to this item is invalid after you
// modify the length of the list
const T & at(size_t index) const {