From b0d23fd9bc916086c5e3b71330e6049dcebd5555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= <motiejus@jakstys.lt>
Date: Tue, 8 Feb 2022 09:52:47 +0200
Subject: [PATCH] Let it be so.

---
 README.md | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0746737
--- /dev/null
+++ b/README.md
@@ -0,0 +1,74 @@
+Turbo NSS
+---------
+
+glibc nss library for passwd and group.
+
+Steps
+-----
+
+A known implementation runs id(1) at ~250 rps sequentially. Our goal is 10k
+ID/s.
+
+id(1) works as follows:
+- lookup user by name.
+- get all additional gids (an array attached to a member).
+- for each additional gid, return the group name.
+
+Assuming a member is in ~100 groups on average, that's 1M group lookups per
+second. We need to convert gid to a group index quickly.
+
+Data structures
+---------------
+
+Basic data structures that allow efficient storage:
+
+```lang=c
+// reminder:
+typedef uid_t uint32;
+typedef gid_t uint32;
+
+// 6*32b = 6*4B = 24B/user
+typedef struct {
+  uid_t uid;
+  gid_t gid;
+  name_offset uint32; // offset into *usernames
+  gecos_offset uint32; // offset into *gecos
+  shell_offset uint32; // offset into *shells
+  additional_groups_offset uint32; // offset into additional_groups
+} user;
+
+const char* usernames; // all concatenated usernames, fsst-compressed
+const char* gecoss; // all concatenated gecos, fsst-compressed
+const char* shells; // all concatenated home directories, fsst-compressed
+const uint8_t additional_groups; // all additional_groups, turbo compressed
+
+typedef struct {
+  gid_t gid;
+  name_offset uint32; // offset into *groupnames
+  members_offset uint32; // offset into members
+}
+
+const char* groupnames; // all concatenated group names, fsst-compressed
+const uint8_8 members; // all concatenated members, turbo compressed
+```
+
+"turbo compression" encodes a list of uids/gids with this algorithm:
+1. sort ascending.
+2. extract deltas and subtract 1: `awk '{diff=$0-prev; prev=$0; print
+   diff-1}'`.
+3. varint-encode these deltas into an uint32, like protobuf or utf8.
+
+With typical group memberships (as of writing) this requires ~1.3-1.5 byte per
+entry.
+
+Indexes
+-------
+
+The following operations need to be fast, in order of importance:
+
+1. lookup gid -> group (this is on hot path in id).
+2. lookup uid -> user.
+3. lookup username -> user.
+4. lookup groupname -> group.
+5. (optional) iterate users using a defined order (`getent passwd`).
+6. (optional) iterate groups using a defined order (`getent group`).