Move docs around; finish it

2022-08-21 06:08:21 +03:00
parent 8bfc4a30cd
commit fbd449b21f
4 changed files with 458 additions and 710 deletions
--- a/README.md
+++ b/README.md
@@ -1,442 +1,131 @@
 Turbo NSS
 ---------

-Turbonss is a plugin for GNU Name Service Switch (NSS) functionality of GNU C
-Library (glibc). Turbonss implements lookup for `user` and `passwd` database
-entries (i.e. system users, groups, and group memberships). It's main goal is
-performance, with focus on making [`id(1)`][id] run as fast as possible.
+Turbonss is a plugin for GNU Name Service Switch ([NSS][nsswitch])
+functionality of GNU C Library (glibc). Turbonss implements lookup for `user`
+and `passwd` database entries (i.e. system users, groups, and group
+memberships). It's main goal is to run [`id(1)`][id] as fast as possible.

 Turbonss is optimized for reading. If the data changes in any way, the whole
-file will need to be regenerated (and tooling only supports only full
-generation). It was created, and best suited, for environments that have a
-central user & group database which then needs to be distributed to many
-servers/services, and the data does not change very often (e.g. hourly).
+file will need to be regenerated. Therefore, it was created, and best suited,
+for environments that have a central user & group database which then needs to
+be distributed to many servers/services, and the data does not change very
+often (e.g. hourly).

-To understand more about name service switch, start with
-[`nsswitch.conf(5)`][nsswitch].
+This is the fastest known NSS passwd/group implementation for *reads*. On a
+corpus with 10k users, 10k groups and 500 average members per group, `id` takes
+17 seconds with the glibc default implementation, 10-17 milliseconds with a
+pre-cached `nscd`, ~8 milliseconds with `turbonss`.

-Design & constraints
--------------------
+Project status
+--------------

-To be fast, the user/group database (later: DB) has to be small
-([background][data-oriented-design]). It encodes user & group information in a
-way that minimizes the DB size, and reduces jumping across the DB ("chasing
-pointers and thrashing CPU cache").
+The project is finished and is not recommended for production; just use nscd.
+Turbonss duly implements the full user/group API in `src/libnss.zig`: feel free
+to copy that.

-To understand how this is done efficiently, let's analyze the
-[`getpwnam_r(3)`][getpwnam_r] in high level. This API call accepts a username
-and returns the following user information:
-
-```
-struct passwd {
-   char   *pw_name;       /* username */
-   char   *pw_passwd;     /* user password */
-   uid_t   pw_uid;        /* user ID */
-   gid_t   pw_gid;        /* group ID */
-   char   *pw_gecos;      /* user information */
-   char   *pw_dir;        /* home directory */
-   char   *pw_shell;      /* shell program */
-};
-```
-
-Turbonss, among others, implements this call, and takes the following steps to
-resolve a username to a `struct passwd*`:
-
- Open the DB (using `mmap`) and interpret it's first 64 bytes as a `*struct
-  Header`. The header stores offsets to the sections of the file. This needs to
-  be done once, when the NSS library is loaded.
- Hash the username using a perfect hash function. Perfect hash function
-  returns a number `n ∈ [0,N-1]`, where N is the total number of users.
- Jump to the `n`'th location in the `idx_name2user` section, which contains
-  the index `i` to the user's information.
- Jump to the location `i` of section `Users`, which stores the full user
-  information.
- Decode the user information (which is all in a continuous memory block) and
-  return it to the caller.
-
-In total, that's one hash for the username (~150ns), two pointer jumps within
-the group file (to sections `idx_name2user` and `Users`), and, now that the
-user record is found, `memcpy` for each field.
-
-The turbonss DB file is be `mmap`-ed, making it simple to jump across the file
-using pointer arithmetic. This also reduces memory usage, as the mmap'ed
-regions are shared. Turbonss reads do not consume any heap space.
-
-Tight packing places some constraints on the underlying data:
-
- Permitted length of username and groupname: 1-32 bytes.
- Permitted length of shell and home: 1-256 bytes.
- Permitted comment ("gecos") length: 0-255 bytes.
- User name, groupname, gecos and shell must be utf8-encoded.
- User and Groups sections are up to 2^35B (~34GB) large. Assuming an "average"
-  user record takes 50 bytes, this section would fit ~660M users. The
-  worst-case upper bound is left as an exercise to the reader.
-
-Sorting is stable. In v0:
- Groups are sorted by gid, ascending.
- Users are sorted by their name, ascending by the unicode codepoints
-  (locale-independent).
-
-Checking out and building
-------------------------
-
-```
-$ git clone --recursive https://git.sr.ht/~motiejus/turbonss
-```
-
-Alternatively, if you forgot `--recursive`:
-
-```
-$ git submodule update --init
-```
-
-And run tests:
-
-```
-$ zig build test
-```
-
-Test the so
-----------
-
-Build:
-
-    zig build -Dtarget=x86_64-linux-gnu.2.31 -Dcpu=x86_64_v3 -Drelease-fast=true -Dstrip=true
-
-Generate `db.turbo`:
-
-    zig-out/bin/turbonss-unix2db --passwd /etc/passwd --group /etc/group
-    zig-out/bin/turbonss-analyze db.turbo
-    <...>
-
-Run a test container:
-
-    $ docker run -ti --rm --privileged -v `pwd`:/etc/turbonss -w /etc/turbonss debian:bullseye
-    # cp zig-out/lib/libnss_turbo.so.2 /lib/x86_64-linux-gnu
-    # sed -i 's/\(\(passwd\|group\).*files\)$/\1 turbo/' /etc/nsswitch.conf
-
-And knock yourself out:
-
-    getent passwd
-    getent group
-    id root
-
-This is probably not very interesting; you may want to take a larger corpus of
-/etc/passwd and /etc/group for more interesting results.
+Yours truly (the author) worked on this for about 7 months. And when this was
+finished it turned out that just slapping nscd on top of the existing NSS
+implementation is almost as fast as this.

 Dependencies
 ------------

-This project uses [git subtrac][git-subtrac] for managing dependencies. They
-work just like regular submodules, except all the refs of the submodules are in
-this repository. Repeat after me: all the submodules are in this repository.
-So if you have a copy of this repo, dependencies will not disappear.
+1. Stage1 of the nightly zig compiler.
+2. [cmph][cmph]: bundled with this repository.

-remarks on `id(1)`
------------------
+Trying it out
+-------------

-A known implementation runs id(1) at ~250 rps sequentially on ~20k users and
-~10k groups. Our rps target is much higher.
+Clone, compile and test first:

-To better reason about the trade-offs, it is useful to understand how `id(1)`
-is implemented, in rough terms:
- lookup user by name ([`getpwent_r(3)`][getpwent]).
- get all gids for the user ([`getgrouplist(3)`][getgrouplist]). Note: it is
-  actually using `initgroups_dyn`, accepts a uid, and is very poorly
-  documented.
- for each additional gid, get the `struct group*`
-  ([`getgrgid_r(3)`][getgrgid_r]).
+    $ git clone --recursive https://git.sr.ht/~motiejus/turbonss
+    $ zig build -fstage1 test
+    $ zig build -fstage1 -Dtarget=x86_64-linux-gnu.2.31 -Dcpu=x86_64_v3 -Drelease-safe=true

-Assuming a member is in ~100 groups on average, to reach 10k id/s translates to
-1M group lookups per second. We need to convert gid to a group index, and group
-index to a group gid/name quickly.
+One may choose different options, depending on requirements. Here are some
+hints:

-Caveat: `struct group` contains an array of pointers to names of group members
-(`char **gr_mem`). However, `id` does not use that information, resulting in
-read amplification, sometimes by 10-100x. Therefore, if `argv[0] == "id"`, our
-implementation of [`getgrid_r(3)`][getgrid] returns the `struct group*` without
-the members. This speeds up `id` by about 10x on a known NSS implementation.
+1. `-Dcpu=<...>` for the CPU
+   [microarchitecture](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels).
+2. `-Drelease-fast=true` for max speed
+3. `-Drelease-small=true` for smallest binary sizes.
+4. `-Dstrip=true` to strip debug symbols.

-Relatedly, because [`getgrid_r(3)`][getgrid] does not need the group members,
-the group members are stored in a different DB section, reducing the `Groups`
-section and making more of it fit the CPU caches.
+Test it on a real system
+------------------------

-Turbonss header
---------------
+`db.turbo` is the TurboNSS database file. To create one from `/etc/group` and
+`/etc/passwd`, use `turbonss-unix2db`:

-The turbonss header looks like this:
+    $ zig-out/bin/turbonss-unix2db --passwd /etc/passwd --group /etc/group
+    $ zig-out/bin/turbonss-analyze db.turbo
+    <...>

-```
-OFFSET     TYPE     NAME                      DESCRIPTION
-   0      [4]u8     magic                     f0 9f a4 b7
-   4         u8     version                   0
-   5         u8     endian                    0 for little, 1 for big
-   6         u8     nblocks_shell_blob        max value: 63
-   7         u8     num_shells                max value: 63
-   8        u32     num_groups                number of group entries
-  12        u32     num_users                 number of passwd entries
-  16        u32     nblocks_bdz_gid           bdz_gid section block count
-  20        u32     nblocks_bdz_groupname
-  24        u32     nblocks_bdz_uid
-  28        u32     nblocks_bdz_username
-  32        u64     nblocks_groups
-  40        u64     nblocks_users
-  48        u64     nblocks_groupmembers
-  56        u64     nblocks_additional_gids
-  64        u64     getgr_bufsize
-  72        u64     getpw_bufsize
-  80     [48]u8     padding
-```
+Run and configure a test container that uses `turbonss` instead of the default
+`files`:

-`magic` is 0xf09fa4b7, and `version` must be `0`. All integers are
-native-endian. `nblocks_*` is the count of blocks of a particular section; this
-helps calculate the offsets to all sections.
+    $ docker run -ti --rm -v `pwd`:/etc/turbonss -w /etc/turbonss debian:bullseye
+    # cp zig-out/lib/libnss_turbo.so.2 /lib/x86_64-linux-gnu/
+    # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf

-Some numbers, like `nblocks_shell_blob`, `num_shells`, would fit to smaller
-number of bytes. However, interpreting `[2]u6` with `xxd(1)` is harder than
-interpreting `[2]u8`. Therefore we are using the space we have to make these
-integers byte-wide.
+And run the commands:

-`getgr_bufsize` and `getpw_bufsize` is a hint for the caller of `getgr*` and
-`getpw*`-family calls. This is the recommended size of the buffer, so the
-caller does not receive `ENOMEM`.
+    $ getent passwd
+    $ getent group
+    $ id root

-Primitive types
---------------
+More users and groups
+---------------------

-`User` and `Group` entries are sorted by the order they were received in the input
-file. All entries are aligned to 8 bytes. All `User` and `Group` entries are
-referred by their byte offset in the `Users` and `Groups` section relative to
-the beginning of the section.
+`turbonss-makecorpus` can synthesize more `users` and `groups`:

-```
-const PackedGroup = packed struct {
-    gid: u32,
-    padding: u3,
-    groupname_len: u5,
-}
-```
+    # ./zig-out/bin/turbonss-makecorpus 
+    wrote users=10000 groups=10000 avg-members=1000 to .
+    # cat group >> /etc/group
+    # cat passwd >> /etc/passwd
+    # time id u_1000000
+    <...>
+    real    0m17.380s
+    user    0m13.117s
+    sys     0m4.263s

-PackedGroup is followed by the group name (of length `groupname_len`), followed
-by a varint-compressed offset to the groupmembers section, followed by 8b padding.
+17 seconds for an `id` command! Well, there are indeed many users and groups.
+Let's see how turbonss fares with it:

-PackedUser is a bit more involved:
+    # zig-out/bin/turbonss-unix2db --group /etc/group --passwd /etc/passwd
+    total 10968512 bytes. groups=10019 users=10039
+    # ls -hs /etc/group /etc/passwd db.turbo
+    48M /etc/group  668K /etc/passwd   11M db.turbo
+    # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf
+    # time id u_1000000
+    real    0m0.008s
+    user    0m0.000s
+    sys     0m0.008s

-```
-pub const PackedUser = packed struct {
-    uid: u32,
-    gid: u32,
-    shell_len_or_idx: u8,
-    shell_here: bool,
-    name_is_a_suffix: bool,
-    home_len: u6,
-    name_len: u5,
-    gecos_len: u11,
-}
-```
+That's ~1500x improvement for the `id` command (and notice about 4X compression
+ratio compared to plain files). If the number of users and groups is increased
+by 10x (to 100k each), the difference becomes even crazier:

-... followed by `userdata: []u8`:
- home.
- name (optional).
- gecos.
- shell (optional).
- `additional_gids_offset`: varint.
+    # time id u_1000000
+    <...>
+    real    3m42.281s
+    user    2m30.482s
+    sys     0m55.840s
+    # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf
+    # time id u_1000000
+    <...>
+    real    0m0.008s
+    user    0m0.000s
+    sys     0m0.008s

-First byte of home is stored right after the `gecos_len` field, and its length
-is `home_len`. The same logic applies to all the `stringdata` fields: there is
-a way to calculate their relative position from the length of the fields before
-them.
+Documentation
+-------------

-PackedUser employs two data-oriented compression techniques:
- shells are often shared across different users, see the "Shells" section.
- `name` is frequently a suffix of `home`. For example, `/home/vidmantas` and
-  `vidmantas`. In this case storing both name and home is wasteful. Therefore
-  name has two options:
-  1. `name_is_a_suffix=true`: name is a suffix of the home dir. Then `name`
-  starts at the `home_len - name_len`'th byte of `home`, and ends at the same
-  place as `home`.
-  2. `name_is_a_suffix=false`: name begins one byte after home, and it's length
-  is `name_len`.
-
-The last field `additional_gids_offset: varint` points to the `additional_gids`
-section for this user.
-
-Shells
------
-
-Normally there is a limited number of separate shells even in huge user
-databases. A few examples: `/bin/bash`, `/usr/bin/nologin`, `/bin/zsh` among
-others. Therefore, "shells" have an optimization: they can be pointed by in the
-external list, or, if they are unique to the user, reside among the user's
-data.
-
-255 most popular shells (i.e. referred to by at least two User entries) are
-stored externally in "Shells" area. The less popular ones are stored with
-userdata.
-
-Shells section consists of two sub-sections: the index and the blob. The index
-is an array of offsets: the i'th shell starts at `offsets[i]` byte, and ends at
-`offsets[i+1]` byte. If there is at least one shell in the shell section, the
-index contains a sentinel index as the last element, which signifies the position
-of the last byte of the shell blob.
-
-`shell_here=true` in the User struct means the shell is stored with userdata,
-and it's length is `shell_len_or_idx`. `shell_here=false` means it is stored in
-the `Shells` section, and it's index is `shell_len_or_idx` (and the actual
-string start and end offsets are resolved as described in the paragraph above).
-
-Variable-length integers (varints)
----------------------------------
-
-Varint is an efficiently encoded integer (packed for small values). Same as
-[protocol buffer varints][varint], except the largest possible value is `u64`.
-They compress integers well. Varints are stored for group memberships.
-
-Group memberships
-----------------
-
-There are two group memberships at play:
-
-1. Given a group (gid/name), resolve the members' names (e.g. `getgrgid`).
-2. Given a username, resolve user's group gids (for `initgroups(3)`).
-
-When group's memberships are resolved in (1), the same call also requires other
-group information: gid and group name. Therefore it makes sense to store a
-pointer to the group members in the group information itself. However, the
-memberships are not *always* necessary (see remarks about `id(1)`), therefore
-the memberships will be stored separately, outside of the groups section.
-
-Similarly, when user's groups are resolved in (2), they are not always necessary
-(i.e. not part of `struct user*`), therefore the memberships themselves are
-stored out of bound.
-
-`groupmembers` and `additional_gids` store group and user memberships
-respectively. Membership IDs are packed — not necessitating random access, thus
-suitable for compression.
-
- `groupmembers` consists of a number X followed by a list of offsets to User
-  records, because `getgr*` returns pointers to membernames, thus a name has to
-  be immediately resolvable.
- `additional_gids` is a list of gids, because `initgroups_dyn` (and friends)
-  returns an array of gids.
-
-Each entry of `groupmembers` and `additional_gids` starts with a varint N,
-which is the number of upcoming elements. Then N delta-compressed varints,
-which are:
-
- **additional_gids** a list of gids.
- **groupmembers** byte-offsets to the User records in the `users` section.
-
-Indices
-------
-
-Now that we've sketched the implementation of `id(3)`, it's clearer to
-understand which operations need to be fast; in order of importance:
-
-1. lookup gid -> group info (this is on hot path in id) without members.
-2. lookup username -> user's groups.
-3. lookup uid -> user.
-4. lookup groupname -> group.
-5. lookup username -> user.
-
-These indices can use perfect hashing like [bdz from cmph][cmph]: a perfect
-hash hashes a list of bytes to a sequential list of integers. Perfect hashing
-algorithms require some space, and take some time to calculate ("hashing
-duration"). I've tested BDZ, which hashes `[][]u8` to a sequential list of
-integers (not preserving order) and CHM, preserves order. BDZ accepts an
-optional argument `3 <= b <= 10`.
-
-* BDZ algorithm requires (b=3, 900KB, b=7, 338KB, b=10, 306KB) for 1M values.
-* Latency to resolve 1M keys: (170ms, 180ms, 230ms, respectively).
-* Packed vs non-packed latency differences are not meaningful.
-
-CHM retains order, however, 1M keys weigh 8MB. 10k keys are ~20x larger with
-CHM than with BDZ, eliminating the benefit of preserved ordering: we can just
-have a separate index.
-
-None of the tested perfect hashing algorithms makes the distinction between
-existing (in the initial dictionary) and new keys. In other words, HASH(value)
-will be pointing to a number `n ∈ [0,N-1]`, regardless whether the value was in
-the initial dictionary. Therefore one must always confirm, after calculating
-the hash, that the key matches what's been hashed.
-
-`idx_*` sections are of type `[]u32` and are pointing from `hash(key)` to the
-respective `Groups` and `Users` entries (from the beginning of the respective
-section). Since User and Group records are 8-byte aligned, the actual offset to
-the record is acquired by right-shifting this value by 3 bits.
-
-Database file structure
-----------------------
-
-Each section is padded to 64 bytes.
-
-```
-SECTION               SIZE             DESCRIPTION
-header                128              see "Turbonss header" section
-bdz_gid               ?                bdz(gid)
-bdz_groupname         ?                bdz(groupname)
-bdz_uid               ?                bdz(uid)
-bdz_username          ?                bdz(username)
-idx_gid2group         len(group)*4     bdz->offset Groups
-idx_groupname2group   len(group)*4     bdz->offset Groups
-idx_uid2user          len(user)*4      bdz->offset Users
-idx_name2user         len(user)*4      bdz->offset Users
-shell_index           len(shells)*2    shell index array
-shell_blob            <= 65280         shell data blob (max 255*256 bytes)
-groups                ?                packed Group entries (8b padding)
-users                 ?                packed User entries (8b padding)
-groupmembers          ?                per-group delta varint memberlist (no padding)
-additional_gids       ?                per-user delta varint gidlist (no padding)
-```
-
-Section creation order:
-
-1. ✅ `bdz_*`.
-1. ✅ `shell_index`, `shell_blob`.
-1. ✅ `additional_gids`.
-1. ✅ `users` requires `additional_gids` and shell.
-1. ✅ `groupmembers` requires `users`.
-1. ✅ `groups` requires `groupmembers`.
-1. ✅ `idx_*`. requires offsets to `groups` and `users`.
-1. ✅ Header.
-
-For v2
------
-
-These are desired for the next DB format:
- Compress strings with fsst.
- Trim first 4 bytes from the cmph headers.
-
-Profiling
---------
-
-Prepare `profile.data`:
-
-```
-zig build -Drelease-small=true && \
-    perf record --call-graph=dwarf \
-        zig-out/bin/turbonss-unix2db --passwd passwd2 --group group2
-```
-
-Perf interactive:
-
-```
-perf report -i perf.data
-```
-
-Flame graph:
-
-```
-perf script | inferno-collapse-perf | inferno-flamegraph > profile.svg
-```
+Architecture is detailed in `docs/architecture.md`
+Development notes are in `docs/development.md`

 [git-subtrac]: https://apenwarr.ca/log/20191109
-[cmph]: http://cmph.sourceforge.net/
-[id]: https://linux.die.net/man/1/id
 [nsswitch]: https://linux.die.net/man/5/nsswitch.conf
-[data-oriented-design]: https://media.handmade-seattle.com/practical-data-oriented-design/
-[getpwnam_r]: https://linux.die.net/man/3/getpwnam_r
-[varint]: https://developers.google.com/protocol-buffers/docs/encoding#varints
-[getpwent]: https://www.man7.org/linux/man-pages/man3/getpwent_r.3.html
-[getgrouplist]: https://www.man7.org/linux/man-pages/man3/getgrouplist.3.html
-[getgrid]: https://www.man7.org/linux/man-pages/man3/getgrid_r.3.html
+[id]: https://linux.die.net/man/1/id
+[cmph]: http://cmph.sourceforge.net/
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -0,0 +1,327 @@
+Design & constraints
+--------------------
+
+To be fast, the user/group database (later: DB) has to be small
+([background][data-oriented-design]). It encodes user & group information in a
+way that minimizes the DB size, and reduces jumping across the DB ("chasing
+pointers and thrashing CPU cache").
+
+To understand how this is done efficiently, let's analyze the
+[`getpwnam_r(3)`][getpwnam_r] in high level. This API call accepts a username
+and returns the following user information:
+
+```
+struct passwd {
+   char   *pw_name;       /* username */
+   char   *pw_passwd;     /* user password */
+   uid_t   pw_uid;        /* user ID */
+   gid_t   pw_gid;        /* group ID */
+   char   *pw_gecos;      /* user information */
+   char   *pw_dir;        /* home directory */
+   char   *pw_shell;      /* shell program */
+};
+```
+
+Turbonss, among others, implements this call, and takes the following steps to
+resolve a username to a `struct passwd*`:
+
+- Open the DB (using `mmap`) and interpret it's first 64 bytes as a `*struct
+  Header`. The header stores offsets to the sections of the file. This needs to
+  be done once, when the NSS library is loaded.
+- Hash the username using a perfect hash function. Perfect hash function
+  returns a number `n ∈ [0,N-1]`, where N is the total number of users.
+- Jump to the `n`'th location in the `idx_name2user` section, which contains
+  the index `i` to the user's information.
+- Jump to the location `i` of section `Users`, which stores the full user
+  information.
+- Decode the user information (which is all in a continuous memory block) and
+  return it to the caller.
+
+In total, that's one hash for the username (~150ns), two pointer jumps within
+the group file (to sections `idx_name2user` and `Users`), and, now that the
+user record is found, `memcpy` for each field.
+
+The turbonss DB file is be `mmap`-ed, making it simple to jump across the file
+using pointer arithmetic. This also reduces memory usage, as the mmap'ed
+regions are shared. Turbonss reads do not consume any heap space.
+
+Tight packing places some constraints on the underlying data:
+
+- Permitted length of username and groupname: 1-32 bytes.
+- Permitted length of shell and home: 1-256 bytes.
+- Permitted comment ("gecos") length: 0-255 bytes.
+- User name, groupname, gecos and shell must be utf8-encoded.
+- User and Groups sections are up to 2^35B (~34GB) large. Assuming an "average"
+  user record takes 50 bytes, this section would fit ~660M users. The
+  worst-case upper bound is left as an exercise to the reader.
+
+Sorting is stable. In v0:
+- Groups are sorted by gid, ascending.
+- Users are sorted by their name, ascending by the unicode codepoints
+  (locale-independent).
+
+remarks on `id(1)`
+------------------
+
+A known implementation runs id(1) at ~250 rps sequentially on ~20k users and
+~10k groups. Our rps target is much higher.
+
+To better reason about the trade-offs, it is useful to understand how `id(1)`
+is implemented, in rough terms:
+- lookup user by name ([`getpwent_r(3)`][getpwent]).
+- get all gids for the user ([`getgrouplist(3)`][getgrouplist]). Note: it is
+  actually using `initgroups_dyn`, accepts a uid, and is very poorly
+  documented.
+- for each additional gid, get the `struct group*`
+  ([`getgrgid_r(3)`][getgrgid_r]).
+
+Assuming a member is in ~100 groups on average, to reach 10k id/s translates to
+1M group lookups per second. We need to convert gid to a group index, and group
+index to a group gid/name quickly.
+
+Caveat: `struct group` contains an array of pointers to names of group members
+(`char **gr_mem`). However, `id` does not use that information, resulting in
+read amplification, sometimes by 10-100x. Therefore, if `argv[0] == "id"`, our
+implementation of [`getgrid_r(3)`][getgrid] returns the `struct group*` without
+the members. This speeds up `id` by about 10x on a known NSS implementation.
+
+Relatedly, because [`getgrid_r(3)`][getgrid] does not need the group members,
+the group members are stored in a different DB section, reducing the `Groups`
+section and making more of it fit the CPU caches.
+
+Turbonss header
+---------------
+
+The turbonss header looks like this:
+
+```
+OFFSET     TYPE     NAME                      DESCRIPTION
+   0      [4]u8     magic                     f0 9f a4 b7
+   4         u8     version                   0
+   5         u8     endian                    0 for little, 1 for big
+   6         u8     nblocks_shell_blob        max value: 63
+   7         u8     num_shells                max value: 63
+   8        u32     num_groups                number of group entries
+  12        u32     num_users                 number of passwd entries
+  16        u32     nblocks_bdz_gid           bdz_gid section block count
+  20        u32     nblocks_bdz_groupname
+  24        u32     nblocks_bdz_uid
+  28        u32     nblocks_bdz_username
+  32        u64     nblocks_groups
+  40        u64     nblocks_users
+  48        u64     nblocks_groupmembers
+  56        u64     nblocks_additional_gids
+  64        u64     getgr_bufsize
+  72        u64     getpw_bufsize
+  80     [48]u8     padding
+```
+
+`magic` is 0xf09fa4b7, and `version` must be `0`. All integers are
+native-endian. `nblocks_*` is the count of blocks of a particular section; this
+helps calculate the offsets to all sections.
+
+Some numbers, like `nblocks_shell_blob`, `num_shells`, would fit to smaller
+number of bytes. However, interpreting `[2]u6` with `xxd(1)` is harder than
+interpreting `[2]u8`. Therefore we are using the space we have to make these
+integers byte-wide.
+
+`getgr_bufsize` and `getpw_bufsize` is a hint for the caller of `getgr*` and
+`getpw*`-family calls. This is the recommended size of the buffer, so the
+caller does not receive `ENOMEM`.
+
+Primitive types
+---------------
+
+`User` and `Group` entries are sorted by the order they were received in the input
+file. All entries are aligned to 8 bytes. All `User` and `Group` entries are
+referred by their byte offset in the `Users` and `Groups` section relative to
+the beginning of the section.
+
+```
+const PackedGroup = packed struct {
+    gid: u32,
+    padding: u3,
+    groupname_len: u5,
+}
+```
+
+PackedGroup is followed by the group name (of length `groupname_len`), followed
+by a varint-compressed offset to the groupmembers section, followed by 8b padding.
+
+PackedUser is a bit more involved:
+
+```
+pub const PackedUser = packed struct {
+    uid: u32,
+    gid: u32,
+    shell_len_or_idx: u8,
+    shell_here: bool,
+    name_is_a_suffix: bool,
+    home_len: u6,
+    name_len: u5,
+    gecos_len: u11,
+}
+```
+
+... followed by `userdata: []u8`:
+- home.
+- name (optional).
+- gecos.
+- shell (optional).
+- `additional_gids_offset`: varint.
+
+First byte of home is stored right after the `gecos_len` field, and its length
+is `home_len`. The same logic applies to all the `stringdata` fields: there is
+a way to calculate their relative position from the length of the fields before
+them.
+
+PackedUser employs two data-oriented compression techniques:
+- shells are often shared across different users, see the "Shells" section.
+- `name` is frequently a suffix of `home`. For example, `/home/vidmantas` and
+  `vidmantas`. In this case storing both name and home is wasteful. Therefore
+  name has two options:
+  1. `name_is_a_suffix=true`: name is a suffix of the home dir. Then `name`
+  starts at the `home_len - name_len`'th byte of `home`, and ends at the same
+  place as `home`.
+  2. `name_is_a_suffix=false`: name begins one byte after home, and it's length
+  is `name_len`.
+
+The last field `additional_gids_offset: varint` points to the `additional_gids`
+section for this user.
+
+Shells
+------
+
+Normally there is a limited number of separate shells even in huge user
+databases. A few examples: `/bin/bash`, `/usr/bin/nologin`, `/bin/zsh` among
+others. Therefore, "shells" have an optimization: they can be pointed by in the
+external list, or, if they are unique to the user, reside among the user's
+data.
+
+255 most popular shells (i.e. referred to by at least two User entries) are
+stored externally in "Shells" area. The less popular ones are stored with
+userdata.
+
+Shells section consists of two sub-sections: the index and the blob. The index
+is an array of offsets: the i'th shell starts at `offsets[i]` byte, and ends at
+`offsets[i+1]` byte. If there is at least one shell in the shell section, the
+index contains a sentinel index as the last element, which signifies the position
+of the last byte of the shell blob.
+
+`shell_here=true` in the User struct means the shell is stored with userdata,
+and it's length is `shell_len_or_idx`. `shell_here=false` means it is stored in
+the `Shells` section, and it's index is `shell_len_or_idx` (and the actual
+string start and end offsets are resolved as described in the paragraph above).
+
+Variable-length integers (varints)
+----------------------------------
+
+Varint is an efficiently encoded integer (packed for small values). Same as
+[protocol buffer varints][varint], except the largest possible value is `u64`.
+They compress integers well. Varints are stored for group memberships.
+
+Group memberships
+-----------------
+
+There are two group memberships at play:
+
+1. Given a group (gid/name), resolve the members' names (e.g. `getgrgid`).
+2. Given a username, resolve user's group gids (for `initgroups(3)`).
+
+When group's memberships are resolved in (1), the same call also requires other
+group information: gid and group name. Therefore it makes sense to store a
+pointer to the group members in the group information itself. However, the
+memberships are not *always* necessary (see remarks about `id(1)`), therefore
+the memberships will be stored separately, outside of the groups section.
+
+Similarly, when user's groups are resolved in (2), they are not always necessary
+(i.e. not part of `struct user*`), therefore the memberships themselves are
+stored out of bound.
+
+`groupmembers` and `additional_gids` store group and user memberships
+respectively. Membership IDs are packed — not necessitating random access, thus
+suitable for compression.
+
+- `groupmembers` consists of a number X followed by a list of offsets to User
+  records, because `getgr*` returns pointers to membernames, thus a name has to
+  be immediately resolvable.
+- `additional_gids` is a list of gids, because `initgroups_dyn` (and friends)
+  returns an array of gids.
+
+Each entry of `groupmembers` and `additional_gids` starts with a varint N,
+which is the number of upcoming elements. Then N delta-compressed varints,
+which are:
+
+- **additional_gids** a list of gids.
+- **groupmembers** byte-offsets to the User records in the `users` section.
+
+Indices
+-------
+
+Now that we've sketched the implementation of `id(3)`, it's clearer to
+understand which operations need to be fast; in order of importance:
+
+1. lookup gid -> group info (this is on hot path in id) without members.
+2. lookup username -> user's groups.
+3. lookup uid -> user.
+4. lookup groupname -> group.
+5. lookup username -> user.
+
+These indices can use perfect hashing like [bdz from cmph][cmph]: a perfect
+hash hashes a list of bytes to a sequential list of integers. Perfect hashing
+algorithms require some space, and take some time to calculate ("hashing
+duration"). I've tested BDZ, which hashes `[][]u8` to a sequential list of
+integers (not preserving order) and CHM, preserves order. BDZ accepts an
+optional argument `3 <= b <= 10`.
+
+* BDZ algorithm requires (b=3, 900KB, b=7, 338KB, b=10, 306KB) for 1M values.
+* Latency to resolve 1M keys: (170ms, 180ms, 230ms, respectively).
+* Packed vs non-packed latency differences are not meaningful.
+
+CHM retains order, however, 1M keys weigh 8MB. 10k keys are ~20x larger with
+CHM than with BDZ, eliminating the benefit of preserved ordering: we can just
+have a separate index.
+
+None of the tested perfect hashing algorithms makes the distinction between
+existing (in the initial dictionary) and new keys. In other words, HASH(value)
+will be pointing to a number `n ∈ [0,N-1]`, regardless whether the value was in
+the initial dictionary. Therefore one must always confirm, after calculating
+the hash, that the key matches what's been hashed.
+
+`idx_*` sections are of type `[]u32` and are pointing from `hash(key)` to the
+respective `Groups` and `Users` entries (from the beginning of the respective
+section). Since User and Group records are 8-byte aligned, the actual offset to
+the record is acquired by right-shifting this value by 3 bits.
+
+Database file structure
+-----------------------
+
+Each section is padded to 64 bytes.
+
+```
+SECTION               SIZE             DESCRIPTION
+header                128              see "Turbonss header" section
+bdz_gid               ?                bdz(gid)
+bdz_groupname         ?                bdz(groupname)
+bdz_uid               ?                bdz(uid)
+bdz_username          ?                bdz(username)
+idx_gid2group         len(group)*4     bdz->offset Groups
+idx_groupname2group   len(group)*4     bdz->offset Groups
+idx_uid2user          len(user)*4      bdz->offset Users
+idx_name2user         len(user)*4      bdz->offset Users
+shell_index           len(shells)*2    shell index array
+shell_blob            <= 65280         shell data blob (max 255*256 bytes)
+groups                ?                packed Group entries (8b padding)
+users                 ?                packed User entries (8b padding)
+groupmembers          ?                per-group delta varint memberlist (no padding)
+additional_gids       ?                per-user delta varint gidlist (no padding)
+```
+
+[cmph]: http://cmph.sourceforge.net/
+[id]: https://linux.die.net/man/1/id
+[data-oriented-design]: https://media.handmade-seattle.com/practical-data-oriented-design/
+[getpwnam_r]: https://linux.die.net/man/3/getpwnam_r
+[varint]: https://developers.google.com/protocol-buffers/docs/encoding#varints
+[getpwent]: https://www.man7.org/linux/man-pages/man3/getpwent_r.3.html
+[getgrouplist]: https://www.man7.org/linux/man-pages/man3/getgrouplist.3.html
+[getgrid]: https://www.man7.org/linux/man-pages/man3/getgrid_r.3.html
--- a/docs/development.md
+++ b/docs/development.md
@@ -0,0 +1,37 @@
+Profiling
+---------
+
+Prepare `profile.data`:
+
+```
+zig build -Drelease-small=true && \
+    perf record --call-graph=dwarf \
+        zig-out/bin/turbonss-unix2db --passwd passwd --group group
+```
+
+Perf interactive:
+
+```
+perf report -i perf.data
+```
+
+Flame graph:
+
+```
+perf script | inferno-collapse-perf | inferno-flamegraph > profile.svg
+```
+
+For v2
+------
+
+These are desired for the next DB format:
+- Compress strings with fsst.
+- Trim first 4 bytes from the cmph headers.
+
+Dependencies
+------------
+
+This project uses [git subtrac][git-subtrac] for managing dependencies. They
+work just like regular submodules, except all the refs of the submodules are in
+this repository. Repeat after me: all the submodules are in this repository.
+So if you have a copy of this repo, dependencies will not disappear.
--- a/src/turbonss-unix2systemd.zig
+++ b/src/turbonss-unix2systemd.zig
@@ -1,305 +0,0 @@
-const std = @import("std");
-const fs = std.fs;
-const io = std.io;
-const mem = std.mem;
-const os = std.os;
-const heap = std.heap;
-const math = std.math;
-const fmt = std.fmt;
-const json = std.json;
-const ArrayList = std.ArrayList;
-const ArrayListUnmanaged = std.ArrayListUnmanaged;
-const Allocator = std.mem.Allocator;
-const StringArrayHashMap = std.StringArrayHashMap;
-
-const flags = @import("flags.zig");
-const User = @import("User.zig");
-const PackedUser = @import("PackedUser.zig");
-const Group = @import("Group.zig");
-const Corpus = @import("Corpus.zig");
-const DB = @import("DB.zig");
-const ErrCtx = @import("ErrCtx.zig");
-
-const usage =
-    \\usage: turbonss-unix2systemd [OPTION]...
-    \\
-    \\Options:
-    \\  -h          Print this help message and exit
-    \\  --passwd    Path to passwd file (default: passwd)
-    \\  --group     Path to group file (default: group)
-    \\  --outdir    Path to output directory (default: ./userdb)
-    \\
-;
-
-pub fn main() !void {
-    // This line is here because of https://github.com/ziglang/zig/issues/7807
-    const argv: []const [*:0]const u8 = os.argv;
-    const gpa = heap.raw_c_allocator;
-
-    const stderr = io.getStdErr().writer();
-    const stdout = io.getStdOut().writer();
-
-    const return_code = execute(gpa, stdout, stderr, argv[1..]);
-    os.exit(return_code);
-}
-
-fn execute(
-    allocator: Allocator,
-    stdout: anytype,
-    stderr: anytype,
-    argv: []const [*:0]const u8,
-) u8 {
-    const result = flags.parse(argv, &[_]flags.Flag{
-        .{ .name = "-h", .kind = .boolean },
-        .{ .name = "--passwd", .kind = .arg },
-        .{ .name = "--group", .kind = .arg },
-        .{ .name = "--outdir", .kind = .arg },
-    }) catch {
-        stderr.writeAll(usage) catch {};
-        return 1;
-    };
-
-    if (result.boolFlag("-h")) {
-        stdout.writeAll(usage) catch return 1;
-        return 0;
-    }
-
-    if (result.args.len != 0) {
-        stderr.print("ERROR: unknown option '{s}'\n", .{result.args[0]}) catch {};
-        stderr.writeAll(usage) catch {};
-        return 1;
-    }
-
-    const passwd_fname = result.argFlag("--passwd") orelse "passwd";
-    const group_fname = result.argFlag("--group") orelse "group";
-    const outdir = result.argFlag("--outdir") orelse "./userdb";
-
-    // to catch an error set file.OpenError, wait for
-    // https://github.com/ziglang/zig/issues/2473
-    var errc = ErrCtx{};
-    var passwd_file = fs.cwd().openFile(passwd_fname, .{ .mode = .read_only }) catch |err|
-        return fail(errc.wrapf("open '{s}'", .{passwd_fname}), stderr, err);
-    defer passwd_file.close();
-
-    var group_file = fs.cwd().openFile(group_fname, .{ .mode = .read_only }) catch |err|
-        return fail(errc.wrapf("open '{s}'", .{group_fname}), stderr, err);
-    defer group_file.close();
-
-    var passwdReader = io.bufferedReader(passwd_file.reader()).reader();
-    var users = User.fromReader(allocator, &errc, passwdReader) catch |err|
-        return fail(errc.wrap("read users"), stderr, err);
-    defer {
-        for (users) |*user| user.deinit(allocator);
-        allocator.free(users);
-    }
-
-    var groupReader = io.bufferedReader(group_file.reader()).reader();
-    var groups = Group.fromReader(allocator, groupReader) catch |err|
-        return fail(errc.wrap("read groups"), stderr, err);
-    defer {
-        for (groups) |*group| group.deinit(allocator);
-        allocator.free(groups);
-    }
-
-    const user2groups = StringArrayHashMap(ArrayListUnmanaged([]const u8)).init(allocator);
-    defer {
-        var it = user2groups.iterator();
-        while (it.next()) |entry|
-            entry.value_ptr.*.deinit(allocator);
-        user2groups.deinit();
-    }
-    fillMemberships(allocator, groups, &user2groups);
-
-    try os.mkdirZ(outdir, 0o755) catch |err| switch (err) {
-        error.PathAlreadyExists => {},
-        else => |err| return err,
-    };
-
-    var dir = try fs.cwd().openDir(outdir, .{});
-
-    try makePasswd(dir, users.items);
-    try makeGroups(dir, groups.items);
-
-    return 0;
-}
-
-const JSONPasswd = struct {
-    uid: u32,
-    gid: u32,
-    userName: []const u8, // pw_name
-    realName: []const u8, // pw_gecos
-    homeDirectory: []const u8,
-    shell: []const u8,
-    memberOf: []const []const u8,
-};
-
-fn makePasswd(
-    dir: fs.Dir,
-    users: []User,
-    memberships: *const StringArrayHashMap(ArrayListUnmanaged([]const u8)),
-) !void {
-    var namebuf: [PackedUser.max_name_len + ".user".len:0]u8 = undefined;
-    var symlinkbuf: [fmt.count("{d}.user", math.maxInt(u32)):0]u8 = undefined;
-
-    for (users) |user| {
-        const member_of = if (memberships.get(user.name)) |m|
-            m.items
-        else
-            []const []const u8{};
-
-        const u = JSONPasswd{
-            .uid = user.uid,
-            .gid = user.gid,
-            .userName = user.name,
-            .realName = user.gecos,
-            .homeDirectory = user.home,
-            .shell = user.shell,
-            .memberOf = member_of,
-        };
-
-        const fname = try fmt.bufPrintZ(namebuf, "{s}.user", user.name);
-        var f = try dir.createFileZ(fname, .{});
-        defer f.close();
-
-        var wr = io.bufferedWriter(f.writer());
-        try json.stringify(u, .{}, wr);
-        try wr.flush();
-
-        const symlinkname = try fmt.bufPrintZ(symlinkbuf, "{d}.user", user.uid);
-        try os.symlinkatZ(fname, dir.fd, symlinkname);
-    }
-}
-
-fn makeGroups(dir: fs.Dir, groups: []Group) !void {
-    _ = dir;
-    _ = groups;
-}
-
-fn fail(errc: *ErrCtx, stderr: anytype, err: anytype) u8 {
-    const err_chain = errc.unwrap().constSlice();
-    stderr.print("ERROR {s}: {s}\n", .{ @errorName(err), err_chain }) catch {};
-    return 1;
-}
-
-fn fillMemberships(
-    allocator: Allocator,
-    groups: ArrayList(Group),
-    user2groups: *StringArrayHashMap(ArrayListUnmanaged([]const u8)),
-) void {
-    for (groups) |group| {
-        for (group.members) |member| {
-            const member_groups = try user2groups.getOrPut(allocator, member.name);
-            if (!member_groups.found_existing)
-                member_groups.value_ptr.* = ArrayListUnmanaged([]const u8){};
-            member_groups.value_ptr.*.append(group.name);
-        }
-    }
-}
-
-const testing = std.testing;
-
-test "turbonss-unix2systemd invalid argument" {
-    const allocator = testing.allocator;
-    const args = &[_][*:0]const u8{"--invalid-argument"};
-    var stderr = ArrayList(u8).init(allocator);
-    defer stderr.deinit();
-    var stdout = ArrayList(u8).init(allocator);
-    defer stdout.deinit();
-
-    const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args[0..]);
-    try testing.expectEqual(@as(u8, 1), exit_code);
-    try testing.expect(mem.startsWith(
-        u8,
-        stderr.items,
-        "ERROR: unknown option '--invalid-argument'",
-    ));
-}
-
-test "turbonss-unix2systemd trivial error: missing passwd file" {
-    const allocator = testing.allocator;
-    const args = &[_][*:0]const u8{
-        "--passwd",
-        "/does/not/exist/passwd",
-        "--group",
-        "/does/not/exist/group",
-    };
-    var stderr = ArrayList(u8).init(allocator);
-    defer stderr.deinit();
-    var stdout = ArrayList(u8).init(allocator);
-    defer stdout.deinit();
-
-    const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args[0..]);
-    try testing.expectEqual(@as(u8, 1), exit_code);
-    try testing.expectEqualStrings(stderr.items, "ERROR FileNotFound: open '/does/not/exist/passwd'\n");
-}
-
-test "turbonss-unix2systemd fail" {
-    var errc = ErrCtx{};
-    var buf = ArrayList(u8).init(testing.allocator);
-    defer buf.deinit();
-    var wr = buf.writer();
-    const exit_code = fail(errc.wrapf("invalid user 'foo'", .{}), wr, error.NotSure);
-    try testing.expectEqual(exit_code, 1);
-    try testing.expectEqualStrings(buf.items, "ERROR NotSure: invalid user 'foo'\n");
-}
-
-test "turbonss-unix2db smoke test" {
-    const allocator = testing.allocator;
-    var stderr = ArrayList(u8).init(allocator);
-    defer stderr.deinit();
-    var stdout = ArrayList(u8).init(allocator);
-    defer stdout.deinit();
-
-    var corpus = try Corpus.testCorpus(allocator);
-    defer corpus.deinit();
-
-    var tmp = testing.tmpDir(.{});
-    // TODO: defer
-    errdefer tmp.cleanup();
-
-    const tmp_path = blk: {
-        const relative_path = try fs.path.join(allocator, &[_][]const u8{
-            "zig-cache",
-            "tmp",
-            tmp.sub_path[0..],
-        });
-        const real_path = try fs.realpathAlloc(allocator, relative_path);
-        allocator.free(relative_path);
-        break :blk real_path;
-    };
-    defer allocator.free(tmp_path);
-
-    const passwdPath = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "passwd" });
-    defer allocator.free(passwdPath);
-    const groupPath = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "group" });
-    defer allocator.free(groupPath);
-    const outDir = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "outdir" });
-    defer allocator.free(outDir);
-
-    const passwd_fd = try os.open(passwdPath, os.O.CREAT | os.O.WRONLY, 0o644);
-    const group_fd = try os.open(groupPath, os.O.CREAT | os.O.WRONLY, 0o644);
-
-    var i: usize = 0;
-    while (i < corpus.users.len) : (i += 1) {
-        const user = corpus.users.get(i);
-        const line = user.toLine().constSlice();
-        _ = try os.write(passwd_fd, line);
-    }
-    os.close(passwd_fd);
-
-    var group_writer = (fs.File{ .handle = group_fd }).writer();
-    i = 0;
-    while (i < corpus.groups.len) : (i += 1)
-        try corpus.groups.get(i).writeTo(group_writer);
-    os.close(group_fd);
-
-    const args = &[_][*:0]const u8{
-        "--passwd", passwdPath,
-        "--group",  groupPath,
-        "--outdir", outDir,
-    };
-
-    const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args);
-    try testing.expectEqualStrings("total 1664 bytes. groups=5 users=4\n", stderr.items);
-    try testing.expectEqual(@as(u8, 0), exit_code);
-}