Move docs around; finish it

2022-08-21 06:08:21 +03:00
parent 8bfc4a30cd
commit fbd449b21f
4 changed files with 458 additions and 710 deletions
--- a/README.md
+++ b/README.md
@@ -1,442 +1,131 @@
 Turbo NSS
 ---------
-Turbonss is a plugin for GNU Name Service Switch (NSS) functionality of GNU C
+Turbonss is a plugin for GNU Name Service Switch ([NSS][nsswitch])
-Library (glibc). Turbonss implements lookup for `user` and `passwd` database
+functionality of GNU C Library (glibc). Turbonss implements lookup for `user`
-entries (i.e. system users, groups, and group memberships). It's main goal is
+and `passwd` database entries (i.e. system users, groups, and group
-performance, with focus on making [`id(1)`][id] run as fast as possible.
+memberships). It's main goal is to run [`id(1)`][id] as fast as possible.
 Turbonss is optimized for reading. If the data changes in any way, the whole
-file will need to be regenerated (and tooling only supports only full
+file will need to be regenerated. Therefore, it was created, and best suited,
-generation). It was created, and best suited, for environments that have a
+for environments that have a central user & group database which then needs to
-central user & group database which then needs to be distributed to many
+be distributed to many servers/services, and the data does not change very
-servers/services, and the data does not change very often (e.g. hourly).
+often (e.g. hourly).
-To understand more about name service switch, start with
+This is the fastest known NSS passwd/group implementation for *reads*. On a
-[`nsswitch.conf(5)`][nsswitch].
+corpus with 10k users, 10k groups and 500 average members per group, `id` takes
 17 seconds with the glibc default implementation, 10-17 milliseconds with a
 pre-cached `nscd`, ~8 milliseconds with `turbonss`.
-Design & constraints
+Project status
--------------------
+--------------
-To be fast, the user/group database (later: DB) has to be small
+The project is finished and is not recommended for production; just use nscd.
-([background][data-oriented-design]). It encodes user & group information in a
+Turbonss duly implements the full user/group API in `src/libnss.zig`: feel free
-way that minimizes the DB size, and reduces jumping across the DB ("chasing
+to copy that.
 pointers and thrashing CPU cache").
-To understand how this is done efficiently, let's analyze the
+Yours truly (the author) worked on this for about 7 months. And when this was
-[`getpwnam_r(3)`][getpwnam_r] in high level. This API call accepts a username
+finished it turned out that just slapping nscd on top of the existing NSS
-and returns the following user information:
+implementation is almost as fast as this.
 ```
 struct passwd {
   char   *pw_name;       /* username */
   char   *pw_passwd;     /* user password */
   uid_t   pw_uid;        /* user ID */
   gid_t   pw_gid;        /* group ID */
   char   *pw_gecos;      /* user information */
   char   *pw_dir;        /* home directory */
   char   *pw_shell;      /* shell program */
 };
 ```
 Turbonss, among others, implements this call, and takes the following steps to
 resolve a username to a `struct passwd*`:
 - Open the DB (using `mmap`) and interpret it's first 64 bytes as a `*struct
  Header`. The header stores offsets to the sections of the file. This needs to
  be done once, when the NSS library is loaded.
 - Hash the username using a perfect hash function. Perfect hash function
  returns a number `n ∈ [0,N-1]`, where N is the total number of users.
 - Jump to the `n`'th location in the `idx_name2user` section, which contains
  the index `i` to the user's information.
 - Jump to the location `i` of section `Users`, which stores the full user
  information.
 - Decode the user information (which is all in a continuous memory block) and
  return it to the caller.
 In total, that's one hash for the username (~150ns), two pointer jumps within
 the group file (to sections `idx_name2user` and `Users`), and, now that the
 user record is found, `memcpy` for each field.
 The turbonss DB file is be `mmap`-ed, making it simple to jump across the file
 using pointer arithmetic. This also reduces memory usage, as the mmap'ed
 regions are shared. Turbonss reads do not consume any heap space.
 Tight packing places some constraints on the underlying data:
 - Permitted length of username and groupname: 1-32 bytes.
 - Permitted length of shell and home: 1-256 bytes.
 - Permitted comment ("gecos") length: 0-255 bytes.
 - User name, groupname, gecos and shell must be utf8-encoded.
 - User and Groups sections are up to 2^35B (~34GB) large. Assuming an "average"
  user record takes 50 bytes, this section would fit ~660M users. The
  worst-case upper bound is left as an exercise to the reader.
 Sorting is stable. In v0:
 - Groups are sorted by gid, ascending.
 - Users are sorted by their name, ascending by the unicode codepoints
  (locale-independent).
 Checking out and building
 -------------------------
 ```
 $ git clone --recursive https://git.sr.ht/~motiejus/turbonss
 ```
 Alternatively, if you forgot `--recursive`:
 ```
 $ git submodule update --init
 ```
 And run tests:
 ```
 $ zig build test
 ```
 Test the so
 -----------
 Build:
    zig build -Dtarget=x86_64-linux-gnu.2.31 -Dcpu=x86_64_v3 -Drelease-fast=true -Dstrip=true
 Generate `db.turbo`:
    zig-out/bin/turbonss-unix2db --passwd /etc/passwd --group /etc/group
    zig-out/bin/turbonss-analyze db.turbo
    <...>
 Run a test container:
    $ docker run -ti --rm --privileged -v `pwd`:/etc/turbonss -w /etc/turbonss debian:bullseye
    # cp zig-out/lib/libnss_turbo.so.2 /lib/x86_64-linux-gnu
    # sed -i 's/\(\(passwd\|group\).*files\)$/\1 turbo/' /etc/nsswitch.conf
 And knock yourself out:
    getent passwd
    getent group
    id root
 This is probably not very interesting; you may want to take a larger corpus of
 /etc/passwd and /etc/group for more interesting results.
 Dependencies
 ------------
-This project uses [git subtrac][git-subtrac] for managing dependencies. They
+1. Stage1 of the nightly zig compiler.
-work just like regular submodules, except all the refs of the submodules are in
+2. [cmph][cmph]: bundled with this repository.
 this repository. Repeat after me: all the submodules are in this repository.
 So if you have a copy of this repo, dependencies will not disappear.
-remarks on `id(1)`
+Trying it out
------------------
+-------------
-A known implementation runs id(1) at ~250 rps sequentially on ~20k users and
+Clone, compile and test first:
 ~10k groups. Our rps target is much higher.
-To better reason about the trade-offs, it is useful to understand how `id(1)`
+    $ git clone --recursive https://git.sr.ht/~motiejus/turbonss
-is implemented, in rough terms:
+    $ zig build -fstage1 test
- lookup user by name ([`getpwent_r(3)`][getpwent]).
+    $ zig build -fstage1 -Dtarget=x86_64-linux-gnu.2.31 -Dcpu=x86_64_v3 -Drelease-safe=true
 - get all gids for the user ([`getgrouplist(3)`][getgrouplist]). Note: it is
  actually using `initgroups_dyn`, accepts a uid, and is very poorly
  documented.
 - for each additional gid, get the `struct group*`
  ([`getgrgid_r(3)`][getgrgid_r]).
-Assuming a member is in ~100 groups on average, to reach 10k id/s translates to
+One may choose different options, depending on requirements. Here are some
-1M group lookups per second. We need to convert gid to a group index, and group
+hints:
 index to a group gid/name quickly.
-Caveat: `struct group` contains an array of pointers to names of group members
+1. `-Dcpu=<...>` for the CPU
-(`char **gr_mem`). However, `id` does not use that information, resulting in
+   [microarchitecture](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels).
-read amplification, sometimes by 10-100x. Therefore, if `argv[0] == "id"`, our
+2. `-Drelease-fast=true` for max speed
-implementation of [`getgrid_r(3)`][getgrid] returns the `struct group*` without
+3. `-Drelease-small=true` for smallest binary sizes.
-the members. This speeds up `id` by about 10x on a known NSS implementation.
+4. `-Dstrip=true` to strip debug symbols.
-Relatedly, because [`getgrid_r(3)`][getgrid] does not need the group members,
+Test it on a real system
-the group members are stored in a different DB section, reducing the `Groups`
+------------------------
 section and making more of it fit the CPU caches.
-Turbonss header
+`db.turbo` is the TurboNSS database file. To create one from `/etc/group` and
---------------
+`/etc/passwd`, use `turbonss-unix2db`:
-The turbonss header looks like this:
+    $ zig-out/bin/turbonss-unix2db --passwd /etc/passwd --group /etc/group
    $ zig-out/bin/turbonss-analyze db.turbo
    <...>
-```
+Run and configure a test container that uses `turbonss` instead of the default
-OFFSET     TYPE     NAME                      DESCRIPTION
+`files`:
   0      [4]u8     magic                     f0 9f a4 b7
   4         u8     version                   0
   5         u8     endian                    0 for little, 1 for big
   6         u8     nblocks_shell_blob        max value: 63
   7         u8     num_shells                max value: 63
   8        u32     num_groups                number of group entries
  12        u32     num_users                 number of passwd entries
  16        u32     nblocks_bdz_gid           bdz_gid section block count
  20        u32     nblocks_bdz_groupname
  24        u32     nblocks_bdz_uid
  28        u32     nblocks_bdz_username
  32        u64     nblocks_groups
  40        u64     nblocks_users
  48        u64     nblocks_groupmembers
  56        u64     nblocks_additional_gids
  64        u64     getgr_bufsize
  72        u64     getpw_bufsize
  80     [48]u8     padding
 ```
-`magic` is 0xf09fa4b7, and `version` must be `0`. All integers are
+    $ docker run -ti --rm -v `pwd`:/etc/turbonss -w /etc/turbonss debian:bullseye
-native-endian. `nblocks_*` is the count of blocks of a particular section; this
+    # cp zig-out/lib/libnss_turbo.so.2 /lib/x86_64-linux-gnu/
-helps calculate the offsets to all sections.
+    # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf
-Some numbers, like `nblocks_shell_blob`, `num_shells`, would fit to smaller
+And run the commands:
 number of bytes. However, interpreting `[2]u6` with `xxd(1)` is harder than
 interpreting `[2]u8`. Therefore we are using the space we have to make these
 integers byte-wide.
-`getgr_bufsize` and `getpw_bufsize` is a hint for the caller of `getgr*` and
+    $ getent passwd
-`getpw*`-family calls. This is the recommended size of the buffer, so the
+    $ getent group
-caller does not receive `ENOMEM`.
+    $ id root
-Primitive types
+More users and groups
---------------
+---------------------
-`User` and `Group` entries are sorted by the order they were received in the input
+`turbonss-makecorpus` can synthesize more `users` and `groups`:
 file. All entries are aligned to 8 bytes. All `User` and `Group` entries are
 referred by their byte offset in the `Users` and `Groups` section relative to
 the beginning of the section.
-```
+    # ./zig-out/bin/turbonss-makecorpus 
-const PackedGroup = packed struct {
+    wrote users=10000 groups=10000 avg-members=1000 to .
-    gid: u32,
+    # cat group >> /etc/group
-    padding: u3,
+    # cat passwd >> /etc/passwd
-    groupname_len: u5,
+    # time id u_1000000
-}
+    <...>
-```
+    real    0m17.380s
    user    0m13.117s
    sys     0m4.263s
-PackedGroup is followed by the group name (of length `groupname_len`), followed
+17 seconds for an `id` command! Well, there are indeed many users and groups.
-by a varint-compressed offset to the groupmembers section, followed by 8b padding.
+Let's see how turbonss fares with it:
-PackedUser is a bit more involved:
+    # zig-out/bin/turbonss-unix2db --group /etc/group --passwd /etc/passwd
    total 10968512 bytes. groups=10019 users=10039
    # ls -hs /etc/group /etc/passwd db.turbo
    48M /etc/group  668K /etc/passwd   11M db.turbo
    # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf
    # time id u_1000000
    real    0m0.008s
    user    0m0.000s
    sys     0m0.008s
-```
+That's ~1500x improvement for the `id` command (and notice about 4X compression
-pub const PackedUser = packed struct {
+ratio compared to plain files). If the number of users and groups is increased
-    uid: u32,
+by 10x (to 100k each), the difference becomes even crazier:
    gid: u32,
    shell_len_or_idx: u8,
    shell_here: bool,
    name_is_a_suffix: bool,
    home_len: u6,
    name_len: u5,
    gecos_len: u11,
 }
 ```
-... followed by `userdata: []u8`:
+    # time id u_1000000
- home.
+    <...>
- name (optional).
+    real    3m42.281s
- gecos.
+    user    2m30.482s
- shell (optional).
+    sys     0m55.840s
- `additional_gids_offset`: varint.
+    # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf
    # time id u_1000000
    <...>
    real    0m0.008s
    user    0m0.000s
    sys     0m0.008s
-First byte of home is stored right after the `gecos_len` field, and its length
+Documentation
-is `home_len`. The same logic applies to all the `stringdata` fields: there is
+-------------
 a way to calculate their relative position from the length of the fields before
 them.
-PackedUser employs two data-oriented compression techniques:
+Architecture is detailed in `docs/architecture.md`
- shells are often shared across different users, see the "Shells" section.
+Development notes are in `docs/development.md`
 - `name` is frequently a suffix of `home`. For example, `/home/vidmantas` and
  `vidmantas`. In this case storing both name and home is wasteful. Therefore
  name has two options:
  1. `name_is_a_suffix=true`: name is a suffix of the home dir. Then `name`
  starts at the `home_len - name_len`'th byte of `home`, and ends at the same
  place as `home`.
  2. `name_is_a_suffix=false`: name begins one byte after home, and it's length
  is `name_len`.
 The last field `additional_gids_offset: varint` points to the `additional_gids`
 section for this user.
 Shells
 ------
 Normally there is a limited number of separate shells even in huge user
 databases. A few examples: `/bin/bash`, `/usr/bin/nologin`, `/bin/zsh` among
 others. Therefore, "shells" have an optimization: they can be pointed by in the
 external list, or, if they are unique to the user, reside among the user's
 data.
 255 most popular shells (i.e. referred to by at least two User entries) are
 stored externally in "Shells" area. The less popular ones are stored with
 userdata.
 Shells section consists of two sub-sections: the index and the blob. The index
 is an array of offsets: the i'th shell starts at `offsets[i]` byte, and ends at
 `offsets[i+1]` byte. If there is at least one shell in the shell section, the
 index contains a sentinel index as the last element, which signifies the position
 of the last byte of the shell blob.
 `shell_here=true` in the User struct means the shell is stored with userdata,
 and it's length is `shell_len_or_idx`. `shell_here=false` means it is stored in
 the `Shells` section, and it's index is `shell_len_or_idx` (and the actual
 string start and end offsets are resolved as described in the paragraph above).
 Variable-length integers (varints)
 ----------------------------------
 Varint is an efficiently encoded integer (packed for small values). Same as
 [protocol buffer varints][varint], except the largest possible value is `u64`.
 They compress integers well. Varints are stored for group memberships.
 Group memberships
 -----------------
 There are two group memberships at play:
 1. Given a group (gid/name), resolve the members' names (e.g. `getgrgid`).
 2. Given a username, resolve user's group gids (for `initgroups(3)`).
 When group's memberships are resolved in (1), the same call also requires other
 group information: gid and group name. Therefore it makes sense to store a
 pointer to the group members in the group information itself. However, the
 memberships are not *always* necessary (see remarks about `id(1)`), therefore
 the memberships will be stored separately, outside of the groups section.
 Similarly, when user's groups are resolved in (2), they are not always necessary
 (i.e. not part of `struct user*`), therefore the memberships themselves are
 stored out of bound.
 `groupmembers` and `additional_gids` store group and user memberships
 respectively. Membership IDs are packed — not necessitating random access, thus
 suitable for compression.
 - `groupmembers` consists of a number X followed by a list of offsets to User
  records, because `getgr*` returns pointers to membernames, thus a name has to
  be immediately resolvable.
 - `additional_gids` is a list of gids, because `initgroups_dyn` (and friends)
  returns an array of gids.
 Each entry of `groupmembers` and `additional_gids` starts with a varint N,
 which is the number of upcoming elements. Then N delta-compressed varints,
 which are:
 - **additional_gids** a list of gids.
 - **groupmembers** byte-offsets to the User records in the `users` section.
 Indices
 -------
 Now that we've sketched the implementation of `id(3)`, it's clearer to
 understand which operations need to be fast; in order of importance:
 1. lookup gid -> group info (this is on hot path in id) without members.
 2. lookup username -> user's groups.
 3. lookup uid -> user.
 4. lookup groupname -> group.
 5. lookup username -> user.
 These indices can use perfect hashing like [bdz from cmph][cmph]: a perfect
 hash hashes a list of bytes to a sequential list of integers. Perfect hashing
 algorithms require some space, and take some time to calculate ("hashing
 duration"). I've tested BDZ, which hashes `[][]u8` to a sequential list of
 integers (not preserving order) and CHM, preserves order. BDZ accepts an
 optional argument `3 <= b <= 10`.
 * BDZ algorithm requires (b=3, 900KB, b=7, 338KB, b=10, 306KB) for 1M values.
 * Latency to resolve 1M keys: (170ms, 180ms, 230ms, respectively).
 * Packed vs non-packed latency differences are not meaningful.
 CHM retains order, however, 1M keys weigh 8MB. 10k keys are ~20x larger with
 CHM than with BDZ, eliminating the benefit of preserved ordering: we can just
 have a separate index.
 None of the tested perfect hashing algorithms makes the distinction between
 existing (in the initial dictionary) and new keys. In other words, HASH(value)
 will be pointing to a number `n ∈ [0,N-1]`, regardless whether the value was in
 the initial dictionary. Therefore one must always confirm, after calculating
 the hash, that the key matches what's been hashed.
 `idx_*` sections are of type `[]u32` and are pointing from `hash(key)` to the
 respective `Groups` and `Users` entries (from the beginning of the respective
 section). Since User and Group records are 8-byte aligned, the actual offset to
 the record is acquired by right-shifting this value by 3 bits.
 Database file structure
 -----------------------
 Each section is padded to 64 bytes.
 ```
 SECTION               SIZE             DESCRIPTION
 header                128              see "Turbonss header" section
 bdz_gid               ?                bdz(gid)
 bdz_groupname         ?                bdz(groupname)
 bdz_uid               ?                bdz(uid)
 bdz_username          ?                bdz(username)
 idx_gid2group         len(group)*4     bdz->offset Groups
 idx_groupname2group   len(group)*4     bdz->offset Groups
 idx_uid2user          len(user)*4      bdz->offset Users
 idx_name2user         len(user)*4      bdz->offset Users
 shell_index           len(shells)*2    shell index array
 shell_blob            <= 65280         shell data blob (max 255*256 bytes)
 groups                ?                packed Group entries (8b padding)
 users                 ?                packed User entries (8b padding)
 groupmembers          ?                per-group delta varint memberlist (no padding)
 additional_gids       ?                per-user delta varint gidlist (no padding)
 ```
 Section creation order:
 1. ✅ `bdz_*`.
 1. ✅ `shell_index`, `shell_blob`.
 1. ✅ `additional_gids`.
 1. ✅ `users` requires `additional_gids` and shell.
 1. ✅ `groupmembers` requires `users`.
 1. ✅ `groups` requires `groupmembers`.
 1. ✅ `idx_*`. requires offsets to `groups` and `users`.
 1. ✅ Header.
 For v2
 ------
 These are desired for the next DB format:
 - Compress strings with fsst.
 - Trim first 4 bytes from the cmph headers.
 Profiling
 ---------
 Prepare `profile.data`:
 ```
 zig build -Drelease-small=true && \
    perf record --call-graph=dwarf \
        zig-out/bin/turbonss-unix2db --passwd passwd2 --group group2
 ```
 Perf interactive:
 ```
 perf report -i perf.data
 ```
 Flame graph:
 ```
 perf script | inferno-collapse-perf | inferno-flamegraph > profile.svg
 ```
 [git-subtrac]: https://apenwarr.ca/log/20191109
 [cmph]: http://cmph.sourceforge.net/
 [id]: https://linux.die.net/man/1/id
 [nsswitch]: https://linux.die.net/man/5/nsswitch.conf
-[data-oriented-design]: https://media.handmade-seattle.com/practical-data-oriented-design/
+[id]: https://linux.die.net/man/1/id
-[getpwnam_r]: https://linux.die.net/man/3/getpwnam_r
+[cmph]: http://cmph.sourceforge.net/
 [varint]: https://developers.google.com/protocol-buffers/docs/encoding#varints
 [getpwent]: https://www.man7.org/linux/man-pages/man3/getpwent_r.3.html
 [getgrouplist]: https://www.man7.org/linux/man-pages/man3/getgrouplist.3.html
 [getgrid]: https://www.man7.org/linux/man-pages/man3/getgrid_r.3.html
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -0,0 +1,327 @@
 Design & constraints
 --------------------
 To be fast, the user/group database (later: DB) has to be small
 ([background][data-oriented-design]). It encodes user & group information in a
 way that minimizes the DB size, and reduces jumping across the DB ("chasing
 pointers and thrashing CPU cache").
 To understand how this is done efficiently, let's analyze the
 [`getpwnam_r(3)`][getpwnam_r] in high level. This API call accepts a username
 and returns the following user information:
 ```
 struct passwd {
   char   *pw_name;       /* username */
   char   *pw_passwd;     /* user password */
   uid_t   pw_uid;        /* user ID */
   gid_t   pw_gid;        /* group ID */
   char   *pw_gecos;      /* user information */
   char   *pw_dir;        /* home directory */
   char   *pw_shell;      /* shell program */
 };
 ```
 Turbonss, among others, implements this call, and takes the following steps to
 resolve a username to a `struct passwd*`:
 - Open the DB (using `mmap`) and interpret it's first 64 bytes as a `*struct
  Header`. The header stores offsets to the sections of the file. This needs to
  be done once, when the NSS library is loaded.
 - Hash the username using a perfect hash function. Perfect hash function
  returns a number `n ∈ [0,N-1]`, where N is the total number of users.
 - Jump to the `n`'th location in the `idx_name2user` section, which contains
  the index `i` to the user's information.
 - Jump to the location `i` of section `Users`, which stores the full user
  information.
 - Decode the user information (which is all in a continuous memory block) and
  return it to the caller.
 In total, that's one hash for the username (~150ns), two pointer jumps within
 the group file (to sections `idx_name2user` and `Users`), and, now that the
 user record is found, `memcpy` for each field.
 The turbonss DB file is be `mmap`-ed, making it simple to jump across the file
 using pointer arithmetic. This also reduces memory usage, as the mmap'ed
 regions are shared. Turbonss reads do not consume any heap space.
 Tight packing places some constraints on the underlying data:
 - Permitted length of username and groupname: 1-32 bytes.
 - Permitted length of shell and home: 1-256 bytes.
 - Permitted comment ("gecos") length: 0-255 bytes.
 - User name, groupname, gecos and shell must be utf8-encoded.
 - User and Groups sections are up to 2^35B (~34GB) large. Assuming an "average"
  user record takes 50 bytes, this section would fit ~660M users. The
  worst-case upper bound is left as an exercise to the reader.
 Sorting is stable. In v0:
 - Groups are sorted by gid, ascending.
 - Users are sorted by their name, ascending by the unicode codepoints
  (locale-independent).
 remarks on `id(1)`
 ------------------
 A known implementation runs id(1) at ~250 rps sequentially on ~20k users and
 ~10k groups. Our rps target is much higher.
 To better reason about the trade-offs, it is useful to understand how `id(1)`
 is implemented, in rough terms:
 - lookup user by name ([`getpwent_r(3)`][getpwent]).
 - get all gids for the user ([`getgrouplist(3)`][getgrouplist]). Note: it is
  actually using `initgroups_dyn`, accepts a uid, and is very poorly
  documented.
 - for each additional gid, get the `struct group*`
  ([`getgrgid_r(3)`][getgrgid_r]).
 Assuming a member is in ~100 groups on average, to reach 10k id/s translates to
 1M group lookups per second. We need to convert gid to a group index, and group
 index to a group gid/name quickly.
 Caveat: `struct group` contains an array of pointers to names of group members
 (`char **gr_mem`). However, `id` does not use that information, resulting in
 read amplification, sometimes by 10-100x. Therefore, if `argv[0] == "id"`, our
 implementation of [`getgrid_r(3)`][getgrid] returns the `struct group*` without
 the members. This speeds up `id` by about 10x on a known NSS implementation.
 Relatedly, because [`getgrid_r(3)`][getgrid] does not need the group members,
 the group members are stored in a different DB section, reducing the `Groups`
 section and making more of it fit the CPU caches.
 Turbonss header
 ---------------
 The turbonss header looks like this:
 ```
 OFFSET     TYPE     NAME                      DESCRIPTION
   0      [4]u8     magic                     f0 9f a4 b7
   4         u8     version                   0
   5         u8     endian                    0 for little, 1 for big
   6         u8     nblocks_shell_blob        max value: 63
   7         u8     num_shells                max value: 63
   8        u32     num_groups                number of group entries
  12        u32     num_users                 number of passwd entries
  16        u32     nblocks_bdz_gid           bdz_gid section block count
  20        u32     nblocks_bdz_groupname
  24        u32     nblocks_bdz_uid
  28        u32     nblocks_bdz_username
  32        u64     nblocks_groups
  40        u64     nblocks_users
  48        u64     nblocks_groupmembers
  56        u64     nblocks_additional_gids
  64        u64     getgr_bufsize
  72        u64     getpw_bufsize
  80     [48]u8     padding
 ```
 `magic` is 0xf09fa4b7, and `version` must be `0`. All integers are
 native-endian. `nblocks_*` is the count of blocks of a particular section; this
 helps calculate the offsets to all sections.
 Some numbers, like `nblocks_shell_blob`, `num_shells`, would fit to smaller
 number of bytes. However, interpreting `[2]u6` with `xxd(1)` is harder than
 interpreting `[2]u8`. Therefore we are using the space we have to make these
 integers byte-wide.
 `getgr_bufsize` and `getpw_bufsize` is a hint for the caller of `getgr*` and
 `getpw*`-family calls. This is the recommended size of the buffer, so the
 caller does not receive `ENOMEM`.
 Primitive types
 ---------------
 `User` and `Group` entries are sorted by the order they were received in the input
 file. All entries are aligned to 8 bytes. All `User` and `Group` entries are
 referred by their byte offset in the `Users` and `Groups` section relative to
 the beginning of the section.
 ```
 const PackedGroup = packed struct {
    gid: u32,
    padding: u3,
    groupname_len: u5,
 }
 ```
 PackedGroup is followed by the group name (of length `groupname_len`), followed
 by a varint-compressed offset to the groupmembers section, followed by 8b padding.
 PackedUser is a bit more involved:
 ```
 pub const PackedUser = packed struct {
    uid: u32,
    gid: u32,
    shell_len_or_idx: u8,
    shell_here: bool,
    name_is_a_suffix: bool,
    home_len: u6,
    name_len: u5,
    gecos_len: u11,
 }
 ```
 ... followed by `userdata: []u8`:
 - home.
 - name (optional).
 - gecos.
 - shell (optional).
 - `additional_gids_offset`: varint.
 First byte of home is stored right after the `gecos_len` field, and its length
 is `home_len`. The same logic applies to all the `stringdata` fields: there is
 a way to calculate their relative position from the length of the fields before
 them.
 PackedUser employs two data-oriented compression techniques:
 - shells are often shared across different users, see the "Shells" section.
 - `name` is frequently a suffix of `home`. For example, `/home/vidmantas` and
  `vidmantas`. In this case storing both name and home is wasteful. Therefore
  name has two options:
  1. `name_is_a_suffix=true`: name is a suffix of the home dir. Then `name`
  starts at the `home_len - name_len`'th byte of `home`, and ends at the same
  place as `home`.
  2. `name_is_a_suffix=false`: name begins one byte after home, and it's length
  is `name_len`.
 The last field `additional_gids_offset: varint` points to the `additional_gids`
 section for this user.
 Shells
 ------
 Normally there is a limited number of separate shells even in huge user
 databases. A few examples: `/bin/bash`, `/usr/bin/nologin`, `/bin/zsh` among
 others. Therefore, "shells" have an optimization: they can be pointed by in the
 external list, or, if they are unique to the user, reside among the user's
 data.
 255 most popular shells (i.e. referred to by at least two User entries) are
 stored externally in "Shells" area. The less popular ones are stored with
 userdata.
 Shells section consists of two sub-sections: the index and the blob. The index
 is an array of offsets: the i'th shell starts at `offsets[i]` byte, and ends at
 `offsets[i+1]` byte. If there is at least one shell in the shell section, the
 index contains a sentinel index as the last element, which signifies the position
 of the last byte of the shell blob.
 `shell_here=true` in the User struct means the shell is stored with userdata,
 and it's length is `shell_len_or_idx`. `shell_here=false` means it is stored in
 the `Shells` section, and it's index is `shell_len_or_idx` (and the actual
 string start and end offsets are resolved as described in the paragraph above).
 Variable-length integers (varints)
 ----------------------------------
 Varint is an efficiently encoded integer (packed for small values). Same as
 [protocol buffer varints][varint], except the largest possible value is `u64`.
 They compress integers well. Varints are stored for group memberships.
 Group memberships
 -----------------
 There are two group memberships at play:
 1. Given a group (gid/name), resolve the members' names (e.g. `getgrgid`).
 2. Given a username, resolve user's group gids (for `initgroups(3)`).
 When group's memberships are resolved in (1), the same call also requires other
 group information: gid and group name. Therefore it makes sense to store a
 pointer to the group members in the group information itself. However, the
 memberships are not *always* necessary (see remarks about `id(1)`), therefore
 the memberships will be stored separately, outside of the groups section.
 Similarly, when user's groups are resolved in (2), they are not always necessary
 (i.e. not part of `struct user*`), therefore the memberships themselves are
 stored out of bound.
 `groupmembers` and `additional_gids` store group and user memberships
 respectively. Membership IDs are packed — not necessitating random access, thus
 suitable for compression.
 - `groupmembers` consists of a number X followed by a list of offsets to User
  records, because `getgr*` returns pointers to membernames, thus a name has to
  be immediately resolvable.
 - `additional_gids` is a list of gids, because `initgroups_dyn` (and friends)
  returns an array of gids.
 Each entry of `groupmembers` and `additional_gids` starts with a varint N,
 which is the number of upcoming elements. Then N delta-compressed varints,
 which are:
 - **additional_gids** a list of gids.
 - **groupmembers** byte-offsets to the User records in the `users` section.
 Indices
 -------
 Now that we've sketched the implementation of `id(3)`, it's clearer to
 understand which operations need to be fast; in order of importance:
 1. lookup gid -> group info (this is on hot path in id) without members.
 2. lookup username -> user's groups.
 3. lookup uid -> user.
 4. lookup groupname -> group.
 5. lookup username -> user.
 These indices can use perfect hashing like [bdz from cmph][cmph]: a perfect
 hash hashes a list of bytes to a sequential list of integers. Perfect hashing
 algorithms require some space, and take some time to calculate ("hashing
 duration"). I've tested BDZ, which hashes `[][]u8` to a sequential list of
 integers (not preserving order) and CHM, preserves order. BDZ accepts an
 optional argument `3 <= b <= 10`.
 * BDZ algorithm requires (b=3, 900KB, b=7, 338KB, b=10, 306KB) for 1M values.
 * Latency to resolve 1M keys: (170ms, 180ms, 230ms, respectively).
 * Packed vs non-packed latency differences are not meaningful.
 CHM retains order, however, 1M keys weigh 8MB. 10k keys are ~20x larger with
 CHM than with BDZ, eliminating the benefit of preserved ordering: we can just
 have a separate index.
 None of the tested perfect hashing algorithms makes the distinction between
 existing (in the initial dictionary) and new keys. In other words, HASH(value)
 will be pointing to a number `n ∈ [0,N-1]`, regardless whether the value was in
 the initial dictionary. Therefore one must always confirm, after calculating
 the hash, that the key matches what's been hashed.
 `idx_*` sections are of type `[]u32` and are pointing from `hash(key)` to the
 respective `Groups` and `Users` entries (from the beginning of the respective
 section). Since User and Group records are 8-byte aligned, the actual offset to
 the record is acquired by right-shifting this value by 3 bits.
 Database file structure
 -----------------------
 Each section is padded to 64 bytes.
 ```
 SECTION               SIZE             DESCRIPTION
 header                128              see "Turbonss header" section
 bdz_gid               ?                bdz(gid)
 bdz_groupname         ?                bdz(groupname)
 bdz_uid               ?                bdz(uid)
 bdz_username          ?                bdz(username)
 idx_gid2group         len(group)*4     bdz->offset Groups
 idx_groupname2group   len(group)*4     bdz->offset Groups
 idx_uid2user          len(user)*4      bdz->offset Users
 idx_name2user         len(user)*4      bdz->offset Users
 shell_index           len(shells)*2    shell index array
 shell_blob            <= 65280         shell data blob (max 255*256 bytes)
 groups                ?                packed Group entries (8b padding)
 users                 ?                packed User entries (8b padding)
 groupmembers          ?                per-group delta varint memberlist (no padding)
 additional_gids       ?                per-user delta varint gidlist (no padding)
 ```
 [cmph]: http://cmph.sourceforge.net/
 [id]: https://linux.die.net/man/1/id
 [data-oriented-design]: https://media.handmade-seattle.com/practical-data-oriented-design/
 [getpwnam_r]: https://linux.die.net/man/3/getpwnam_r
 [varint]: https://developers.google.com/protocol-buffers/docs/encoding#varints
 [getpwent]: https://www.man7.org/linux/man-pages/man3/getpwent_r.3.html
 [getgrouplist]: https://www.man7.org/linux/man-pages/man3/getgrouplist.3.html
 [getgrid]: https://www.man7.org/linux/man-pages/man3/getgrid_r.3.html
--- a/docs/development.md
+++ b/docs/development.md
@@ -0,0 +1,37 @@
 Profiling
 ---------
 Prepare `profile.data`:
 ```
 zig build -Drelease-small=true && \
    perf record --call-graph=dwarf \
        zig-out/bin/turbonss-unix2db --passwd passwd --group group
 ```
 Perf interactive:
 ```
 perf report -i perf.data
 ```
 Flame graph:
 ```
 perf script | inferno-collapse-perf | inferno-flamegraph > profile.svg
 ```
 For v2
 ------
 These are desired for the next DB format:
 - Compress strings with fsst.
 - Trim first 4 bytes from the cmph headers.
 Dependencies
 ------------
 This project uses [git subtrac][git-subtrac] for managing dependencies. They
 work just like regular submodules, except all the refs of the submodules are in
 this repository. Repeat after me: all the submodules are in this repository.
 So if you have a copy of this repo, dependencies will not disappear.
--- a/src/turbonss-unix2systemd.zig
+++ b/src/turbonss-unix2systemd.zig
@@ -1,305 +0,0 @@
 const std = @import("std");
 const fs = std.fs;
 const io = std.io;
 const mem = std.mem;
 const os = std.os;
 const heap = std.heap;
 const math = std.math;
 const fmt = std.fmt;
 const json = std.json;
 const ArrayList = std.ArrayList;
 const ArrayListUnmanaged = std.ArrayListUnmanaged;
 const Allocator = std.mem.Allocator;
 const StringArrayHashMap = std.StringArrayHashMap;
 const flags = @import("flags.zig");
 const User = @import("User.zig");
 const PackedUser = @import("PackedUser.zig");
 const Group = @import("Group.zig");
 const Corpus = @import("Corpus.zig");
 const DB = @import("DB.zig");
 const ErrCtx = @import("ErrCtx.zig");
 const usage =
    \\usage: turbonss-unix2systemd [OPTION]...
    \\
    \\Options:
    \\  -h          Print this help message and exit
    \\  --passwd    Path to passwd file (default: passwd)
    \\  --group     Path to group file (default: group)
    \\  --outdir    Path to output directory (default: ./userdb)
    \\
 ;
 pub fn main() !void {
    // This line is here because of https://github.com/ziglang/zig/issues/7807
    const argv: []const [*:0]const u8 = os.argv;
    const gpa = heap.raw_c_allocator;
    const stderr = io.getStdErr().writer();
    const stdout = io.getStdOut().writer();
    const return_code = execute(gpa, stdout, stderr, argv[1..]);
    os.exit(return_code);
 }
 fn execute(
    allocator: Allocator,
    stdout: anytype,
    stderr: anytype,
    argv: []const [*:0]const u8,
 ) u8 {
    const result = flags.parse(argv, &[_]flags.Flag{
        .{ .name = "-h", .kind = .boolean },
        .{ .name = "--passwd", .kind = .arg },
        .{ .name = "--group", .kind = .arg },
        .{ .name = "--outdir", .kind = .arg },
    }) catch {
        stderr.writeAll(usage) catch {};
        return 1;
    };
    if (result.boolFlag("-h")) {
        stdout.writeAll(usage) catch return 1;
        return 0;
    }
    if (result.args.len != 0) {
        stderr.print("ERROR: unknown option '{s}'\n", .{result.args[0]}) catch {};
        stderr.writeAll(usage) catch {};
        return 1;
    }
    const passwd_fname = result.argFlag("--passwd") orelse "passwd";
    const group_fname = result.argFlag("--group") orelse "group";
    const outdir = result.argFlag("--outdir") orelse "./userdb";
    // to catch an error set file.OpenError, wait for
    // https://github.com/ziglang/zig/issues/2473
    var errc = ErrCtx{};
    var passwd_file = fs.cwd().openFile(passwd_fname, .{ .mode = .read_only }) catch |err|
        return fail(errc.wrapf("open '{s}'", .{passwd_fname}), stderr, err);
    defer passwd_file.close();
    var group_file = fs.cwd().openFile(group_fname, .{ .mode = .read_only }) catch |err|
        return fail(errc.wrapf("open '{s}'", .{group_fname}), stderr, err);
    defer group_file.close();
    var passwdReader = io.bufferedReader(passwd_file.reader()).reader();
    var users = User.fromReader(allocator, &errc, passwdReader) catch |err|
        return fail(errc.wrap("read users"), stderr, err);
    defer {
        for (users) |*user| user.deinit(allocator);
        allocator.free(users);
    }
    var groupReader = io.bufferedReader(group_file.reader()).reader();
    var groups = Group.fromReader(allocator, groupReader) catch |err|
        return fail(errc.wrap("read groups"), stderr, err);
    defer {
        for (groups) |*group| group.deinit(allocator);
        allocator.free(groups);
    }
    const user2groups = StringArrayHashMap(ArrayListUnmanaged([]const u8)).init(allocator);
    defer {
        var it = user2groups.iterator();
        while (it.next()) |entry|
            entry.value_ptr.*.deinit(allocator);
        user2groups.deinit();
    }
    fillMemberships(allocator, groups, &user2groups);
    try os.mkdirZ(outdir, 0o755) catch |err| switch (err) {
        error.PathAlreadyExists => {},
        else => |err| return err,
    };
    var dir = try fs.cwd().openDir(outdir, .{});
    try makePasswd(dir, users.items);
    try makeGroups(dir, groups.items);
    return 0;
 }
 const JSONPasswd = struct {
    uid: u32,
    gid: u32,
    userName: []const u8, // pw_name
    realName: []const u8, // pw_gecos
    homeDirectory: []const u8,
    shell: []const u8,
    memberOf: []const []const u8,
 };
 fn makePasswd(
    dir: fs.Dir,
    users: []User,
    memberships: *const StringArrayHashMap(ArrayListUnmanaged([]const u8)),
 ) !void {
    var namebuf: [PackedUser.max_name_len + ".user".len:0]u8 = undefined;
    var symlinkbuf: [fmt.count("{d}.user", math.maxInt(u32)):0]u8 = undefined;
    for (users) |user| {
        const member_of = if (memberships.get(user.name)) |m|
            m.items
        else
            []const []const u8{};
        const u = JSONPasswd{
            .uid = user.uid,
            .gid = user.gid,
            .userName = user.name,
            .realName = user.gecos,
            .homeDirectory = user.home,
            .shell = user.shell,
            .memberOf = member_of,
        };
        const fname = try fmt.bufPrintZ(namebuf, "{s}.user", user.name);
        var f = try dir.createFileZ(fname, .{});
        defer f.close();
        var wr = io.bufferedWriter(f.writer());
        try json.stringify(u, .{}, wr);
        try wr.flush();
        const symlinkname = try fmt.bufPrintZ(symlinkbuf, "{d}.user", user.uid);
        try os.symlinkatZ(fname, dir.fd, symlinkname);
    }
 }
 fn makeGroups(dir: fs.Dir, groups: []Group) !void {
    _ = dir;
    _ = groups;
 }
 fn fail(errc: *ErrCtx, stderr: anytype, err: anytype) u8 {
    const err_chain = errc.unwrap().constSlice();
    stderr.print("ERROR {s}: {s}\n", .{ @errorName(err), err_chain }) catch {};
    return 1;
 }
 fn fillMemberships(
    allocator: Allocator,
    groups: ArrayList(Group),
    user2groups: *StringArrayHashMap(ArrayListUnmanaged([]const u8)),
 ) void {
    for (groups) |group| {
        for (group.members) |member| {
            const member_groups = try user2groups.getOrPut(allocator, member.name);
            if (!member_groups.found_existing)
                member_groups.value_ptr.* = ArrayListUnmanaged([]const u8){};
            member_groups.value_ptr.*.append(group.name);
        }
    }
 }
 const testing = std.testing;
 test "turbonss-unix2systemd invalid argument" {
    const allocator = testing.allocator;
    const args = &[_][*:0]const u8{"--invalid-argument"};
    var stderr = ArrayList(u8).init(allocator);
    defer stderr.deinit();
    var stdout = ArrayList(u8).init(allocator);
    defer stdout.deinit();
    const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args[0..]);
    try testing.expectEqual(@as(u8, 1), exit_code);
    try testing.expect(mem.startsWith(
        u8,
        stderr.items,
        "ERROR: unknown option '--invalid-argument'",
    ));
 }
 test "turbonss-unix2systemd trivial error: missing passwd file" {
    const allocator = testing.allocator;
    const args = &[_][*:0]const u8{
        "--passwd",
        "/does/not/exist/passwd",
        "--group",
        "/does/not/exist/group",
    };
    var stderr = ArrayList(u8).init(allocator);
    defer stderr.deinit();
    var stdout = ArrayList(u8).init(allocator);
    defer stdout.deinit();
    const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args[0..]);
    try testing.expectEqual(@as(u8, 1), exit_code);
    try testing.expectEqualStrings(stderr.items, "ERROR FileNotFound: open '/does/not/exist/passwd'\n");
 }
 test "turbonss-unix2systemd fail" {
    var errc = ErrCtx{};
    var buf = ArrayList(u8).init(testing.allocator);
    defer buf.deinit();
    var wr = buf.writer();
    const exit_code = fail(errc.wrapf("invalid user 'foo'", .{}), wr, error.NotSure);
    try testing.expectEqual(exit_code, 1);
    try testing.expectEqualStrings(buf.items, "ERROR NotSure: invalid user 'foo'\n");
 }
 test "turbonss-unix2db smoke test" {
    const allocator = testing.allocator;
    var stderr = ArrayList(u8).init(allocator);
    defer stderr.deinit();
    var stdout = ArrayList(u8).init(allocator);
    defer stdout.deinit();
    var corpus = try Corpus.testCorpus(allocator);
    defer corpus.deinit();
    var tmp = testing.tmpDir(.{});
    // TODO: defer
    errdefer tmp.cleanup();
    const tmp_path = blk: {
        const relative_path = try fs.path.join(allocator, &[_][]const u8{
            "zig-cache",
            "tmp",
            tmp.sub_path[0..],
        });
        const real_path = try fs.realpathAlloc(allocator, relative_path);
        allocator.free(relative_path);
        break :blk real_path;
    };
    defer allocator.free(tmp_path);
    const passwdPath = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "passwd" });
    defer allocator.free(passwdPath);
    const groupPath = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "group" });
    defer allocator.free(groupPath);
    const outDir = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "outdir" });
    defer allocator.free(outDir);
    const passwd_fd = try os.open(passwdPath, os.O.CREAT | os.O.WRONLY, 0o644);
    const group_fd = try os.open(groupPath, os.O.CREAT | os.O.WRONLY, 0o644);
    var i: usize = 0;
    while (i < corpus.users.len) : (i += 1) {
        const user = corpus.users.get(i);
        const line = user.toLine().constSlice();
        _ = try os.write(passwd_fd, line);
    }
    os.close(passwd_fd);
    var group_writer = (fs.File{ .handle = group_fd }).writer();
    i = 0;
    while (i < corpus.groups.len) : (i += 1)
        try corpus.groups.get(i).writeTo(group_writer);
    os.close(group_fd);
    const args = &[_][*:0]const u8{
        "--passwd", passwdPath,
        "--group",  groupPath,
        "--outdir", outDir,
    };
    const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args);
    try testing.expectEqualStrings("total 1664 bytes. groups=5 users=4\n", stderr.items);
    try testing.expectEqual(@as(u8, 0), exit_code);
 }