From fbd449b21f846607e4c3bf094985cb5e27b383e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Sun, 21 Aug 2022 06:08:21 +0300 Subject: [PATCH] Move docs around; finish it --- README.md | 499 +++++++--------------------------- docs/architecture.md | 327 ++++++++++++++++++++++ docs/development.md | 37 +++ src/turbonss-unix2systemd.zig | 305 --------------------- 4 files changed, 458 insertions(+), 710 deletions(-) create mode 100644 docs/architecture.md create mode 100644 docs/development.md delete mode 100644 src/turbonss-unix2systemd.zig diff --git a/README.md b/README.md index b05c357..41a62bf 100644 --- a/README.md +++ b/README.md @@ -1,442 +1,131 @@ Turbo NSS --------- -Turbonss is a plugin for GNU Name Service Switch (NSS) functionality of GNU C -Library (glibc). Turbonss implements lookup for `user` and `passwd` database -entries (i.e. system users, groups, and group memberships). It's main goal is -performance, with focus on making [`id(1)`][id] run as fast as possible. +Turbonss is a plugin for GNU Name Service Switch ([NSS][nsswitch]) +functionality of GNU C Library (glibc). Turbonss implements lookup for `user` +and `passwd` database entries (i.e. system users, groups, and group +memberships). It's main goal is to run [`id(1)`][id] as fast as possible. Turbonss is optimized for reading. If the data changes in any way, the whole -file will need to be regenerated (and tooling only supports only full -generation). It was created, and best suited, for environments that have a -central user & group database which then needs to be distributed to many -servers/services, and the data does not change very often (e.g. hourly). +file will need to be regenerated. Therefore, it was created, and best suited, +for environments that have a central user & group database which then needs to +be distributed to many servers/services, and the data does not change very +often (e.g. hourly). -To understand more about name service switch, start with -[`nsswitch.conf(5)`][nsswitch]. +This is the fastest known NSS passwd/group implementation for *reads*. On a +corpus with 10k users, 10k groups and 500 average members per group, `id` takes +17 seconds with the glibc default implementation, 10-17 milliseconds with a +pre-cached `nscd`, ~8 milliseconds with `turbonss`. -Design & constraints --------------------- +Project status +-------------- -To be fast, the user/group database (later: DB) has to be small -([background][data-oriented-design]). It encodes user & group information in a -way that minimizes the DB size, and reduces jumping across the DB ("chasing -pointers and thrashing CPU cache"). +The project is finished and is not recommended for production; just use nscd. +Turbonss duly implements the full user/group API in `src/libnss.zig`: feel free +to copy that. -To understand how this is done efficiently, let's analyze the -[`getpwnam_r(3)`][getpwnam_r] in high level. This API call accepts a username -and returns the following user information: - -``` -struct passwd { - char *pw_name; /* username */ - char *pw_passwd; /* user password */ - uid_t pw_uid; /* user ID */ - gid_t pw_gid; /* group ID */ - char *pw_gecos; /* user information */ - char *pw_dir; /* home directory */ - char *pw_shell; /* shell program */ -}; -``` - -Turbonss, among others, implements this call, and takes the following steps to -resolve a username to a `struct passwd*`: - -- Open the DB (using `mmap`) and interpret it's first 64 bytes as a `*struct - Header`. The header stores offsets to the sections of the file. This needs to - be done once, when the NSS library is loaded. -- Hash the username using a perfect hash function. Perfect hash function - returns a number `n ∈ [0,N-1]`, where N is the total number of users. -- Jump to the `n`'th location in the `idx_name2user` section, which contains - the index `i` to the user's information. -- Jump to the location `i` of section `Users`, which stores the full user - information. -- Decode the user information (which is all in a continuous memory block) and - return it to the caller. - -In total, that's one hash for the username (~150ns), two pointer jumps within -the group file (to sections `idx_name2user` and `Users`), and, now that the -user record is found, `memcpy` for each field. - -The turbonss DB file is be `mmap`-ed, making it simple to jump across the file -using pointer arithmetic. This also reduces memory usage, as the mmap'ed -regions are shared. Turbonss reads do not consume any heap space. - -Tight packing places some constraints on the underlying data: - -- Permitted length of username and groupname: 1-32 bytes. -- Permitted length of shell and home: 1-256 bytes. -- Permitted comment ("gecos") length: 0-255 bytes. -- User name, groupname, gecos and shell must be utf8-encoded. -- User and Groups sections are up to 2^35B (~34GB) large. Assuming an "average" - user record takes 50 bytes, this section would fit ~660M users. The - worst-case upper bound is left as an exercise to the reader. - -Sorting is stable. In v0: -- Groups are sorted by gid, ascending. -- Users are sorted by their name, ascending by the unicode codepoints - (locale-independent). - -Checking out and building -------------------------- - -``` -$ git clone --recursive https://git.sr.ht/~motiejus/turbonss -``` - -Alternatively, if you forgot `--recursive`: - -``` -$ git submodule update --init -``` - -And run tests: - -``` -$ zig build test -``` - -Test the so ------------ - -Build: - - zig build -Dtarget=x86_64-linux-gnu.2.31 -Dcpu=x86_64_v3 -Drelease-fast=true -Dstrip=true - -Generate `db.turbo`: - - zig-out/bin/turbonss-unix2db --passwd /etc/passwd --group /etc/group - zig-out/bin/turbonss-analyze db.turbo - <...> - -Run a test container: - - $ docker run -ti --rm --privileged -v `pwd`:/etc/turbonss -w /etc/turbonss debian:bullseye - # cp zig-out/lib/libnss_turbo.so.2 /lib/x86_64-linux-gnu - # sed -i 's/\(\(passwd\|group\).*files\)$/\1 turbo/' /etc/nsswitch.conf - -And knock yourself out: - - getent passwd - getent group - id root - -This is probably not very interesting; you may want to take a larger corpus of -/etc/passwd and /etc/group for more interesting results. +Yours truly (the author) worked on this for about 7 months. And when this was +finished it turned out that just slapping nscd on top of the existing NSS +implementation is almost as fast as this. Dependencies ------------ -This project uses [git subtrac][git-subtrac] for managing dependencies. They -work just like regular submodules, except all the refs of the submodules are in -this repository. Repeat after me: all the submodules are in this repository. -So if you have a copy of this repo, dependencies will not disappear. +1. Stage1 of the nightly zig compiler. +2. [cmph][cmph]: bundled with this repository. -remarks on `id(1)` ------------------- +Trying it out +------------- -A known implementation runs id(1) at ~250 rps sequentially on ~20k users and -~10k groups. Our rps target is much higher. +Clone, compile and test first: -To better reason about the trade-offs, it is useful to understand how `id(1)` -is implemented, in rough terms: -- lookup user by name ([`getpwent_r(3)`][getpwent]). -- get all gids for the user ([`getgrouplist(3)`][getgrouplist]). Note: it is - actually using `initgroups_dyn`, accepts a uid, and is very poorly - documented. -- for each additional gid, get the `struct group*` - ([`getgrgid_r(3)`][getgrgid_r]). + $ git clone --recursive https://git.sr.ht/~motiejus/turbonss + $ zig build -fstage1 test + $ zig build -fstage1 -Dtarget=x86_64-linux-gnu.2.31 -Dcpu=x86_64_v3 -Drelease-safe=true -Assuming a member is in ~100 groups on average, to reach 10k id/s translates to -1M group lookups per second. We need to convert gid to a group index, and group -index to a group gid/name quickly. +One may choose different options, depending on requirements. Here are some +hints: -Caveat: `struct group` contains an array of pointers to names of group members -(`char **gr_mem`). However, `id` does not use that information, resulting in -read amplification, sometimes by 10-100x. Therefore, if `argv[0] == "id"`, our -implementation of [`getgrid_r(3)`][getgrid] returns the `struct group*` without -the members. This speeds up `id` by about 10x on a known NSS implementation. +1. `-Dcpu=<...>` for the CPU + [microarchitecture](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels). +2. `-Drelease-fast=true` for max speed +3. `-Drelease-small=true` for smallest binary sizes. +4. `-Dstrip=true` to strip debug symbols. -Relatedly, because [`getgrid_r(3)`][getgrid] does not need the group members, -the group members are stored in a different DB section, reducing the `Groups` -section and making more of it fit the CPU caches. +Test it on a real system +------------------------ -Turbonss header ---------------- +`db.turbo` is the TurboNSS database file. To create one from `/etc/group` and +`/etc/passwd`, use `turbonss-unix2db`: -The turbonss header looks like this: + $ zig-out/bin/turbonss-unix2db --passwd /etc/passwd --group /etc/group + $ zig-out/bin/turbonss-analyze db.turbo + <...> -``` -OFFSET TYPE NAME DESCRIPTION - 0 [4]u8 magic f0 9f a4 b7 - 4 u8 version 0 - 5 u8 endian 0 for little, 1 for big - 6 u8 nblocks_shell_blob max value: 63 - 7 u8 num_shells max value: 63 - 8 u32 num_groups number of group entries - 12 u32 num_users number of passwd entries - 16 u32 nblocks_bdz_gid bdz_gid section block count - 20 u32 nblocks_bdz_groupname - 24 u32 nblocks_bdz_uid - 28 u32 nblocks_bdz_username - 32 u64 nblocks_groups - 40 u64 nblocks_users - 48 u64 nblocks_groupmembers - 56 u64 nblocks_additional_gids - 64 u64 getgr_bufsize - 72 u64 getpw_bufsize - 80 [48]u8 padding -``` +Run and configure a test container that uses `turbonss` instead of the default +`files`: -`magic` is 0xf09fa4b7, and `version` must be `0`. All integers are -native-endian. `nblocks_*` is the count of blocks of a particular section; this -helps calculate the offsets to all sections. + $ docker run -ti --rm -v `pwd`:/etc/turbonss -w /etc/turbonss debian:bullseye + # cp zig-out/lib/libnss_turbo.so.2 /lib/x86_64-linux-gnu/ + # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf -Some numbers, like `nblocks_shell_blob`, `num_shells`, would fit to smaller -number of bytes. However, interpreting `[2]u6` with `xxd(1)` is harder than -interpreting `[2]u8`. Therefore we are using the space we have to make these -integers byte-wide. +And run the commands: -`getgr_bufsize` and `getpw_bufsize` is a hint for the caller of `getgr*` and -`getpw*`-family calls. This is the recommended size of the buffer, so the -caller does not receive `ENOMEM`. + $ getent passwd + $ getent group + $ id root -Primitive types ---------------- +More users and groups +--------------------- -`User` and `Group` entries are sorted by the order they were received in the input -file. All entries are aligned to 8 bytes. All `User` and `Group` entries are -referred by their byte offset in the `Users` and `Groups` section relative to -the beginning of the section. +`turbonss-makecorpus` can synthesize more `users` and `groups`: -``` -const PackedGroup = packed struct { - gid: u32, - padding: u3, - groupname_len: u5, -} -``` + # ./zig-out/bin/turbonss-makecorpus + wrote users=10000 groups=10000 avg-members=1000 to . + # cat group >> /etc/group + # cat passwd >> /etc/passwd + # time id u_1000000 + <...> + real 0m17.380s + user 0m13.117s + sys 0m4.263s -PackedGroup is followed by the group name (of length `groupname_len`), followed -by a varint-compressed offset to the groupmembers section, followed by 8b padding. +17 seconds for an `id` command! Well, there are indeed many users and groups. +Let's see how turbonss fares with it: -PackedUser is a bit more involved: + # zig-out/bin/turbonss-unix2db --group /etc/group --passwd /etc/passwd + total 10968512 bytes. groups=10019 users=10039 + # ls -hs /etc/group /etc/passwd db.turbo + 48M /etc/group 668K /etc/passwd 11M db.turbo + # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf + # time id u_1000000 + real 0m0.008s + user 0m0.000s + sys 0m0.008s -``` -pub const PackedUser = packed struct { - uid: u32, - gid: u32, - shell_len_or_idx: u8, - shell_here: bool, - name_is_a_suffix: bool, - home_len: u6, - name_len: u5, - gecos_len: u11, -} -``` +That's ~1500x improvement for the `id` command (and notice about 4X compression +ratio compared to plain files). If the number of users and groups is increased +by 10x (to 100k each), the difference becomes even crazier: -... followed by `userdata: []u8`: -- home. -- name (optional). -- gecos. -- shell (optional). -- `additional_gids_offset`: varint. + # time id u_1000000 + <...> + real 3m42.281s + user 2m30.482s + sys 0m55.840s + # sed -i '/passwd\|group/ s/files/turbo/' /etc/nsswitch.conf + # time id u_1000000 + <...> + real 0m0.008s + user 0m0.000s + sys 0m0.008s -First byte of home is stored right after the `gecos_len` field, and its length -is `home_len`. The same logic applies to all the `stringdata` fields: there is -a way to calculate their relative position from the length of the fields before -them. +Documentation +------------- -PackedUser employs two data-oriented compression techniques: -- shells are often shared across different users, see the "Shells" section. -- `name` is frequently a suffix of `home`. For example, `/home/vidmantas` and - `vidmantas`. In this case storing both name and home is wasteful. Therefore - name has two options: - 1. `name_is_a_suffix=true`: name is a suffix of the home dir. Then `name` - starts at the `home_len - name_len`'th byte of `home`, and ends at the same - place as `home`. - 2. `name_is_a_suffix=false`: name begins one byte after home, and it's length - is `name_len`. - -The last field `additional_gids_offset: varint` points to the `additional_gids` -section for this user. - -Shells ------- - -Normally there is a limited number of separate shells even in huge user -databases. A few examples: `/bin/bash`, `/usr/bin/nologin`, `/bin/zsh` among -others. Therefore, "shells" have an optimization: they can be pointed by in the -external list, or, if they are unique to the user, reside among the user's -data. - -255 most popular shells (i.e. referred to by at least two User entries) are -stored externally in "Shells" area. The less popular ones are stored with -userdata. - -Shells section consists of two sub-sections: the index and the blob. The index -is an array of offsets: the i'th shell starts at `offsets[i]` byte, and ends at -`offsets[i+1]` byte. If there is at least one shell in the shell section, the -index contains a sentinel index as the last element, which signifies the position -of the last byte of the shell blob. - -`shell_here=true` in the User struct means the shell is stored with userdata, -and it's length is `shell_len_or_idx`. `shell_here=false` means it is stored in -the `Shells` section, and it's index is `shell_len_or_idx` (and the actual -string start and end offsets are resolved as described in the paragraph above). - -Variable-length integers (varints) ----------------------------------- - -Varint is an efficiently encoded integer (packed for small values). Same as -[protocol buffer varints][varint], except the largest possible value is `u64`. -They compress integers well. Varints are stored for group memberships. - -Group memberships ------------------ - -There are two group memberships at play: - -1. Given a group (gid/name), resolve the members' names (e.g. `getgrgid`). -2. Given a username, resolve user's group gids (for `initgroups(3)`). - -When group's memberships are resolved in (1), the same call also requires other -group information: gid and group name. Therefore it makes sense to store a -pointer to the group members in the group information itself. However, the -memberships are not *always* necessary (see remarks about `id(1)`), therefore -the memberships will be stored separately, outside of the groups section. - -Similarly, when user's groups are resolved in (2), they are not always necessary -(i.e. not part of `struct user*`), therefore the memberships themselves are -stored out of bound. - -`groupmembers` and `additional_gids` store group and user memberships -respectively. Membership IDs are packed — not necessitating random access, thus -suitable for compression. - -- `groupmembers` consists of a number X followed by a list of offsets to User - records, because `getgr*` returns pointers to membernames, thus a name has to - be immediately resolvable. -- `additional_gids` is a list of gids, because `initgroups_dyn` (and friends) - returns an array of gids. - -Each entry of `groupmembers` and `additional_gids` starts with a varint N, -which is the number of upcoming elements. Then N delta-compressed varints, -which are: - -- **additional_gids** a list of gids. -- **groupmembers** byte-offsets to the User records in the `users` section. - -Indices -------- - -Now that we've sketched the implementation of `id(3)`, it's clearer to -understand which operations need to be fast; in order of importance: - -1. lookup gid -> group info (this is on hot path in id) without members. -2. lookup username -> user's groups. -3. lookup uid -> user. -4. lookup groupname -> group. -5. lookup username -> user. - -These indices can use perfect hashing like [bdz from cmph][cmph]: a perfect -hash hashes a list of bytes to a sequential list of integers. Perfect hashing -algorithms require some space, and take some time to calculate ("hashing -duration"). I've tested BDZ, which hashes `[][]u8` to a sequential list of -integers (not preserving order) and CHM, preserves order. BDZ accepts an -optional argument `3 <= b <= 10`. - -* BDZ algorithm requires (b=3, 900KB, b=7, 338KB, b=10, 306KB) for 1M values. -* Latency to resolve 1M keys: (170ms, 180ms, 230ms, respectively). -* Packed vs non-packed latency differences are not meaningful. - -CHM retains order, however, 1M keys weigh 8MB. 10k keys are ~20x larger with -CHM than with BDZ, eliminating the benefit of preserved ordering: we can just -have a separate index. - -None of the tested perfect hashing algorithms makes the distinction between -existing (in the initial dictionary) and new keys. In other words, HASH(value) -will be pointing to a number `n ∈ [0,N-1]`, regardless whether the value was in -the initial dictionary. Therefore one must always confirm, after calculating -the hash, that the key matches what's been hashed. - -`idx_*` sections are of type `[]u32` and are pointing from `hash(key)` to the -respective `Groups` and `Users` entries (from the beginning of the respective -section). Since User and Group records are 8-byte aligned, the actual offset to -the record is acquired by right-shifting this value by 3 bits. - -Database file structure ------------------------ - -Each section is padded to 64 bytes. - -``` -SECTION SIZE DESCRIPTION -header 128 see "Turbonss header" section -bdz_gid ? bdz(gid) -bdz_groupname ? bdz(groupname) -bdz_uid ? bdz(uid) -bdz_username ? bdz(username) -idx_gid2group len(group)*4 bdz->offset Groups -idx_groupname2group len(group)*4 bdz->offset Groups -idx_uid2user len(user)*4 bdz->offset Users -idx_name2user len(user)*4 bdz->offset Users -shell_index len(shells)*2 shell index array -shell_blob <= 65280 shell data blob (max 255*256 bytes) -groups ? packed Group entries (8b padding) -users ? packed User entries (8b padding) -groupmembers ? per-group delta varint memberlist (no padding) -additional_gids ? per-user delta varint gidlist (no padding) -``` - -Section creation order: - -1. ✅ `bdz_*`. -1. ✅ `shell_index`, `shell_blob`. -1. ✅ `additional_gids`. -1. ✅ `users` requires `additional_gids` and shell. -1. ✅ `groupmembers` requires `users`. -1. ✅ `groups` requires `groupmembers`. -1. ✅ `idx_*`. requires offsets to `groups` and `users`. -1. ✅ Header. - -For v2 ------- - -These are desired for the next DB format: -- Compress strings with fsst. -- Trim first 4 bytes from the cmph headers. - -Profiling ---------- - -Prepare `profile.data`: - -``` -zig build -Drelease-small=true && \ - perf record --call-graph=dwarf \ - zig-out/bin/turbonss-unix2db --passwd passwd2 --group group2 -``` - -Perf interactive: - -``` -perf report -i perf.data -``` - -Flame graph: - -``` -perf script | inferno-collapse-perf | inferno-flamegraph > profile.svg -``` +Architecture is detailed in `docs/architecture.md` +Development notes are in `docs/development.md` [git-subtrac]: https://apenwarr.ca/log/20191109 -[cmph]: http://cmph.sourceforge.net/ -[id]: https://linux.die.net/man/1/id [nsswitch]: https://linux.die.net/man/5/nsswitch.conf -[data-oriented-design]: https://media.handmade-seattle.com/practical-data-oriented-design/ -[getpwnam_r]: https://linux.die.net/man/3/getpwnam_r -[varint]: https://developers.google.com/protocol-buffers/docs/encoding#varints -[getpwent]: https://www.man7.org/linux/man-pages/man3/getpwent_r.3.html -[getgrouplist]: https://www.man7.org/linux/man-pages/man3/getgrouplist.3.html -[getgrid]: https://www.man7.org/linux/man-pages/man3/getgrid_r.3.html +[id]: https://linux.die.net/man/1/id +[cmph]: http://cmph.sourceforge.net/ diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..88b2857 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,327 @@ +Design & constraints +-------------------- + +To be fast, the user/group database (later: DB) has to be small +([background][data-oriented-design]). It encodes user & group information in a +way that minimizes the DB size, and reduces jumping across the DB ("chasing +pointers and thrashing CPU cache"). + +To understand how this is done efficiently, let's analyze the +[`getpwnam_r(3)`][getpwnam_r] in high level. This API call accepts a username +and returns the following user information: + +``` +struct passwd { + char *pw_name; /* username */ + char *pw_passwd; /* user password */ + uid_t pw_uid; /* user ID */ + gid_t pw_gid; /* group ID */ + char *pw_gecos; /* user information */ + char *pw_dir; /* home directory */ + char *pw_shell; /* shell program */ +}; +``` + +Turbonss, among others, implements this call, and takes the following steps to +resolve a username to a `struct passwd*`: + +- Open the DB (using `mmap`) and interpret it's first 64 bytes as a `*struct + Header`. The header stores offsets to the sections of the file. This needs to + be done once, when the NSS library is loaded. +- Hash the username using a perfect hash function. Perfect hash function + returns a number `n ∈ [0,N-1]`, where N is the total number of users. +- Jump to the `n`'th location in the `idx_name2user` section, which contains + the index `i` to the user's information. +- Jump to the location `i` of section `Users`, which stores the full user + information. +- Decode the user information (which is all in a continuous memory block) and + return it to the caller. + +In total, that's one hash for the username (~150ns), two pointer jumps within +the group file (to sections `idx_name2user` and `Users`), and, now that the +user record is found, `memcpy` for each field. + +The turbonss DB file is be `mmap`-ed, making it simple to jump across the file +using pointer arithmetic. This also reduces memory usage, as the mmap'ed +regions are shared. Turbonss reads do not consume any heap space. + +Tight packing places some constraints on the underlying data: + +- Permitted length of username and groupname: 1-32 bytes. +- Permitted length of shell and home: 1-256 bytes. +- Permitted comment ("gecos") length: 0-255 bytes. +- User name, groupname, gecos and shell must be utf8-encoded. +- User and Groups sections are up to 2^35B (~34GB) large. Assuming an "average" + user record takes 50 bytes, this section would fit ~660M users. The + worst-case upper bound is left as an exercise to the reader. + +Sorting is stable. In v0: +- Groups are sorted by gid, ascending. +- Users are sorted by their name, ascending by the unicode codepoints + (locale-independent). + +remarks on `id(1)` +------------------ + +A known implementation runs id(1) at ~250 rps sequentially on ~20k users and +~10k groups. Our rps target is much higher. + +To better reason about the trade-offs, it is useful to understand how `id(1)` +is implemented, in rough terms: +- lookup user by name ([`getpwent_r(3)`][getpwent]). +- get all gids for the user ([`getgrouplist(3)`][getgrouplist]). Note: it is + actually using `initgroups_dyn`, accepts a uid, and is very poorly + documented. +- for each additional gid, get the `struct group*` + ([`getgrgid_r(3)`][getgrgid_r]). + +Assuming a member is in ~100 groups on average, to reach 10k id/s translates to +1M group lookups per second. We need to convert gid to a group index, and group +index to a group gid/name quickly. + +Caveat: `struct group` contains an array of pointers to names of group members +(`char **gr_mem`). However, `id` does not use that information, resulting in +read amplification, sometimes by 10-100x. Therefore, if `argv[0] == "id"`, our +implementation of [`getgrid_r(3)`][getgrid] returns the `struct group*` without +the members. This speeds up `id` by about 10x on a known NSS implementation. + +Relatedly, because [`getgrid_r(3)`][getgrid] does not need the group members, +the group members are stored in a different DB section, reducing the `Groups` +section and making more of it fit the CPU caches. + +Turbonss header +--------------- + +The turbonss header looks like this: + +``` +OFFSET TYPE NAME DESCRIPTION + 0 [4]u8 magic f0 9f a4 b7 + 4 u8 version 0 + 5 u8 endian 0 for little, 1 for big + 6 u8 nblocks_shell_blob max value: 63 + 7 u8 num_shells max value: 63 + 8 u32 num_groups number of group entries + 12 u32 num_users number of passwd entries + 16 u32 nblocks_bdz_gid bdz_gid section block count + 20 u32 nblocks_bdz_groupname + 24 u32 nblocks_bdz_uid + 28 u32 nblocks_bdz_username + 32 u64 nblocks_groups + 40 u64 nblocks_users + 48 u64 nblocks_groupmembers + 56 u64 nblocks_additional_gids + 64 u64 getgr_bufsize + 72 u64 getpw_bufsize + 80 [48]u8 padding +``` + +`magic` is 0xf09fa4b7, and `version` must be `0`. All integers are +native-endian. `nblocks_*` is the count of blocks of a particular section; this +helps calculate the offsets to all sections. + +Some numbers, like `nblocks_shell_blob`, `num_shells`, would fit to smaller +number of bytes. However, interpreting `[2]u6` with `xxd(1)` is harder than +interpreting `[2]u8`. Therefore we are using the space we have to make these +integers byte-wide. + +`getgr_bufsize` and `getpw_bufsize` is a hint for the caller of `getgr*` and +`getpw*`-family calls. This is the recommended size of the buffer, so the +caller does not receive `ENOMEM`. + +Primitive types +--------------- + +`User` and `Group` entries are sorted by the order they were received in the input +file. All entries are aligned to 8 bytes. All `User` and `Group` entries are +referred by their byte offset in the `Users` and `Groups` section relative to +the beginning of the section. + +``` +const PackedGroup = packed struct { + gid: u32, + padding: u3, + groupname_len: u5, +} +``` + +PackedGroup is followed by the group name (of length `groupname_len`), followed +by a varint-compressed offset to the groupmembers section, followed by 8b padding. + +PackedUser is a bit more involved: + +``` +pub const PackedUser = packed struct { + uid: u32, + gid: u32, + shell_len_or_idx: u8, + shell_here: bool, + name_is_a_suffix: bool, + home_len: u6, + name_len: u5, + gecos_len: u11, +} +``` + +... followed by `userdata: []u8`: +- home. +- name (optional). +- gecos. +- shell (optional). +- `additional_gids_offset`: varint. + +First byte of home is stored right after the `gecos_len` field, and its length +is `home_len`. The same logic applies to all the `stringdata` fields: there is +a way to calculate their relative position from the length of the fields before +them. + +PackedUser employs two data-oriented compression techniques: +- shells are often shared across different users, see the "Shells" section. +- `name` is frequently a suffix of `home`. For example, `/home/vidmantas` and + `vidmantas`. In this case storing both name and home is wasteful. Therefore + name has two options: + 1. `name_is_a_suffix=true`: name is a suffix of the home dir. Then `name` + starts at the `home_len - name_len`'th byte of `home`, and ends at the same + place as `home`. + 2. `name_is_a_suffix=false`: name begins one byte after home, and it's length + is `name_len`. + +The last field `additional_gids_offset: varint` points to the `additional_gids` +section for this user. + +Shells +------ + +Normally there is a limited number of separate shells even in huge user +databases. A few examples: `/bin/bash`, `/usr/bin/nologin`, `/bin/zsh` among +others. Therefore, "shells" have an optimization: they can be pointed by in the +external list, or, if they are unique to the user, reside among the user's +data. + +255 most popular shells (i.e. referred to by at least two User entries) are +stored externally in "Shells" area. The less popular ones are stored with +userdata. + +Shells section consists of two sub-sections: the index and the blob. The index +is an array of offsets: the i'th shell starts at `offsets[i]` byte, and ends at +`offsets[i+1]` byte. If there is at least one shell in the shell section, the +index contains a sentinel index as the last element, which signifies the position +of the last byte of the shell blob. + +`shell_here=true` in the User struct means the shell is stored with userdata, +and it's length is `shell_len_or_idx`. `shell_here=false` means it is stored in +the `Shells` section, and it's index is `shell_len_or_idx` (and the actual +string start and end offsets are resolved as described in the paragraph above). + +Variable-length integers (varints) +---------------------------------- + +Varint is an efficiently encoded integer (packed for small values). Same as +[protocol buffer varints][varint], except the largest possible value is `u64`. +They compress integers well. Varints are stored for group memberships. + +Group memberships +----------------- + +There are two group memberships at play: + +1. Given a group (gid/name), resolve the members' names (e.g. `getgrgid`). +2. Given a username, resolve user's group gids (for `initgroups(3)`). + +When group's memberships are resolved in (1), the same call also requires other +group information: gid and group name. Therefore it makes sense to store a +pointer to the group members in the group information itself. However, the +memberships are not *always* necessary (see remarks about `id(1)`), therefore +the memberships will be stored separately, outside of the groups section. + +Similarly, when user's groups are resolved in (2), they are not always necessary +(i.e. not part of `struct user*`), therefore the memberships themselves are +stored out of bound. + +`groupmembers` and `additional_gids` store group and user memberships +respectively. Membership IDs are packed — not necessitating random access, thus +suitable for compression. + +- `groupmembers` consists of a number X followed by a list of offsets to User + records, because `getgr*` returns pointers to membernames, thus a name has to + be immediately resolvable. +- `additional_gids` is a list of gids, because `initgroups_dyn` (and friends) + returns an array of gids. + +Each entry of `groupmembers` and `additional_gids` starts with a varint N, +which is the number of upcoming elements. Then N delta-compressed varints, +which are: + +- **additional_gids** a list of gids. +- **groupmembers** byte-offsets to the User records in the `users` section. + +Indices +------- + +Now that we've sketched the implementation of `id(3)`, it's clearer to +understand which operations need to be fast; in order of importance: + +1. lookup gid -> group info (this is on hot path in id) without members. +2. lookup username -> user's groups. +3. lookup uid -> user. +4. lookup groupname -> group. +5. lookup username -> user. + +These indices can use perfect hashing like [bdz from cmph][cmph]: a perfect +hash hashes a list of bytes to a sequential list of integers. Perfect hashing +algorithms require some space, and take some time to calculate ("hashing +duration"). I've tested BDZ, which hashes `[][]u8` to a sequential list of +integers (not preserving order) and CHM, preserves order. BDZ accepts an +optional argument `3 <= b <= 10`. + +* BDZ algorithm requires (b=3, 900KB, b=7, 338KB, b=10, 306KB) for 1M values. +* Latency to resolve 1M keys: (170ms, 180ms, 230ms, respectively). +* Packed vs non-packed latency differences are not meaningful. + +CHM retains order, however, 1M keys weigh 8MB. 10k keys are ~20x larger with +CHM than with BDZ, eliminating the benefit of preserved ordering: we can just +have a separate index. + +None of the tested perfect hashing algorithms makes the distinction between +existing (in the initial dictionary) and new keys. In other words, HASH(value) +will be pointing to a number `n ∈ [0,N-1]`, regardless whether the value was in +the initial dictionary. Therefore one must always confirm, after calculating +the hash, that the key matches what's been hashed. + +`idx_*` sections are of type `[]u32` and are pointing from `hash(key)` to the +respective `Groups` and `Users` entries (from the beginning of the respective +section). Since User and Group records are 8-byte aligned, the actual offset to +the record is acquired by right-shifting this value by 3 bits. + +Database file structure +----------------------- + +Each section is padded to 64 bytes. + +``` +SECTION SIZE DESCRIPTION +header 128 see "Turbonss header" section +bdz_gid ? bdz(gid) +bdz_groupname ? bdz(groupname) +bdz_uid ? bdz(uid) +bdz_username ? bdz(username) +idx_gid2group len(group)*4 bdz->offset Groups +idx_groupname2group len(group)*4 bdz->offset Groups +idx_uid2user len(user)*4 bdz->offset Users +idx_name2user len(user)*4 bdz->offset Users +shell_index len(shells)*2 shell index array +shell_blob <= 65280 shell data blob (max 255*256 bytes) +groups ? packed Group entries (8b padding) +users ? packed User entries (8b padding) +groupmembers ? per-group delta varint memberlist (no padding) +additional_gids ? per-user delta varint gidlist (no padding) +``` + +[cmph]: http://cmph.sourceforge.net/ +[id]: https://linux.die.net/man/1/id +[data-oriented-design]: https://media.handmade-seattle.com/practical-data-oriented-design/ +[getpwnam_r]: https://linux.die.net/man/3/getpwnam_r +[varint]: https://developers.google.com/protocol-buffers/docs/encoding#varints +[getpwent]: https://www.man7.org/linux/man-pages/man3/getpwent_r.3.html +[getgrouplist]: https://www.man7.org/linux/man-pages/man3/getgrouplist.3.html +[getgrid]: https://www.man7.org/linux/man-pages/man3/getgrid_r.3.html diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 0000000..21d4b51 --- /dev/null +++ b/docs/development.md @@ -0,0 +1,37 @@ +Profiling +--------- + +Prepare `profile.data`: + +``` +zig build -Drelease-small=true && \ + perf record --call-graph=dwarf \ + zig-out/bin/turbonss-unix2db --passwd passwd --group group +``` + +Perf interactive: + +``` +perf report -i perf.data +``` + +Flame graph: + +``` +perf script | inferno-collapse-perf | inferno-flamegraph > profile.svg +``` + +For v2 +------ + +These are desired for the next DB format: +- Compress strings with fsst. +- Trim first 4 bytes from the cmph headers. + +Dependencies +------------ + +This project uses [git subtrac][git-subtrac] for managing dependencies. They +work just like regular submodules, except all the refs of the submodules are in +this repository. Repeat after me: all the submodules are in this repository. +So if you have a copy of this repo, dependencies will not disappear. diff --git a/src/turbonss-unix2systemd.zig b/src/turbonss-unix2systemd.zig deleted file mode 100644 index bb9aded..0000000 --- a/src/turbonss-unix2systemd.zig +++ /dev/null @@ -1,305 +0,0 @@ -const std = @import("std"); -const fs = std.fs; -const io = std.io; -const mem = std.mem; -const os = std.os; -const heap = std.heap; -const math = std.math; -const fmt = std.fmt; -const json = std.json; -const ArrayList = std.ArrayList; -const ArrayListUnmanaged = std.ArrayListUnmanaged; -const Allocator = std.mem.Allocator; -const StringArrayHashMap = std.StringArrayHashMap; - -const flags = @import("flags.zig"); -const User = @import("User.zig"); -const PackedUser = @import("PackedUser.zig"); -const Group = @import("Group.zig"); -const Corpus = @import("Corpus.zig"); -const DB = @import("DB.zig"); -const ErrCtx = @import("ErrCtx.zig"); - -const usage = - \\usage: turbonss-unix2systemd [OPTION]... - \\ - \\Options: - \\ -h Print this help message and exit - \\ --passwd Path to passwd file (default: passwd) - \\ --group Path to group file (default: group) - \\ --outdir Path to output directory (default: ./userdb) - \\ -; - -pub fn main() !void { - // This line is here because of https://github.com/ziglang/zig/issues/7807 - const argv: []const [*:0]const u8 = os.argv; - const gpa = heap.raw_c_allocator; - - const stderr = io.getStdErr().writer(); - const stdout = io.getStdOut().writer(); - - const return_code = execute(gpa, stdout, stderr, argv[1..]); - os.exit(return_code); -} - -fn execute( - allocator: Allocator, - stdout: anytype, - stderr: anytype, - argv: []const [*:0]const u8, -) u8 { - const result = flags.parse(argv, &[_]flags.Flag{ - .{ .name = "-h", .kind = .boolean }, - .{ .name = "--passwd", .kind = .arg }, - .{ .name = "--group", .kind = .arg }, - .{ .name = "--outdir", .kind = .arg }, - }) catch { - stderr.writeAll(usage) catch {}; - return 1; - }; - - if (result.boolFlag("-h")) { - stdout.writeAll(usage) catch return 1; - return 0; - } - - if (result.args.len != 0) { - stderr.print("ERROR: unknown option '{s}'\n", .{result.args[0]}) catch {}; - stderr.writeAll(usage) catch {}; - return 1; - } - - const passwd_fname = result.argFlag("--passwd") orelse "passwd"; - const group_fname = result.argFlag("--group") orelse "group"; - const outdir = result.argFlag("--outdir") orelse "./userdb"; - - // to catch an error set file.OpenError, wait for - // https://github.com/ziglang/zig/issues/2473 - var errc = ErrCtx{}; - var passwd_file = fs.cwd().openFile(passwd_fname, .{ .mode = .read_only }) catch |err| - return fail(errc.wrapf("open '{s}'", .{passwd_fname}), stderr, err); - defer passwd_file.close(); - - var group_file = fs.cwd().openFile(group_fname, .{ .mode = .read_only }) catch |err| - return fail(errc.wrapf("open '{s}'", .{group_fname}), stderr, err); - defer group_file.close(); - - var passwdReader = io.bufferedReader(passwd_file.reader()).reader(); - var users = User.fromReader(allocator, &errc, passwdReader) catch |err| - return fail(errc.wrap("read users"), stderr, err); - defer { - for (users) |*user| user.deinit(allocator); - allocator.free(users); - } - - var groupReader = io.bufferedReader(group_file.reader()).reader(); - var groups = Group.fromReader(allocator, groupReader) catch |err| - return fail(errc.wrap("read groups"), stderr, err); - defer { - for (groups) |*group| group.deinit(allocator); - allocator.free(groups); - } - - const user2groups = StringArrayHashMap(ArrayListUnmanaged([]const u8)).init(allocator); - defer { - var it = user2groups.iterator(); - while (it.next()) |entry| - entry.value_ptr.*.deinit(allocator); - user2groups.deinit(); - } - fillMemberships(allocator, groups, &user2groups); - - try os.mkdirZ(outdir, 0o755) catch |err| switch (err) { - error.PathAlreadyExists => {}, - else => |err| return err, - }; - - var dir = try fs.cwd().openDir(outdir, .{}); - - try makePasswd(dir, users.items); - try makeGroups(dir, groups.items); - - return 0; -} - -const JSONPasswd = struct { - uid: u32, - gid: u32, - userName: []const u8, // pw_name - realName: []const u8, // pw_gecos - homeDirectory: []const u8, - shell: []const u8, - memberOf: []const []const u8, -}; - -fn makePasswd( - dir: fs.Dir, - users: []User, - memberships: *const StringArrayHashMap(ArrayListUnmanaged([]const u8)), -) !void { - var namebuf: [PackedUser.max_name_len + ".user".len:0]u8 = undefined; - var symlinkbuf: [fmt.count("{d}.user", math.maxInt(u32)):0]u8 = undefined; - - for (users) |user| { - const member_of = if (memberships.get(user.name)) |m| - m.items - else - []const []const u8{}; - - const u = JSONPasswd{ - .uid = user.uid, - .gid = user.gid, - .userName = user.name, - .realName = user.gecos, - .homeDirectory = user.home, - .shell = user.shell, - .memberOf = member_of, - }; - - const fname = try fmt.bufPrintZ(namebuf, "{s}.user", user.name); - var f = try dir.createFileZ(fname, .{}); - defer f.close(); - - var wr = io.bufferedWriter(f.writer()); - try json.stringify(u, .{}, wr); - try wr.flush(); - - const symlinkname = try fmt.bufPrintZ(symlinkbuf, "{d}.user", user.uid); - try os.symlinkatZ(fname, dir.fd, symlinkname); - } -} - -fn makeGroups(dir: fs.Dir, groups: []Group) !void { - _ = dir; - _ = groups; -} - -fn fail(errc: *ErrCtx, stderr: anytype, err: anytype) u8 { - const err_chain = errc.unwrap().constSlice(); - stderr.print("ERROR {s}: {s}\n", .{ @errorName(err), err_chain }) catch {}; - return 1; -} - -fn fillMemberships( - allocator: Allocator, - groups: ArrayList(Group), - user2groups: *StringArrayHashMap(ArrayListUnmanaged([]const u8)), -) void { - for (groups) |group| { - for (group.members) |member| { - const member_groups = try user2groups.getOrPut(allocator, member.name); - if (!member_groups.found_existing) - member_groups.value_ptr.* = ArrayListUnmanaged([]const u8){}; - member_groups.value_ptr.*.append(group.name); - } - } -} - -const testing = std.testing; - -test "turbonss-unix2systemd invalid argument" { - const allocator = testing.allocator; - const args = &[_][*:0]const u8{"--invalid-argument"}; - var stderr = ArrayList(u8).init(allocator); - defer stderr.deinit(); - var stdout = ArrayList(u8).init(allocator); - defer stdout.deinit(); - - const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args[0..]); - try testing.expectEqual(@as(u8, 1), exit_code); - try testing.expect(mem.startsWith( - u8, - stderr.items, - "ERROR: unknown option '--invalid-argument'", - )); -} - -test "turbonss-unix2systemd trivial error: missing passwd file" { - const allocator = testing.allocator; - const args = &[_][*:0]const u8{ - "--passwd", - "/does/not/exist/passwd", - "--group", - "/does/not/exist/group", - }; - var stderr = ArrayList(u8).init(allocator); - defer stderr.deinit(); - var stdout = ArrayList(u8).init(allocator); - defer stdout.deinit(); - - const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args[0..]); - try testing.expectEqual(@as(u8, 1), exit_code); - try testing.expectEqualStrings(stderr.items, "ERROR FileNotFound: open '/does/not/exist/passwd'\n"); -} - -test "turbonss-unix2systemd fail" { - var errc = ErrCtx{}; - var buf = ArrayList(u8).init(testing.allocator); - defer buf.deinit(); - var wr = buf.writer(); - const exit_code = fail(errc.wrapf("invalid user 'foo'", .{}), wr, error.NotSure); - try testing.expectEqual(exit_code, 1); - try testing.expectEqualStrings(buf.items, "ERROR NotSure: invalid user 'foo'\n"); -} - -test "turbonss-unix2db smoke test" { - const allocator = testing.allocator; - var stderr = ArrayList(u8).init(allocator); - defer stderr.deinit(); - var stdout = ArrayList(u8).init(allocator); - defer stdout.deinit(); - - var corpus = try Corpus.testCorpus(allocator); - defer corpus.deinit(); - - var tmp = testing.tmpDir(.{}); - // TODO: defer - errdefer tmp.cleanup(); - - const tmp_path = blk: { - const relative_path = try fs.path.join(allocator, &[_][]const u8{ - "zig-cache", - "tmp", - tmp.sub_path[0..], - }); - const real_path = try fs.realpathAlloc(allocator, relative_path); - allocator.free(relative_path); - break :blk real_path; - }; - defer allocator.free(tmp_path); - - const passwdPath = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "passwd" }); - defer allocator.free(passwdPath); - const groupPath = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "group" }); - defer allocator.free(groupPath); - const outDir = try fs.path.joinZ(allocator, &[_][]const u8{ tmp_path, "outdir" }); - defer allocator.free(outDir); - - const passwd_fd = try os.open(passwdPath, os.O.CREAT | os.O.WRONLY, 0o644); - const group_fd = try os.open(groupPath, os.O.CREAT | os.O.WRONLY, 0o644); - - var i: usize = 0; - while (i < corpus.users.len) : (i += 1) { - const user = corpus.users.get(i); - const line = user.toLine().constSlice(); - _ = try os.write(passwd_fd, line); - } - os.close(passwd_fd); - - var group_writer = (fs.File{ .handle = group_fd }).writer(); - i = 0; - while (i < corpus.groups.len) : (i += 1) - try corpus.groups.get(i).writeTo(group_writer); - os.close(group_fd); - - const args = &[_][*:0]const u8{ - "--passwd", passwdPath, - "--group", groupPath, - "--outdir", outDir, - }; - - const exit_code = execute(allocator, stdout.writer(), stderr.writer(), args); - try testing.expectEqualStrings("total 1664 bytes. groups=5 users=4\n", stderr.items); - try testing.expectEqual(@as(u8, 0), exit_code); -}