From 84bb5f7fd75f1c5414e08f1adc05a558e6823513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Sat, 12 Feb 2022 10:13:10 +0200 Subject: [PATCH] update user record --- README.md | 101 ++++++++++++++++-------------------------------------- 1 file changed, 30 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 40dbd1a..1b62099 100644 --- a/README.md +++ b/README.md @@ -35,66 +35,28 @@ ID/s. id(1) works as follows: - lookup user by name. - get all additional gids (an array attached to a member). -- for each additional gid, return the group name. +- for each additional gid, get the group name. Assuming a member is in ~100 groups on average, that's 1M group lookups per -second. We need to convert gid to a group index quickly. +second (cmph can do 1M in <200ms). We need to convert gid to a group index +quickly. -Data structures ---------------- - -Basic data structures that allow efficient storage: - -```lang=c -// reminder: -typedef uid_t uint32; -typedef gid_t uint32; - -// 6*32b = 6*4B = 24B/user -typedef struct { - uid_t uid; - gid_t gid; - name_offset uint32; // offset into *usernames - gecos_offset uint32; // offset into *gecos - shell_offset uint32; // offset into *shells - additional_groups_offset uint32; // offset into additional_groups -} user; - -const char* usernames; // all concatenated usernames, fsst-compressed -const char* gecoss; // all concatenated gecos, fsst-compressed -const char* shells; // all concatenated home directories, fsst-compressed -const uint8_t additional_groups; // all additional_groups, turbo compressed - -typedef struct { - gid_t gid; - name_offset uint32; // offset into *groupnames - members_offset uint32; // offset into members -} - -const char* groupnames; // all concatenated group names, fsst-compressed -const uint8_8 members; // all concatenated members, turbo compressed -``` - -"turbo compression" encodes a list of uids/gids with this algorithm: -1. sort ascending. -2. extract deltas and subtract 1: `awk '{diff=$0-prev; prev=$0; print - diff-1}'`. -3. varint-encode these deltas into an uint32, like protobuf or utf8. - -With typical group memberships (as of writing) this requires ~1.3-1.5 byte per -entry. - -Indexes -------- +API +--- The following operations need to be fast, in order of importance: -1. lookup gid -> group (this is on hot path in id). +1. lookup gid -> group (this is on hot path in id) with or without members (2 + separate calls). 2. lookup uid -> user. 3. lookup groupname -> group. 4. lookup username -> user. -5. (optional) iterate users using a defined order (`getent passwd`). -6. (optional) iterate groups using a defined order (`getent group`). +5. lookup uid -> list of gids. +6. (optional) iterate users using a defined order (`getent passwd`). +7. (optional) iterate groups using a defined order (`getent group`). + +Indexes +------- Preliminary results of playing with [cmph][cmph]: @@ -104,34 +66,31 @@ BDZ: tried b=3, b=7 (default), and b=10. * Latency for 1M keys: (170ms, 180ms, 230ms). * Packed vs non-packed latency differences are not meaningful. -CHM retains order, however, 1M keys weigh 8MB. 10k keys are ~20x larger with +CHM retains order, however, 0M keys weigh 8MB. 10k keys are ~20x larger with CHM than with BDZ, eliminating the benefit of preserved ordering. Full file structure ------------------- -The file structure stars with the metadata field. All indexes are number of -bytes, relative to the beginning of the file. +The file structure stars with magic and version number, followed by a list of +User, Group records and their indices. All indices are number of bytes, +relative to the beginning of the file. ``` -const Offsets = struct { - magic: [4]u32, - version: u32, - - num_users, size_num_users: u32, - num_groups, size_num_groups: u32, - cmph_gid2group: u32, - size_cmph_gid2group: u32, - cmph_uid2user, size_cmph_uid2user: u32, - cmph_groupname2group, size_cmph_groupname2group: u32, - cmph_username2user, size_cmph_username2user: u32, - structs_group, size_structs_group: u32, - structs_user, size_structs_user: u32, - fsst_usernames_homes, size_fsst_usernames_homes: u32, - fsst_groupnames, size_fsst_usernames_homes: u32, - fsst_shells, size_fsst_shells: u32, +// /home/motiejusMotiejus.Jakstys is 16+30=46b for "my" record. +const User = struct { + uid: u32, + gid: u32, + additional_gids_offset: u29, + shell_here: u1, // whether it's stored "here" or in another place. Docs TBD + shell_len: u6, + home_len: u6, + username_len: u6, + gecos_len: u8, + // a variable-sized array that will be stored immediately after this + // struct. + stringdata []u8; } -``` `magic` must be 0xf09fa4b7, and `version` must be `0x00`. The remaining fields are indexes to further sections of the file with their sizes in bytes. All