From 4c7afb8afa5232c6d51722b024afaf1f261d2c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Mon, 24 May 2021 00:11:58 +0300 Subject: [PATCH] doc --- main.go | 8 +- rootfs/BUILD | 5 +- rootfs/doc.go | 70 +++++++++++++++++ rootfs/rootfs.go | 80 +++++++++---------- rootfs/rootfs_test.go | 176 +++++++++++++++++++++++++++++++----------- 5 files changed, 249 insertions(+), 90 deletions(-) create mode 100644 rootfs/doc.go diff --git a/main.go b/main.go index 9fe7281..0b20280 100644 --- a/main.go +++ b/main.go @@ -50,9 +50,7 @@ func (r *cmdRootFS) Execute(args []string) (err error) { if err != nil { return err } - defer func() { - err = multierr.Append(err, in.Close()) - }() + defer func() { err = multierr.Append(err, in.Close()) }() var out *os.File outf := string(r.PositionalArgs.Outfile) @@ -64,9 +62,7 @@ func (r *cmdRootFS) Execute(args []string) (err error) { return err } } - defer func() { - err = multierr.Append(err, out.Close()) - }() + defer func() { err = multierr.Append(err, out.Close()) }() return rootfs.RootFS(in, out) } diff --git a/rootfs/BUILD b/rootfs/BUILD index 385fdaa..ba78de1 100644 --- a/rootfs/BUILD +++ b/rootfs/BUILD @@ -2,7 +2,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "go_default_library", - srcs = ["rootfs.go"], + srcs = [ + "doc.go", + "rootfs.go", + ], importpath = "github.com/motiejus/code/undocker/rootfs", visibility = ["//visibility:public"], deps = ["@org_uber_go_multierr//:go_default_library"], diff --git a/rootfs/doc.go b/rootfs/doc.go new file mode 100644 index 0000000..14b18ce --- /dev/null +++ b/rootfs/doc.go @@ -0,0 +1,70 @@ +// Package rootfs extracts all layers of a Docker container image to a single +// tarball. It will go trough all layers in order and copy every file to the +// destination archive. +// +// Except it will also reasonably process those files. +// +// == Non-directory will be copied only once == +// A non-directory will be copied only once, only from within it's past +// occurrence. I.e. if file /a/b was found in layers 0 and 2, only the file +// from layer 2 will be used. +// Directories will always be copied, even if there are duplicates. This is +// to avoid a situation like this: +// layer0: +// - ./dir/ +// - ./dir/file +// layer1: +// - ./dir/ +// - ./dir/file +// In theory, the directory from layer 1 takes precedence, so a tarball like +// this could be created: +// - ./dir/ (from layer1) +// - ./dir/file1 (from layer1) +// However, imagine the following: +// layer0: +// - ./dir/ +// - ./dir/file1 +// layer1: +// - ./dir/ +// Then the resulting tarball would have: +// - ./dir/file1 (from layer1) +// - ./dir/ (from layer0) +// Which would mean `untar` would try to untar a file to a directory which +// was not yet created. Therefore directories will be copied to the resulting +// tar in the order they appear in the layers. +// +// == Special files: .dockerenv == +// +// .dockernv is present in all docker containers, and is likely to remain +// such. So if you do `docker export `, the resulting tarball will +// have this file. rootfs will not add it. You are welcome to append one +// yourself. +// +// == Special files: opaque files and dirs (.wh.*) == +// +// From mount.aufs(8)[1]: +// +// The whiteout is for hiding files on lower branches. Also it is applied to +// stop readdir going lower branches. The latter case is called ‘opaque +// directory.’ Any whiteout is an empty file, it means whiteout is just an +// mark. In the case of hiding lower files, the name of whiteout is +// ‘.wh..’ And in the case of stopping readdir, the name is +// ‘.wh..wh..opq’. All whiteouts are hardlinked, including ‘/.wh..wh.aufs`. +// +// My interpretation: +// - a hardlink called `.wh..wh..opq` means that directory contents from the +// layers below the mentioned file should be ignored. Higher layers may add +// files on top. +// - if hardlink `.wh.([^/]+)` is found, $1 should be deleted from the current +// and lower layers. +// +// == Tar format == +// +// Since we do care about long filenames and large file sizes (>8GB), we are +// using "classic" GNU Tar. However, at least NetBSD pax is known to have +// problems reading it[2]. +// +// [1]: https://manpages.debian.org/unstable/aufs-tools/mount.aufs.8.en.html +// [2]: https://mgorny.pl/articles/portability-of-tar-features.html +package rootfs diff --git a/rootfs/rootfs.go b/rootfs/rootfs.go index d6341f2..9826652 100644 --- a/rootfs/rootfs.go +++ b/rootfs/rootfs.go @@ -4,6 +4,7 @@ import ( "archive/tar" "encoding/json" "errors" + "fmt" "io" "path/filepath" "strings" @@ -13,10 +14,11 @@ import ( const ( _manifestJSON = "manifest.json" + _layerSuffix = "/layer.tar" ) var ( - ErrBadManifest = errors.New("bad or missing manifest.json") + errBadManifest = errors.New("bad or missing manifest.json") ) type dockerManifestJSON []struct { @@ -24,7 +26,7 @@ type dockerManifestJSON []struct { Layers []string `json:"Layers"` } -// Rootfs accepts a docker layer tarball and writes it to outfile. +// RootFS accepts a docker layer tarball and writes it to outfile. // 1. create map[string]io.ReadSeeker for each layer. // 2. parse manifest.json and get the layer order. // 3. go through each layer in order and write: @@ -36,9 +38,7 @@ type dockerManifestJSON []struct { func RootFS(in io.ReadSeeker, out io.Writer) (err error) { tr := tar.NewReader(in) tw := tar.NewWriter(out) - defer func() { - err = multierr.Append(err, tw.Close()) - }() + defer func() { err = multierr.Append(err, tw.Close()) }() // layerOffsets maps a layer name (a9b123c0daa/layer.tar) to it's offset layerOffsets := map[string]int64{} @@ -60,9 +60,9 @@ func RootFS(in io.ReadSeeker, out io.Writer) (err error) { case filepath.Clean(hdr.Name) == _manifestJSON: dec := json.NewDecoder(tr) if err := dec.Decode(&manifest); err != nil { - return err + return fmt.Errorf("parse %s: %w", _manifestJSON, err) } - case strings.HasSuffix(hdr.Name, "/layer.tar"): + case strings.HasSuffix(hdr.Name, _layerSuffix): here, err := in.Seek(0, io.SeekCurrent) if err != nil { return err @@ -71,15 +71,11 @@ func RootFS(in io.ReadSeeker, out io.Writer) (err error) { } } - if len(manifest) == 0 { - return ErrBadManifest + if len(manifest) == 0 || len(layerOffsets) != len(manifest[0].Layers) { + return errBadManifest } - if len(layerOffsets) != len(manifest[0].Layers) { - return ErrBadManifest - } - - // phase 1.5: enumerate layers + // enumerate layers the way they would be laid down in the image layers := make([]int64, len(layerOffsets)) for i, name := range manifest[0].Layers { layers[i] = layerOffsets[name] @@ -88,8 +84,7 @@ func RootFS(in io.ReadSeeker, out io.Writer) (err error) { // file2layer maps a filename to layer number (index in "layers") file2layer := map[string]int{} - // phase 2: iterate through all layers and save filenames - // for all kinds of files. + // iterate through all layers and save filenames for all kinds of files. for i, offset := range layers { if _, err := in.Seek(offset, io.SeekStart); err != nil { return err @@ -124,37 +119,44 @@ func RootFS(in io.ReadSeeker, out io.Writer) (err error) { return err } - // only directories can have multiple entries with the same name. + // Only directories can have multiple entries with the same name. // all other file types cannot. if hdr.Typeflag != tar.TypeDir && file2layer[hdr.Name] != i { continue } - hdrOut := &tar.Header{ - Typeflag: hdr.Typeflag, - Name: hdr.Name, - Linkname: hdr.Linkname, - Size: hdr.Size, - Mode: int64(hdr.Mode & 0777), - Uid: hdr.Uid, - Gid: hdr.Gid, - Uname: hdr.Uname, - Gname: hdr.Gname, - ModTime: hdr.ModTime, - Devmajor: hdr.Devmajor, - Devminor: hdr.Devminor, - Format: tar.FormatPAX, - } - - if err := tw.WriteHeader(hdrOut); err != nil { + if err := writeFile(tr, tw, hdr); err != nil { return err } + } + } + return nil +} - if hdr.Typeflag == tar.TypeReg { - if _, err := io.Copy(tw, tr); err != nil { - return err - } - } +func writeFile(tr *tar.Reader, tw *tar.Writer, hdr *tar.Header) error { + hdrOut := &tar.Header{ + Typeflag: hdr.Typeflag, + Name: hdr.Name, + Linkname: hdr.Linkname, + Size: hdr.Size, + Mode: int64(hdr.Mode & 0777), + Uid: hdr.Uid, + Gid: hdr.Gid, + Uname: hdr.Uname, + Gname: hdr.Gname, + ModTime: hdr.ModTime, + Devmajor: hdr.Devmajor, + Devminor: hdr.Devminor, + Format: tar.FormatGNU, + } + + if err := tw.WriteHeader(hdrOut); err != nil { + return err + } + + if hdr.Typeflag == tar.TypeReg { + if _, err := io.Copy(tw, tr); err != nil { + return err } } diff --git a/rootfs/rootfs_test.go b/rootfs/rootfs_test.go index 990356f..e6e6c02 100644 --- a/rootfs/rootfs_test.go +++ b/rootfs/rootfs_test.go @@ -14,11 +14,11 @@ import ( func TestRootFS(t *testing.T) { layer0 := tarball{ dir{name: "/", uid: 0}, - file{name: "/file", uid: 0, contents: []byte("from 0")}, + file{name: "/file", uid: 0, contents: bytes.NewBufferString("from 0")}, } layer1 := tarball{ - file{name: "/file", uid: 1, contents: []byte("from 1")}, + file{name: "/file", uid: 1, contents: bytes.NewBufferString("from 1")}, } layer2 := tarball{ @@ -44,34 +44,95 @@ func TestRootFS(t *testing.T) { { name: "basic file overwrite, layer order mixed", image: tarball{ - file{name: "layer1/layer.tar", contents: layer1.bytes(t)}, - file{name: "layer0/layer.tar", contents: layer0.bytes(t)}, + file{name: "layer1/layer.tar", contents: layer1}, + file{name: "layer0/layer.tar", contents: layer0}, manifest{"layer0/layer.tar", "layer1/layer.tar"}, }, want: []extractable{ dir{name: "/", uid: 0}, - file{name: "/file", uid: 1, contents: []byte("from 1")}, + file{name: "/file", uid: 1, contents: bytes.NewBufferString("from 1")}, }, }, { name: "directory overwrite retains original dir", image: tarball{ - file{name: "layer2/layer.tar", contents: layer2.bytes(t)}, - file{name: "layer0/layer.tar", contents: layer0.bytes(t)}, - file{name: "layer1/layer.tar", contents: layer1.bytes(t)}, + file{name: "layer2/layer.tar", contents: layer2}, + file{name: "layer0/layer.tar", contents: layer0}, + file{name: "layer1/layer.tar", contents: layer1}, manifest{"layer0/layer.tar", "layer1/layer.tar", "layer2/layer.tar"}, }, want: []extractable{ dir{name: "/", uid: 0}, - file{name: "/file", uid: 1, contents: []byte("from 1")}, + file{name: "/file", uid: 1, contents: bytes.NewBufferString("from 1")}, dir{name: "/", uid: 2}, }, }, + { + name: "simple whiteout", + image: tarball{ + file{name: "layer0/layer.tar", contents: tarball{ + file{name: "filea"}, + file{name: "fileb"}, + dir{name: "dira"}, + dir{name: "dirb"}, + }}, + file{name: "layer1/layer.tar", contents: tarball{ + hardlink{name: ".wh.filea"}, + hardlink{name: ".wh.dira"}, + }}, + manifest{"layer0/layer.tar", "layer1/layer.tar"}, + }, + want: []extractable{ + file{name: "fileb"}, + dir{name: "dirb"}, + }, + }, + { + name: "whiteout with override", + image: tarball{ + file{name: "layer0/layer.tar", contents: tarball{ + file{name: "filea", contents: bytes.NewBufferString("from 0")}, + }}, + file{name: "layer1/layer.tar", contents: tarball{ + hardlink{name: ".wh.filea"}, + }}, + file{name: "layer2/layer.tar", contents: tarball{ + file{name: "filea", contents: bytes.NewBufferString("from 3")}, + }}, + manifest{ + "layer0/layer.tar", + "layer1/layer.tar", + "layer2/layer.tar", + }, + }, + want: []extractable{ + file{name: "filea", contents: bytes.NewBufferString("from 3")}, + }, + }, + { + name: "files and directories do not whiteout", + image: tarball{ + file{name: "layer0/layer.tar", contents: tarball{ + dir{name: "dir"}, + file{name: "file"}, + }}, + file{name: "layer1/layer.tar", contents: tarball{ + dir{name: ".wh.dir"}, + file{name: ".wh.file"}, + }}, + }, + want: []extractable{ + dir{name: "dir"}, + dir{name: ".wh.dir"}, + file{name: "file"}, + file{name: ".wh.file"}, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - in := bytes.NewReader(tt.image.bytes(t)) + in := bytes.NewReader(tt.image.Bytes()) out := bytes.Buffer{} err := RootFS(in, &out) @@ -81,15 +142,31 @@ func TestRootFS(t *testing.T) { } require.NoError(t, err) got := extract(t, &out) - assert.Equal(t, got, tt.want) + assert.Equal(t, tt.want, got) }) } } // Helpers -type tarrable interface { - tar(*testing.T, *tar.Writer) +type tarrer interface { + tar(*tar.Writer) +} + +type byter interface { + Bytes() []byte +} + +type tarball []tarrer + +func (tb tarball) Bytes() []byte { + buf := bytes.Buffer{} + tw := tar.NewWriter(&buf) + for _, member := range tb { + member.tar(tw) + } + tw.Close() + return buf.Bytes() } // extractable is an empty interface for comparing extracted outputs in tests. @@ -101,63 +178,70 @@ type dir struct { uid int } -func (d dir) tar(t *testing.T, tw *tar.Writer) { - t.Helper() +func (d dir) tar(tw *tar.Writer) { hdr := &tar.Header{ Typeflag: tar.TypeDir, Name: d.name, Mode: 0644, Uid: d.uid, } - require.NoError(t, tw.WriteHeader(hdr)) + tw.WriteHeader(hdr) } type file struct { name string uid int - contents []byte + contents byter } -func (f file) tar(t *testing.T, tw *tar.Writer) { - t.Helper() +func (f file) tar(tw *tar.Writer) { + var contentbytes []byte + if f.contents != nil { + contentbytes = f.contents.Bytes() + } hdr := &tar.Header{ Typeflag: tar.TypeReg, Name: f.name, Mode: 0644, Uid: f.uid, - Size: int64(len(f.contents)), + Size: int64(len(contentbytes)), } - require.NoError(t, tw.WriteHeader(hdr)) - _, err := tw.Write(f.contents) - require.NoError(t, err) + tw.WriteHeader(hdr) + tw.Write(contentbytes) } type manifest []string -func (m manifest) tar(t *testing.T, tw *tar.Writer) { - t.Helper() +func (m manifest) tar(tw *tar.Writer) { b, err := json.Marshal(dockerManifestJSON{{Layers: m}}) - require.NoError(t, err) - file{name: "manifest.json", uid: 0, contents: b}.tar(t, tw) -} - -type tarball []tarrable - -func (tb tarball) bytes(t *testing.T) []byte { - t.Helper() - buf := bytes.Buffer{} - tw := tar.NewWriter(&buf) - for _, member := range tb { - member.tar(t, tw) + if err != nil { + panic("testerr") } - require.NoError(t, tw.Close()) - return buf.Bytes() + file{ + name: "manifest.json", + uid: 0, + contents: bytes.NewBuffer(b), + }.tar(tw) } -func extract(t *testing.T, f io.Reader) []extractable { +type hardlink struct { + name string + uid int +} + +func (h hardlink) tar(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeLink, + Name: h.name, + Mode: 0644, + Uid: h.uid, + }) +} + +func extract(t *testing.T, r io.Reader) []extractable { t.Helper() ret := []extractable{} - tr := tar.NewReader(f) + tr := tar.NewReader(r) for { hdr, err := tr.Next() if err == io.EOF { @@ -170,9 +254,13 @@ func extract(t *testing.T, f io.Reader) []extractable { case tar.TypeDir: elem = dir{name: hdr.Name, uid: hdr.Uid} case tar.TypeReg: - buf := bytes.Buffer{} - io.Copy(&buf, tr) - elem = file{name: hdr.Name, uid: hdr.Uid, contents: buf.Bytes()} + f := file{name: hdr.Name, uid: hdr.Uid} + if hdr.Size > 0 { + var buf bytes.Buffer + io.Copy(&buf, tr) + f.contents = &buf + } + elem = f } ret = append(ret, elem) }