From 7ff42eff914e2e501f570bb8c530719bb3a2521a Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 10 Dec 2024 20:44:00 -0800 Subject: [PATCH] std.Build.Cache.hit: work around macOS kernel bug The previous commit cast doubt upon the initial report about macOS kernel behavior, identifying another reason that ENOENT could be returned from file creation. However, it is demonstrable that ENOENT can be returned for both cases: 1. create file race 2. handle refers to deleted directory This commit re-introduces the workaround for the file creation race on macOS however it does not unconditionally retry - it first tries again with O_EXCL to disambiguate the error condition that has occurred. --- lib/std/Build/Cache.zig | 37 +++++++++++++++++++++++++++++++++++++ lib/std/Build/Step.zig | 2 +- lib/std/posix.zig | 2 ++ src/Compilation.zig | 2 +- src/Zcu/PerThread.zig | 34 ++++++++++++++++++++++++++++------ 5 files changed, 69 insertions(+), 8 deletions(-) diff --git a/lib/std/Build/Cache.zig b/lib/std/Build/Cache.zig index 0e763bee63..13c08f93a1 100644 --- a/lib/std/Build/Cache.zig +++ b/lib/std/Build/Cache.zig @@ -528,6 +528,43 @@ pub const Manifest = struct { }; break; }, + error.FileNotFound => { + // There are no dir components, so the only possibility + // should be that the directory behind the handle has been + // deleted, however we have observed on macOS two processes + // racing to do openat() with O_CREAT manifest in ENOENT. + // + // As a workaround, we retry with exclusive=true which + // disambiguates by returning EEXIST, indicating original + // failure was a race, or ENOENT, indicating deletion of + // the directory of our open handle. + if (builtin.os.tag != .macos) { + self.diagnostic = .{ .manifest_create = error.FileNotFound }; + return error.CacheCheckFailed; + } + + if (self.cache.manifest_dir.createFile(&manifest_file_path, .{ + .read = true, + .truncate = false, + .lock = .exclusive, + .lock_nonblocking = self.want_shared_lock, + .exclusive = true, + })) |manifest_file| { + self.manifest_file = manifest_file; + self.have_exclusive_lock = true; + break; + } else |excl_err| switch (excl_err) { + error.WouldBlock, error.PathAlreadyExists => continue, + error.FileNotFound => { + self.diagnostic = .{ .manifest_create = error.FileNotFound }; + return error.CacheCheckFailed; + }, + else => |e| { + self.diagnostic = .{ .manifest_create = e }; + return error.CacheCheckFailed; + }, + } + }, else => |e| { self.diagnostic = .{ .manifest_create = e }; return error.CacheCheckFailed; diff --git a/lib/std/Build/Step.zig b/lib/std/Build/Step.zig index 5e5672755c..5c51296581 100644 --- a/lib/std/Build/Step.zig +++ b/lib/std/Build/Step.zig @@ -770,7 +770,7 @@ fn failWithCacheError(s: *Step, man: *const Build.Cache.Manifest, err: Build.Cac }, }, error.OutOfMemory => return error.OutOfMemory, - error.InvalidFormat => return s.fail("failed check cache: invalid manifest file format", .{}), + error.InvalidFormat => return s.fail("failed to check cache: invalid manifest file format", .{}), } } diff --git a/lib/std/posix.zig b/lib/std/posix.zig index 5e5f6073e6..100500bec4 100644 --- a/lib/std/posix.zig +++ b/lib/std/posix.zig @@ -1541,6 +1541,8 @@ pub const OpenError = error{ /// * One of the path components does not exist. /// * Cwd was used, but cwd has been deleted. /// * The path associated with the open directory handle has been deleted. + /// * On macOS, multiple processes or threads raced to create the same file + /// with `O.EXCL` set to `false`. FileNotFound, /// The path exceeded `max_path_bytes` bytes. diff --git a/src/Compilation.zig b/src/Compilation.zig index 12e8a87831..5f4404ff99 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -2069,7 +2069,7 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) !void { error.OutOfMemory => return error.OutOfMemory, error.InvalidFormat => return comp.setMiscFailure( .check_whole_cache, - "failed check cache: invalid manifest file format", + "failed to check cache: invalid manifest file format", .{}, ), }; diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig index 360e479f3c..c9b6d68de4 100644 --- a/src/Zcu/PerThread.zig +++ b/src/Zcu/PerThread.zig @@ -136,12 +136,34 @@ pub fn astGenFile( error.NoDevice => unreachable, // it's not a pipe error.WouldBlock => unreachable, // not asking for non-blocking I/O error.FileNotFound => { - // Since there are no dir components this could only occur if - // `zir_dir` is deleted after the compiler process obtains an - // open directory handle. - std.process.fatal("cache directory '{}' unexpectedly removed during compiler execution", .{ - cache_directory, - }); + // There are no dir components, so the only possibility should + // be that the directory behind the handle has been deleted, + // however we have observed on macOS two processes racing to do + // openat() with O_CREAT manifest in ENOENT. + // + // As a workaround, we retry with exclusive=true which + // disambiguates by returning EEXIST, indicating original + // failure was a race, or ENOENT, indicating deletion of the + // directory of our open handle. + if (builtin.os.tag != .macos) { + std.process.fatal("cache directory '{}' unexpectedly removed during compiler execution", .{ + cache_directory, + }); + } + break zir_dir.createFile(&hex_digest, .{ + .read = true, + .truncate = false, + .lock = lock, + .exclusive = true, + }) catch |excl_err| switch (excl_err) { + error.PathAlreadyExists => continue, + error.FileNotFound => { + std.process.fatal("cache directory '{}' unexpectedly removed during compiler execution", .{ + cache_directory, + }); + }, + else => |e| return e, + }; }, else => |e| return e, // Retryable errors are handled at callsite.