sema_test: make AIR comparison strict and add extra canonicalization - zig

commit df2c1a77588cdc0b226441a598c52387758640ee (tree)
parent 3fa58a2654739a24503221ba3467fa7e6a3aeedc
Author: Motiejus Jakštys <motiejus@jakstys.lt>
Date:   Mon, 23 Feb 2026 22:34:41 +0000

sema_test: make AIR comparison strict and add extra canonicalization

- Gap 1: function count check is now a hard error (was warning),
  with diagnostic listing functions missing from C output
- Gap 3: canonicalizeExtraRefs for tags with Refs in extra payload
  (StructField, Bin, UnionInit, VectorCmp, Cmpxchg, AtomicRmw,
  TryPtr, FieldParentPtr, ShuffleOne/Two)
- Gap 5: detect ambiguous name matches in precomputedFindByName
- Reduce num_passing 66→8 (addhf3.zig function count mismatch)
- Add num_sema_passing=78 (call_inside_runtime_conditional and
  6 similar tests have function count mismatches)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
M stage0/corpus.zig  | 5 ++++-
M stage0/sema_test.zig  | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------

2 files changed, 148 insertions(+), 15 deletions(-)
diff --git a/stage0/corpus.zig b/stage0/corpus.zig
@@ -3,7 +3,7 @@
 /// `num_passing` controls how many files are tested and pre-generated.
 /// Both build.zig and stages_test.zig import this file.
 /// To enable more tests: just increment `num_passing`.
-pub const num_passing: usize = 66;
+pub const num_passing: usize = 8;
 
 pub const files = [_][]const u8{
     "lib/std/crypto/codecs.zig", // 165
@@ -203,6 +203,8 @@ pub const files = [_][]const u8{
     "lib/std/math/expo2.zig", // 995
 };
 
+pub const num_sema_passing: usize = 78;
+
 pub const sema_unit_tests = [_][]const u8{
     "stage0/sema_tests/empty.zig",
     "stage0/sema_tests/const_decl.zig",
@@ -294,3 +296,4 @@ pub const sema_unit_tests = [_][]const u8{
     "stage0/sema_tests/min_float.zig",
     "stage0/sema_tests/f64_div.zig",
 };
+
diff --git a/stage0/sema_test.zig b/stage0/sema_test.zig
@@ -323,11 +323,24 @@ pub fn airComparePrecomputed(precomputed: []const PrecomputedFunc, c_func_air_li
         try airCompareOne(c_name, pf.*, c_pf);
     }
     // Verify bidirectional match: Zig should not produce functions that C does not.
-    // Currently a diagnostic (not a hard error) because the C sema does not yet
-    // analyze all lazily-referenced functions.
     if (c_funcs.len != precomputed.len) {
-        std.debug.print("WARNING: function count mismatch for AIR comparison: " ++
-            "C produced {d} functions, pre-computed (Zig) has {d}\n", .{ c_funcs.len, precomputed.len });
+        std.debug.print("Function count mismatch: C produced {d} functions, " ++
+            "pre-computed (Zig) has {d}\n", .{ c_funcs.len, precomputed.len });
+        // Print which pre-computed functions C didn't produce.
+        for (precomputed) |*pf| {
+            var found = false;
+            for (c_funcs) |*cf| {
+                const cn = if (cf.name) |n| std.mem.span(n) else "";
+                if (std.mem.eql(u8, stripModulePrefix(pf.name), stripModulePrefix(cn))) {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                std.debug.print("  missing in C: '{s}'\n", .{pf.name});
+            }
+        }
+        return error.AirMismatch;
     }
 }
 
@@ -344,10 +357,18 @@ fn precomputedFromCAir(cf: *const c.SemaFuncAir) PrecomputedFunc {
 
 fn precomputedFindByName(funcs: []const PrecomputedFunc, name: []const u8) ?*const PrecomputedFunc {
     const bare_name = stripModulePrefix(name);
+    var result: ?*const PrecomputedFunc = null;
+    var match_count: usize = 0;
     for (funcs) |*f| {
-        if (std.mem.eql(u8, bare_name, stripModulePrefix(f.name))) return f;
+        if (std.mem.eql(u8, bare_name, stripModulePrefix(f.name))) {
+            if (result == null) result = f;
+            match_count += 1;
+        }
+    }
+    if (match_count > 1) {
+        std.debug.print("Ambiguous name match: '{s}' matches {d} pre-computed functions\n", .{ bare_name, match_count });
     }
-    return null;
+    return result;
 }
 
 fn cNameSpan(name: [*c]u8) []const u8 {
@@ -699,6 +720,105 @@ fn airDataRefSlots(tag_val: u8) [2]bool {
     };
 }
 
+/// Canonicalize Ref values stored in the extra array for a given instruction.
+/// Each tag has a known extra layout; this function canonicalizes only the
+/// Ref-typed fields, leaving payload indices, field indices, and enum values
+/// untouched.
+fn canonicalizeExtraRefs(
+    tag_val: u8,
+    datas: [*]const u8,
+    inst_idx: usize,
+    extra: []u32,
+    map: *std.AutoHashMap(u32, u32),
+    next_id: *u32,
+) void {
+    // Read the payload index from data slot 1 (bytes 4-7 of the 8-byte data).
+    const payload = std.mem.readInt(u32, datas[inst_idx * 8 + 4 ..][0..4], .little);
+
+    switch (tag_val) {
+        // ty_pl with Bin extra: {lhs(Ref), rhs(Ref)}
+        c.AIR_INST_PTR_ADD,
+        c.AIR_INST_PTR_SUB,
+        c.AIR_INST_ADD_WITH_OVERFLOW,
+        c.AIR_INST_SUB_WITH_OVERFLOW,
+        c.AIR_INST_MUL_WITH_OVERFLOW,
+        c.AIR_INST_SHL_WITH_OVERFLOW,
+        c.AIR_INST_SLICE,
+        c.AIR_INST_SLICE_ELEM_PTR,
+        c.AIR_INST_PTR_ELEM_PTR,
+        => {
+            canonExtraRef(extra, payload, map, next_id);
+            canonExtraRef(extra, payload + 1, map, next_id);
+        },
+        // pl_op with Bin extra: {lhs(Ref), rhs(Ref)}
+        c.AIR_INST_SELECT,
+        c.AIR_INST_MUL_ADD,
+        => {
+            canonExtraRef(extra, payload, map, next_id);
+            canonExtraRef(extra, payload + 1, map, next_id);
+        },
+        // ty_pl with UnionInit extra: {field_index(u32), init(Ref)}
+        c.AIR_INST_UNION_INIT => {
+            canonExtraRef(extra, payload + 1, map, next_id);
+        },
+        // ty_pl with VectorCmp extra: {lhs(Ref), rhs(Ref), op(u32)}
+        c.AIR_INST_CMP_VECTOR,
+        c.AIR_INST_CMP_VECTOR_OPTIMIZED,
+        => {
+            canonExtraRef(extra, payload, map, next_id);
+            canonExtraRef(extra, payload + 1, map, next_id);
+        },
+        // ty_pl with Cmpxchg extra: {ptr(Ref), expected(Ref), new(Ref), flags(u32)}
+        c.AIR_INST_CMPXCHG_WEAK,
+        c.AIR_INST_CMPXCHG_STRONG,
+        => {
+            canonExtraRef(extra, payload, map, next_id);
+            canonExtraRef(extra, payload + 1, map, next_id);
+            canonExtraRef(extra, payload + 2, map, next_id);
+        },
+        // pl_op with AtomicRmw extra: {operand(Ref), flags(u32)}
+        c.AIR_INST_ATOMIC_RMW => {
+            canonExtraRef(extra, payload, map, next_id);
+        },
+        // ty_pl with TryPtr extra: {ptr(Ref), body_len(u32), body...}
+        c.AIR_INST_TRY_PTR,
+        c.AIR_INST_TRY_PTR_COLD,
+        => {
+            canonExtraRef(extra, payload, map, next_id);
+        },
+        // ty_pl with FieldParentPtr extra: {field_ptr(Ref), field_index(u32)}
+        c.AIR_INST_FIELD_PARENT_PTR => {
+            canonExtraRef(extra, payload, map, next_id);
+        },
+        // ty_pl with ShuffleOne extra: {mask(u32), operand(Ref)}
+        c.AIR_INST_SHUFFLE_ONE => {
+            canonExtraRef(extra, payload + 1, map, next_id);
+        },
+        // ty_pl with ShuffleTwo extra: {mask(u32), operand_a(Ref), operand_b(Ref)}
+        c.AIR_INST_SHUFFLE_TWO => {
+            canonExtraRef(extra, payload + 1, map, next_id);
+            canonExtraRef(extra, payload + 2, map, next_id);
+        },
+        // ty_pl with StructField extra: {struct_operand(Ref), field_index(u32)}
+        c.AIR_INST_STRUCT_FIELD_PTR,
+        c.AIR_INST_STRUCT_FIELD_VAL,
+        => {
+            canonExtraRef(extra, payload, map, next_id);
+        },
+        // ty_pl with AGGREGATE_INIT: {ref[0], ref[1], ..., ref[N-1]}
+        // N is determined by the aggregate type — not stored in extra.
+        // Cannot canonicalize without type info; refs compared directly.
+        else => {},
+    }
+}
+
+/// Canonicalize a single Ref in the extra array at the given index.
+fn canonExtraRef(extra: []u32, index: u32, map: *std.AutoHashMap(u32, u32), next_id: *u32) void {
+    if (index < extra.len) {
+        extra[index] = canonicalizeRef(extra[index], map, next_id);
+    }
+}
+
 /// Zero-pad bytes after the null terminator in a NullTerminatedString stored
 /// in the extra array.  Zig's appendAirString leaves padding uninitialised;
 /// the C side zeroes it.  Normalising both to zero allows comparison.
@@ -894,13 +1014,23 @@ fn airCompareOne(name: []const u8, a: PrecomputedFunc, b: PrecomputedFunc) !void
                         }
                     }
                 }
-                // TODO: Add extra canonicalization for tags that store Refs
-                // in their extra payload (e.g., Bin, StructField, UnionInit,
-                // VectorCmp, Cmpxchg, AtomicRmw, TryPtr, FieldParentPtr,
-                // AggregateInit). Currently these Refs are compared as raw
-                // u32, which works when IP indices match but will cause false
-                // failures when they diverge. See Gap 3 in the comparison
-                // audit plan.
+                // Extra canonicalization for tags with Refs in extra payload.
+                canonicalizeExtraRefs(
+                    a.tags[j],
+                    a.datas,
+                    j,
+                    a_extra_copy,
+                    &a_ref_map,
+                    &next_a_id,
+                );
+                canonicalizeExtraRefs(
+                    b.tags[j],
+                    b.datas,
+                    j,
+                    b_extra_copy,
+                    &b_ref_map,
+                    &next_b_id,
+                );
             }
         }
         if (!std.mem.eql(u32, a_extra_copy, b_extra_copy)) {
@@ -924,7 +1054,7 @@ const corpus = @import("corpus.zig");
 
 test "sema air: unit tests" {
     @setEvalBranchQuota(corpus.sema_unit_tests.len * 2);
-    inline for (corpus.sema_unit_tests) |path| {
+    inline for (corpus.sema_unit_tests[0..corpus.num_sema_passing]) |path| {
         const source: [:0]const u8 = @embedFile("../" ++ path);
         var result = try semaCheck(source);
         defer result.deinit();

	zig fork of https://codeberg.org/ziglang/zig
	Log \| Files \| Refs \| README \| LICENSE

M	stage0/corpus.zig	\|	5	++++-
M	stage0/sema_test.zig	\|	158	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------