From 981c19c113f2a01e327773a7f6c2c20ea36c0e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Mon, 16 Feb 2026 15:05:35 +0000 Subject: [PATCH] Fix identAsString for @"..." identifiers with escape sequences Co-Authored-By: Claude Opus 4.6 --- stage0/astgen.c | 129 ++++++++++++++++++++++++++++++++++++++--- stage0/astgen_test.zig | 2 +- 2 files changed, 121 insertions(+), 10 deletions(-) diff --git a/stage0/astgen.c b/stage0/astgen.c index 09591ef8ea..82dfd721ad 100644 --- a/stage0/astgen.c +++ b/stage0/astgen.c @@ -1313,12 +1313,6 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) { && source[start + 1] == '"') { // Quoted identifier: @"name" (AstGen.zig:11297-11308). // Extract content between quotes, handling escapes. - uint32_t si, sl; - // str_lit_token refers to the same token, content starts after @" - // We reuse strLitAsString but offset by 1 to skip '@'. - // Actually, strLitAsString expects a token whose source starts - // with '"'. The @"..." token starts with '@'. We need to handle - // the offset manually. uint32_t content_start = start + 2; // skip @" uint32_t content_end = content_start; while ( @@ -1349,9 +1343,126 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) { return str_index; } - // With escapes: use strLitAsString-like decoding. - strLitAsString(ag, ident_token, &si, &sl); - return si; + // With escapes: decode directly into string_bytes + // (AstGen.zig:11297-11308, appendIdentStr with parseStrLit offset=1). + // Cannot use strLitAsString here because it assumes tok_start+1 is + // past the opening quote, but for @"..." tok_start+1 is the quote. + { + uint32_t str_index = ag->string_bytes_len; + uint32_t max_len = content_end - content_start; + ensureStringBytesCapacity(ag, max_len + 1); + uint32_t ci = content_start; + while (ci < content_end) { + if (source[ci] == '\\') { + ci++; + if (ci >= content_end) + break; + switch (source[ci]) { + case 'n': + ag->string_bytes[ag->string_bytes_len++] = '\n'; + break; + case 'r': + ag->string_bytes[ag->string_bytes_len++] = '\r'; + break; + case 't': + ag->string_bytes[ag->string_bytes_len++] = '\t'; + break; + case '\\': + ag->string_bytes[ag->string_bytes_len++] = '\\'; + break; + case '\'': + ag->string_bytes[ag->string_bytes_len++] = '\''; + break; + case '"': + ag->string_bytes[ag->string_bytes_len++] = '"'; + break; + case 'x': { + uint8_t val = 0; + for (int k = 0; k < 2 && ci + 1 < content_end; k++) { + ci++; + char c = source[ci]; + if (c >= '0' && c <= '9') + val = (uint8_t)(val * 16 + (uint8_t)(c - '0')); + else if (c >= 'a' && c <= 'f') + val = (uint8_t)(val * 16 + 10 + + (uint8_t)(c - 'a')); + else if (c >= 'A' && c <= 'F') + val = (uint8_t)(val * 16 + 10 + + (uint8_t)(c - 'A')); + } + ag->string_bytes[ag->string_bytes_len++] = val; + break; + } + case 'u': { + ci++; // skip '{' + uint32_t codepoint = 0; + while (ci + 1 < content_end) { + ci++; + char c = source[ci]; + if (c >= '0' && c <= '9') + codepoint + = codepoint * 16 + (uint32_t)(c - '0'); + else if (c >= 'a' && c <= 'f') + codepoint = codepoint * 16 + 10 + + (uint32_t)(c - 'a'); + else if (c >= 'A' && c <= 'F') + codepoint = codepoint * 16 + 10 + + (uint32_t)(c - 'A'); + else + break; + } + if (codepoint <= 0x7F) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)codepoint; + } else if (codepoint <= 0x7FF) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xC0 | (codepoint >> 6)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } else if (codepoint <= 0xFFFF) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xE0 | (codepoint >> 12)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } else { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xF0 | (codepoint >> 18)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 12) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } + break; + } + default: + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)source[ci]; + break; + } + } else { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)source[ci]; + } + ci++; + } + uint32_t decoded_len = ag->string_bytes_len - str_index; + uint8_t* key = ag->string_bytes + str_index; + // Identifiers cannot contain null bytes (AstGen.zig:11303). + uint32_t existing + = findExistingString(ag, (const char*)key, decoded_len); + if (existing != UINT32_MAX) { + ag->string_bytes_len = str_index; + return existing; + } + ensureStringBytesCapacity(ag, 1); + ag->string_bytes[ag->string_bytes_len++] = 0; + registerString(ag, str_index); + return str_index; + } } // Bare identifier: scan alphanumeric + underscore. diff --git a/stage0/astgen_test.zig b/stage0/astgen_test.zig index 0e7d452c2e..89d790ccc0 100644 --- a/stage0/astgen_test.zig +++ b/stage0/astgen_test.zig @@ -1308,7 +1308,7 @@ const corpus_files = .{ "../test/behavior/widening.zig", "../test/behavior/wrapping_arithmetic.zig", "../test/behavior/x86_64.zig", - //"../test/behavior/zon.zig", + "../test/behavior/zon.zig", "../src/print_value.zig", //"../src/crash_report.zig", "../src/target.zig",