From 0b12e027a3628e26a765126d9937a2366b638ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Fri, 13 Feb 2026 22:07:23 +0000 Subject: [PATCH] astgen: add \u{...} unicode escape sequence handling Port the \u{NNNNNN} unicode escape parsing from upstream Zig's string_literal.zig:parseEscapeSequence into both strLitAsString (string literal decoding with UTF-8 encoding) and char_literal (codepoint value extraction). Without this, \u escapes fell through to the default branch which wrote a literal 'u' character, producing incorrect ZIR string bytes. Co-Authored-By: Claude Opus 4.6 --- astgen.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/astgen.c b/astgen.c index 5fa717f394..4342bd7a38 100644 --- a/astgen.c +++ b/astgen.c @@ -1089,6 +1089,54 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token, ag->string_bytes[ag->string_bytes_len++] = val; break; } + case 'u': { + // \u{NNNNNN} unicode escape (string_literal.zig:194-231). + // Skip past '{'. + i++; + // Parse hex digits until '}'. + uint32_t codepoint = 0; + while (i + 1 < raw_end) { + i++; + char c = source[i]; + if (c >= '0' && c <= '9') { + codepoint = codepoint * 16 + (uint32_t)(c - '0'); + } else if (c >= 'a' && c <= 'f') { + codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'a'); + } else if (c >= 'A' && c <= 'F') { + codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'A'); + } else { + // Must be '}', done. + break; + } + } + // Encode codepoint as UTF-8 (unicode.zig:53-82). + if (codepoint <= 0x7F) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)codepoint; + } else if (codepoint <= 0x7FF) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xC0 | (codepoint >> 6)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } else if (codepoint <= 0xFFFF) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xE0 | (codepoint >> 12)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } else { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xF0 | (codepoint >> 18)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 12) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } + break; + } default: ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i]; break; @@ -4251,6 +4299,26 @@ static uint32_t exprRl(GenZir* gz, Scope* scope, ResultLoc rl, uint32_t node) { char_val = val; break; } + case 'u': { + // \u{NNNNNN} unicode escape (string_literal.zig:194-231). + // Skip past '{'. + ci++; + uint32_t codepoint = 0; + while (true) { + ci++; + char c = src[ci]; + if (c >= '0' && c <= '9') + codepoint = codepoint * 16 + (uint32_t)(c - '0'); + else if (c >= 'a' && c <= 'f') + codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'a'); + else if (c >= 'A' && c <= 'F') + codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'A'); + else + break; // Must be '}'. + } + char_val = codepoint; + break; + } default: char_val = (uint8_t)src[ci]; break;