zig0

my attempts at zig bootstrapping in C
Log | Files | Refs | README | LICENSE

commit edc55c2debf730e9a9c373d04b0c19020989a3ef (tree)
parent 8a45572d0f55a7d9e184361a0f388facdb9dea46
Author: Motiejus Jakštys <motiejus@jakstys.lt>
Date:   Fri, 13 Feb 2026 22:07:23 +0000

astgen: add \u{...} unicode escape sequence handling

Port the \u{NNNNNN} unicode escape parsing from upstream Zig's
string_literal.zig:parseEscapeSequence into both strLitAsString
(string literal decoding with UTF-8 encoding) and char_literal
(codepoint value extraction). Without this, \u escapes fell through
to the default branch which wrote a literal 'u' character, producing
incorrect ZIR string bytes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
Mastgen.c | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 68 insertions(+), 0 deletions(-)

diff --git a/astgen.c b/astgen.c @@ -1089,6 +1089,54 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token, ag->string_bytes[ag->string_bytes_len++] = val; break; } + case 'u': { + // \u{NNNNNN} unicode escape (string_literal.zig:194-231). + // Skip past '{'. + i++; + // Parse hex digits until '}'. + uint32_t codepoint = 0; + while (i + 1 < raw_end) { + i++; + char c = source[i]; + if (c >= '0' && c <= '9') { + codepoint = codepoint * 16 + (uint32_t)(c - '0'); + } else if (c >= 'a' && c <= 'f') { + codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'a'); + } else if (c >= 'A' && c <= 'F') { + codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'A'); + } else { + // Must be '}', done. + break; + } + } + // Encode codepoint as UTF-8 (unicode.zig:53-82). + if (codepoint <= 0x7F) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)codepoint; + } else if (codepoint <= 0x7FF) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xC0 | (codepoint >> 6)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } else if (codepoint <= 0xFFFF) { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xE0 | (codepoint >> 12)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } else { + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0xF0 | (codepoint >> 18)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 12) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F)); + ag->string_bytes[ag->string_bytes_len++] + = (uint8_t)(0x80 | (codepoint & 0x3F)); + } + break; + } default: ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i]; break; @@ -4251,6 +4299,26 @@ static uint32_t exprRl(GenZir* gz, Scope* scope, ResultLoc rl, uint32_t node) { char_val = val; break; } + case 'u': { + // \u{NNNNNN} unicode escape (string_literal.zig:194-231). + // Skip past '{'. + ci++; + uint32_t codepoint = 0; + while (true) { + ci++; + char c = src[ci]; + if (c >= '0' && c <= '9') + codepoint = codepoint * 16 + (uint32_t)(c - '0'); + else if (c >= 'a' && c <= 'f') + codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'a'); + else if (c >= 'A' && c <= 'F') + codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'A'); + else + break; // Must be '}'. + } + char_val = codepoint; + break; + } default: char_val = (uint8_t)src[ci]; break;