astgen: add \u{...} unicode escape sequence handling - zig0 - my attempts at zig bootstrapping in C

commit edc55c2debf730e9a9c373d04b0c19020989a3ef (tree)
parent 8a45572d0f55a7d9e184361a0f388facdb9dea46
Author: Motiejus Jakštys <motiejus@jakstys.lt>
Date:   Fri, 13 Feb 2026 22:07:23 +0000

astgen: add \u{...} unicode escape sequence handling

Port the \u{NNNNNN} unicode escape parsing from upstream Zig's
string_literal.zig:parseEscapeSequence into both strLitAsString
(string literal decoding with UTF-8 encoding) and char_literal
(codepoint value extraction). Without this, \u escapes fell through
to the default branch which wrote a literal 'u' character, producing
incorrect ZIR string bytes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
M astgen.c  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 68 insertions(+), 0 deletions(-)
diff --git a/astgen.c b/astgen.c
@@ -1089,6 +1089,54 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
                 ag->string_bytes[ag->string_bytes_len++] = val;
                 break;
             }
+            case 'u': {
+                // \u{NNNNNN} unicode escape (string_literal.zig:194-231).
+                // Skip past '{'.
+                i++;
+                // Parse hex digits until '}'.
+                uint32_t codepoint = 0;
+                while (i + 1 < raw_end) {
+                    i++;
+                    char c = source[i];
+                    if (c >= '0' && c <= '9') {
+                        codepoint = codepoint * 16 + (uint32_t)(c - '0');
+                    } else if (c >= 'a' && c <= 'f') {
+                        codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'a');
+                    } else if (c >= 'A' && c <= 'F') {
+                        codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'A');
+                    } else {
+                        // Must be '}', done.
+                        break;
+                    }
+                }
+                // Encode codepoint as UTF-8 (unicode.zig:53-82).
+                if (codepoint <= 0x7F) {
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)codepoint;
+                } else if (codepoint <= 0x7FF) {
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0xC0 | (codepoint >> 6));
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0x80 | (codepoint & 0x3F));
+                } else if (codepoint <= 0xFFFF) {
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0xE0 | (codepoint >> 12));
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F));
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0x80 | (codepoint & 0x3F));
+                } else {
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0xF0 | (codepoint >> 18));
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0x80 | ((codepoint >> 12) & 0x3F));
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F));
+                    ag->string_bytes[ag->string_bytes_len++]
+                        = (uint8_t)(0x80 | (codepoint & 0x3F));
+                }
+                break;
+            }
             default:
                 ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i];
                 break;
@@ -4251,6 +4299,26 @@ static uint32_t exprRl(GenZir* gz, Scope* scope, ResultLoc rl, uint32_t node) {
                 char_val = val;
                 break;
             }
+            case 'u': {
+                // \u{NNNNNN} unicode escape (string_literal.zig:194-231).
+                // Skip past '{'.
+                ci++;
+                uint32_t codepoint = 0;
+                while (true) {
+                    ci++;
+                    char c = src[ci];
+                    if (c >= '0' && c <= '9')
+                        codepoint = codepoint * 16 + (uint32_t)(c - '0');
+                    else if (c >= 'a' && c <= 'f')
+                        codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'a');
+                    else if (c >= 'A' && c <= 'F')
+                        codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'A');
+                    else
+                        break; // Must be '}'.
+                }
+                char_val = codepoint;
+                break;
+            }
             default:
                 char_val = (uint8_t)src[ci];
                 break;

	zig0 my attempts at zig bootstrapping in C
	Log \| Files \| Refs \| README \| LICENSE