astgen: add \u{...} unicode escape sequence handling
Port the \u{NNNNNN} unicode escape parsing from upstream Zig's
string_literal.zig:parseEscapeSequence into both strLitAsString
(string literal decoding with UTF-8 encoding) and char_literal
(codepoint value extraction). Without this, \u escapes fell through
to the default branch which wrote a literal 'u' character, producing
incorrect ZIR string bytes.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
68
astgen.c
68
astgen.c
@@ -1089,6 +1089,54 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
|
|||||||
ag->string_bytes[ag->string_bytes_len++] = val;
|
ag->string_bytes[ag->string_bytes_len++] = val;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case 'u': {
|
||||||
|
// \u{NNNNNN} unicode escape (string_literal.zig:194-231).
|
||||||
|
// Skip past '{'.
|
||||||
|
i++;
|
||||||
|
// Parse hex digits until '}'.
|
||||||
|
uint32_t codepoint = 0;
|
||||||
|
while (i + 1 < raw_end) {
|
||||||
|
i++;
|
||||||
|
char c = source[i];
|
||||||
|
if (c >= '0' && c <= '9') {
|
||||||
|
codepoint = codepoint * 16 + (uint32_t)(c - '0');
|
||||||
|
} else if (c >= 'a' && c <= 'f') {
|
||||||
|
codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'a');
|
||||||
|
} else if (c >= 'A' && c <= 'F') {
|
||||||
|
codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'A');
|
||||||
|
} else {
|
||||||
|
// Must be '}', done.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Encode codepoint as UTF-8 (unicode.zig:53-82).
|
||||||
|
if (codepoint <= 0x7F) {
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)codepoint;
|
||||||
|
} else if (codepoint <= 0x7FF) {
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0xC0 | (codepoint >> 6));
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0x80 | (codepoint & 0x3F));
|
||||||
|
} else if (codepoint <= 0xFFFF) {
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0xE0 | (codepoint >> 12));
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F));
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0x80 | (codepoint & 0x3F));
|
||||||
|
} else {
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0xF0 | (codepoint >> 18));
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0x80 | ((codepoint >> 12) & 0x3F));
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F));
|
||||||
|
ag->string_bytes[ag->string_bytes_len++]
|
||||||
|
= (uint8_t)(0x80 | (codepoint & 0x3F));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i];
|
ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i];
|
||||||
break;
|
break;
|
||||||
@@ -4251,6 +4299,26 @@ static uint32_t exprRl(GenZir* gz, Scope* scope, ResultLoc rl, uint32_t node) {
|
|||||||
char_val = val;
|
char_val = val;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case 'u': {
|
||||||
|
// \u{NNNNNN} unicode escape (string_literal.zig:194-231).
|
||||||
|
// Skip past '{'.
|
||||||
|
ci++;
|
||||||
|
uint32_t codepoint = 0;
|
||||||
|
while (true) {
|
||||||
|
ci++;
|
||||||
|
char c = src[ci];
|
||||||
|
if (c >= '0' && c <= '9')
|
||||||
|
codepoint = codepoint * 16 + (uint32_t)(c - '0');
|
||||||
|
else if (c >= 'a' && c <= 'f')
|
||||||
|
codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'a');
|
||||||
|
else if (c >= 'A' && c <= 'F')
|
||||||
|
codepoint = codepoint * 16 + 10 + (uint32_t)(c - 'A');
|
||||||
|
else
|
||||||
|
break; // Must be '}'.
|
||||||
|
}
|
||||||
|
char_val = codepoint;
|
||||||
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
char_val = (uint8_t)src[ci];
|
char_val = (uint8_t)src[ci];
|
||||||
break;
|
break;
|
||||||
|
|||||||
Reference in New Issue
Block a user