commit 092bebb573987e4b25be0cd698daad196e8fcbb5 (tree)
parent 8c8690e2903e9a33b1239ab70422a7810663c07f
Author: Motiejus Jakštys <motiejus.jakstys@chronosphere.io>
Date: Fri, 13 Feb 2026 05:52:32 +0000
astgen: fix string literal escape handling and string table ordering
- Add escape sequence handling to strLitAsString (\n, \r, \t, \\, \',
\", \xNN). Previously copied string content byte-for-byte.
- Fix strLitAsString quote scanning to skip escaped quotes (\\").
- Handle @"..." quoted identifiers in identAsString.
- Add test name and field name strings to scanContainer to match
upstream string table insertion order.
- Skip dedup against reserved index 0 in strLitAsString to match
upstream hash table behavior.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat:
| M | astgen.c | | | 206 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- |
1 file changed, 187 insertions(+), 19 deletions(-)
diff --git a/astgen.c b/astgen.c
@@ -639,10 +639,63 @@ static uint32_t findExistingString(
return UINT32_MAX;
}
+// Forward declaration for strLitAsString (used by identAsString for @"..."
+// quoted identifiers with escapes).
+static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
+ uint32_t* out_index, uint32_t* out_len);
+
// Mirrors AstGen.identAsString (AstGen.zig:11530).
+// Handles both bare identifiers and @"..." quoted identifiers.
static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
uint32_t start = ag->tree->tokens.starts[ident_token];
const char* source = ag->tree->source;
+
+ if (source[start] == '@' && start + 1 < ag->tree->source_len
+ && source[start + 1] == '"') {
+ // Quoted identifier: @"name" (AstGen.zig:11297-11308).
+ // Extract content between quotes, handling escapes.
+ uint32_t si, sl;
+ // str_lit_token refers to the same token, content starts after @"
+ // We reuse strLitAsString but offset by 1 to skip '@'.
+ // Actually, strLitAsString expects a token whose source starts
+ // with '"'. The @"..." token starts with '@'. We need to handle
+ // the offset manually.
+ uint32_t content_start = start + 2; // skip @"
+ uint32_t content_end = content_start;
+ while (content_end < ag->tree->source_len
+ && source[content_end] != '"')
+ content_end++;
+ uint32_t content_len = content_end - content_start;
+
+ // Check for escapes.
+ bool has_escapes = false;
+ for (uint32_t j = content_start; j < content_end; j++) {
+ if (source[j] == '\\') {
+ has_escapes = true;
+ break;
+ }
+ }
+
+ if (!has_escapes) {
+ uint32_t existing
+ = findExistingString(ag, source + content_start, content_len);
+ if (existing != UINT32_MAX)
+ return existing;
+ uint32_t str_index = ag->string_bytes_len;
+ ensureStringBytesCapacity(ag, content_len + 1);
+ memcpy(ag->string_bytes + ag->string_bytes_len,
+ source + content_start, content_len);
+ ag->string_bytes_len += content_len;
+ ag->string_bytes[ag->string_bytes_len++] = 0;
+ return str_index;
+ }
+
+ // With escapes: use strLitAsString-like decoding.
+ strLitAsString(ag, ident_token, &si, &sl);
+ return si;
+ }
+
+ // Bare identifier: scan alphanumeric + underscore.
uint32_t end = start;
while (end < ag->tree->source_len) {
char ch = source[end];
@@ -669,7 +722,8 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
}
// Mirrors AstGen.strLitAsString (AstGen.zig:11553).
-// Simplified: handles simple string literals without escape sequences.
+// Mirrors AstGen.strLitAsString (AstGen.zig:11553).
+// Handles string literals with escape sequences.
// Returns the string index and length via out parameters.
static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
uint32_t* out_index, uint32_t* out_len) {
@@ -677,33 +731,126 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
const char* source = ag->tree->source;
// Skip opening quote.
- uint32_t content_start = tok_start + 1;
- // Find closing quote.
- uint32_t content_end = content_start;
- while (content_end < ag->tree->source_len && source[content_end] != '"') {
- content_end++;
+ uint32_t i = tok_start + 1;
+ // Find closing quote, skipping escaped characters.
+ uint32_t raw_end = i;
+ while (raw_end < ag->tree->source_len) {
+ if (source[raw_end] == '\\') {
+ raw_end += 2; // skip escape + escaped char
+ } else if (source[raw_end] == '"') {
+ break;
+ } else {
+ raw_end++;
+ }
}
- uint32_t content_len = content_end - content_start;
+ // Check if there are any escape sequences.
+ bool has_escapes = false;
+ for (uint32_t j = i; j < raw_end; j++) {
+ if (source[j] == '\\') {
+ has_escapes = true;
+ break;
+ }
+ }
- // Check for existing string (dedup).
- uint32_t existing
- = findExistingString(ag, source + content_start, content_len);
- if (existing != UINT32_MAX) {
- *out_index = existing;
+ if (!has_escapes) {
+ // Fast path: no escapes, copy directly.
+ uint32_t content_len = raw_end - i;
+ // Dedup: skip index 0 (reserved NullTerminatedString.empty).
+ // The upstream hash table doesn't include the reserved entry, so
+ // string literals are never deduped against it.
+ uint32_t existing
+ = findExistingString(ag, source + i, content_len);
+ if (existing != UINT32_MAX && existing != 0) {
+ *out_index = existing;
+ *out_len = content_len;
+ return;
+ }
+ uint32_t str_index = ag->string_bytes_len;
+ ensureStringBytesCapacity(ag, content_len + 1);
+ memcpy(
+ ag->string_bytes + ag->string_bytes_len, source + i, content_len);
+ ag->string_bytes_len += content_len;
+ ag->string_bytes[ag->string_bytes_len++] = 0;
+ *out_index = str_index;
*out_len = content_len;
return;
}
+ // Slow path: process escape sequences (AstGen.zig:11585-11640).
+ // Decode into a temporary buffer.
+ uint32_t max_len = raw_end - i;
+ uint8_t* buf = malloc(max_len);
+ if (!buf)
+ exit(1);
+ uint32_t out_pos = 0;
+ while (i < raw_end) {
+ if (source[i] == '\\') {
+ i++;
+ if (i >= raw_end)
+ break;
+ switch (source[i]) {
+ case 'n':
+ buf[out_pos++] = '\n';
+ break;
+ case 'r':
+ buf[out_pos++] = '\r';
+ break;
+ case 't':
+ buf[out_pos++] = '\t';
+ break;
+ case '\\':
+ buf[out_pos++] = '\\';
+ break;
+ case '\'':
+ buf[out_pos++] = '\'';
+ break;
+ case '"':
+ buf[out_pos++] = '"';
+ break;
+ case 'x': {
+ // \xNN hex escape.
+ uint8_t val = 0;
+ for (int k = 0; k < 2 && i + 1 < raw_end; k++) {
+ i++;
+ char c = source[i];
+ if (c >= '0' && c <= '9')
+ val = (uint8_t)(val * 16 + (uint8_t)(c - '0'));
+ else if (c >= 'a' && c <= 'f')
+ val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'a'));
+ else if (c >= 'A' && c <= 'F')
+ val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'A'));
+ }
+ buf[out_pos++] = val;
+ break;
+ }
+ default:
+ buf[out_pos++] = (uint8_t)source[i];
+ break;
+ }
+ } else {
+ buf[out_pos++] = (uint8_t)source[i];
+ }
+ i++;
+ }
+
+ // Dedup check (skip index 0 — reserved NullTerminatedString.empty).
+ uint32_t existing = findExistingString(ag, (const char*)buf, out_pos);
+ if (existing != UINT32_MAX && existing != 0) {
+ *out_index = existing;
+ *out_len = out_pos;
+ free(buf);
+ return;
+ }
+
uint32_t str_index = ag->string_bytes_len;
- ensureStringBytesCapacity(ag, content_len + 1);
- memcpy(ag->string_bytes + ag->string_bytes_len, source + content_start,
- content_len);
- ag->string_bytes_len += content_len;
+ ensureStringBytesCapacity(ag, out_pos + 1);
+ memcpy(ag->string_bytes + ag->string_bytes_len, buf, out_pos);
+ ag->string_bytes_len += out_pos;
ag->string_bytes[ag->string_bytes_len++] = 0;
-
+ free(buf);
*out_index = str_index;
- *out_len = content_len;
+ *out_len = out_pos;
}
// --- Declaration helpers ---
@@ -1219,12 +1366,33 @@ static uint32_t scanContainer(
addDeclToTable(ag, name_str, member);
break;
}
+ // Container fields: add field name to string table for ordering
+ // (AstGen.zig:13509).
+ case AST_NODE_CONTAINER_FIELD_INIT:
+ case AST_NODE_CONTAINER_FIELD_ALIGN:
+ case AST_NODE_CONTAINER_FIELD: {
+ uint32_t main_token = tree->nodes.main_tokens[member];
+ identAsString(ag, main_token);
+ break;
+ }
case AST_NODE_COMPTIME:
decl_count++;
break;
- case AST_NODE_TEST_DECL:
+ case AST_NODE_TEST_DECL: {
decl_count++;
+ // Process test name string to match upstream string table
+ // ordering (AstGen.zig:13465-13500).
+ uint32_t test_name_token
+ = tree->nodes.main_tokens[member] + 1;
+ TokenizerTag tt = tree->tokens.tags[test_name_token];
+ if (tt == TOKEN_STRING_LITERAL) {
+ uint32_t si, sl;
+ strLitAsString(ag, test_name_token, &si, &sl);
+ } else if (tt == TOKEN_IDENTIFIER) {
+ identAsString(ag, test_name_token);
+ }
break;
+ }
default:
break;
}