astgen: fix string literal escape handling and string table ordering

- Add escape sequence handling to strLitAsString (\n, \r, \t, \\, \',
  \", \xNN). Previously copied string content byte-for-byte.
- Fix strLitAsString quote scanning to skip escaped quotes (\\").
- Handle @"..." quoted identifiers in identAsString.
- Add test name and field name strings to scanContainer to match
  upstream string table insertion order.
- Skip dedup against reserved index 0 in strLitAsString to match
  upstream hash table behavior.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 05:52:32 +00:00
parent 2717f8ca91
commit 1228d8d70f

208
astgen.c
View File

@@ -639,10 +639,63 @@ static uint32_t findExistingString(
return UINT32_MAX; return UINT32_MAX;
} }
// Forward declaration for strLitAsString (used by identAsString for @"..."
// quoted identifiers with escapes).
static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
uint32_t* out_index, uint32_t* out_len);
// Mirrors AstGen.identAsString (AstGen.zig:11530). // Mirrors AstGen.identAsString (AstGen.zig:11530).
// Handles both bare identifiers and @"..." quoted identifiers.
static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) { static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
uint32_t start = ag->tree->tokens.starts[ident_token]; uint32_t start = ag->tree->tokens.starts[ident_token];
const char* source = ag->tree->source; const char* source = ag->tree->source;
if (source[start] == '@' && start + 1 < ag->tree->source_len
&& source[start + 1] == '"') {
// Quoted identifier: @"name" (AstGen.zig:11297-11308).
// Extract content between quotes, handling escapes.
uint32_t si, sl;
// str_lit_token refers to the same token, content starts after @"
// We reuse strLitAsString but offset by 1 to skip '@'.
// Actually, strLitAsString expects a token whose source starts
// with '"'. The @"..." token starts with '@'. We need to handle
// the offset manually.
uint32_t content_start = start + 2; // skip @"
uint32_t content_end = content_start;
while (content_end < ag->tree->source_len
&& source[content_end] != '"')
content_end++;
uint32_t content_len = content_end - content_start;
// Check for escapes.
bool has_escapes = false;
for (uint32_t j = content_start; j < content_end; j++) {
if (source[j] == '\\') {
has_escapes = true;
break;
}
}
if (!has_escapes) {
uint32_t existing
= findExistingString(ag, source + content_start, content_len);
if (existing != UINT32_MAX)
return existing;
uint32_t str_index = ag->string_bytes_len;
ensureStringBytesCapacity(ag, content_len + 1);
memcpy(ag->string_bytes + ag->string_bytes_len,
source + content_start, content_len);
ag->string_bytes_len += content_len;
ag->string_bytes[ag->string_bytes_len++] = 0;
return str_index;
}
// With escapes: use strLitAsString-like decoding.
strLitAsString(ag, ident_token, &si, &sl);
return si;
}
// Bare identifier: scan alphanumeric + underscore.
uint32_t end = start; uint32_t end = start;
while (end < ag->tree->source_len) { while (end < ag->tree->source_len) {
char ch = source[end]; char ch = source[end];
@@ -669,7 +722,8 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
} }
// Mirrors AstGen.strLitAsString (AstGen.zig:11553). // Mirrors AstGen.strLitAsString (AstGen.zig:11553).
// Simplified: handles simple string literals without escape sequences. // Mirrors AstGen.strLitAsString (AstGen.zig:11553).
// Handles string literals with escape sequences.
// Returns the string index and length via out parameters. // Returns the string index and length via out parameters.
static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token, static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
uint32_t* out_index, uint32_t* out_len) { uint32_t* out_index, uint32_t* out_len) {
@@ -677,33 +731,126 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
const char* source = ag->tree->source; const char* source = ag->tree->source;
// Skip opening quote. // Skip opening quote.
uint32_t content_start = tok_start + 1; uint32_t i = tok_start + 1;
// Find closing quote. // Find closing quote, skipping escaped characters.
uint32_t content_end = content_start; uint32_t raw_end = i;
while (content_end < ag->tree->source_len && source[content_end] != '"') { while (raw_end < ag->tree->source_len) {
content_end++; if (source[raw_end] == '\\') {
raw_end += 2; // skip escape + escaped char
} else if (source[raw_end] == '"') {
break;
} else {
raw_end++;
}
} }
uint32_t content_len = content_end - content_start; // Check if there are any escape sequences.
bool has_escapes = false;
for (uint32_t j = i; j < raw_end; j++) {
if (source[j] == '\\') {
has_escapes = true;
break;
}
}
// Check for existing string (dedup). if (!has_escapes) {
uint32_t existing // Fast path: no escapes, copy directly.
= findExistingString(ag, source + content_start, content_len); uint32_t content_len = raw_end - i;
if (existing != UINT32_MAX) { // Dedup: skip index 0 (reserved NullTerminatedString.empty).
*out_index = existing; // The upstream hash table doesn't include the reserved entry, so
// string literals are never deduped against it.
uint32_t existing
= findExistingString(ag, source + i, content_len);
if (existing != UINT32_MAX && existing != 0) {
*out_index = existing;
*out_len = content_len;
return;
}
uint32_t str_index = ag->string_bytes_len;
ensureStringBytesCapacity(ag, content_len + 1);
memcpy(
ag->string_bytes + ag->string_bytes_len, source + i, content_len);
ag->string_bytes_len += content_len;
ag->string_bytes[ag->string_bytes_len++] = 0;
*out_index = str_index;
*out_len = content_len; *out_len = content_len;
return; return;
} }
uint32_t str_index = ag->string_bytes_len; // Slow path: process escape sequences (AstGen.zig:11585-11640).
ensureStringBytesCapacity(ag, content_len + 1); // Decode into a temporary buffer.
memcpy(ag->string_bytes + ag->string_bytes_len, source + content_start, uint32_t max_len = raw_end - i;
content_len); uint8_t* buf = malloc(max_len);
ag->string_bytes_len += content_len; if (!buf)
ag->string_bytes[ag->string_bytes_len++] = 0; exit(1);
uint32_t out_pos = 0;
while (i < raw_end) {
if (source[i] == '\\') {
i++;
if (i >= raw_end)
break;
switch (source[i]) {
case 'n':
buf[out_pos++] = '\n';
break;
case 'r':
buf[out_pos++] = '\r';
break;
case 't':
buf[out_pos++] = '\t';
break;
case '\\':
buf[out_pos++] = '\\';
break;
case '\'':
buf[out_pos++] = '\'';
break;
case '"':
buf[out_pos++] = '"';
break;
case 'x': {
// \xNN hex escape.
uint8_t val = 0;
for (int k = 0; k < 2 && i + 1 < raw_end; k++) {
i++;
char c = source[i];
if (c >= '0' && c <= '9')
val = (uint8_t)(val * 16 + (uint8_t)(c - '0'));
else if (c >= 'a' && c <= 'f')
val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'a'));
else if (c >= 'A' && c <= 'F')
val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'A'));
}
buf[out_pos++] = val;
break;
}
default:
buf[out_pos++] = (uint8_t)source[i];
break;
}
} else {
buf[out_pos++] = (uint8_t)source[i];
}
i++;
}
// Dedup check (skip index 0 — reserved NullTerminatedString.empty).
uint32_t existing = findExistingString(ag, (const char*)buf, out_pos);
if (existing != UINT32_MAX && existing != 0) {
*out_index = existing;
*out_len = out_pos;
free(buf);
return;
}
uint32_t str_index = ag->string_bytes_len;
ensureStringBytesCapacity(ag, out_pos + 1);
memcpy(ag->string_bytes + ag->string_bytes_len, buf, out_pos);
ag->string_bytes_len += out_pos;
ag->string_bytes[ag->string_bytes_len++] = 0;
free(buf);
*out_index = str_index; *out_index = str_index;
*out_len = content_len; *out_len = out_pos;
} }
// --- Declaration helpers --- // --- Declaration helpers ---
@@ -1219,12 +1366,33 @@ static uint32_t scanContainer(
addDeclToTable(ag, name_str, member); addDeclToTable(ag, name_str, member);
break; break;
} }
// Container fields: add field name to string table for ordering
// (AstGen.zig:13509).
case AST_NODE_CONTAINER_FIELD_INIT:
case AST_NODE_CONTAINER_FIELD_ALIGN:
case AST_NODE_CONTAINER_FIELD: {
uint32_t main_token = tree->nodes.main_tokens[member];
identAsString(ag, main_token);
break;
}
case AST_NODE_COMPTIME: case AST_NODE_COMPTIME:
decl_count++; decl_count++;
break; break;
case AST_NODE_TEST_DECL: case AST_NODE_TEST_DECL: {
decl_count++; decl_count++;
// Process test name string to match upstream string table
// ordering (AstGen.zig:13465-13500).
uint32_t test_name_token
= tree->nodes.main_tokens[member] + 1;
TokenizerTag tt = tree->tokens.tags[test_name_token];
if (tt == TOKEN_STRING_LITERAL) {
uint32_t si, sl;
strLitAsString(ag, test_name_token, &si, &sl);
} else if (tt == TOKEN_IDENTIFIER) {
identAsString(ag, test_name_token);
}
break; break;
}
default: default:
break; break;
} }