astgen: fix string literal escape handling and string table ordering
- Add escape sequence handling to strLitAsString (\n, \r, \t, \\, \', \", \xNN). Previously copied string content byte-for-byte. - Fix strLitAsString quote scanning to skip escaped quotes (\\"). - Handle @"..." quoted identifiers in identAsString. - Add test name and field name strings to scanContainer to match upstream string table insertion order. - Skip dedup against reserved index 0 in strLitAsString to match upstream hash table behavior. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
208
astgen.c
208
astgen.c
@@ -639,10 +639,63 @@ static uint32_t findExistingString(
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
// Forward declaration for strLitAsString (used by identAsString for @"..."
|
||||
// quoted identifiers with escapes).
|
||||
static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
|
||||
uint32_t* out_index, uint32_t* out_len);
|
||||
|
||||
// Mirrors AstGen.identAsString (AstGen.zig:11530).
|
||||
// Handles both bare identifiers and @"..." quoted identifiers.
|
||||
static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
|
||||
uint32_t start = ag->tree->tokens.starts[ident_token];
|
||||
const char* source = ag->tree->source;
|
||||
|
||||
if (source[start] == '@' && start + 1 < ag->tree->source_len
|
||||
&& source[start + 1] == '"') {
|
||||
// Quoted identifier: @"name" (AstGen.zig:11297-11308).
|
||||
// Extract content between quotes, handling escapes.
|
||||
uint32_t si, sl;
|
||||
// str_lit_token refers to the same token, content starts after @"
|
||||
// We reuse strLitAsString but offset by 1 to skip '@'.
|
||||
// Actually, strLitAsString expects a token whose source starts
|
||||
// with '"'. The @"..." token starts with '@'. We need to handle
|
||||
// the offset manually.
|
||||
uint32_t content_start = start + 2; // skip @"
|
||||
uint32_t content_end = content_start;
|
||||
while (content_end < ag->tree->source_len
|
||||
&& source[content_end] != '"')
|
||||
content_end++;
|
||||
uint32_t content_len = content_end - content_start;
|
||||
|
||||
// Check for escapes.
|
||||
bool has_escapes = false;
|
||||
for (uint32_t j = content_start; j < content_end; j++) {
|
||||
if (source[j] == '\\') {
|
||||
has_escapes = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!has_escapes) {
|
||||
uint32_t existing
|
||||
= findExistingString(ag, source + content_start, content_len);
|
||||
if (existing != UINT32_MAX)
|
||||
return existing;
|
||||
uint32_t str_index = ag->string_bytes_len;
|
||||
ensureStringBytesCapacity(ag, content_len + 1);
|
||||
memcpy(ag->string_bytes + ag->string_bytes_len,
|
||||
source + content_start, content_len);
|
||||
ag->string_bytes_len += content_len;
|
||||
ag->string_bytes[ag->string_bytes_len++] = 0;
|
||||
return str_index;
|
||||
}
|
||||
|
||||
// With escapes: use strLitAsString-like decoding.
|
||||
strLitAsString(ag, ident_token, &si, &sl);
|
||||
return si;
|
||||
}
|
||||
|
||||
// Bare identifier: scan alphanumeric + underscore.
|
||||
uint32_t end = start;
|
||||
while (end < ag->tree->source_len) {
|
||||
char ch = source[end];
|
||||
@@ -669,7 +722,8 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
|
||||
}
|
||||
|
||||
// Mirrors AstGen.strLitAsString (AstGen.zig:11553).
|
||||
// Simplified: handles simple string literals without escape sequences.
|
||||
// Mirrors AstGen.strLitAsString (AstGen.zig:11553).
|
||||
// Handles string literals with escape sequences.
|
||||
// Returns the string index and length via out parameters.
|
||||
static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
|
||||
uint32_t* out_index, uint32_t* out_len) {
|
||||
@@ -677,33 +731,126 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
|
||||
const char* source = ag->tree->source;
|
||||
|
||||
// Skip opening quote.
|
||||
uint32_t content_start = tok_start + 1;
|
||||
// Find closing quote.
|
||||
uint32_t content_end = content_start;
|
||||
while (content_end < ag->tree->source_len && source[content_end] != '"') {
|
||||
content_end++;
|
||||
uint32_t i = tok_start + 1;
|
||||
// Find closing quote, skipping escaped characters.
|
||||
uint32_t raw_end = i;
|
||||
while (raw_end < ag->tree->source_len) {
|
||||
if (source[raw_end] == '\\') {
|
||||
raw_end += 2; // skip escape + escaped char
|
||||
} else if (source[raw_end] == '"') {
|
||||
break;
|
||||
} else {
|
||||
raw_end++;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t content_len = content_end - content_start;
|
||||
// Check if there are any escape sequences.
|
||||
bool has_escapes = false;
|
||||
for (uint32_t j = i; j < raw_end; j++) {
|
||||
if (source[j] == '\\') {
|
||||
has_escapes = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for existing string (dedup).
|
||||
uint32_t existing
|
||||
= findExistingString(ag, source + content_start, content_len);
|
||||
if (existing != UINT32_MAX) {
|
||||
*out_index = existing;
|
||||
if (!has_escapes) {
|
||||
// Fast path: no escapes, copy directly.
|
||||
uint32_t content_len = raw_end - i;
|
||||
// Dedup: skip index 0 (reserved NullTerminatedString.empty).
|
||||
// The upstream hash table doesn't include the reserved entry, so
|
||||
// string literals are never deduped against it.
|
||||
uint32_t existing
|
||||
= findExistingString(ag, source + i, content_len);
|
||||
if (existing != UINT32_MAX && existing != 0) {
|
||||
*out_index = existing;
|
||||
*out_len = content_len;
|
||||
return;
|
||||
}
|
||||
uint32_t str_index = ag->string_bytes_len;
|
||||
ensureStringBytesCapacity(ag, content_len + 1);
|
||||
memcpy(
|
||||
ag->string_bytes + ag->string_bytes_len, source + i, content_len);
|
||||
ag->string_bytes_len += content_len;
|
||||
ag->string_bytes[ag->string_bytes_len++] = 0;
|
||||
*out_index = str_index;
|
||||
*out_len = content_len;
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t str_index = ag->string_bytes_len;
|
||||
ensureStringBytesCapacity(ag, content_len + 1);
|
||||
memcpy(ag->string_bytes + ag->string_bytes_len, source + content_start,
|
||||
content_len);
|
||||
ag->string_bytes_len += content_len;
|
||||
ag->string_bytes[ag->string_bytes_len++] = 0;
|
||||
// Slow path: process escape sequences (AstGen.zig:11585-11640).
|
||||
// Decode into a temporary buffer.
|
||||
uint32_t max_len = raw_end - i;
|
||||
uint8_t* buf = malloc(max_len);
|
||||
if (!buf)
|
||||
exit(1);
|
||||
uint32_t out_pos = 0;
|
||||
while (i < raw_end) {
|
||||
if (source[i] == '\\') {
|
||||
i++;
|
||||
if (i >= raw_end)
|
||||
break;
|
||||
switch (source[i]) {
|
||||
case 'n':
|
||||
buf[out_pos++] = '\n';
|
||||
break;
|
||||
case 'r':
|
||||
buf[out_pos++] = '\r';
|
||||
break;
|
||||
case 't':
|
||||
buf[out_pos++] = '\t';
|
||||
break;
|
||||
case '\\':
|
||||
buf[out_pos++] = '\\';
|
||||
break;
|
||||
case '\'':
|
||||
buf[out_pos++] = '\'';
|
||||
break;
|
||||
case '"':
|
||||
buf[out_pos++] = '"';
|
||||
break;
|
||||
case 'x': {
|
||||
// \xNN hex escape.
|
||||
uint8_t val = 0;
|
||||
for (int k = 0; k < 2 && i + 1 < raw_end; k++) {
|
||||
i++;
|
||||
char c = source[i];
|
||||
if (c >= '0' && c <= '9')
|
||||
val = (uint8_t)(val * 16 + (uint8_t)(c - '0'));
|
||||
else if (c >= 'a' && c <= 'f')
|
||||
val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'a'));
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'A'));
|
||||
}
|
||||
buf[out_pos++] = val;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
buf[out_pos++] = (uint8_t)source[i];
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
buf[out_pos++] = (uint8_t)source[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
// Dedup check (skip index 0 — reserved NullTerminatedString.empty).
|
||||
uint32_t existing = findExistingString(ag, (const char*)buf, out_pos);
|
||||
if (existing != UINT32_MAX && existing != 0) {
|
||||
*out_index = existing;
|
||||
*out_len = out_pos;
|
||||
free(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t str_index = ag->string_bytes_len;
|
||||
ensureStringBytesCapacity(ag, out_pos + 1);
|
||||
memcpy(ag->string_bytes + ag->string_bytes_len, buf, out_pos);
|
||||
ag->string_bytes_len += out_pos;
|
||||
ag->string_bytes[ag->string_bytes_len++] = 0;
|
||||
free(buf);
|
||||
*out_index = str_index;
|
||||
*out_len = content_len;
|
||||
*out_len = out_pos;
|
||||
}
|
||||
|
||||
// --- Declaration helpers ---
|
||||
@@ -1219,12 +1366,33 @@ static uint32_t scanContainer(
|
||||
addDeclToTable(ag, name_str, member);
|
||||
break;
|
||||
}
|
||||
// Container fields: add field name to string table for ordering
|
||||
// (AstGen.zig:13509).
|
||||
case AST_NODE_CONTAINER_FIELD_INIT:
|
||||
case AST_NODE_CONTAINER_FIELD_ALIGN:
|
||||
case AST_NODE_CONTAINER_FIELD: {
|
||||
uint32_t main_token = tree->nodes.main_tokens[member];
|
||||
identAsString(ag, main_token);
|
||||
break;
|
||||
}
|
||||
case AST_NODE_COMPTIME:
|
||||
decl_count++;
|
||||
break;
|
||||
case AST_NODE_TEST_DECL:
|
||||
case AST_NODE_TEST_DECL: {
|
||||
decl_count++;
|
||||
// Process test name string to match upstream string table
|
||||
// ordering (AstGen.zig:13465-13500).
|
||||
uint32_t test_name_token
|
||||
= tree->nodes.main_tokens[member] + 1;
|
||||
TokenizerTag tt = tree->tokens.tags[test_name_token];
|
||||
if (tt == TOKEN_STRING_LITERAL) {
|
||||
uint32_t si, sl;
|
||||
strLitAsString(ag, test_name_token, &si, &sl);
|
||||
} else if (tt == TOKEN_IDENTIFIER) {
|
||||
identAsString(ag, test_name_token);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user