astgen: fix string dedup with string_table, handle embedded nulls

Replace linear scan of all string_bytes with a string_table that
only contains explicitly registered strings (via identAsString and
strLitAsString). This prevents false deduplication against multiline
string content that upstream's hash table would never match.

Also handle embedded null bytes in strLitAsString: when decoded string
contains \x00, skip dedup and don't add trailing null, matching upstream
AstGen.zig:11560. Fix c_include extended instruction small field to
0xAAAA (undefined) matching upstream addExtendedPayload.

Passes corpus tests for test_all.zig, build.zig, tokenizer_test.zig.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 14:08:11 +00:00
parent 68d0917ec3
commit a5b0e07d50

132
astgen.c
View File

@@ -66,6 +66,13 @@ typedef struct {
uint8_t* string_bytes;
uint32_t string_bytes_len;
uint32_t string_bytes_cap;
// String dedup table: stores positions in string_bytes that are
// registered for deduplication (mirrors AstGen.string_table).
// Only strings added via identAsString/strLitAsString (non-embedded-null)
// are registered. Multiline strings are NOT registered.
uint32_t* string_table;
uint32_t string_table_len;
uint32_t string_table_cap;
uint32_t source_offset;
uint32_t source_line;
uint32_t source_column;
@@ -838,26 +845,37 @@ static int32_t tokenIndexToRelative(const GenZir* gz, uint32_t token) {
// Search for an existing null-terminated string in string_bytes.
// Returns the index if found, or UINT32_MAX if not found.
// Mirrors string_table dedup (AstGen.zig:11564).
// Find a string in string_table (registered strings only).
// Mirrors AstGen.string_table hash table lookup.
static uint32_t findExistingString(
const AstGenCtx* ag, const char* str, uint32_t len) {
// Linear scan through null-terminated strings in string_bytes.
uint32_t i = 0;
while (i < ag->string_bytes_len) {
// Find the end of the current null-terminated string.
uint32_t j = i;
while (j < ag->string_bytes_len && ag->string_bytes[j] != 0)
j++;
uint32_t existing_len = j - i;
if (existing_len == len
&& memcmp(ag->string_bytes + i, str, len) == 0) {
return i;
for (uint32_t k = 0; k < ag->string_table_len; k++) {
uint32_t pos = ag->string_table[k];
// Compare: string at pos is null-terminated in string_bytes.
const char* existing = (const char*)ag->string_bytes + pos;
uint32_t existing_len = (uint32_t)strlen(existing);
if (existing_len == len && memcmp(existing, str, len) == 0) {
return pos;
}
// Skip past the null terminator.
i = j + 1;
}
return UINT32_MAX;
}
// Register a string position in the string table for deduplication.
static void registerString(AstGenCtx* ag, uint32_t pos) {
if (ag->string_table_len >= ag->string_table_cap) {
uint32_t new_cap = ag->string_table_cap * 2;
if (new_cap < 64)
new_cap = 64;
uint32_t* p = realloc(ag->string_table, new_cap * sizeof(uint32_t));
if (!p)
exit(1);
ag->string_table = p;
ag->string_table_cap = new_cap;
}
ag->string_table[ag->string_table_len++] = pos;
}
// Mirrors AstGen.tokenIdentEql (AstGen.zig:6148-6152).
// Compares two identifier tokens by source text without touching string_bytes.
static bool tokenIdentEql(const Ast* tree, uint32_t tok1, uint32_t tok2) {
@@ -935,6 +953,7 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
source + content_start, content_len);
ag->string_bytes_len += content_len;
ag->string_bytes[ag->string_bytes_len++] = 0;
registerString(ag, str_index);
return str_index;
}
@@ -966,13 +985,14 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
memcpy(ag->string_bytes + ag->string_bytes_len, source + start, ident_len);
ag->string_bytes_len += ident_len;
ag->string_bytes[ag->string_bytes_len++] = 0;
registerString(ag, str_index);
return str_index;
}
// Mirrors AstGen.strLitAsString (AstGen.zig:11553).
// Mirrors AstGen.strLitAsString (AstGen.zig:11553).
// Handles string literals with escape sequences.
// Returns the string index and length via out parameters.
// Decodes string literal, checks for embedded nulls.
// If embedded null found: store raw bytes without trailing null, no dedup.
// Otherwise: dedup via string_table, add trailing null.
static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
uint32_t* out_index, uint32_t* out_len) {
uint32_t tok_start = ag->tree->tokens.starts[str_lit_token];
@@ -1002,13 +1022,10 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
}
if (!has_escapes) {
// Fast path: no escapes, copy directly.
// Fast path: no escapes, no embedded nulls possible.
uint32_t content_len = raw_end - i;
// Dedup: skip index 0 (reserved NullTerminatedString.empty).
// The upstream hash table doesn't include the reserved entry, so
// string literals are never deduped against it.
uint32_t existing = findExistingString(ag, source + i, content_len);
if (existing != UINT32_MAX && existing != 0) {
if (existing != UINT32_MAX) {
*out_index = existing;
*out_len = content_len;
return;
@@ -1019,18 +1036,17 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
ag->string_bytes + ag->string_bytes_len, source + i, content_len);
ag->string_bytes_len += content_len;
ag->string_bytes[ag->string_bytes_len++] = 0;
registerString(ag, str_index);
*out_index = str_index;
*out_len = content_len;
return;
}
// Slow path: process escape sequences (AstGen.zig:11585-11640).
// Decode into a temporary buffer.
// Slow path: process escape sequences (AstGen.zig:11558).
// Decode directly into string_bytes (like upstream).
uint32_t str_index = ag->string_bytes_len;
uint32_t max_len = raw_end - i;
uint8_t* buf = malloc(max_len);
if (!buf)
exit(1);
uint32_t out_pos = 0;
ensureStringBytesCapacity(ag, max_len + 1);
while (i < raw_end) {
if (source[i] == '\\') {
i++;
@@ -1038,22 +1054,22 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
break;
switch (source[i]) {
case 'n':
buf[out_pos++] = '\n';
ag->string_bytes[ag->string_bytes_len++] = '\n';
break;
case 'r':
buf[out_pos++] = '\r';
ag->string_bytes[ag->string_bytes_len++] = '\r';
break;
case 't':
buf[out_pos++] = '\t';
ag->string_bytes[ag->string_bytes_len++] = '\t';
break;
case '\\':
buf[out_pos++] = '\\';
ag->string_bytes[ag->string_bytes_len++] = '\\';
break;
case '\'':
buf[out_pos++] = '\'';
ag->string_bytes[ag->string_bytes_len++] = '\'';
break;
case '"':
buf[out_pos++] = '"';
ag->string_bytes[ag->string_bytes_len++] = '"';
break;
case 'x': {
// \xNN hex escape.
@@ -1068,36 +1084,52 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
else if (c >= 'A' && c <= 'F')
val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'A'));
}
buf[out_pos++] = val;
ag->string_bytes[ag->string_bytes_len++] = val;
break;
}
default:
buf[out_pos++] = (uint8_t)source[i];
ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i];
break;
}
} else {
buf[out_pos++] = (uint8_t)source[i];
ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i];
}
i++;
}
uint32_t decoded_len = ag->string_bytes_len - str_index;
uint8_t* key = ag->string_bytes + str_index;
// Dedup check (skip index 0 — reserved NullTerminatedString.empty).
uint32_t existing = findExistingString(ag, (const char*)buf, out_pos);
if (existing != UINT32_MAX && existing != 0) {
*out_index = existing;
*out_len = out_pos;
free(buf);
// Check for embedded null bytes (AstGen.zig:11560).
// If found, skip dedup and don't add trailing null.
bool has_embedded_null = false;
for (uint32_t j = 0; j < decoded_len; j++) {
if (key[j] == 0) {
has_embedded_null = true;
break;
}
}
if (has_embedded_null) {
*out_index = str_index;
*out_len = decoded_len;
return;
}
uint32_t str_index = ag->string_bytes_len;
ensureStringBytesCapacity(ag, out_pos + 1);
memcpy(ag->string_bytes + ag->string_bytes_len, buf, out_pos);
ag->string_bytes_len += out_pos;
// Dedup against string_table (AstGen.zig:11564-11585).
uint32_t existing = findExistingString(ag, (const char*)key, decoded_len);
if (existing != UINT32_MAX) {
// Shrink back (AstGen.zig:11570).
ag->string_bytes_len = str_index;
*out_index = existing;
*out_len = decoded_len;
return;
}
// New entry: add trailing null and register.
ensureStringBytesCapacity(ag, 1);
ag->string_bytes[ag->string_bytes_len++] = 0;
free(buf);
registerString(ag, str_index);
*out_index = str_index;
*out_len = out_pos;
*out_len = decoded_len;
}
// --- Declaration helpers ---
@@ -2358,7 +2390,8 @@ static uint32_t simpleCBuiltin(GenZir* gz, Scope* scope, uint32_t node,
ZirInstData data;
data.extended.opcode = ext_tag;
data.extended.small = 0;
data.extended.small = 0xAAAAu; // undefined (addExtendedPayload passes
// undefined for small)
data.extended.operand = payload_index;
addInstruction(gz, ZIR_INST_EXTENDED, data);
@@ -10366,6 +10399,7 @@ Zir astGen(const Ast* ast) {
free(ag.ref_table_keys);
free(ag.ref_table_vals);
free(ag.nodes_need_rl);
free(ag.string_table);
return zir;
}