diff --git a/astgen.c b/astgen.c index c11de99d82..0aa1eb472e 100644 --- a/astgen.c +++ b/astgen.c @@ -66,6 +66,13 @@ typedef struct { uint8_t* string_bytes; uint32_t string_bytes_len; uint32_t string_bytes_cap; + // String dedup table: stores positions in string_bytes that are + // registered for deduplication (mirrors AstGen.string_table). + // Only strings added via identAsString/strLitAsString (non-embedded-null) + // are registered. Multiline strings are NOT registered. + uint32_t* string_table; + uint32_t string_table_len; + uint32_t string_table_cap; uint32_t source_offset; uint32_t source_line; uint32_t source_column; @@ -838,26 +845,37 @@ static int32_t tokenIndexToRelative(const GenZir* gz, uint32_t token) { // Search for an existing null-terminated string in string_bytes. // Returns the index if found, or UINT32_MAX if not found. // Mirrors string_table dedup (AstGen.zig:11564). +// Find a string in string_table (registered strings only). +// Mirrors AstGen.string_table hash table lookup. static uint32_t findExistingString( const AstGenCtx* ag, const char* str, uint32_t len) { - // Linear scan through null-terminated strings in string_bytes. - uint32_t i = 0; - while (i < ag->string_bytes_len) { - // Find the end of the current null-terminated string. - uint32_t j = i; - while (j < ag->string_bytes_len && ag->string_bytes[j] != 0) - j++; - uint32_t existing_len = j - i; - if (existing_len == len - && memcmp(ag->string_bytes + i, str, len) == 0) { - return i; + for (uint32_t k = 0; k < ag->string_table_len; k++) { + uint32_t pos = ag->string_table[k]; + // Compare: string at pos is null-terminated in string_bytes. + const char* existing = (const char*)ag->string_bytes + pos; + uint32_t existing_len = (uint32_t)strlen(existing); + if (existing_len == len && memcmp(existing, str, len) == 0) { + return pos; } - // Skip past the null terminator. - i = j + 1; } return UINT32_MAX; } +// Register a string position in the string table for deduplication. +static void registerString(AstGenCtx* ag, uint32_t pos) { + if (ag->string_table_len >= ag->string_table_cap) { + uint32_t new_cap = ag->string_table_cap * 2; + if (new_cap < 64) + new_cap = 64; + uint32_t* p = realloc(ag->string_table, new_cap * sizeof(uint32_t)); + if (!p) + exit(1); + ag->string_table = p; + ag->string_table_cap = new_cap; + } + ag->string_table[ag->string_table_len++] = pos; +} + // Mirrors AstGen.tokenIdentEql (AstGen.zig:6148-6152). // Compares two identifier tokens by source text without touching string_bytes. static bool tokenIdentEql(const Ast* tree, uint32_t tok1, uint32_t tok2) { @@ -935,6 +953,7 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) { source + content_start, content_len); ag->string_bytes_len += content_len; ag->string_bytes[ag->string_bytes_len++] = 0; + registerString(ag, str_index); return str_index; } @@ -966,13 +985,14 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) { memcpy(ag->string_bytes + ag->string_bytes_len, source + start, ident_len); ag->string_bytes_len += ident_len; ag->string_bytes[ag->string_bytes_len++] = 0; + registerString(ag, str_index); return str_index; } // Mirrors AstGen.strLitAsString (AstGen.zig:11553). -// Mirrors AstGen.strLitAsString (AstGen.zig:11553). -// Handles string literals with escape sequences. -// Returns the string index and length via out parameters. +// Decodes string literal, checks for embedded nulls. +// If embedded null found: store raw bytes without trailing null, no dedup. +// Otherwise: dedup via string_table, add trailing null. static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token, uint32_t* out_index, uint32_t* out_len) { uint32_t tok_start = ag->tree->tokens.starts[str_lit_token]; @@ -1002,13 +1022,10 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token, } if (!has_escapes) { - // Fast path: no escapes, copy directly. + // Fast path: no escapes, no embedded nulls possible. uint32_t content_len = raw_end - i; - // Dedup: skip index 0 (reserved NullTerminatedString.empty). - // The upstream hash table doesn't include the reserved entry, so - // string literals are never deduped against it. uint32_t existing = findExistingString(ag, source + i, content_len); - if (existing != UINT32_MAX && existing != 0) { + if (existing != UINT32_MAX) { *out_index = existing; *out_len = content_len; return; @@ -1019,18 +1036,17 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token, ag->string_bytes + ag->string_bytes_len, source + i, content_len); ag->string_bytes_len += content_len; ag->string_bytes[ag->string_bytes_len++] = 0; + registerString(ag, str_index); *out_index = str_index; *out_len = content_len; return; } - // Slow path: process escape sequences (AstGen.zig:11585-11640). - // Decode into a temporary buffer. + // Slow path: process escape sequences (AstGen.zig:11558). + // Decode directly into string_bytes (like upstream). + uint32_t str_index = ag->string_bytes_len; uint32_t max_len = raw_end - i; - uint8_t* buf = malloc(max_len); - if (!buf) - exit(1); - uint32_t out_pos = 0; + ensureStringBytesCapacity(ag, max_len + 1); while (i < raw_end) { if (source[i] == '\\') { i++; @@ -1038,22 +1054,22 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token, break; switch (source[i]) { case 'n': - buf[out_pos++] = '\n'; + ag->string_bytes[ag->string_bytes_len++] = '\n'; break; case 'r': - buf[out_pos++] = '\r'; + ag->string_bytes[ag->string_bytes_len++] = '\r'; break; case 't': - buf[out_pos++] = '\t'; + ag->string_bytes[ag->string_bytes_len++] = '\t'; break; case '\\': - buf[out_pos++] = '\\'; + ag->string_bytes[ag->string_bytes_len++] = '\\'; break; case '\'': - buf[out_pos++] = '\''; + ag->string_bytes[ag->string_bytes_len++] = '\''; break; case '"': - buf[out_pos++] = '"'; + ag->string_bytes[ag->string_bytes_len++] = '"'; break; case 'x': { // \xNN hex escape. @@ -1068,36 +1084,52 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token, else if (c >= 'A' && c <= 'F') val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'A')); } - buf[out_pos++] = val; + ag->string_bytes[ag->string_bytes_len++] = val; break; } default: - buf[out_pos++] = (uint8_t)source[i]; + ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i]; break; } } else { - buf[out_pos++] = (uint8_t)source[i]; + ag->string_bytes[ag->string_bytes_len++] = (uint8_t)source[i]; } i++; } + uint32_t decoded_len = ag->string_bytes_len - str_index; + uint8_t* key = ag->string_bytes + str_index; - // Dedup check (skip index 0 — reserved NullTerminatedString.empty). - uint32_t existing = findExistingString(ag, (const char*)buf, out_pos); - if (existing != UINT32_MAX && existing != 0) { - *out_index = existing; - *out_len = out_pos; - free(buf); + // Check for embedded null bytes (AstGen.zig:11560). + // If found, skip dedup and don't add trailing null. + bool has_embedded_null = false; + for (uint32_t j = 0; j < decoded_len; j++) { + if (key[j] == 0) { + has_embedded_null = true; + break; + } + } + if (has_embedded_null) { + *out_index = str_index; + *out_len = decoded_len; return; } - uint32_t str_index = ag->string_bytes_len; - ensureStringBytesCapacity(ag, out_pos + 1); - memcpy(ag->string_bytes + ag->string_bytes_len, buf, out_pos); - ag->string_bytes_len += out_pos; + // Dedup against string_table (AstGen.zig:11564-11585). + uint32_t existing = findExistingString(ag, (const char*)key, decoded_len); + if (existing != UINT32_MAX) { + // Shrink back (AstGen.zig:11570). + ag->string_bytes_len = str_index; + *out_index = existing; + *out_len = decoded_len; + return; + } + + // New entry: add trailing null and register. + ensureStringBytesCapacity(ag, 1); ag->string_bytes[ag->string_bytes_len++] = 0; - free(buf); + registerString(ag, str_index); *out_index = str_index; - *out_len = out_pos; + *out_len = decoded_len; } // --- Declaration helpers --- @@ -2358,7 +2390,8 @@ static uint32_t simpleCBuiltin(GenZir* gz, Scope* scope, uint32_t node, ZirInstData data; data.extended.opcode = ext_tag; - data.extended.small = 0; + data.extended.small = 0xAAAAu; // undefined (addExtendedPayload passes + // undefined for small) data.extended.operand = payload_index; addInstruction(gz, ZIR_INST_EXTENDED, data); @@ -10366,6 +10399,7 @@ Zir astGen(const Ast* ast) { free(ag.ref_table_keys); free(ag.ref_table_vals); free(ag.nodes_need_rl); + free(ag.string_table); return zir; }