astgen: fix string literal escape handling and string table ordering

- Add escape sequence handling to strLitAsString (\n, \r, \t, \\, \', \", \xNN). Previously copied string content byte-for-byte. - Fix strLitAsString quote scanning to skip escaped quotes (\\"). - Handle @"..." quoted identifiers in identAsString. - Add test name and field name strings to scanContainer to match upstream string table insertion order. - Skip dedup against reserved index 0 in strLitAsString to match upstream hash table behavior. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 05:52:32 +00:00
parent 2717f8ca91
commit 1228d8d70f
1 changed files with 188 additions and 20 deletions
--- a/astgen.c
+++ b/astgen.c
@@ -639,10 +639,63 @@ static uint32_t findExistingString(
    return UINT32_MAX;
 }

+// Forward declaration for strLitAsString (used by identAsString for @"..."
+// quoted identifiers with escapes).
+static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
+    uint32_t* out_index, uint32_t* out_len);
+
 // Mirrors AstGen.identAsString (AstGen.zig:11530).
+// Handles both bare identifiers and @"..." quoted identifiers.
 static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
    uint32_t start = ag->tree->tokens.starts[ident_token];
    const char* source = ag->tree->source;
+
+    if (source[start] == '@' && start + 1 < ag->tree->source_len
+        && source[start + 1] == '"') {
+        // Quoted identifier: @"name" (AstGen.zig:11297-11308).
+        // Extract content between quotes, handling escapes.
+        uint32_t si, sl;
+        // str_lit_token refers to the same token, content starts after @"
+        // We reuse strLitAsString but offset by 1 to skip '@'.
+        // Actually, strLitAsString expects a token whose source starts
+        // with '"'. The @"..." token starts with '@'. We need to handle
+        // the offset manually.
+        uint32_t content_start = start + 2; // skip @"
+        uint32_t content_end = content_start;
+        while (content_end < ag->tree->source_len
+            && source[content_end] != '"')
+            content_end++;
+        uint32_t content_len = content_end - content_start;
+
+        // Check for escapes.
+        bool has_escapes = false;
+        for (uint32_t j = content_start; j < content_end; j++) {
+            if (source[j] == '\\') {
+                has_escapes = true;
+                break;
+            }
+        }
+
+        if (!has_escapes) {
+            uint32_t existing
+                = findExistingString(ag, source + content_start, content_len);
+            if (existing != UINT32_MAX)
+                return existing;
+            uint32_t str_index = ag->string_bytes_len;
+            ensureStringBytesCapacity(ag, content_len + 1);
+            memcpy(ag->string_bytes + ag->string_bytes_len,
+                source + content_start, content_len);
+            ag->string_bytes_len += content_len;
+            ag->string_bytes[ag->string_bytes_len++] = 0;
+            return str_index;
+        }
+
+        // With escapes: use strLitAsString-like decoding.
+        strLitAsString(ag, ident_token, &si, &sl);
+        return si;
+    }
+
+    // Bare identifier: scan alphanumeric + underscore.
    uint32_t end = start;
    while (end < ag->tree->source_len) {
        char ch = source[end];
@@ -669,7 +722,8 @@ static uint32_t identAsString(AstGenCtx* ag, uint32_t ident_token) {
 }

 // Mirrors AstGen.strLitAsString (AstGen.zig:11553).
-// Simplified: handles simple string literals without escape sequences.
+// Mirrors AstGen.strLitAsString (AstGen.zig:11553).
+// Handles string literals with escape sequences.
 // Returns the string index and length via out parameters.
 static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
    uint32_t* out_index, uint32_t* out_len) {
@@ -677,33 +731,126 @@ static void strLitAsString(AstGenCtx* ag, uint32_t str_lit_token,
    const char* source = ag->tree->source;

    // Skip opening quote.
-    uint32_t content_start = tok_start + 1;
-    // Find closing quote.
-    uint32_t content_end = content_start;
-    while (content_end < ag->tree->source_len && source[content_end] != '"') {
-        content_end++;
+    uint32_t i = tok_start + 1;
+    // Find closing quote, skipping escaped characters.
+    uint32_t raw_end = i;
+    while (raw_end < ag->tree->source_len) {
+        if (source[raw_end] == '\\') {
+            raw_end += 2; // skip escape + escaped char
+        } else if (source[raw_end] == '"') {
+            break;
+        } else {
+            raw_end++;
+        }
    }

-    uint32_t content_len = content_end - content_start;
+    // Check if there are any escape sequences.
+    bool has_escapes = false;
+    for (uint32_t j = i; j < raw_end; j++) {
+        if (source[j] == '\\') {
+            has_escapes = true;
+            break;
+        }
+    }

-    // Check for existing string (dedup).
-    uint32_t existing
-        = findExistingString(ag, source + content_start, content_len);
-    if (existing != UINT32_MAX) {
-        *out_index = existing;
+    if (!has_escapes) {
+        // Fast path: no escapes, copy directly.
+        uint32_t content_len = raw_end - i;
+        // Dedup: skip index 0 (reserved NullTerminatedString.empty).
+        // The upstream hash table doesn't include the reserved entry, so
+        // string literals are never deduped against it.
+        uint32_t existing
+            = findExistingString(ag, source + i, content_len);
+        if (existing != UINT32_MAX && existing != 0) {
+            *out_index = existing;
+            *out_len = content_len;
+            return;
+        }
+        uint32_t str_index = ag->string_bytes_len;
+        ensureStringBytesCapacity(ag, content_len + 1);
+        memcpy(
+            ag->string_bytes + ag->string_bytes_len, source + i, content_len);
+        ag->string_bytes_len += content_len;
+        ag->string_bytes[ag->string_bytes_len++] = 0;
+        *out_index = str_index;
        *out_len = content_len;
        return;
    }

-    uint32_t str_index = ag->string_bytes_len;
-    ensureStringBytesCapacity(ag, content_len + 1);
-    memcpy(ag->string_bytes + ag->string_bytes_len, source + content_start,
-        content_len);
-    ag->string_bytes_len += content_len;
-    ag->string_bytes[ag->string_bytes_len++] = 0;
+    // Slow path: process escape sequences (AstGen.zig:11585-11640).
+    // Decode into a temporary buffer.
+    uint32_t max_len = raw_end - i;
+    uint8_t* buf = malloc(max_len);
+    if (!buf)
+        exit(1);
+    uint32_t out_pos = 0;
+    while (i < raw_end) {
+        if (source[i] == '\\') {
+            i++;
+            if (i >= raw_end)
+                break;
+            switch (source[i]) {
+            case 'n':
+                buf[out_pos++] = '\n';
+                break;
+            case 'r':
+                buf[out_pos++] = '\r';
+                break;
+            case 't':
+                buf[out_pos++] = '\t';
+                break;
+            case '\\':
+                buf[out_pos++] = '\\';
+                break;
+            case '\'':
+                buf[out_pos++] = '\'';
+                break;
+            case '"':
+                buf[out_pos++] = '"';
+                break;
+            case 'x': {
+                // \xNN hex escape.
+                uint8_t val = 0;
+                for (int k = 0; k < 2 && i + 1 < raw_end; k++) {
+                    i++;
+                    char c = source[i];
+                    if (c >= '0' && c <= '9')
+                        val = (uint8_t)(val * 16 + (uint8_t)(c - '0'));
+                    else if (c >= 'a' && c <= 'f')
+                        val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'a'));
+                    else if (c >= 'A' && c <= 'F')
+                        val = (uint8_t)(val * 16 + 10 + (uint8_t)(c - 'A'));
+                }
+                buf[out_pos++] = val;
+                break;
+            }
+            default:
+                buf[out_pos++] = (uint8_t)source[i];
+                break;
+            }
+        } else {
+            buf[out_pos++] = (uint8_t)source[i];
+        }
+        i++;
+    }

+    // Dedup check (skip index 0 — reserved NullTerminatedString.empty).
+    uint32_t existing = findExistingString(ag, (const char*)buf, out_pos);
+    if (existing != UINT32_MAX && existing != 0) {
+        *out_index = existing;
+        *out_len = out_pos;
+        free(buf);
+        return;
+    }
+
+    uint32_t str_index = ag->string_bytes_len;
+    ensureStringBytesCapacity(ag, out_pos + 1);
+    memcpy(ag->string_bytes + ag->string_bytes_len, buf, out_pos);
+    ag->string_bytes_len += out_pos;
+    ag->string_bytes[ag->string_bytes_len++] = 0;
+    free(buf);
    *out_index = str_index;
-    *out_len = content_len;
+    *out_len = out_pos;
 }

 // --- Declaration helpers ---
@@ -1219,12 +1366,33 @@ static uint32_t scanContainer(
            addDeclToTable(ag, name_str, member);
            break;
        }
+        // Container fields: add field name to string table for ordering
+        // (AstGen.zig:13509).
+        case AST_NODE_CONTAINER_FIELD_INIT:
+        case AST_NODE_CONTAINER_FIELD_ALIGN:
+        case AST_NODE_CONTAINER_FIELD: {
+            uint32_t main_token = tree->nodes.main_tokens[member];
+            identAsString(ag, main_token);
+            break;
+        }
        case AST_NODE_COMPTIME:
            decl_count++;
            break;
-        case AST_NODE_TEST_DECL:
+        case AST_NODE_TEST_DECL: {
            decl_count++;
+            // Process test name string to match upstream string table
+            // ordering (AstGen.zig:13465-13500).
+            uint32_t test_name_token
+                = tree->nodes.main_tokens[member] + 1;
+            TokenizerTag tt = tree->tokens.tags[test_name_token];
+            if (tt == TOKEN_STRING_LITERAL) {
+                uint32_t si, sl;
+                strLitAsString(ag, test_name_token, &si, &sl);
+            } else if (tt == TOKEN_IDENTIFIER) {
+                identAsString(ag, test_name_token);
+            }
            break;
+        }
        default:
            break;
        }