zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

disjoint_code_page.zig (4693B) - Raw


      1 const std = @import("std");
      2 const lex = @import("lex.zig");
      3 const SourceMappings = @import("source_mapping.zig").SourceMappings;
      4 const SupportedCodePage = @import("code_pages.zig").SupportedCodePage;
      5 
      6 pub fn hasDisjointCodePage(source: []const u8, source_mappings: ?*const SourceMappings, default_code_page: SupportedCodePage) bool {
      7     var line_handler = lex.LineHandler{ .buffer = source };
      8     var i: usize = 0;
      9     while (i < source.len) {
     10         const codepoint = default_code_page.codepointAt(i, source) orelse break;
     11         const c = codepoint.value;
     12         switch (c) {
     13             '\r', '\n' => {
     14                 _ = line_handler.incrementLineNumber(i);
     15                 // Any lines that are not from the root file interrupt the disjoint code page
     16                 if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) return false;
     17             },
     18             // whitespace is ignored
     19             ' ',
     20             '\t',
     21             // NBSP, this should technically be in the TODO below, but it is treated as whitespace
     22             // due to a (misguided) special casing in the lexer, see the TODO in lex.zig
     23             '\u{A0}',
     24             => {},
     25 
     26             // TODO: All of the below are treated as whitespace by the Win32 RC preprocessor, which also
     27             //       means they are trimmed from the file during preprocessing. This means that these characters
     28             //       should be treated like ' ', '\t' above, but since the resinator preprocessor does not treat
     29             //       them as whitespace *or* trim whitespace, files with these characters are likely going to
     30             //       error. So, in the future some sort of emulation of/rejection of the Win32 behavior might
     31             //       make handling these codepoints specially make sense, but for now it doesn't really matter
     32             //       so they are not handled specially for simplicity's sake.
     33             //'\u{1680}',
     34             //'\u{180E}',
     35             //'\u{2001}',
     36             //'\u{2002}',
     37             //'\u{2003}',
     38             //'\u{2004}',
     39             //'\u{2005}',
     40             //'\u{2006}',
     41             //'\u{2007}',
     42             //'\u{2008}',
     43             //'\u{2009}',
     44             //'\u{200A}',
     45             //'\u{2028}',
     46             //'\u{2029}',
     47             //'\u{202F}',
     48             //'\u{205F}',
     49             //'\u{3000}',
     50 
     51             '#' => {
     52                 if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) {
     53                     return false;
     54                 }
     55                 const start_i = i;
     56                 while (i < source.len and source[i] != '\r' and source[i] != '\n') : (i += 1) {}
     57                 const line = source[start_i..i];
     58                 _ = (lex.parsePragmaCodePage(line) catch |err| switch (err) {
     59                     error.NotPragma => return false,
     60                     error.NotCodePagePragma => continue,
     61                     error.CodePagePragmaUnsupportedCodePage => continue,
     62                     else => continue,
     63                 }) orelse return false; // DEFAULT interrupts disjoint code page
     64 
     65                 // If we got a code page, then it is a disjoint code page pragma
     66                 return true;
     67             },
     68             else => {
     69                 // Any other character interrupts the disjoint code page
     70                 return false;
     71             },
     72         }
     73 
     74         i += codepoint.byte_len;
     75     }
     76     return false;
     77 }
     78 
     79 test hasDisjointCodePage {
     80     try std.testing.expect(hasDisjointCodePage("#pragma code_page(65001)\n", null, .windows1252));
     81     // NBSP is a special case
     82     try std.testing.expect(hasDisjointCodePage("\xA0\n#pragma code_page(65001)\n", null, .windows1252));
     83     try std.testing.expect(hasDisjointCodePage("\u{A0}\n#pragma code_page(1252)\n", null, .utf8));
     84     // other preprocessor commands don't interrupt
     85     try std.testing.expect(hasDisjointCodePage("#pragma foo\n#pragma code_page(65001)\n", null, .windows1252));
     86     // invalid code page doesn't interrupt
     87     try std.testing.expect(hasDisjointCodePage("#pragma code_page(1234567)\n#pragma code_page(65001)\n", null, .windows1252));
     88 
     89     try std.testing.expect(!hasDisjointCodePage("#if 1\n#endif\n#pragma code_page(65001)", null, .windows1252));
     90     try std.testing.expect(!hasDisjointCodePage("// comment\n#pragma code_page(65001)", null, .windows1252));
     91     try std.testing.expect(!hasDisjointCodePage("/* comment */\n#pragma code_page(65001)", null, .windows1252));
     92 }
     93 
     94 test "multiline comment edge case" {
     95     // TODO
     96     if (true) return error.SkipZigTest;
     97 
     98     try std.testing.expect(hasDisjointCodePage("/* comment */#pragma code_page(65001)", null, .windows1252));
     99 }