disjoint_code_page.zig (4693B) - Raw
1 const std = @import("std"); 2 const lex = @import("lex.zig"); 3 const SourceMappings = @import("source_mapping.zig").SourceMappings; 4 const SupportedCodePage = @import("code_pages.zig").SupportedCodePage; 5 6 pub fn hasDisjointCodePage(source: []const u8, source_mappings: ?*const SourceMappings, default_code_page: SupportedCodePage) bool { 7 var line_handler = lex.LineHandler{ .buffer = source }; 8 var i: usize = 0; 9 while (i < source.len) { 10 const codepoint = default_code_page.codepointAt(i, source) orelse break; 11 const c = codepoint.value; 12 switch (c) { 13 '\r', '\n' => { 14 _ = line_handler.incrementLineNumber(i); 15 // Any lines that are not from the root file interrupt the disjoint code page 16 if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) return false; 17 }, 18 // whitespace is ignored 19 ' ', 20 '\t', 21 // NBSP, this should technically be in the TODO below, but it is treated as whitespace 22 // due to a (misguided) special casing in the lexer, see the TODO in lex.zig 23 '\u{A0}', 24 => {}, 25 26 // TODO: All of the below are treated as whitespace by the Win32 RC preprocessor, which also 27 // means they are trimmed from the file during preprocessing. This means that these characters 28 // should be treated like ' ', '\t' above, but since the resinator preprocessor does not treat 29 // them as whitespace *or* trim whitespace, files with these characters are likely going to 30 // error. So, in the future some sort of emulation of/rejection of the Win32 behavior might 31 // make handling these codepoints specially make sense, but for now it doesn't really matter 32 // so they are not handled specially for simplicity's sake. 33 //'\u{1680}', 34 //'\u{180E}', 35 //'\u{2001}', 36 //'\u{2002}', 37 //'\u{2003}', 38 //'\u{2004}', 39 //'\u{2005}', 40 //'\u{2006}', 41 //'\u{2007}', 42 //'\u{2008}', 43 //'\u{2009}', 44 //'\u{200A}', 45 //'\u{2028}', 46 //'\u{2029}', 47 //'\u{202F}', 48 //'\u{205F}', 49 //'\u{3000}', 50 51 '#' => { 52 if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) { 53 return false; 54 } 55 const start_i = i; 56 while (i < source.len and source[i] != '\r' and source[i] != '\n') : (i += 1) {} 57 const line = source[start_i..i]; 58 _ = (lex.parsePragmaCodePage(line) catch |err| switch (err) { 59 error.NotPragma => return false, 60 error.NotCodePagePragma => continue, 61 error.CodePagePragmaUnsupportedCodePage => continue, 62 else => continue, 63 }) orelse return false; // DEFAULT interrupts disjoint code page 64 65 // If we got a code page, then it is a disjoint code page pragma 66 return true; 67 }, 68 else => { 69 // Any other character interrupts the disjoint code page 70 return false; 71 }, 72 } 73 74 i += codepoint.byte_len; 75 } 76 return false; 77 } 78 79 test hasDisjointCodePage { 80 try std.testing.expect(hasDisjointCodePage("#pragma code_page(65001)\n", null, .windows1252)); 81 // NBSP is a special case 82 try std.testing.expect(hasDisjointCodePage("\xA0\n#pragma code_page(65001)\n", null, .windows1252)); 83 try std.testing.expect(hasDisjointCodePage("\u{A0}\n#pragma code_page(1252)\n", null, .utf8)); 84 // other preprocessor commands don't interrupt 85 try std.testing.expect(hasDisjointCodePage("#pragma foo\n#pragma code_page(65001)\n", null, .windows1252)); 86 // invalid code page doesn't interrupt 87 try std.testing.expect(hasDisjointCodePage("#pragma code_page(1234567)\n#pragma code_page(65001)\n", null, .windows1252)); 88 89 try std.testing.expect(!hasDisjointCodePage("#if 1\n#endif\n#pragma code_page(65001)", null, .windows1252)); 90 try std.testing.expect(!hasDisjointCodePage("// comment\n#pragma code_page(65001)", null, .windows1252)); 91 try std.testing.expect(!hasDisjointCodePage("/* comment */\n#pragma code_page(65001)", null, .windows1252)); 92 } 93 94 test "multiline comment edge case" { 95 // TODO 96 if (true) return error.SkipZigTest; 97 98 try std.testing.expect(hasDisjointCodePage("/* comment */#pragma code_page(65001)", null, .windows1252)); 99 }