zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

code_pages.zig (21825B) - Raw


      1 const std = @import("std");
      2 const windows1252 = @import("windows1252.zig");
      3 
      4 /// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
      5 pub const SupportedCodePage = enum(u16) {
      6     windows1252 = 1252, // windows-1252    ANSI Latin 1; Western European (Windows)
      7     utf8 = 65001, // utf-8    Unicode (UTF-8)
      8 
      9     pub fn codepointAt(code_page: SupportedCodePage, index: usize, bytes: []const u8) ?Codepoint {
     10         if (index >= bytes.len) return null;
     11         switch (code_page) {
     12             .windows1252 => {
     13                 // All byte values have a representation, so just convert the byte
     14                 return Codepoint{
     15                     .value = windows1252.toCodepoint(bytes[index]),
     16                     .byte_len = 1,
     17                 };
     18             },
     19             .utf8 => {
     20                 return Utf8.WellFormedDecoder.decode(bytes[index..]);
     21             },
     22         }
     23     }
     24 };
     25 
     26 /// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
     27 pub const UnsupportedCodePage = enum(u16) {
     28     ibm037 = 37, // IBM037    IBM EBCDIC US-Canada
     29     ibm437 = 437, // IBM437    OEM United States
     30     ibm500 = 500, // IBM500    IBM EBCDIC International
     31     asmo708 = 708, // ASMO-708    Arabic (ASMO 708)
     32     asmo449plus = 709, // Arabic (ASMO-449+, BCON V4)
     33     transparent_arabic = 710, // Arabic - Transparent Arabic
     34     dos720 = 720, // DOS-720    Arabic (Transparent ASMO); Arabic (DOS)
     35     ibm737 = 737, // ibm737    OEM Greek (formerly 437G); Greek (DOS)
     36     ibm775 = 775, // ibm775    OEM Baltic; Baltic (DOS)
     37     ibm850 = 850, // ibm850    OEM Multilingual Latin 1; Western European (DOS)
     38     ibm852 = 852, // ibm852    OEM Latin 2; Central European (DOS)
     39     ibm855 = 855, // IBM855    OEM Cyrillic (primarily Russian)
     40     ibm857 = 857, // ibm857    OEM Turkish; Turkish (DOS)
     41     ibm00858 = 858, // IBM00858    OEM Multilingual Latin 1 + Euro symbol
     42     ibm860 = 860, // IBM860    OEM Portuguese; Portuguese (DOS)
     43     ibm861 = 861, // ibm861    OEM Icelandic; Icelandic (DOS)
     44     dos862 = 862, // DOS-862    OEM Hebrew; Hebrew (DOS)
     45     ibm863 = 863, // IBM863    OEM French Canadian; French Canadian (DOS)
     46     ibm864 = 864, // IBM864    OEM Arabic; Arabic (864)
     47     ibm865 = 865, // IBM865    OEM Nordic; Nordic (DOS)
     48     cp866 = 866, // cp866    OEM Russian; Cyrillic (DOS)
     49     ibm869 = 869, // ibm869    OEM Modern Greek; Greek, Modern (DOS)
     50     ibm870 = 870, // IBM870    IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
     51     windows874 = 874, // windows-874    Thai (Windows)
     52     cp875 = 875, // cp875    IBM EBCDIC Greek Modern
     53     shift_jis = 932, // shift_jis    ANSI/OEM Japanese; Japanese (Shift-JIS)
     54     gb2312 = 936, // gb2312    ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
     55     ks_c_5601_1987 = 949, // ks_c_5601-1987    ANSI/OEM Korean (Unified Hangul Code)
     56     big5 = 950, // big5    ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
     57     ibm1026 = 1026, // IBM1026    IBM EBCDIC Turkish (Latin 5)
     58     ibm01047 = 1047, // IBM01047    IBM EBCDIC Latin 1/Open System
     59     ibm01140 = 1140, // IBM01140    IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
     60     ibm01141 = 1141, // IBM01141    IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
     61     ibm01142 = 1142, // IBM01142    IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
     62     ibm01143 = 1143, // IBM01143    IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
     63     ibm01144 = 1144, // IBM01144    IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
     64     ibm01145 = 1145, // IBM01145    IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
     65     ibm01146 = 1146, // IBM01146    IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
     66     ibm01147 = 1147, // IBM01147    IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
     67     ibm01148 = 1148, // IBM01148    IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
     68     ibm01149 = 1149, // IBM01149    IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
     69     utf16 = 1200, // utf-16    Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
     70     utf16_fffe = 1201, // unicodeFFFE    Unicode UTF-16, big endian byte order; available only to managed applications
     71     windows1250 = 1250, // windows-1250    ANSI Central European; Central European (Windows)
     72     windows1251 = 1251, // windows-1251    ANSI Cyrillic; Cyrillic (Windows)
     73     windows1253 = 1253, // windows-1253    ANSI Greek; Greek (Windows)
     74     windows1254 = 1254, // windows-1254    ANSI Turkish; Turkish (Windows)
     75     windows1255 = 1255, // windows-1255    ANSI Hebrew; Hebrew (Windows)
     76     windows1256 = 1256, // windows-1256    ANSI Arabic; Arabic (Windows)
     77     windows1257 = 1257, // windows-1257    ANSI Baltic; Baltic (Windows)
     78     windows1258 = 1258, // windows-1258    ANSI/OEM Vietnamese; Vietnamese (Windows)
     79     johab = 1361, // Johab    Korean (Johab)
     80     macintosh = 10000, // macintosh    MAC Roman; Western European (Mac)
     81     x_mac_japanese = 10001, // x-mac-japanese    Japanese (Mac)
     82     x_mac_chinesetrad = 10002, // x-mac-chinesetrad    MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
     83     x_mac_korean = 10003, // x-mac-korean    Korean (Mac)
     84     x_mac_arabic = 10004, // x-mac-arabic    Arabic (Mac)
     85     x_mac_hebrew = 10005, // x-mac-hebrew    Hebrew (Mac)
     86     x_mac_greek = 10006, // x-mac-greek    Greek (Mac)
     87     x_mac_cyrillic = 10007, // x-mac-cyrillic    Cyrillic (Mac)
     88     x_mac_chinesesimp = 10008, // x-mac-chinesesimp    MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
     89     x_mac_romanian = 10010, // x-mac-romanian    Romanian (Mac)
     90     x_mac_ukranian = 10017, // x-mac-ukrainian    Ukrainian (Mac)
     91     x_mac_thai = 10021, // x-mac-thai    Thai (Mac)
     92     x_mac_ce = 10029, // x-mac-ce    MAC Latin 2; Central European (Mac)
     93     x_mac_icelandic = 10079, // x-mac-icelandic    Icelandic (Mac)
     94     x_mac_turkish = 10081, // x-mac-turkish    Turkish (Mac)
     95     x_mac_croatian = 10082, // x-mac-croatian    Croatian (Mac)
     96     utf32 = 12000, // utf-32    Unicode UTF-32, little endian byte order; available only to managed applications
     97     utf32_be = 12001, // utf-32BE    Unicode UTF-32, big endian byte order; available only to managed applications
     98     x_chinese_cns = 20000, // x-Chinese_CNS    CNS Taiwan; Chinese Traditional (CNS)
     99     x_cp20001 = 20001, // x-cp20001    TCA Taiwan
    100     x_chinese_eten = 20002, // x_Chinese-Eten    Eten Taiwan; Chinese Traditional (Eten)
    101     x_cp20003 = 20003, // x-cp20003    IBM5550 Taiwan
    102     x_cp20004 = 20004, // x-cp20004    TeleText Taiwan
    103     x_cp20005 = 20005, // x-cp20005    Wang Taiwan
    104     x_ia5 = 20105, // x-IA5    IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
    105     x_ia5_german = 20106, // x-IA5-German    IA5 German (7-bit)
    106     x_ia5_swedish = 20107, // x-IA5-Swedish    IA5 Swedish (7-bit)
    107     x_ia5_norwegian = 20108, // x-IA5-Norwegian    IA5 Norwegian (7-bit)
    108     us_ascii = 20127, // us-ascii    US-ASCII (7-bit)
    109     x_cp20261 = 20261, // x-cp20261    T.61
    110     x_cp20269 = 20269, // x-cp20269    ISO 6937 Non-Spacing Accent
    111     ibm273 = 20273, // IBM273    IBM EBCDIC Germany
    112     ibm277 = 20277, // IBM277    IBM EBCDIC Denmark-Norway
    113     ibm278 = 20278, // IBM278    IBM EBCDIC Finland-Sweden
    114     ibm280 = 20280, // IBM280    IBM EBCDIC Italy
    115     ibm284 = 20284, // IBM284    IBM EBCDIC Latin America-Spain
    116     ibm285 = 20285, // IBM285    IBM EBCDIC United Kingdom
    117     ibm290 = 20290, // IBM290    IBM EBCDIC Japanese Katakana Extended
    118     ibm297 = 20297, // IBM297    IBM EBCDIC France
    119     ibm420 = 20420, // IBM420    IBM EBCDIC Arabic
    120     ibm423 = 20423, // IBM423    IBM EBCDIC Greek
    121     ibm424 = 20424, // IBM424    IBM EBCDIC Hebrew
    122     x_ebcdic_korean_extended = 20833, // x-EBCDIC-KoreanExtended    IBM EBCDIC Korean Extended
    123     ibm_thai = 20838, // IBM-Thai    IBM EBCDIC Thai
    124     koi8_r = 20866, // koi8-r    Russian (KOI8-R); Cyrillic (KOI8-R)
    125     ibm871 = 20871, // IBM871    IBM EBCDIC Icelandic
    126     ibm880 = 20880, // IBM880    IBM EBCDIC Cyrillic Russian
    127     ibm905 = 20905, // IBM905    IBM EBCDIC Turkish
    128     ibm00924 = 20924, // IBM00924    IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
    129     euc_jp_jis = 20932, // EUC-JP    Japanese (JIS 0208-1990 and 0212-1990)
    130     x_cp20936 = 20936, // x-cp20936    Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
    131     x_cp20949 = 20949, // x-cp20949    Korean Wansung
    132     cp1025 = 21025, // cp1025    IBM EBCDIC Cyrillic Serbian-Bulgarian
    133     // = 21027, // (deprecated)
    134     koi8_u = 21866, // koi8-u    Ukrainian (KOI8-U); Cyrillic (KOI8-U)
    135     iso8859_1 = 28591, // iso-8859-1    ISO 8859-1 Latin 1; Western European (ISO)
    136     iso8859_2 = 28592, // iso-8859-2    ISO 8859-2 Central European; Central European (ISO)
    137     iso8859_3 = 28593, // iso-8859-3    ISO 8859-3 Latin 3
    138     iso8859_4 = 28594, // iso-8859-4    ISO 8859-4 Baltic
    139     iso8859_5 = 28595, // iso-8859-5    ISO 8859-5 Cyrillic
    140     iso8859_6 = 28596, // iso-8859-6    ISO 8859-6 Arabic
    141     iso8859_7 = 28597, // iso-8859-7    ISO 8859-7 Greek
    142     iso8859_8 = 28598, // iso-8859-8    ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
    143     iso8859_9 = 28599, // iso-8859-9    ISO 8859-9 Turkish
    144     iso8859_13 = 28603, // iso-8859-13    ISO 8859-13 Estonian
    145     iso8859_15 = 28605, // iso-8859-15    ISO 8859-15 Latin 9
    146     x_europa = 29001, // x-Europa    Europa 3
    147     is8859_8_i = 38598, // iso-8859-8-i    ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
    148     iso2022_jp = 50220, // iso-2022-jp    ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
    149     cs_iso2022_jp = 50221, // csISO2022JP    ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
    150     iso2022_jp_jis_x = 50222, // iso-2022-jp    ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
    151     iso2022_kr = 50225, // iso-2022-kr    ISO 2022 Korean
    152     x_cp50227 = 50227, // x-cp50227    ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
    153     iso2022_chinesetrad = 50229, // ISO 2022 Traditional Chinese
    154     ebcdic_jp_katakana_extended = 50930, // EBCDIC Japanese (Katakana) Extended
    155     ebcdic_us_ca_jp = 50931, // EBCDIC US-Canada and Japanese
    156     ebcdic_kr_extended = 50933, // EBCDIC Korean Extended and Korean
    157     ebcdic_chinesesimp_extended = 50935, // EBCDIC Simplified Chinese Extended and Simplified Chinese
    158     ebcdic_chinesesimp = 50936, // EBCDIC Simplified Chinese
    159     ebcdic_us_ca_chinesetrad = 50937, // EBCDIC US-Canada and Traditional Chinese
    160     ebcdic_jp_latin_extended = 50939, // EBCDIC Japanese (Latin) Extended and Japanese
    161     euc_jp = 51932, // euc-jp    EUC Japanese
    162     euc_cn = 51936, // EUC-CN    EUC Simplified Chinese; Chinese Simplified (EUC)
    163     euc_kr = 51949, // euc-kr    EUC Korean
    164     euc_chinesetrad = 51950, // EUC Traditional Chinese
    165     hz_gb2312 = 52936, // hz-gb-2312    HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
    166     gb18030 = 54936, // GB18030    Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
    167     x_iscii_de = 57002, // x-iscii-de    ISCII Devanagari
    168     x_iscii_be = 57003, // x-iscii-be    ISCII Bangla
    169     x_iscii_ta = 57004, // x-iscii-ta    ISCII Tamil
    170     x_iscii_te = 57005, // x-iscii-te    ISCII Telugu
    171     x_iscii_as = 57006, // x-iscii-as    ISCII Assamese
    172     x_iscii_or = 57007, // x-iscii-or    ISCII Odia
    173     x_iscii_ka = 57008, // x-iscii-ka    ISCII Kannada
    174     x_iscii_ma = 57009, // x-iscii-ma    ISCII Malayalam
    175     x_iscii_gu = 57010, // x-iscii-gu    ISCII Gujarati
    176     x_iscii_pa = 57011, // x-iscii-pa    ISCII Punjabi
    177     utf7 = 65000, // utf-7    Unicode (UTF-7)
    178 };
    179 
    180 pub const CodePage = blk: {
    181     const fields = @typeInfo(SupportedCodePage).@"enum".fields ++ @typeInfo(UnsupportedCodePage).@"enum".fields;
    182     break :blk @Type(.{ .@"enum" = .{
    183         .tag_type = u16,
    184         .decls = &.{},
    185         .fields = fields,
    186         .is_exhaustive = true,
    187     } });
    188 };
    189 
    190 pub fn isSupported(code_page: CodePage) bool {
    191     inline for (@typeInfo(SupportedCodePage).@"enum".fields) |enumField| {
    192         if (@intFromEnum(code_page) == @intFromEnum(@field(SupportedCodePage, enumField.name))) {
    193             return true;
    194         }
    195     }
    196     return false;
    197 }
    198 
    199 pub fn getByIdentifier(identifier: u16) !CodePage {
    200     // There's probably a more efficient way to do this (e.g. ComptimeHashMap?) but
    201     // this should be fine, especially since this function likely won't be called much.
    202     inline for (@typeInfo(CodePage).@"enum".fields) |enumField| {
    203         if (identifier == enumField.value) {
    204             return @field(CodePage, enumField.name);
    205         }
    206     }
    207     return error.InvalidCodePage;
    208 }
    209 
    210 pub fn getByIdentifierEnsureSupported(identifier: u16) !SupportedCodePage {
    211     const code_page = try getByIdentifier(identifier);
    212     return if (isSupported(code_page))
    213         @enumFromInt(@intFromEnum(code_page))
    214     else
    215         error.UnsupportedCodePage;
    216 }
    217 
    218 pub const Utf8 = struct {
    219     /// Implements decoding with rejection of ill-formed UTF-8 sequences based on section
    220     /// D92 of Chapter 3 of the Unicode standard (Table 3-7 specifically).
    221     ///
    222     /// Note: This does not match "U+FFFD Substitution of Maximal Subparts", but instead
    223     ///       matches the behavior of the Windows RC compiler.
    224     pub const WellFormedDecoder = struct {
    225         /// Like std.unicode.utf8ByteSequenceLength, but:
    226         /// - Rejects non-well-formed first bytes, i.e. C0-C1, F5-FF
    227         /// - Returns an optional value instead of an error union
    228         pub fn sequenceLength(first_byte: u8) ?u3 {
    229             return switch (first_byte) {
    230                 0x00...0x7F => 1,
    231                 0xC2...0xDF => 2,
    232                 0xE0...0xEF => 3,
    233                 0xF0...0xF4 => 4,
    234                 else => null,
    235             };
    236         }
    237 
    238         fn isContinuationByte(byte: u8) bool {
    239             return switch (byte) {
    240                 0x80...0xBF => true,
    241                 else => false,
    242             };
    243         }
    244 
    245         pub fn decode(bytes: []const u8) Codepoint {
    246             std.debug.assert(bytes.len > 0);
    247             const first_byte = bytes[0];
    248             const expected_len = sequenceLength(first_byte) orelse {
    249                 return .{ .value = Codepoint.invalid, .byte_len = 1 };
    250             };
    251             if (expected_len == 1) return .{ .value = first_byte, .byte_len = 1 };
    252 
    253             var value: u21 = first_byte & 0b00011111;
    254             var byte_index: u8 = 1;
    255             while (byte_index < @min(bytes.len, expected_len)) : (byte_index += 1) {
    256                 const byte = bytes[byte_index];
    257                 // See Table 3-7 of D92 in Chapter 3 of the Unicode Standard
    258                 const valid: bool = switch (byte_index) {
    259                     1 => switch (first_byte) {
    260                         0xE0 => switch (byte) {
    261                             0xA0...0xBF => true,
    262                             else => false,
    263                         },
    264                         0xED => switch (byte) {
    265                             0x80...0x9F => true,
    266                             else => false,
    267                         },
    268                         0xF0 => switch (byte) {
    269                             0x90...0xBF => true,
    270                             else => false,
    271                         },
    272                         0xF4 => switch (byte) {
    273                             0x80...0x8F => true,
    274                             else => false,
    275                         },
    276                         else => switch (byte) {
    277                             0x80...0xBF => true,
    278                             else => false,
    279                         },
    280                     },
    281                     else => switch (byte) {
    282                         0x80...0xBF => true,
    283                         else => false,
    284                     },
    285                 };
    286 
    287                 if (!valid) {
    288                     var len = byte_index;
    289                     // Only include the byte in the invalid sequence if it's in the range
    290                     // of a continuation byte. All other values should not be included in the
    291                     // invalid sequence.
    292                     if (isContinuationByte(byte)) len += 1;
    293                     return .{ .value = Codepoint.invalid, .byte_len = len };
    294                 }
    295 
    296                 value <<= 6;
    297                 value |= byte & 0b00111111;
    298             }
    299             if (byte_index != expected_len) {
    300                 return .{ .value = Codepoint.invalid, .byte_len = byte_index };
    301             }
    302             return .{ .value = value, .byte_len = expected_len };
    303         }
    304     };
    305 };
    306 
    307 test "Utf8.WellFormedDecoder" {
    308     const invalid_utf8 = "\xF0\x80";
    309     const decoded = Utf8.WellFormedDecoder.decode(invalid_utf8);
    310     try std.testing.expectEqual(Codepoint.invalid, decoded.value);
    311     try std.testing.expectEqual(@as(usize, 2), decoded.byte_len);
    312 }
    313 
    314 test "codepointAt invalid utf8" {
    315     {
    316         const invalid_utf8 = "\xf0\xf0\x80\x80\x80";
    317         try std.testing.expectEqual(Codepoint{
    318             .value = Codepoint.invalid,
    319             .byte_len = 1,
    320         }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
    321         try std.testing.expectEqual(Codepoint{
    322             .value = Codepoint.invalid,
    323             .byte_len = 2,
    324         }, SupportedCodePage.utf8.codepointAt(1, invalid_utf8).?);
    325         try std.testing.expectEqual(Codepoint{
    326             .value = Codepoint.invalid,
    327             .byte_len = 1,
    328         }, SupportedCodePage.utf8.codepointAt(3, invalid_utf8).?);
    329         try std.testing.expectEqual(Codepoint{
    330             .value = Codepoint.invalid,
    331             .byte_len = 1,
    332         }, SupportedCodePage.utf8.codepointAt(4, invalid_utf8).?);
    333         try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(5, invalid_utf8));
    334     }
    335 
    336     {
    337         const invalid_utf8 = "\xE1\xA0\xC0";
    338         try std.testing.expectEqual(Codepoint{
    339             .value = Codepoint.invalid,
    340             .byte_len = 2,
    341         }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
    342         try std.testing.expectEqual(Codepoint{
    343             .value = Codepoint.invalid,
    344             .byte_len = 1,
    345         }, SupportedCodePage.utf8.codepointAt(2, invalid_utf8).?);
    346         try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(3, invalid_utf8));
    347     }
    348 
    349     {
    350         const invalid_utf8 = "\xD2";
    351         try std.testing.expectEqual(Codepoint{
    352             .value = Codepoint.invalid,
    353             .byte_len = 1,
    354         }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
    355         try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(1, invalid_utf8));
    356     }
    357 
    358     {
    359         const invalid_utf8 = "\xE1\xA0";
    360         try std.testing.expectEqual(Codepoint{
    361             .value = Codepoint.invalid,
    362             .byte_len = 2,
    363         }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
    364         try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, invalid_utf8));
    365     }
    366 
    367     {
    368         const invalid_utf8 = "\xC5\xFF";
    369         try std.testing.expectEqual(Codepoint{
    370             .value = Codepoint.invalid,
    371             .byte_len = 1,
    372         }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
    373         try std.testing.expectEqual(Codepoint{
    374             .value = Codepoint.invalid,
    375             .byte_len = 1,
    376         }, SupportedCodePage.utf8.codepointAt(1, invalid_utf8).?);
    377         try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, invalid_utf8));
    378     }
    379 
    380     {
    381         // encoded high surrogate
    382         const invalid_utf8 = "\xED\xA0\xBD";
    383         try std.testing.expectEqual(Codepoint{
    384             .value = Codepoint.invalid,
    385             .byte_len = 2,
    386         }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
    387         try std.testing.expectEqual(Codepoint{
    388             .value = Codepoint.invalid,
    389             .byte_len = 1,
    390         }, SupportedCodePage.utf8.codepointAt(2, invalid_utf8).?);
    391     }
    392 }
    393 
    394 test "codepointAt utf8 encoded" {
    395     const utf8_encoded = "²";
    396 
    397     // with code page utf8
    398     try std.testing.expectEqual(Codepoint{
    399         .value = '²',
    400         .byte_len = 2,
    401     }, SupportedCodePage.utf8.codepointAt(0, utf8_encoded).?);
    402     try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, utf8_encoded));
    403 
    404     // with code page windows1252
    405     try std.testing.expectEqual(Codepoint{
    406         .value = '\xC2',
    407         .byte_len = 1,
    408     }, SupportedCodePage.windows1252.codepointAt(0, utf8_encoded).?);
    409     try std.testing.expectEqual(Codepoint{
    410         .value = '\xB2',
    411         .byte_len = 1,
    412     }, SupportedCodePage.windows1252.codepointAt(1, utf8_encoded).?);
    413     try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.windows1252.codepointAt(2, utf8_encoded));
    414 }
    415 
    416 test "codepointAt windows1252 encoded" {
    417     const windows1252_encoded = "\xB2";
    418 
    419     // with code page utf8
    420     try std.testing.expectEqual(Codepoint{
    421         .value = Codepoint.invalid,
    422         .byte_len = 1,
    423     }, SupportedCodePage.utf8.codepointAt(0, windows1252_encoded).?);
    424     try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, windows1252_encoded));
    425 
    426     // with code page windows1252
    427     try std.testing.expectEqual(Codepoint{
    428         .value = '\xB2',
    429         .byte_len = 1,
    430     }, SupportedCodePage.windows1252.codepointAt(0, windows1252_encoded).?);
    431     try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.windows1252.codepointAt(1, windows1252_encoded));
    432 }
    433 
    434 pub const Codepoint = struct {
    435     value: u21,
    436     byte_len: usize,
    437 
    438     pub const invalid: u21 = std.math.maxInt(u21);
    439 };