zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

lang.zig (36104B) - Raw


      1 const std = @import("std");
      2 
      3 /// This function is specific to how the Win32 RC command line interprets
      4 /// language IDs specified as integers.
      5 /// - Always interpreted as hexadecimal, but explicit 0x prefix is also allowed
      6 /// - Wraps on overflow of u16
      7 /// - Stops parsing on any invalid hexadecimal digits
      8 /// - Errors if a digit is not the first char
      9 /// - `-` (negative) prefix is allowed
     10 pub fn parseInt(str: []const u8) error{InvalidLanguageId}!u16 {
     11     var result: u16 = 0;
     12     const radix: u8 = 16;
     13     var buf = str;
     14 
     15     const Prefix = enum { none, minus };
     16     var prefix: Prefix = .none;
     17     switch (buf[0]) {
     18         '-' => {
     19             prefix = .minus;
     20             buf = buf[1..];
     21         },
     22         else => {},
     23     }
     24 
     25     if (buf.len > 2 and buf[0] == '0' and buf[1] == 'x') {
     26         buf = buf[2..];
     27     }
     28 
     29     for (buf, 0..) |c, i| {
     30         const digit = switch (c) {
     31             // On invalid digit for the radix, just stop parsing but don't fail
     32             'a'...'f', 'A'...'F', '0'...'9' => std.fmt.charToDigit(c, radix) catch break,
     33             else => {
     34                 // First digit must be valid
     35                 if (i == 0) {
     36                     return error.InvalidLanguageId;
     37                 }
     38                 break;
     39             },
     40         };
     41 
     42         if (result != 0) {
     43             result *%= radix;
     44         }
     45         result +%= digit;
     46     }
     47 
     48     switch (prefix) {
     49         .none => {},
     50         .minus => result = 0 -% result,
     51     }
     52 
     53     return result;
     54 }
     55 
     56 test parseInt {
     57     try std.testing.expectEqual(@as(u16, 0x16), try parseInt("16"));
     58     try std.testing.expectEqual(@as(u16, 0x1a), try parseInt("0x1A"));
     59     try std.testing.expectEqual(@as(u16, 0x1a), try parseInt("0x1Azzzz"));
     60     try std.testing.expectEqual(@as(u16, 0xffff), try parseInt("-1"));
     61     try std.testing.expectEqual(@as(u16, 0xffea), try parseInt("-0x16"));
     62     try std.testing.expectEqual(@as(u16, 0x0), try parseInt("0o100"));
     63     try std.testing.expectEqual(@as(u16, 0x1), try parseInt("10001"));
     64     try std.testing.expectError(error.InvalidLanguageId, parseInt("--1"));
     65     try std.testing.expectError(error.InvalidLanguageId, parseInt("0xha"));
     66     try std.testing.expectError(error.InvalidLanguageId, parseInt("¹"));
     67     try std.testing.expectError(error.InvalidLanguageId, parseInt("~1"));
     68 }
     69 
     70 /// This function is specific to how the Win32 RC command line interprets
     71 /// language tags: invalid tags are rejected, but tags that don't have
     72 /// a specific assigned ID but are otherwise valid enough will get
     73 /// converted to an ID of LOCALE_CUSTOM_UNSPECIFIED.
     74 pub fn tagToInt(tag: []const u8) error{InvalidLanguageTag}!u16 {
     75     const maybe_id = try tagToId(tag);
     76     if (maybe_id) |id| {
     77         return @intFromEnum(id);
     78     } else {
     79         return LOCALE_CUSTOM_UNSPECIFIED;
     80     }
     81 }
     82 
     83 pub fn tagToId(tag: []const u8) error{InvalidLanguageTag}!?LanguageId {
     84     const parsed = try parse(tag);
     85     // There are currently no language tags with assigned IDs that have
     86     // multiple suffixes, so we can skip the lookup.
     87     if (parsed.multiple_suffixes) return null;
     88     const longest_known_tag = comptime blk: {
     89         var len = 0;
     90         for (@typeInfo(LanguageId).@"enum".fields) |field| {
     91             if (field.name.len > len) len = field.name.len;
     92         }
     93         break :blk len;
     94     };
     95     // If the tag is longer than the longest tag that has an assigned ID,
     96     // then we can skip the lookup.
     97     if (tag.len > longest_known_tag) return null;
     98     var normalized_buf: [longest_known_tag]u8 = undefined;
     99     // To allow e.g. `de-de_phoneb` to get looked up as `de-de`, we need to
    100     // omit the suffix, but only if the tag contains a valid alternate sort order.
    101     const tag_to_normalize = if (parsed.isSuffixValidSortOrder()) tag[0 .. tag.len - (parsed.suffix.?.len + 1)] else tag;
    102     const normalized_tag = normalizeTag(tag_to_normalize, &normalized_buf);
    103     return std.meta.stringToEnum(LanguageId, normalized_tag) orelse {
    104         // special case for a tag that has been mapped to the same ID
    105         // twice.
    106         if (std.mem.eql(u8, "ff_latn_ng", normalized_tag)) {
    107             return LanguageId.ff_ng;
    108         }
    109         return null;
    110     };
    111 }
    112 
    113 test tagToId {
    114     try std.testing.expectEqual(LanguageId.ar_ae, (try tagToId("ar-ae")).?);
    115     try std.testing.expectEqual(LanguageId.ar_ae, (try tagToId("AR_AE")).?);
    116     try std.testing.expectEqual(LanguageId.ff_ng, (try tagToId("ff-ng")).?);
    117     // Special case
    118     try std.testing.expectEqual(LanguageId.ff_ng, (try tagToId("ff-Latn-NG")).?);
    119 }
    120 
    121 test "exhaustive tagToId" {
    122     inline for (@typeInfo(LanguageId).@"enum".fields) |field| {
    123         const id = tagToId(field.name) catch |err| {
    124             std.debug.print("tag: {s}\n", .{field.name});
    125             return err;
    126         };
    127         try std.testing.expectEqual(@field(LanguageId, field.name), id orelse {
    128             std.debug.print("tag: {s}, got null\n", .{field.name});
    129             return error.TestExpectedEqual;
    130         });
    131     }
    132     var buf: [32]u8 = undefined;
    133     inline for (valid_alternate_sorts) |parsed_sort| {
    134         var fbs = std.io.fixedBufferStream(&buf);
    135         const writer = fbs.writer();
    136         writer.writeAll(parsed_sort.language_code) catch unreachable;
    137         writer.writeAll("-") catch unreachable;
    138         writer.writeAll(parsed_sort.country_code.?) catch unreachable;
    139         writer.writeAll("-") catch unreachable;
    140         writer.writeAll(parsed_sort.suffix.?) catch unreachable;
    141         const expected_field_name = comptime field: {
    142             var name_buf: [5]u8 = undefined;
    143             @memcpy(name_buf[0..parsed_sort.language_code.len], parsed_sort.language_code);
    144             name_buf[2] = '_';
    145             @memcpy(name_buf[3..], parsed_sort.country_code.?);
    146             break :field name_buf;
    147         };
    148         const expected = @field(LanguageId, &expected_field_name);
    149         const id = tagToId(fbs.getWritten()) catch |err| {
    150             std.debug.print("tag: {s}\n", .{fbs.getWritten()});
    151             return err;
    152         };
    153         try std.testing.expectEqual(expected, id orelse {
    154             std.debug.print("tag: {s}, expected: {}, got null\n", .{ fbs.getWritten(), expected });
    155             return error.TestExpectedEqual;
    156         });
    157     }
    158 }
    159 
    160 fn normalizeTag(tag: []const u8, buf: []u8) []u8 {
    161     std.debug.assert(buf.len >= tag.len);
    162     for (tag, 0..) |c, i| {
    163         if (c == '-')
    164             buf[i] = '_'
    165         else
    166             buf[i] = std.ascii.toLower(c);
    167     }
    168     return buf[0..tag.len];
    169 }
    170 
    171 /// https://winprotocoldoc.blob.core.windows.net/productionwindowsarchives/MS-LCID/%5bMS-LCID%5d.pdf#%5B%7B%22num%22%3A72%2C%22gen%22%3A0%7D%2C%7B%22name%22%3A%22XYZ%22%7D%2C69%2C574%2C0%5D
    172 /// "When an LCID is requested for a locale without a
    173 /// permanent LCID assignment, nor a temporary
    174 /// assignment as above, the protocol will respond
    175 /// with LOCALE_CUSTOM_UNSPECIFIED for all such
    176 /// locales. Because this single value is used for
    177 /// numerous possible locale names, it is impossible to
    178 /// round trip this locale, even temporarily.
    179 /// Applications should discard this value as soon as
    180 /// possible and never persist it. If the system is
    181 /// forced to respond to a request for
    182 /// LCID_CUSTOM_UNSPECIFIED, it will fall back to
    183 /// the current user locale. This is often incorrect but
    184 /// may prevent an application or component from
    185 /// failing. As the meaning of this temporary LCID is
    186 /// unstable, it should never be used for interchange
    187 /// or persisted data. This is a 1-to-many relationship
    188 /// that is very unstable."
    189 pub const LOCALE_CUSTOM_UNSPECIFIED = 0x1000;
    190 
    191 pub const LANG_ENGLISH = 0x09;
    192 pub const SUBLANG_ENGLISH_US = 0x01;
    193 
    194 /// https://learn.microsoft.com/en-us/windows/win32/intl/language-identifiers
    195 pub fn MAKELANGID(primary: u10, sublang: u6) u16 {
    196     return (@as(u16, primary) << 10) | sublang;
    197 }
    198 
    199 /// Language tag format expressed as a regular expression (rough approximation):
    200 ///
    201 /// [a-zA-Z]{1,3}([-_][a-zA-Z]{4})?([-_][a-zA-Z]{2})?([-_][a-zA-Z0-9]{1,8})?
    202 ///     lang    |     script      |      country    |       suffix
    203 ///
    204 /// Notes:
    205 /// - If lang code is 1 char, it seems to mean that everything afterwards uses suffix
    206 ///   parsing rules (e.g. `a-0` and `a-00000000` are allowed).
    207 /// - There can also be any number of trailing suffix parts as long as they each
    208 ///   would be a valid suffix part, e.g. `en-us-blah-blah1-blah2-blah3` is allowed.
    209 /// - When doing lookups, trailing suffix parts are taken into account, e.g.
    210 ///   `ca-es-valencia` is not considered equivalent to `ca-es-valencia-blah`.
    211 /// - A suffix is only allowed if:
    212 ///   + Lang code is 1 char long, or
    213 ///   + A country code is present, or
    214 ///   + A script tag is not present and:
    215 ///      - the suffix is numeric-only and has a length of 3, or
    216 ///      - the lang is `qps` and the suffix is `ploca` or `plocm`
    217 pub fn parse(lang_tag: []const u8) error{InvalidLanguageTag}!Parsed {
    218     var it = std.mem.splitAny(u8, lang_tag, "-_");
    219     const lang_code = it.first();
    220     const is_valid_lang_code = lang_code.len >= 1 and lang_code.len <= 3 and isAllAlphabetic(lang_code);
    221     if (!is_valid_lang_code) return error.InvalidLanguageTag;
    222     var parsed = Parsed{
    223         .language_code = lang_code,
    224     };
    225     // The second part could be a script tag, a country code, or a suffix
    226     if (it.next()) |part_str| {
    227         // The lang code being length 1 behaves strangely, so fully special case it.
    228         if (lang_code.len == 1) {
    229             // This is almost certainly not the 'right' way to do this, but I don't have a method
    230             // to determine how exactly these language tags are parsed, and it seems like
    231             // suffix parsing rules apply generally (digits allowed, length of 1 to 8).
    232             //
    233             // However, because we want to be able to lookup `x-iv-mathan` normally without
    234             // `multiple_suffixes` being set to true, we need to make sure to treat two-length
    235             // alphabetic parts as a country code.
    236             if (part_str.len == 2 and isAllAlphabetic(part_str)) {
    237                 parsed.country_code = part_str;
    238             }
    239             // Everything else, though, we can just throw into the suffix as long as the normal
    240             // rules apply.
    241             else if (part_str.len > 0 and part_str.len <= 8 and isAllAlphanumeric(part_str)) {
    242                 parsed.suffix = part_str;
    243             } else {
    244                 return error.InvalidLanguageTag;
    245             }
    246         } else if (part_str.len == 4 and isAllAlphabetic(part_str)) {
    247             parsed.script_tag = part_str;
    248         } else if (part_str.len == 2 and isAllAlphabetic(part_str)) {
    249             parsed.country_code = part_str;
    250         }
    251         // Only a 3-len numeric suffix is allowed as the second part of a tag
    252         else if (part_str.len == 3 and isAllNumeric(part_str)) {
    253             parsed.suffix = part_str;
    254         }
    255         // Special case for qps-ploca and qps-plocm
    256         else if (std.ascii.eqlIgnoreCase(lang_code, "qps") and
    257             (std.ascii.eqlIgnoreCase(part_str, "ploca") or
    258                 std.ascii.eqlIgnoreCase(part_str, "plocm")))
    259         {
    260             parsed.suffix = part_str;
    261         } else {
    262             return error.InvalidLanguageTag;
    263         }
    264     } else {
    265         // If there's no part besides a 1-len lang code, then it is malformed
    266         if (lang_code.len == 1) return error.InvalidLanguageTag;
    267         return parsed;
    268     }
    269     if (parsed.script_tag != null) {
    270         if (it.next()) |part_str| {
    271             if (part_str.len == 2 and isAllAlphabetic(part_str)) {
    272                 parsed.country_code = part_str;
    273             } else {
    274                 // Suffix is not allowed when a country code is not present.
    275                 return error.InvalidLanguageTag;
    276             }
    277         } else {
    278             return parsed;
    279         }
    280     }
    281     // We've now parsed any potential script tag/country codes, so anything remaining
    282     // is a suffix
    283     while (it.next()) |part_str| {
    284         if (part_str.len == 0 or part_str.len > 8 or !isAllAlphanumeric(part_str)) {
    285             return error.InvalidLanguageTag;
    286         }
    287         if (parsed.suffix == null) {
    288             parsed.suffix = part_str;
    289         } else {
    290             // In theory we could return early here but we still want to validate
    291             // that each part is a valid suffix all the way to the end, e.g.
    292             // we should reject `en-us-suffix-a-b-c-!!!` because of the invalid `!!!`
    293             // suffix part.
    294             parsed.multiple_suffixes = true;
    295         }
    296     }
    297     return parsed;
    298 }
    299 
    300 pub const Parsed = struct {
    301     language_code: []const u8,
    302     script_tag: ?[]const u8 = null,
    303     country_code: ?[]const u8 = null,
    304     /// Can be a sort order (e.g. phoneb) or something like valencia, 001, etc
    305     suffix: ?[]const u8 = null,
    306     /// There can be any number of suffixes, but we don't need to care what their
    307     /// values are, we just need to know if any exist so that e.g. `ca-es-valencia-blah`
    308     /// can be seen as different from `ca-es-valencia`. Storing this as a bool
    309     /// allows us to avoid needing either (a) dynamic allocation or (b) a limit to
    310     /// the number of suffixes allowed when parsing.
    311     multiple_suffixes: bool = false,
    312 
    313     pub fn isSuffixValidSortOrder(self: Parsed) bool {
    314         if (self.country_code == null) return false;
    315         if (self.suffix == null) return false;
    316         if (self.script_tag != null) return false;
    317         if (self.multiple_suffixes) return false;
    318         for (valid_alternate_sorts) |valid_sort| {
    319             if (std.ascii.eqlIgnoreCase(valid_sort.language_code, self.language_code) and
    320                 std.ascii.eqlIgnoreCase(valid_sort.country_code.?, self.country_code.?) and
    321                 std.ascii.eqlIgnoreCase(valid_sort.suffix.?, self.suffix.?))
    322             {
    323                 return true;
    324             }
    325         }
    326         return false;
    327     }
    328 };
    329 
    330 /// https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f
    331 /// See the table following this text: "Alternate sorts can be selected by using one of the identifiers from the following table."
    332 const valid_alternate_sorts = [_]Parsed{
    333     // Note: x-IV-mathan is omitted due to how lookups are implemented.
    334     //       This table is used to make e.g. `de-de_phoneb` get looked up
    335     //       as `de-de` (the suffix is omitted for the lookup), but x-iv-mathan
    336     //       instead needs to be looked up with the suffix included because
    337     //       `x-iv` is not a tag with an assigned ID.
    338     .{ .language_code = "de", .country_code = "de", .suffix = "phoneb" },
    339     .{ .language_code = "hu", .country_code = "hu", .suffix = "tchncl" },
    340     .{ .language_code = "ka", .country_code = "ge", .suffix = "modern" },
    341     .{ .language_code = "zh", .country_code = "cn", .suffix = "stroke" },
    342     .{ .language_code = "zh", .country_code = "sg", .suffix = "stroke" },
    343     .{ .language_code = "zh", .country_code = "mo", .suffix = "stroke" },
    344     .{ .language_code = "zh", .country_code = "tw", .suffix = "pronun" },
    345     .{ .language_code = "zh", .country_code = "tw", .suffix = "radstr" },
    346     .{ .language_code = "ja", .country_code = "jp", .suffix = "radstr" },
    347     .{ .language_code = "zh", .country_code = "hk", .suffix = "radstr" },
    348     .{ .language_code = "zh", .country_code = "mo", .suffix = "radstr" },
    349     .{ .language_code = "zh", .country_code = "cn", .suffix = "phoneb" },
    350     .{ .language_code = "zh", .country_code = "sg", .suffix = "phoneb" },
    351 };
    352 
    353 test "parse" {
    354     try std.testing.expectEqualDeep(Parsed{
    355         .language_code = "en",
    356     }, try parse("en"));
    357     try std.testing.expectEqualDeep(Parsed{
    358         .language_code = "en",
    359         .country_code = "us",
    360     }, try parse("en-us"));
    361     try std.testing.expectEqualDeep(Parsed{
    362         .language_code = "en",
    363         .suffix = "123",
    364     }, try parse("en-123"));
    365     try std.testing.expectEqualDeep(Parsed{
    366         .language_code = "en",
    367         .suffix = "123",
    368         .multiple_suffixes = true,
    369     }, try parse("en-123-blah"));
    370     try std.testing.expectEqualDeep(Parsed{
    371         .language_code = "en",
    372         .country_code = "us",
    373         .suffix = "123",
    374         .multiple_suffixes = true,
    375     }, try parse("en-us_123-blah"));
    376     try std.testing.expectEqualDeep(Parsed{
    377         .language_code = "eng",
    378         .script_tag = "Latn",
    379     }, try parse("eng-Latn"));
    380     try std.testing.expectEqualDeep(Parsed{
    381         .language_code = "eng",
    382         .script_tag = "Latn",
    383     }, try parse("eng-Latn"));
    384     try std.testing.expectEqualDeep(Parsed{
    385         .language_code = "ff",
    386         .script_tag = "Latn",
    387         .country_code = "NG",
    388     }, try parse("ff-Latn-NG"));
    389     try std.testing.expectEqualDeep(Parsed{
    390         .language_code = "qps",
    391         .suffix = "Plocm",
    392     }, try parse("qps-Plocm"));
    393     try std.testing.expectEqualDeep(Parsed{
    394         .language_code = "qps",
    395         .suffix = "ploca",
    396     }, try parse("qps-ploca"));
    397     try std.testing.expectEqualDeep(Parsed{
    398         .language_code = "x",
    399         .country_code = "IV",
    400         .suffix = "mathan",
    401     }, try parse("x-IV-mathan"));
    402     try std.testing.expectEqualDeep(Parsed{
    403         .language_code = "a",
    404         .suffix = "a",
    405     }, try parse("a-a"));
    406     try std.testing.expectEqualDeep(Parsed{
    407         .language_code = "a",
    408         .suffix = "000",
    409     }, try parse("a-000"));
    410     try std.testing.expectEqualDeep(Parsed{
    411         .language_code = "a",
    412         .suffix = "00000000",
    413     }, try parse("a-00000000"));
    414     // suffix not allowed if script tag is present without country code
    415     try std.testing.expectError(error.InvalidLanguageTag, parse("eng-Latn-suffix"));
    416     // suffix must be 3 numeric digits if neither script tag nor country code is present
    417     try std.testing.expectError(error.InvalidLanguageTag, parse("eng-suffix"));
    418     try std.testing.expectError(error.InvalidLanguageTag, parse("en-plocm"));
    419     // 1-len lang code is not allowed if it's the only part
    420     try std.testing.expectError(error.InvalidLanguageTag, parse("e"));
    421 }
    422 
    423 fn isAllAlphabetic(str: []const u8) bool {
    424     for (str) |c| {
    425         if (!std.ascii.isAlphabetic(c)) return false;
    426     }
    427     return true;
    428 }
    429 
    430 fn isAllAlphanumeric(str: []const u8) bool {
    431     for (str) |c| {
    432         if (!std.ascii.isAlphanumeric(c)) return false;
    433     }
    434     return true;
    435 }
    436 
    437 fn isAllNumeric(str: []const u8) bool {
    438     for (str) |c| {
    439         if (!std.ascii.isDigit(c)) return false;
    440     }
    441     return true;
    442 }
    443 
    444 /// Derived from https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f
    445 /// - Protocol Revision: 15.0
    446 /// - Language / Language ID / Language Tag table in Appendix A
    447 /// - Removed all rows that have Language ID 0x1000 (LOCALE_CUSTOM_UNSPECIFIED)
    448 /// - Normalized each language tag (lowercased, replaced all `-` with `_`)
    449 /// - There is one special case where two tags are mapped to the same ID, the following
    450 ///   has been omitted and must be special cased during lookup to map to the ID ff_ng / 0x0467.
    451 ///     ff_latn_ng = 0x0467, // Fulah (Latin), Nigeria
    452 /// - x_iv_mathan has been added which is not in the table but does appear in the Alternate sorts
    453 ///   table as 0x007F (LANG_INVARIANT).
    454 pub const LanguageId = enum(u16) {
    455     // Language tag = Language ID, // Language, Location (or type)
    456     af = 0x0036, // Afrikaans
    457     af_za = 0x0436, // Afrikaans, South Africa
    458     sq = 0x001C, // Albanian
    459     sq_al = 0x041C, // Albanian, Albania
    460     gsw = 0x0084, // Alsatian
    461     gsw_fr = 0x0484, // Alsatian, France
    462     am = 0x005E, // Amharic
    463     am_et = 0x045E, // Amharic, Ethiopia
    464     ar = 0x0001, // Arabic
    465     ar_dz = 0x1401, // Arabic, Algeria
    466     ar_bh = 0x3C01, // Arabic, Bahrain
    467     ar_eg = 0x0c01, // Arabic, Egypt
    468     ar_iq = 0x0801, // Arabic, Iraq
    469     ar_jo = 0x2C01, // Arabic, Jordan
    470     ar_kw = 0x3401, // Arabic, Kuwait
    471     ar_lb = 0x3001, // Arabic, Lebanon
    472     ar_ly = 0x1001, // Arabic, Libya
    473     ar_ma = 0x1801, // Arabic, Morocco
    474     ar_om = 0x2001, // Arabic, Oman
    475     ar_qa = 0x4001, // Arabic, Qatar
    476     ar_sa = 0x0401, // Arabic, Saudi Arabia
    477     ar_sy = 0x2801, // Arabic, Syria
    478     ar_tn = 0x1C01, // Arabic, Tunisia
    479     ar_ae = 0x3801, // Arabic, U.A.E.
    480     ar_ye = 0x2401, // Arabic, Yemen
    481     hy = 0x002B, // Armenian
    482     hy_am = 0x042B, // Armenian, Armenia
    483     as = 0x004D, // Assamese
    484     as_in = 0x044D, // Assamese, India
    485     az_cyrl = 0x742C, // Azerbaijani (Cyrillic)
    486     az_cyrl_az = 0x082C, // Azerbaijani (Cyrillic), Azerbaijan
    487     az = 0x002C, // Azerbaijani (Latin)
    488     az_latn = 0x782C, // Azerbaijani (Latin)
    489     az_latn_az = 0x042C, // Azerbaijani (Latin), Azerbaijan
    490     bn = 0x0045, // Bangla
    491     bn_bd = 0x0845, // Bangla, Bangladesh
    492     bn_in = 0x0445, // Bangla, India
    493     ba = 0x006D, // Bashkir
    494     ba_ru = 0x046D, // Bashkir, Russia
    495     eu = 0x002D, // Basque
    496     eu_es = 0x042D, // Basque, Spain
    497     be = 0x0023, // Belarusian
    498     be_by = 0x0423, // Belarusian, Belarus
    499     bs_cyrl = 0x641A, // Bosnian (Cyrillic)
    500     bs_cyrl_ba = 0x201A, // Bosnian (Cyrillic), Bosnia and Herzegovina
    501     bs_latn = 0x681A, // Bosnian (Latin)
    502     bs = 0x781A, // Bosnian (Latin)
    503     bs_latn_ba = 0x141A, // Bosnian (Latin), Bosnia and Herzegovina
    504     br = 0x007E, // Breton
    505     br_fr = 0x047E, // Breton, France
    506     bg = 0x0002, // Bulgarian
    507     bg_bg = 0x0402, // Bulgarian, Bulgaria
    508     my = 0x0055, // Burmese
    509     my_mm = 0x0455, // Burmese, Myanmar
    510     ca = 0x0003, // Catalan
    511     ca_es = 0x0403, // Catalan, Spain
    512     tzm_arab_ma = 0x045F, // Central Atlas Tamazight (Arabic), Morocco
    513     ku = 0x0092, // Central Kurdish
    514     ku_arab = 0x7c92, // Central Kurdish
    515     ku_arab_iq = 0x0492, // Central Kurdish, Iraq
    516     chr = 0x005C, // Cherokee
    517     chr_cher = 0x7c5C, // Cherokee
    518     chr_cher_us = 0x045C, // Cherokee, United States
    519     zh_hans = 0x0004, // Chinese (Simplified)
    520     zh = 0x7804, // Chinese (Simplified)
    521     zh_cn = 0x0804, // Chinese (Simplified), People's Republic of China
    522     zh_sg = 0x1004, // Chinese (Simplified), Singapore
    523     zh_hant = 0x7C04, // Chinese (Traditional)
    524     zh_hk = 0x0C04, // Chinese (Traditional), Hong Kong S.A.R.
    525     zh_mo = 0x1404, // Chinese (Traditional), Macao S.A.R.
    526     zh_tw = 0x0404, // Chinese (Traditional), Taiwan
    527     co = 0x0083, // Corsican
    528     co_fr = 0x0483, // Corsican, France
    529     hr = 0x001A, // Croatian
    530     hr_hr = 0x041A, // Croatian, Croatia
    531     hr_ba = 0x101A, // Croatian (Latin), Bosnia and Herzegovina
    532     cs = 0x0005, // Czech
    533     cs_cz = 0x0405, // Czech, Czech Republic
    534     da = 0x0006, // Danish
    535     da_dk = 0x0406, // Danish, Denmark
    536     prs = 0x008C, // Dari
    537     prs_af = 0x048C, // Dari, Afghanistan
    538     dv = 0x0065, // Divehi
    539     dv_mv = 0x0465, // Divehi, Maldives
    540     nl = 0x0013, // Dutch
    541     nl_be = 0x0813, // Dutch, Belgium
    542     nl_nl = 0x0413, // Dutch, Netherlands
    543     dz_bt = 0x0C51, // Dzongkha, Bhutan
    544     en = 0x0009, // English
    545     en_au = 0x0C09, // English, Australia
    546     en_bz = 0x2809, // English, Belize
    547     en_ca = 0x1009, // English, Canada
    548     en_029 = 0x2409, // English, Caribbean
    549     en_hk = 0x3C09, // English, Hong Kong
    550     en_in = 0x4009, // English, India
    551     en_ie = 0x1809, // English, Ireland
    552     en_jm = 0x2009, // English, Jamaica
    553     en_my = 0x4409, // English, Malaysia
    554     en_nz = 0x1409, // English, New Zealand
    555     en_ph = 0x3409, // English, Republic of the Philippines
    556     en_sg = 0x4809, // English, Singapore
    557     en_za = 0x1C09, // English, South Africa
    558     en_tt = 0x2c09, // English, Trinidad and Tobago
    559     en_ae = 0x4C09, // English, United Arab Emirates
    560     en_gb = 0x0809, // English, United Kingdom
    561     en_us = 0x0409, // English, United States
    562     en_zw = 0x3009, // English, Zimbabwe
    563     et = 0x0025, // Estonian
    564     et_ee = 0x0425, // Estonian, Estonia
    565     fo = 0x0038, // Faroese
    566     fo_fo = 0x0438, // Faroese, Faroe Islands
    567     fil = 0x0064, // Filipino
    568     fil_ph = 0x0464, // Filipino, Philippines
    569     fi = 0x000B, // Finnish
    570     fi_fi = 0x040B, // Finnish, Finland
    571     fr = 0x000C, // French
    572     fr_be = 0x080C, // French, Belgium
    573     fr_cm = 0x2c0C, // French, Cameroon
    574     fr_ca = 0x0c0C, // French, Canada
    575     fr_029 = 0x1C0C, // French, Caribbean
    576     fr_cd = 0x240C, // French, Congo, DRC
    577     fr_ci = 0x300C, // French, Côte d'Ivoire
    578     fr_fr = 0x040C, // French, France
    579     fr_ht = 0x3c0C, // French, Haiti
    580     fr_lu = 0x140C, // French, Luxembourg
    581     fr_ml = 0x340C, // French, Mali
    582     fr_ma = 0x380C, // French, Morocco
    583     fr_mc = 0x180C, // French, Principality of Monaco
    584     fr_re = 0x200C, // French, Reunion
    585     fr_sn = 0x280C, // French, Senegal
    586     fr_ch = 0x100C, // French, Switzerland
    587     fy = 0x0062, // Frisian
    588     fy_nl = 0x0462, // Frisian, Netherlands
    589     ff = 0x0067, // Fulah
    590     ff_latn = 0x7C67, // Fulah (Latin)
    591     ff_ng = 0x0467, // Fulah, Nigeria
    592     ff_latn_sn = 0x0867, // Fulah, Senegal
    593     gl = 0x0056, // Galician
    594     gl_es = 0x0456, // Galician, Spain
    595     ka = 0x0037, // Georgian
    596     ka_ge = 0x0437, // Georgian, Georgia
    597     de = 0x0007, // German
    598     de_at = 0x0C07, // German, Austria
    599     de_de = 0x0407, // German, Germany
    600     de_li = 0x1407, // German, Liechtenstein
    601     de_lu = 0x1007, // German, Luxembourg
    602     de_ch = 0x0807, // German, Switzerland
    603     el = 0x0008, // Greek
    604     el_gr = 0x0408, // Greek, Greece
    605     kl = 0x006F, // Greenlandic
    606     kl_gl = 0x046F, // Greenlandic, Greenland
    607     gn = 0x0074, // Guarani
    608     gn_py = 0x0474, // Guarani, Paraguay
    609     gu = 0x0047, // Gujarati
    610     gu_in = 0x0447, // Gujarati, India
    611     ha = 0x0068, // Hausa (Latin)
    612     ha_latn = 0x7C68, // Hausa (Latin)
    613     ha_latn_ng = 0x0468, // Hausa (Latin), Nigeria
    614     haw = 0x0075, // Hawaiian
    615     haw_us = 0x0475, // Hawaiian, United States
    616     he = 0x000D, // Hebrew
    617     he_il = 0x040D, // Hebrew, Israel
    618     hi = 0x0039, // Hindi
    619     hi_in = 0x0439, // Hindi, India
    620     hu = 0x000E, // Hungarian
    621     hu_hu = 0x040E, // Hungarian, Hungary
    622     is = 0x000F, // Icelandic
    623     is_is = 0x040F, // Icelandic, Iceland
    624     ig = 0x0070, // Igbo
    625     ig_ng = 0x0470, // Igbo, Nigeria
    626     id = 0x0021, // Indonesian
    627     id_id = 0x0421, // Indonesian, Indonesia
    628     iu = 0x005D, // Inuktitut (Latin)
    629     iu_latn = 0x7C5D, // Inuktitut (Latin)
    630     iu_latn_ca = 0x085D, // Inuktitut (Latin), Canada
    631     iu_cans = 0x785D, // Inuktitut (Syllabics)
    632     iu_cans_ca = 0x045d, // Inuktitut (Syllabics), Canada
    633     ga = 0x003C, // Irish
    634     ga_ie = 0x083C, // Irish, Ireland
    635     it = 0x0010, // Italian
    636     it_it = 0x0410, // Italian, Italy
    637     it_ch = 0x0810, // Italian, Switzerland
    638     ja = 0x0011, // Japanese
    639     ja_jp = 0x0411, // Japanese, Japan
    640     kn = 0x004B, // Kannada
    641     kn_in = 0x044B, // Kannada, India
    642     kr_latn_ng = 0x0471, // Kanuri (Latin), Nigeria
    643     ks = 0x0060, // Kashmiri
    644     ks_arab = 0x0460, // Kashmiri, Perso-Arabic
    645     ks_deva_in = 0x0860, // Kashmiri (Devanagari), India
    646     kk = 0x003F, // Kazakh
    647     kk_kz = 0x043F, // Kazakh, Kazakhstan
    648     km = 0x0053, // Khmer
    649     km_kh = 0x0453, // Khmer, Cambodia
    650     quc = 0x0086, // K'iche
    651     quc_latn_gt = 0x0486, // K'iche, Guatemala
    652     rw = 0x0087, // Kinyarwanda
    653     rw_rw = 0x0487, // Kinyarwanda, Rwanda
    654     sw = 0x0041, // Kiswahili
    655     sw_ke = 0x0441, // Kiswahili, Kenya
    656     kok = 0x0057, // Konkani
    657     kok_in = 0x0457, // Konkani, India
    658     ko = 0x0012, // Korean
    659     ko_kr = 0x0412, // Korean, Korea
    660     ky = 0x0040, // Kyrgyz
    661     ky_kg = 0x0440, // Kyrgyz, Kyrgyzstan
    662     lo = 0x0054, // Lao
    663     lo_la = 0x0454, // Lao, Lao P.D.R.
    664     la_va = 0x0476, // Latin, Vatican City
    665     lv = 0x0026, // Latvian
    666     lv_lv = 0x0426, // Latvian, Latvia
    667     lt = 0x0027, // Lithuanian
    668     lt_lt = 0x0427, // Lithuanian, Lithuania
    669     dsb = 0x7C2E, // Lower Sorbian
    670     dsb_de = 0x082E, // Lower Sorbian, Germany
    671     lb = 0x006E, // Luxembourgish
    672     lb_lu = 0x046E, // Luxembourgish, Luxembourg
    673     mk = 0x002F, // Macedonian
    674     mk_mk = 0x042F, // Macedonian, North Macedonia
    675     ms = 0x003E, // Malay
    676     ms_bn = 0x083E, // Malay, Brunei Darussalam
    677     ms_my = 0x043E, // Malay, Malaysia
    678     ml = 0x004C, // Malayalam
    679     ml_in = 0x044C, // Malayalam, India
    680     mt = 0x003A, // Maltese
    681     mt_mt = 0x043A, // Maltese, Malta
    682     mi = 0x0081, // Maori
    683     mi_nz = 0x0481, // Maori, New Zealand
    684     arn = 0x007A, // Mapudungun
    685     arn_cl = 0x047A, // Mapudungun, Chile
    686     mr = 0x004E, // Marathi
    687     mr_in = 0x044E, // Marathi, India
    688     moh = 0x007C, // Mohawk
    689     moh_ca = 0x047C, // Mohawk, Canada
    690     mn = 0x0050, // Mongolian (Cyrillic)
    691     mn_cyrl = 0x7850, // Mongolian (Cyrillic)
    692     mn_mn = 0x0450, // Mongolian (Cyrillic), Mongolia
    693     mn_mong = 0x7C50, // Mongolian (Traditional Mongolian)
    694     mn_mong_cn = 0x0850, // Mongolian (Traditional Mongolian), People's Republic of China
    695     mn_mong_mn = 0x0C50, // Mongolian (Traditional Mongolian), Mongolia
    696     ne = 0x0061, // Nepali
    697     ne_in = 0x0861, // Nepali, India
    698     ne_np = 0x0461, // Nepali, Nepal
    699     no = 0x0014, // Norwegian (Bokmal)
    700     nb = 0x7C14, // Norwegian (Bokmal)
    701     nb_no = 0x0414, // Norwegian (Bokmal), Norway
    702     nn = 0x7814, // Norwegian (Nynorsk)
    703     nn_no = 0x0814, // Norwegian (Nynorsk), Norway
    704     oc = 0x0082, // Occitan
    705     oc_fr = 0x0482, // Occitan, France
    706     @"or" = 0x0048, // Odia
    707     or_in = 0x0448, // Odia, India
    708     om = 0x0072, // Oromo
    709     om_et = 0x0472, // Oromo, Ethiopia
    710     ps = 0x0063, // Pashto
    711     ps_af = 0x0463, // Pashto, Afghanistan
    712     fa = 0x0029, // Persian
    713     fa_ir = 0x0429, // Persian, Iran
    714     pl = 0x0015, // Polish
    715     pl_pl = 0x0415, // Polish, Poland
    716     pt = 0x0016, // Portuguese
    717     pt_br = 0x0416, // Portuguese, Brazil
    718     pt_pt = 0x0816, // Portuguese, Portugal
    719     qps_ploca = 0x05FE, // Pseudo Language, Pseudo locale for east Asian/complex script localization testing
    720     qps_ploc = 0x0501, // Pseudo Language, Pseudo locale used for localization testing
    721     qps_plocm = 0x09FF, // Pseudo Language, Pseudo locale used for localization testing of mirrored locales
    722     pa = 0x0046, // Punjabi
    723     pa_arab = 0x7C46, // Punjabi
    724     pa_in = 0x0446, // Punjabi, India
    725     pa_arab_pk = 0x0846, // Punjabi, Islamic Republic of Pakistan
    726     quz = 0x006B, // Quechua
    727     quz_bo = 0x046B, // Quechua, Bolivia
    728     quz_ec = 0x086B, // Quechua, Ecuador
    729     quz_pe = 0x0C6B, // Quechua, Peru
    730     ro = 0x0018, // Romanian
    731     ro_md = 0x0818, // Romanian, Moldova
    732     ro_ro = 0x0418, // Romanian, Romania
    733     rm = 0x0017, // Romansh
    734     rm_ch = 0x0417, // Romansh, Switzerland
    735     ru = 0x0019, // Russian
    736     ru_md = 0x0819, // Russian, Moldova
    737     ru_ru = 0x0419, // Russian, Russia
    738     sah = 0x0085, // Sakha
    739     sah_ru = 0x0485, // Sakha, Russia
    740     smn = 0x703B, // Sami (Inari)
    741     smn_fi = 0x243B, // Sami (Inari), Finland
    742     smj = 0x7C3B, // Sami (Lule)
    743     smj_no = 0x103B, // Sami (Lule), Norway
    744     smj_se = 0x143B, // Sami (Lule), Sweden
    745     se = 0x003B, // Sami (Northern)
    746     se_fi = 0x0C3B, // Sami (Northern), Finland
    747     se_no = 0x043B, // Sami (Northern), Norway
    748     se_se = 0x083B, // Sami (Northern), Sweden
    749     sms = 0x743B, // Sami (Skolt)
    750     sms_fi = 0x203B, // Sami (Skolt), Finland
    751     sma = 0x783B, // Sami (Southern)
    752     sma_no = 0x183B, // Sami (Southern), Norway
    753     sma_se = 0x1C3B, // Sami (Southern), Sweden
    754     sa = 0x004F, // Sanskrit
    755     sa_in = 0x044F, // Sanskrit, India
    756     gd = 0x0091, // Scottish Gaelic
    757     gd_gb = 0x0491, // Scottish Gaelic, United Kingdom
    758     sr_cyrl = 0x6C1A, // Serbian (Cyrillic)
    759     sr_cyrl_ba = 0x1C1A, // Serbian (Cyrillic), Bosnia and Herzegovina
    760     sr_cyrl_me = 0x301A, // Serbian (Cyrillic), Montenegro
    761     sr_cyrl_rs = 0x281A, // Serbian (Cyrillic), Serbia
    762     sr_cyrl_cs = 0x0C1A, // Serbian (Cyrillic), Serbia and Montenegro (Former)
    763     sr_latn = 0x701A, // Serbian (Latin)
    764     sr = 0x7C1A, // Serbian (Latin)
    765     sr_latn_ba = 0x181A, // Serbian (Latin), Bosnia and Herzegovina
    766     sr_latn_me = 0x2c1A, // Serbian (Latin), Montenegro
    767     sr_latn_rs = 0x241A, // Serbian (Latin), Serbia
    768     sr_latn_cs = 0x081A, // Serbian (Latin), Serbia and Montenegro (Former)
    769     nso = 0x006C, // Sesotho sa Leboa
    770     nso_za = 0x046C, // Sesotho sa Leboa, South Africa
    771     tn = 0x0032, // Setswana
    772     tn_bw = 0x0832, // Setswana, Botswana
    773     tn_za = 0x0432, // Setswana, South Africa
    774     sd = 0x0059, // Sindhi
    775     sd_arab = 0x7C59, // Sindhi
    776     sd_arab_pk = 0x0859, // Sindhi, Islamic Republic of Pakistan
    777     si = 0x005B, // Sinhala
    778     si_lk = 0x045B, // Sinhala, Sri Lanka
    779     sk = 0x001B, // Slovak
    780     sk_sk = 0x041B, // Slovak, Slovakia
    781     sl = 0x0024, // Slovenian
    782     sl_si = 0x0424, // Slovenian, Slovenia
    783     so = 0x0077, // Somali
    784     so_so = 0x0477, // Somali, Somalia
    785     st = 0x0030, // Sotho
    786     st_za = 0x0430, // Sotho, South Africa
    787     es = 0x000A, // Spanish
    788     es_ar = 0x2C0A, // Spanish, Argentina
    789     es_ve = 0x200A, // Spanish, Bolivarian Republic of Venezuela
    790     es_bo = 0x400A, // Spanish, Bolivia
    791     es_cl = 0x340A, // Spanish, Chile
    792     es_co = 0x240A, // Spanish, Colombia
    793     es_cr = 0x140A, // Spanish, Costa Rica
    794     es_cu = 0x5c0A, // Spanish, Cuba
    795     es_do = 0x1c0A, // Spanish, Dominican Republic
    796     es_ec = 0x300A, // Spanish, Ecuador
    797     es_sv = 0x440A, // Spanish, El Salvador
    798     es_gt = 0x100A, // Spanish, Guatemala
    799     es_hn = 0x480A, // Spanish, Honduras
    800     es_419 = 0x580A, // Spanish, Latin America
    801     es_mx = 0x080A, // Spanish, Mexico
    802     es_ni = 0x4C0A, // Spanish, Nicaragua
    803     es_pa = 0x180A, // Spanish, Panama
    804     es_py = 0x3C0A, // Spanish, Paraguay
    805     es_pe = 0x280A, // Spanish, Peru
    806     es_pr = 0x500A, // Spanish, Puerto Rico
    807     es_es_tradnl = 0x040A, // Spanish, Spain
    808     es_es = 0x0c0A, // Spanish, Spain
    809     es_us = 0x540A, // Spanish, United States
    810     es_uy = 0x380A, // Spanish, Uruguay
    811     sv = 0x001D, // Swedish
    812     sv_fi = 0x081D, // Swedish, Finland
    813     sv_se = 0x041D, // Swedish, Sweden
    814     syr = 0x005A, // Syriac
    815     syr_sy = 0x045A, // Syriac, Syria
    816     tg = 0x0028, // Tajik (Cyrillic)
    817     tg_cyrl = 0x7C28, // Tajik (Cyrillic)
    818     tg_cyrl_tj = 0x0428, // Tajik (Cyrillic), Tajikistan
    819     tzm = 0x005F, // Tamazight (Latin)
    820     tzm_latn = 0x7C5F, // Tamazight (Latin)
    821     tzm_latn_dz = 0x085F, // Tamazight (Latin), Algeria
    822     ta = 0x0049, // Tamil
    823     ta_in = 0x0449, // Tamil, India
    824     ta_lk = 0x0849, // Tamil, Sri Lanka
    825     tt = 0x0044, // Tatar
    826     tt_ru = 0x0444, // Tatar, Russia
    827     te = 0x004A, // Telugu
    828     te_in = 0x044A, // Telugu, India
    829     th = 0x001E, // Thai
    830     th_th = 0x041E, // Thai, Thailand
    831     bo = 0x0051, // Tibetan
    832     bo_cn = 0x0451, // Tibetan, People's Republic of China
    833     ti = 0x0073, // Tigrinya
    834     ti_er = 0x0873, // Tigrinya, Eritrea
    835     ti_et = 0x0473, // Tigrinya, Ethiopia
    836     ts = 0x0031, // Tsonga
    837     ts_za = 0x0431, // Tsonga, South Africa
    838     tr = 0x001F, // Turkish
    839     tr_tr = 0x041F, // Turkish, Turkey
    840     tk = 0x0042, // Turkmen
    841     tk_tm = 0x0442, // Turkmen, Turkmenistan
    842     uk = 0x0022, // Ukrainian
    843     uk_ua = 0x0422, // Ukrainian, Ukraine
    844     hsb = 0x002E, // Upper Sorbian
    845     hsb_de = 0x042E, // Upper Sorbian, Germany
    846     ur = 0x0020, // Urdu
    847     ur_in = 0x0820, // Urdu, India
    848     ur_pk = 0x0420, // Urdu, Islamic Republic of Pakistan
    849     ug = 0x0080, // Uyghur
    850     ug_cn = 0x0480, // Uyghur, People's Republic of China
    851     uz_cyrl = 0x7843, // Uzbek (Cyrillic)
    852     uz_cyrl_uz = 0x0843, // Uzbek (Cyrillic), Uzbekistan
    853     uz = 0x0043, // Uzbek (Latin)
    854     uz_latn = 0x7C43, // Uzbek (Latin)
    855     uz_latn_uz = 0x0443, // Uzbek (Latin), Uzbekistan
    856     ca_es_valencia = 0x0803, // Valencian, Spain
    857     ve = 0x0033, // Venda
    858     ve_za = 0x0433, // Venda, South Africa
    859     vi = 0x002A, // Vietnamese
    860     vi_vn = 0x042A, // Vietnamese, Vietnam
    861     cy = 0x0052, // Welsh
    862     cy_gb = 0x0452, // Welsh, United Kingdom
    863     wo = 0x0088, // Wolof
    864     wo_sn = 0x0488, // Wolof, Senegal
    865     xh = 0x0034, // Xhosa
    866     xh_za = 0x0434, // Xhosa, South Africa
    867     ii = 0x0078, // Yi
    868     ii_cn = 0x0478, // Yi, People's Republic of China
    869     yi_001 = 0x043D, // Yiddish, World
    870     yo = 0x006A, // Yoruba
    871     yo_ng = 0x046A, // Yoruba, Nigeria
    872     zu = 0x0035, // Zulu
    873     zu_za = 0x0435, // Zulu, South Africa
    874 
    875     /// Special case
    876     x_iv_mathan = 0x007F, // LANG_INVARIANT, "math alphanumeric sorting"
    877 };