lang.zig (36104B) - Raw
1 const std = @import("std"); 2 3 /// This function is specific to how the Win32 RC command line interprets 4 /// language IDs specified as integers. 5 /// - Always interpreted as hexadecimal, but explicit 0x prefix is also allowed 6 /// - Wraps on overflow of u16 7 /// - Stops parsing on any invalid hexadecimal digits 8 /// - Errors if a digit is not the first char 9 /// - `-` (negative) prefix is allowed 10 pub fn parseInt(str: []const u8) error{InvalidLanguageId}!u16 { 11 var result: u16 = 0; 12 const radix: u8 = 16; 13 var buf = str; 14 15 const Prefix = enum { none, minus }; 16 var prefix: Prefix = .none; 17 switch (buf[0]) { 18 '-' => { 19 prefix = .minus; 20 buf = buf[1..]; 21 }, 22 else => {}, 23 } 24 25 if (buf.len > 2 and buf[0] == '0' and buf[1] == 'x') { 26 buf = buf[2..]; 27 } 28 29 for (buf, 0..) |c, i| { 30 const digit = switch (c) { 31 // On invalid digit for the radix, just stop parsing but don't fail 32 'a'...'f', 'A'...'F', '0'...'9' => std.fmt.charToDigit(c, radix) catch break, 33 else => { 34 // First digit must be valid 35 if (i == 0) { 36 return error.InvalidLanguageId; 37 } 38 break; 39 }, 40 }; 41 42 if (result != 0) { 43 result *%= radix; 44 } 45 result +%= digit; 46 } 47 48 switch (prefix) { 49 .none => {}, 50 .minus => result = 0 -% result, 51 } 52 53 return result; 54 } 55 56 test parseInt { 57 try std.testing.expectEqual(@as(u16, 0x16), try parseInt("16")); 58 try std.testing.expectEqual(@as(u16, 0x1a), try parseInt("0x1A")); 59 try std.testing.expectEqual(@as(u16, 0x1a), try parseInt("0x1Azzzz")); 60 try std.testing.expectEqual(@as(u16, 0xffff), try parseInt("-1")); 61 try std.testing.expectEqual(@as(u16, 0xffea), try parseInt("-0x16")); 62 try std.testing.expectEqual(@as(u16, 0x0), try parseInt("0o100")); 63 try std.testing.expectEqual(@as(u16, 0x1), try parseInt("10001")); 64 try std.testing.expectError(error.InvalidLanguageId, parseInt("--1")); 65 try std.testing.expectError(error.InvalidLanguageId, parseInt("0xha")); 66 try std.testing.expectError(error.InvalidLanguageId, parseInt("¹")); 67 try std.testing.expectError(error.InvalidLanguageId, parseInt("~1")); 68 } 69 70 /// This function is specific to how the Win32 RC command line interprets 71 /// language tags: invalid tags are rejected, but tags that don't have 72 /// a specific assigned ID but are otherwise valid enough will get 73 /// converted to an ID of LOCALE_CUSTOM_UNSPECIFIED. 74 pub fn tagToInt(tag: []const u8) error{InvalidLanguageTag}!u16 { 75 const maybe_id = try tagToId(tag); 76 if (maybe_id) |id| { 77 return @intFromEnum(id); 78 } else { 79 return LOCALE_CUSTOM_UNSPECIFIED; 80 } 81 } 82 83 pub fn tagToId(tag: []const u8) error{InvalidLanguageTag}!?LanguageId { 84 const parsed = try parse(tag); 85 // There are currently no language tags with assigned IDs that have 86 // multiple suffixes, so we can skip the lookup. 87 if (parsed.multiple_suffixes) return null; 88 const longest_known_tag = comptime blk: { 89 var len = 0; 90 for (@typeInfo(LanguageId).@"enum".fields) |field| { 91 if (field.name.len > len) len = field.name.len; 92 } 93 break :blk len; 94 }; 95 // If the tag is longer than the longest tag that has an assigned ID, 96 // then we can skip the lookup. 97 if (tag.len > longest_known_tag) return null; 98 var normalized_buf: [longest_known_tag]u8 = undefined; 99 // To allow e.g. `de-de_phoneb` to get looked up as `de-de`, we need to 100 // omit the suffix, but only if the tag contains a valid alternate sort order. 101 const tag_to_normalize = if (parsed.isSuffixValidSortOrder()) tag[0 .. tag.len - (parsed.suffix.?.len + 1)] else tag; 102 const normalized_tag = normalizeTag(tag_to_normalize, &normalized_buf); 103 return std.meta.stringToEnum(LanguageId, normalized_tag) orelse { 104 // special case for a tag that has been mapped to the same ID 105 // twice. 106 if (std.mem.eql(u8, "ff_latn_ng", normalized_tag)) { 107 return LanguageId.ff_ng; 108 } 109 return null; 110 }; 111 } 112 113 test tagToId { 114 try std.testing.expectEqual(LanguageId.ar_ae, (try tagToId("ar-ae")).?); 115 try std.testing.expectEqual(LanguageId.ar_ae, (try tagToId("AR_AE")).?); 116 try std.testing.expectEqual(LanguageId.ff_ng, (try tagToId("ff-ng")).?); 117 // Special case 118 try std.testing.expectEqual(LanguageId.ff_ng, (try tagToId("ff-Latn-NG")).?); 119 } 120 121 test "exhaustive tagToId" { 122 inline for (@typeInfo(LanguageId).@"enum".fields) |field| { 123 const id = tagToId(field.name) catch |err| { 124 std.debug.print("tag: {s}\n", .{field.name}); 125 return err; 126 }; 127 try std.testing.expectEqual(@field(LanguageId, field.name), id orelse { 128 std.debug.print("tag: {s}, got null\n", .{field.name}); 129 return error.TestExpectedEqual; 130 }); 131 } 132 var buf: [32]u8 = undefined; 133 inline for (valid_alternate_sorts) |parsed_sort| { 134 var fbs = std.io.fixedBufferStream(&buf); 135 const writer = fbs.writer(); 136 writer.writeAll(parsed_sort.language_code) catch unreachable; 137 writer.writeAll("-") catch unreachable; 138 writer.writeAll(parsed_sort.country_code.?) catch unreachable; 139 writer.writeAll("-") catch unreachable; 140 writer.writeAll(parsed_sort.suffix.?) catch unreachable; 141 const expected_field_name = comptime field: { 142 var name_buf: [5]u8 = undefined; 143 @memcpy(name_buf[0..parsed_sort.language_code.len], parsed_sort.language_code); 144 name_buf[2] = '_'; 145 @memcpy(name_buf[3..], parsed_sort.country_code.?); 146 break :field name_buf; 147 }; 148 const expected = @field(LanguageId, &expected_field_name); 149 const id = tagToId(fbs.getWritten()) catch |err| { 150 std.debug.print("tag: {s}\n", .{fbs.getWritten()}); 151 return err; 152 }; 153 try std.testing.expectEqual(expected, id orelse { 154 std.debug.print("tag: {s}, expected: {}, got null\n", .{ fbs.getWritten(), expected }); 155 return error.TestExpectedEqual; 156 }); 157 } 158 } 159 160 fn normalizeTag(tag: []const u8, buf: []u8) []u8 { 161 std.debug.assert(buf.len >= tag.len); 162 for (tag, 0..) |c, i| { 163 if (c == '-') 164 buf[i] = '_' 165 else 166 buf[i] = std.ascii.toLower(c); 167 } 168 return buf[0..tag.len]; 169 } 170 171 /// https://winprotocoldoc.blob.core.windows.net/productionwindowsarchives/MS-LCID/%5bMS-LCID%5d.pdf#%5B%7B%22num%22%3A72%2C%22gen%22%3A0%7D%2C%7B%22name%22%3A%22XYZ%22%7D%2C69%2C574%2C0%5D 172 /// "When an LCID is requested for a locale without a 173 /// permanent LCID assignment, nor a temporary 174 /// assignment as above, the protocol will respond 175 /// with LOCALE_CUSTOM_UNSPECIFIED for all such 176 /// locales. Because this single value is used for 177 /// numerous possible locale names, it is impossible to 178 /// round trip this locale, even temporarily. 179 /// Applications should discard this value as soon as 180 /// possible and never persist it. If the system is 181 /// forced to respond to a request for 182 /// LCID_CUSTOM_UNSPECIFIED, it will fall back to 183 /// the current user locale. This is often incorrect but 184 /// may prevent an application or component from 185 /// failing. As the meaning of this temporary LCID is 186 /// unstable, it should never be used for interchange 187 /// or persisted data. This is a 1-to-many relationship 188 /// that is very unstable." 189 pub const LOCALE_CUSTOM_UNSPECIFIED = 0x1000; 190 191 pub const LANG_ENGLISH = 0x09; 192 pub const SUBLANG_ENGLISH_US = 0x01; 193 194 /// https://learn.microsoft.com/en-us/windows/win32/intl/language-identifiers 195 pub fn MAKELANGID(primary: u10, sublang: u6) u16 { 196 return (@as(u16, primary) << 10) | sublang; 197 } 198 199 /// Language tag format expressed as a regular expression (rough approximation): 200 /// 201 /// [a-zA-Z]{1,3}([-_][a-zA-Z]{4})?([-_][a-zA-Z]{2})?([-_][a-zA-Z0-9]{1,8})? 202 /// lang | script | country | suffix 203 /// 204 /// Notes: 205 /// - If lang code is 1 char, it seems to mean that everything afterwards uses suffix 206 /// parsing rules (e.g. `a-0` and `a-00000000` are allowed). 207 /// - There can also be any number of trailing suffix parts as long as they each 208 /// would be a valid suffix part, e.g. `en-us-blah-blah1-blah2-blah3` is allowed. 209 /// - When doing lookups, trailing suffix parts are taken into account, e.g. 210 /// `ca-es-valencia` is not considered equivalent to `ca-es-valencia-blah`. 211 /// - A suffix is only allowed if: 212 /// + Lang code is 1 char long, or 213 /// + A country code is present, or 214 /// + A script tag is not present and: 215 /// - the suffix is numeric-only and has a length of 3, or 216 /// - the lang is `qps` and the suffix is `ploca` or `plocm` 217 pub fn parse(lang_tag: []const u8) error{InvalidLanguageTag}!Parsed { 218 var it = std.mem.splitAny(u8, lang_tag, "-_"); 219 const lang_code = it.first(); 220 const is_valid_lang_code = lang_code.len >= 1 and lang_code.len <= 3 and isAllAlphabetic(lang_code); 221 if (!is_valid_lang_code) return error.InvalidLanguageTag; 222 var parsed = Parsed{ 223 .language_code = lang_code, 224 }; 225 // The second part could be a script tag, a country code, or a suffix 226 if (it.next()) |part_str| { 227 // The lang code being length 1 behaves strangely, so fully special case it. 228 if (lang_code.len == 1) { 229 // This is almost certainly not the 'right' way to do this, but I don't have a method 230 // to determine how exactly these language tags are parsed, and it seems like 231 // suffix parsing rules apply generally (digits allowed, length of 1 to 8). 232 // 233 // However, because we want to be able to lookup `x-iv-mathan` normally without 234 // `multiple_suffixes` being set to true, we need to make sure to treat two-length 235 // alphabetic parts as a country code. 236 if (part_str.len == 2 and isAllAlphabetic(part_str)) { 237 parsed.country_code = part_str; 238 } 239 // Everything else, though, we can just throw into the suffix as long as the normal 240 // rules apply. 241 else if (part_str.len > 0 and part_str.len <= 8 and isAllAlphanumeric(part_str)) { 242 parsed.suffix = part_str; 243 } else { 244 return error.InvalidLanguageTag; 245 } 246 } else if (part_str.len == 4 and isAllAlphabetic(part_str)) { 247 parsed.script_tag = part_str; 248 } else if (part_str.len == 2 and isAllAlphabetic(part_str)) { 249 parsed.country_code = part_str; 250 } 251 // Only a 3-len numeric suffix is allowed as the second part of a tag 252 else if (part_str.len == 3 and isAllNumeric(part_str)) { 253 parsed.suffix = part_str; 254 } 255 // Special case for qps-ploca and qps-plocm 256 else if (std.ascii.eqlIgnoreCase(lang_code, "qps") and 257 (std.ascii.eqlIgnoreCase(part_str, "ploca") or 258 std.ascii.eqlIgnoreCase(part_str, "plocm"))) 259 { 260 parsed.suffix = part_str; 261 } else { 262 return error.InvalidLanguageTag; 263 } 264 } else { 265 // If there's no part besides a 1-len lang code, then it is malformed 266 if (lang_code.len == 1) return error.InvalidLanguageTag; 267 return parsed; 268 } 269 if (parsed.script_tag != null) { 270 if (it.next()) |part_str| { 271 if (part_str.len == 2 and isAllAlphabetic(part_str)) { 272 parsed.country_code = part_str; 273 } else { 274 // Suffix is not allowed when a country code is not present. 275 return error.InvalidLanguageTag; 276 } 277 } else { 278 return parsed; 279 } 280 } 281 // We've now parsed any potential script tag/country codes, so anything remaining 282 // is a suffix 283 while (it.next()) |part_str| { 284 if (part_str.len == 0 or part_str.len > 8 or !isAllAlphanumeric(part_str)) { 285 return error.InvalidLanguageTag; 286 } 287 if (parsed.suffix == null) { 288 parsed.suffix = part_str; 289 } else { 290 // In theory we could return early here but we still want to validate 291 // that each part is a valid suffix all the way to the end, e.g. 292 // we should reject `en-us-suffix-a-b-c-!!!` because of the invalid `!!!` 293 // suffix part. 294 parsed.multiple_suffixes = true; 295 } 296 } 297 return parsed; 298 } 299 300 pub const Parsed = struct { 301 language_code: []const u8, 302 script_tag: ?[]const u8 = null, 303 country_code: ?[]const u8 = null, 304 /// Can be a sort order (e.g. phoneb) or something like valencia, 001, etc 305 suffix: ?[]const u8 = null, 306 /// There can be any number of suffixes, but we don't need to care what their 307 /// values are, we just need to know if any exist so that e.g. `ca-es-valencia-blah` 308 /// can be seen as different from `ca-es-valencia`. Storing this as a bool 309 /// allows us to avoid needing either (a) dynamic allocation or (b) a limit to 310 /// the number of suffixes allowed when parsing. 311 multiple_suffixes: bool = false, 312 313 pub fn isSuffixValidSortOrder(self: Parsed) bool { 314 if (self.country_code == null) return false; 315 if (self.suffix == null) return false; 316 if (self.script_tag != null) return false; 317 if (self.multiple_suffixes) return false; 318 for (valid_alternate_sorts) |valid_sort| { 319 if (std.ascii.eqlIgnoreCase(valid_sort.language_code, self.language_code) and 320 std.ascii.eqlIgnoreCase(valid_sort.country_code.?, self.country_code.?) and 321 std.ascii.eqlIgnoreCase(valid_sort.suffix.?, self.suffix.?)) 322 { 323 return true; 324 } 325 } 326 return false; 327 } 328 }; 329 330 /// https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f 331 /// See the table following this text: "Alternate sorts can be selected by using one of the identifiers from the following table." 332 const valid_alternate_sorts = [_]Parsed{ 333 // Note: x-IV-mathan is omitted due to how lookups are implemented. 334 // This table is used to make e.g. `de-de_phoneb` get looked up 335 // as `de-de` (the suffix is omitted for the lookup), but x-iv-mathan 336 // instead needs to be looked up with the suffix included because 337 // `x-iv` is not a tag with an assigned ID. 338 .{ .language_code = "de", .country_code = "de", .suffix = "phoneb" }, 339 .{ .language_code = "hu", .country_code = "hu", .suffix = "tchncl" }, 340 .{ .language_code = "ka", .country_code = "ge", .suffix = "modern" }, 341 .{ .language_code = "zh", .country_code = "cn", .suffix = "stroke" }, 342 .{ .language_code = "zh", .country_code = "sg", .suffix = "stroke" }, 343 .{ .language_code = "zh", .country_code = "mo", .suffix = "stroke" }, 344 .{ .language_code = "zh", .country_code = "tw", .suffix = "pronun" }, 345 .{ .language_code = "zh", .country_code = "tw", .suffix = "radstr" }, 346 .{ .language_code = "ja", .country_code = "jp", .suffix = "radstr" }, 347 .{ .language_code = "zh", .country_code = "hk", .suffix = "radstr" }, 348 .{ .language_code = "zh", .country_code = "mo", .suffix = "radstr" }, 349 .{ .language_code = "zh", .country_code = "cn", .suffix = "phoneb" }, 350 .{ .language_code = "zh", .country_code = "sg", .suffix = "phoneb" }, 351 }; 352 353 test "parse" { 354 try std.testing.expectEqualDeep(Parsed{ 355 .language_code = "en", 356 }, try parse("en")); 357 try std.testing.expectEqualDeep(Parsed{ 358 .language_code = "en", 359 .country_code = "us", 360 }, try parse("en-us")); 361 try std.testing.expectEqualDeep(Parsed{ 362 .language_code = "en", 363 .suffix = "123", 364 }, try parse("en-123")); 365 try std.testing.expectEqualDeep(Parsed{ 366 .language_code = "en", 367 .suffix = "123", 368 .multiple_suffixes = true, 369 }, try parse("en-123-blah")); 370 try std.testing.expectEqualDeep(Parsed{ 371 .language_code = "en", 372 .country_code = "us", 373 .suffix = "123", 374 .multiple_suffixes = true, 375 }, try parse("en-us_123-blah")); 376 try std.testing.expectEqualDeep(Parsed{ 377 .language_code = "eng", 378 .script_tag = "Latn", 379 }, try parse("eng-Latn")); 380 try std.testing.expectEqualDeep(Parsed{ 381 .language_code = "eng", 382 .script_tag = "Latn", 383 }, try parse("eng-Latn")); 384 try std.testing.expectEqualDeep(Parsed{ 385 .language_code = "ff", 386 .script_tag = "Latn", 387 .country_code = "NG", 388 }, try parse("ff-Latn-NG")); 389 try std.testing.expectEqualDeep(Parsed{ 390 .language_code = "qps", 391 .suffix = "Plocm", 392 }, try parse("qps-Plocm")); 393 try std.testing.expectEqualDeep(Parsed{ 394 .language_code = "qps", 395 .suffix = "ploca", 396 }, try parse("qps-ploca")); 397 try std.testing.expectEqualDeep(Parsed{ 398 .language_code = "x", 399 .country_code = "IV", 400 .suffix = "mathan", 401 }, try parse("x-IV-mathan")); 402 try std.testing.expectEqualDeep(Parsed{ 403 .language_code = "a", 404 .suffix = "a", 405 }, try parse("a-a")); 406 try std.testing.expectEqualDeep(Parsed{ 407 .language_code = "a", 408 .suffix = "000", 409 }, try parse("a-000")); 410 try std.testing.expectEqualDeep(Parsed{ 411 .language_code = "a", 412 .suffix = "00000000", 413 }, try parse("a-00000000")); 414 // suffix not allowed if script tag is present without country code 415 try std.testing.expectError(error.InvalidLanguageTag, parse("eng-Latn-suffix")); 416 // suffix must be 3 numeric digits if neither script tag nor country code is present 417 try std.testing.expectError(error.InvalidLanguageTag, parse("eng-suffix")); 418 try std.testing.expectError(error.InvalidLanguageTag, parse("en-plocm")); 419 // 1-len lang code is not allowed if it's the only part 420 try std.testing.expectError(error.InvalidLanguageTag, parse("e")); 421 } 422 423 fn isAllAlphabetic(str: []const u8) bool { 424 for (str) |c| { 425 if (!std.ascii.isAlphabetic(c)) return false; 426 } 427 return true; 428 } 429 430 fn isAllAlphanumeric(str: []const u8) bool { 431 for (str) |c| { 432 if (!std.ascii.isAlphanumeric(c)) return false; 433 } 434 return true; 435 } 436 437 fn isAllNumeric(str: []const u8) bool { 438 for (str) |c| { 439 if (!std.ascii.isDigit(c)) return false; 440 } 441 return true; 442 } 443 444 /// Derived from https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f 445 /// - Protocol Revision: 15.0 446 /// - Language / Language ID / Language Tag table in Appendix A 447 /// - Removed all rows that have Language ID 0x1000 (LOCALE_CUSTOM_UNSPECIFIED) 448 /// - Normalized each language tag (lowercased, replaced all `-` with `_`) 449 /// - There is one special case where two tags are mapped to the same ID, the following 450 /// has been omitted and must be special cased during lookup to map to the ID ff_ng / 0x0467. 451 /// ff_latn_ng = 0x0467, // Fulah (Latin), Nigeria 452 /// - x_iv_mathan has been added which is not in the table but does appear in the Alternate sorts 453 /// table as 0x007F (LANG_INVARIANT). 454 pub const LanguageId = enum(u16) { 455 // Language tag = Language ID, // Language, Location (or type) 456 af = 0x0036, // Afrikaans 457 af_za = 0x0436, // Afrikaans, South Africa 458 sq = 0x001C, // Albanian 459 sq_al = 0x041C, // Albanian, Albania 460 gsw = 0x0084, // Alsatian 461 gsw_fr = 0x0484, // Alsatian, France 462 am = 0x005E, // Amharic 463 am_et = 0x045E, // Amharic, Ethiopia 464 ar = 0x0001, // Arabic 465 ar_dz = 0x1401, // Arabic, Algeria 466 ar_bh = 0x3C01, // Arabic, Bahrain 467 ar_eg = 0x0c01, // Arabic, Egypt 468 ar_iq = 0x0801, // Arabic, Iraq 469 ar_jo = 0x2C01, // Arabic, Jordan 470 ar_kw = 0x3401, // Arabic, Kuwait 471 ar_lb = 0x3001, // Arabic, Lebanon 472 ar_ly = 0x1001, // Arabic, Libya 473 ar_ma = 0x1801, // Arabic, Morocco 474 ar_om = 0x2001, // Arabic, Oman 475 ar_qa = 0x4001, // Arabic, Qatar 476 ar_sa = 0x0401, // Arabic, Saudi Arabia 477 ar_sy = 0x2801, // Arabic, Syria 478 ar_tn = 0x1C01, // Arabic, Tunisia 479 ar_ae = 0x3801, // Arabic, U.A.E. 480 ar_ye = 0x2401, // Arabic, Yemen 481 hy = 0x002B, // Armenian 482 hy_am = 0x042B, // Armenian, Armenia 483 as = 0x004D, // Assamese 484 as_in = 0x044D, // Assamese, India 485 az_cyrl = 0x742C, // Azerbaijani (Cyrillic) 486 az_cyrl_az = 0x082C, // Azerbaijani (Cyrillic), Azerbaijan 487 az = 0x002C, // Azerbaijani (Latin) 488 az_latn = 0x782C, // Azerbaijani (Latin) 489 az_latn_az = 0x042C, // Azerbaijani (Latin), Azerbaijan 490 bn = 0x0045, // Bangla 491 bn_bd = 0x0845, // Bangla, Bangladesh 492 bn_in = 0x0445, // Bangla, India 493 ba = 0x006D, // Bashkir 494 ba_ru = 0x046D, // Bashkir, Russia 495 eu = 0x002D, // Basque 496 eu_es = 0x042D, // Basque, Spain 497 be = 0x0023, // Belarusian 498 be_by = 0x0423, // Belarusian, Belarus 499 bs_cyrl = 0x641A, // Bosnian (Cyrillic) 500 bs_cyrl_ba = 0x201A, // Bosnian (Cyrillic), Bosnia and Herzegovina 501 bs_latn = 0x681A, // Bosnian (Latin) 502 bs = 0x781A, // Bosnian (Latin) 503 bs_latn_ba = 0x141A, // Bosnian (Latin), Bosnia and Herzegovina 504 br = 0x007E, // Breton 505 br_fr = 0x047E, // Breton, France 506 bg = 0x0002, // Bulgarian 507 bg_bg = 0x0402, // Bulgarian, Bulgaria 508 my = 0x0055, // Burmese 509 my_mm = 0x0455, // Burmese, Myanmar 510 ca = 0x0003, // Catalan 511 ca_es = 0x0403, // Catalan, Spain 512 tzm_arab_ma = 0x045F, // Central Atlas Tamazight (Arabic), Morocco 513 ku = 0x0092, // Central Kurdish 514 ku_arab = 0x7c92, // Central Kurdish 515 ku_arab_iq = 0x0492, // Central Kurdish, Iraq 516 chr = 0x005C, // Cherokee 517 chr_cher = 0x7c5C, // Cherokee 518 chr_cher_us = 0x045C, // Cherokee, United States 519 zh_hans = 0x0004, // Chinese (Simplified) 520 zh = 0x7804, // Chinese (Simplified) 521 zh_cn = 0x0804, // Chinese (Simplified), People's Republic of China 522 zh_sg = 0x1004, // Chinese (Simplified), Singapore 523 zh_hant = 0x7C04, // Chinese (Traditional) 524 zh_hk = 0x0C04, // Chinese (Traditional), Hong Kong S.A.R. 525 zh_mo = 0x1404, // Chinese (Traditional), Macao S.A.R. 526 zh_tw = 0x0404, // Chinese (Traditional), Taiwan 527 co = 0x0083, // Corsican 528 co_fr = 0x0483, // Corsican, France 529 hr = 0x001A, // Croatian 530 hr_hr = 0x041A, // Croatian, Croatia 531 hr_ba = 0x101A, // Croatian (Latin), Bosnia and Herzegovina 532 cs = 0x0005, // Czech 533 cs_cz = 0x0405, // Czech, Czech Republic 534 da = 0x0006, // Danish 535 da_dk = 0x0406, // Danish, Denmark 536 prs = 0x008C, // Dari 537 prs_af = 0x048C, // Dari, Afghanistan 538 dv = 0x0065, // Divehi 539 dv_mv = 0x0465, // Divehi, Maldives 540 nl = 0x0013, // Dutch 541 nl_be = 0x0813, // Dutch, Belgium 542 nl_nl = 0x0413, // Dutch, Netherlands 543 dz_bt = 0x0C51, // Dzongkha, Bhutan 544 en = 0x0009, // English 545 en_au = 0x0C09, // English, Australia 546 en_bz = 0x2809, // English, Belize 547 en_ca = 0x1009, // English, Canada 548 en_029 = 0x2409, // English, Caribbean 549 en_hk = 0x3C09, // English, Hong Kong 550 en_in = 0x4009, // English, India 551 en_ie = 0x1809, // English, Ireland 552 en_jm = 0x2009, // English, Jamaica 553 en_my = 0x4409, // English, Malaysia 554 en_nz = 0x1409, // English, New Zealand 555 en_ph = 0x3409, // English, Republic of the Philippines 556 en_sg = 0x4809, // English, Singapore 557 en_za = 0x1C09, // English, South Africa 558 en_tt = 0x2c09, // English, Trinidad and Tobago 559 en_ae = 0x4C09, // English, United Arab Emirates 560 en_gb = 0x0809, // English, United Kingdom 561 en_us = 0x0409, // English, United States 562 en_zw = 0x3009, // English, Zimbabwe 563 et = 0x0025, // Estonian 564 et_ee = 0x0425, // Estonian, Estonia 565 fo = 0x0038, // Faroese 566 fo_fo = 0x0438, // Faroese, Faroe Islands 567 fil = 0x0064, // Filipino 568 fil_ph = 0x0464, // Filipino, Philippines 569 fi = 0x000B, // Finnish 570 fi_fi = 0x040B, // Finnish, Finland 571 fr = 0x000C, // French 572 fr_be = 0x080C, // French, Belgium 573 fr_cm = 0x2c0C, // French, Cameroon 574 fr_ca = 0x0c0C, // French, Canada 575 fr_029 = 0x1C0C, // French, Caribbean 576 fr_cd = 0x240C, // French, Congo, DRC 577 fr_ci = 0x300C, // French, Côte d'Ivoire 578 fr_fr = 0x040C, // French, France 579 fr_ht = 0x3c0C, // French, Haiti 580 fr_lu = 0x140C, // French, Luxembourg 581 fr_ml = 0x340C, // French, Mali 582 fr_ma = 0x380C, // French, Morocco 583 fr_mc = 0x180C, // French, Principality of Monaco 584 fr_re = 0x200C, // French, Reunion 585 fr_sn = 0x280C, // French, Senegal 586 fr_ch = 0x100C, // French, Switzerland 587 fy = 0x0062, // Frisian 588 fy_nl = 0x0462, // Frisian, Netherlands 589 ff = 0x0067, // Fulah 590 ff_latn = 0x7C67, // Fulah (Latin) 591 ff_ng = 0x0467, // Fulah, Nigeria 592 ff_latn_sn = 0x0867, // Fulah, Senegal 593 gl = 0x0056, // Galician 594 gl_es = 0x0456, // Galician, Spain 595 ka = 0x0037, // Georgian 596 ka_ge = 0x0437, // Georgian, Georgia 597 de = 0x0007, // German 598 de_at = 0x0C07, // German, Austria 599 de_de = 0x0407, // German, Germany 600 de_li = 0x1407, // German, Liechtenstein 601 de_lu = 0x1007, // German, Luxembourg 602 de_ch = 0x0807, // German, Switzerland 603 el = 0x0008, // Greek 604 el_gr = 0x0408, // Greek, Greece 605 kl = 0x006F, // Greenlandic 606 kl_gl = 0x046F, // Greenlandic, Greenland 607 gn = 0x0074, // Guarani 608 gn_py = 0x0474, // Guarani, Paraguay 609 gu = 0x0047, // Gujarati 610 gu_in = 0x0447, // Gujarati, India 611 ha = 0x0068, // Hausa (Latin) 612 ha_latn = 0x7C68, // Hausa (Latin) 613 ha_latn_ng = 0x0468, // Hausa (Latin), Nigeria 614 haw = 0x0075, // Hawaiian 615 haw_us = 0x0475, // Hawaiian, United States 616 he = 0x000D, // Hebrew 617 he_il = 0x040D, // Hebrew, Israel 618 hi = 0x0039, // Hindi 619 hi_in = 0x0439, // Hindi, India 620 hu = 0x000E, // Hungarian 621 hu_hu = 0x040E, // Hungarian, Hungary 622 is = 0x000F, // Icelandic 623 is_is = 0x040F, // Icelandic, Iceland 624 ig = 0x0070, // Igbo 625 ig_ng = 0x0470, // Igbo, Nigeria 626 id = 0x0021, // Indonesian 627 id_id = 0x0421, // Indonesian, Indonesia 628 iu = 0x005D, // Inuktitut (Latin) 629 iu_latn = 0x7C5D, // Inuktitut (Latin) 630 iu_latn_ca = 0x085D, // Inuktitut (Latin), Canada 631 iu_cans = 0x785D, // Inuktitut (Syllabics) 632 iu_cans_ca = 0x045d, // Inuktitut (Syllabics), Canada 633 ga = 0x003C, // Irish 634 ga_ie = 0x083C, // Irish, Ireland 635 it = 0x0010, // Italian 636 it_it = 0x0410, // Italian, Italy 637 it_ch = 0x0810, // Italian, Switzerland 638 ja = 0x0011, // Japanese 639 ja_jp = 0x0411, // Japanese, Japan 640 kn = 0x004B, // Kannada 641 kn_in = 0x044B, // Kannada, India 642 kr_latn_ng = 0x0471, // Kanuri (Latin), Nigeria 643 ks = 0x0060, // Kashmiri 644 ks_arab = 0x0460, // Kashmiri, Perso-Arabic 645 ks_deva_in = 0x0860, // Kashmiri (Devanagari), India 646 kk = 0x003F, // Kazakh 647 kk_kz = 0x043F, // Kazakh, Kazakhstan 648 km = 0x0053, // Khmer 649 km_kh = 0x0453, // Khmer, Cambodia 650 quc = 0x0086, // K'iche 651 quc_latn_gt = 0x0486, // K'iche, Guatemala 652 rw = 0x0087, // Kinyarwanda 653 rw_rw = 0x0487, // Kinyarwanda, Rwanda 654 sw = 0x0041, // Kiswahili 655 sw_ke = 0x0441, // Kiswahili, Kenya 656 kok = 0x0057, // Konkani 657 kok_in = 0x0457, // Konkani, India 658 ko = 0x0012, // Korean 659 ko_kr = 0x0412, // Korean, Korea 660 ky = 0x0040, // Kyrgyz 661 ky_kg = 0x0440, // Kyrgyz, Kyrgyzstan 662 lo = 0x0054, // Lao 663 lo_la = 0x0454, // Lao, Lao P.D.R. 664 la_va = 0x0476, // Latin, Vatican City 665 lv = 0x0026, // Latvian 666 lv_lv = 0x0426, // Latvian, Latvia 667 lt = 0x0027, // Lithuanian 668 lt_lt = 0x0427, // Lithuanian, Lithuania 669 dsb = 0x7C2E, // Lower Sorbian 670 dsb_de = 0x082E, // Lower Sorbian, Germany 671 lb = 0x006E, // Luxembourgish 672 lb_lu = 0x046E, // Luxembourgish, Luxembourg 673 mk = 0x002F, // Macedonian 674 mk_mk = 0x042F, // Macedonian, North Macedonia 675 ms = 0x003E, // Malay 676 ms_bn = 0x083E, // Malay, Brunei Darussalam 677 ms_my = 0x043E, // Malay, Malaysia 678 ml = 0x004C, // Malayalam 679 ml_in = 0x044C, // Malayalam, India 680 mt = 0x003A, // Maltese 681 mt_mt = 0x043A, // Maltese, Malta 682 mi = 0x0081, // Maori 683 mi_nz = 0x0481, // Maori, New Zealand 684 arn = 0x007A, // Mapudungun 685 arn_cl = 0x047A, // Mapudungun, Chile 686 mr = 0x004E, // Marathi 687 mr_in = 0x044E, // Marathi, India 688 moh = 0x007C, // Mohawk 689 moh_ca = 0x047C, // Mohawk, Canada 690 mn = 0x0050, // Mongolian (Cyrillic) 691 mn_cyrl = 0x7850, // Mongolian (Cyrillic) 692 mn_mn = 0x0450, // Mongolian (Cyrillic), Mongolia 693 mn_mong = 0x7C50, // Mongolian (Traditional Mongolian) 694 mn_mong_cn = 0x0850, // Mongolian (Traditional Mongolian), People's Republic of China 695 mn_mong_mn = 0x0C50, // Mongolian (Traditional Mongolian), Mongolia 696 ne = 0x0061, // Nepali 697 ne_in = 0x0861, // Nepali, India 698 ne_np = 0x0461, // Nepali, Nepal 699 no = 0x0014, // Norwegian (Bokmal) 700 nb = 0x7C14, // Norwegian (Bokmal) 701 nb_no = 0x0414, // Norwegian (Bokmal), Norway 702 nn = 0x7814, // Norwegian (Nynorsk) 703 nn_no = 0x0814, // Norwegian (Nynorsk), Norway 704 oc = 0x0082, // Occitan 705 oc_fr = 0x0482, // Occitan, France 706 @"or" = 0x0048, // Odia 707 or_in = 0x0448, // Odia, India 708 om = 0x0072, // Oromo 709 om_et = 0x0472, // Oromo, Ethiopia 710 ps = 0x0063, // Pashto 711 ps_af = 0x0463, // Pashto, Afghanistan 712 fa = 0x0029, // Persian 713 fa_ir = 0x0429, // Persian, Iran 714 pl = 0x0015, // Polish 715 pl_pl = 0x0415, // Polish, Poland 716 pt = 0x0016, // Portuguese 717 pt_br = 0x0416, // Portuguese, Brazil 718 pt_pt = 0x0816, // Portuguese, Portugal 719 qps_ploca = 0x05FE, // Pseudo Language, Pseudo locale for east Asian/complex script localization testing 720 qps_ploc = 0x0501, // Pseudo Language, Pseudo locale used for localization testing 721 qps_plocm = 0x09FF, // Pseudo Language, Pseudo locale used for localization testing of mirrored locales 722 pa = 0x0046, // Punjabi 723 pa_arab = 0x7C46, // Punjabi 724 pa_in = 0x0446, // Punjabi, India 725 pa_arab_pk = 0x0846, // Punjabi, Islamic Republic of Pakistan 726 quz = 0x006B, // Quechua 727 quz_bo = 0x046B, // Quechua, Bolivia 728 quz_ec = 0x086B, // Quechua, Ecuador 729 quz_pe = 0x0C6B, // Quechua, Peru 730 ro = 0x0018, // Romanian 731 ro_md = 0x0818, // Romanian, Moldova 732 ro_ro = 0x0418, // Romanian, Romania 733 rm = 0x0017, // Romansh 734 rm_ch = 0x0417, // Romansh, Switzerland 735 ru = 0x0019, // Russian 736 ru_md = 0x0819, // Russian, Moldova 737 ru_ru = 0x0419, // Russian, Russia 738 sah = 0x0085, // Sakha 739 sah_ru = 0x0485, // Sakha, Russia 740 smn = 0x703B, // Sami (Inari) 741 smn_fi = 0x243B, // Sami (Inari), Finland 742 smj = 0x7C3B, // Sami (Lule) 743 smj_no = 0x103B, // Sami (Lule), Norway 744 smj_se = 0x143B, // Sami (Lule), Sweden 745 se = 0x003B, // Sami (Northern) 746 se_fi = 0x0C3B, // Sami (Northern), Finland 747 se_no = 0x043B, // Sami (Northern), Norway 748 se_se = 0x083B, // Sami (Northern), Sweden 749 sms = 0x743B, // Sami (Skolt) 750 sms_fi = 0x203B, // Sami (Skolt), Finland 751 sma = 0x783B, // Sami (Southern) 752 sma_no = 0x183B, // Sami (Southern), Norway 753 sma_se = 0x1C3B, // Sami (Southern), Sweden 754 sa = 0x004F, // Sanskrit 755 sa_in = 0x044F, // Sanskrit, India 756 gd = 0x0091, // Scottish Gaelic 757 gd_gb = 0x0491, // Scottish Gaelic, United Kingdom 758 sr_cyrl = 0x6C1A, // Serbian (Cyrillic) 759 sr_cyrl_ba = 0x1C1A, // Serbian (Cyrillic), Bosnia and Herzegovina 760 sr_cyrl_me = 0x301A, // Serbian (Cyrillic), Montenegro 761 sr_cyrl_rs = 0x281A, // Serbian (Cyrillic), Serbia 762 sr_cyrl_cs = 0x0C1A, // Serbian (Cyrillic), Serbia and Montenegro (Former) 763 sr_latn = 0x701A, // Serbian (Latin) 764 sr = 0x7C1A, // Serbian (Latin) 765 sr_latn_ba = 0x181A, // Serbian (Latin), Bosnia and Herzegovina 766 sr_latn_me = 0x2c1A, // Serbian (Latin), Montenegro 767 sr_latn_rs = 0x241A, // Serbian (Latin), Serbia 768 sr_latn_cs = 0x081A, // Serbian (Latin), Serbia and Montenegro (Former) 769 nso = 0x006C, // Sesotho sa Leboa 770 nso_za = 0x046C, // Sesotho sa Leboa, South Africa 771 tn = 0x0032, // Setswana 772 tn_bw = 0x0832, // Setswana, Botswana 773 tn_za = 0x0432, // Setswana, South Africa 774 sd = 0x0059, // Sindhi 775 sd_arab = 0x7C59, // Sindhi 776 sd_arab_pk = 0x0859, // Sindhi, Islamic Republic of Pakistan 777 si = 0x005B, // Sinhala 778 si_lk = 0x045B, // Sinhala, Sri Lanka 779 sk = 0x001B, // Slovak 780 sk_sk = 0x041B, // Slovak, Slovakia 781 sl = 0x0024, // Slovenian 782 sl_si = 0x0424, // Slovenian, Slovenia 783 so = 0x0077, // Somali 784 so_so = 0x0477, // Somali, Somalia 785 st = 0x0030, // Sotho 786 st_za = 0x0430, // Sotho, South Africa 787 es = 0x000A, // Spanish 788 es_ar = 0x2C0A, // Spanish, Argentina 789 es_ve = 0x200A, // Spanish, Bolivarian Republic of Venezuela 790 es_bo = 0x400A, // Spanish, Bolivia 791 es_cl = 0x340A, // Spanish, Chile 792 es_co = 0x240A, // Spanish, Colombia 793 es_cr = 0x140A, // Spanish, Costa Rica 794 es_cu = 0x5c0A, // Spanish, Cuba 795 es_do = 0x1c0A, // Spanish, Dominican Republic 796 es_ec = 0x300A, // Spanish, Ecuador 797 es_sv = 0x440A, // Spanish, El Salvador 798 es_gt = 0x100A, // Spanish, Guatemala 799 es_hn = 0x480A, // Spanish, Honduras 800 es_419 = 0x580A, // Spanish, Latin America 801 es_mx = 0x080A, // Spanish, Mexico 802 es_ni = 0x4C0A, // Spanish, Nicaragua 803 es_pa = 0x180A, // Spanish, Panama 804 es_py = 0x3C0A, // Spanish, Paraguay 805 es_pe = 0x280A, // Spanish, Peru 806 es_pr = 0x500A, // Spanish, Puerto Rico 807 es_es_tradnl = 0x040A, // Spanish, Spain 808 es_es = 0x0c0A, // Spanish, Spain 809 es_us = 0x540A, // Spanish, United States 810 es_uy = 0x380A, // Spanish, Uruguay 811 sv = 0x001D, // Swedish 812 sv_fi = 0x081D, // Swedish, Finland 813 sv_se = 0x041D, // Swedish, Sweden 814 syr = 0x005A, // Syriac 815 syr_sy = 0x045A, // Syriac, Syria 816 tg = 0x0028, // Tajik (Cyrillic) 817 tg_cyrl = 0x7C28, // Tajik (Cyrillic) 818 tg_cyrl_tj = 0x0428, // Tajik (Cyrillic), Tajikistan 819 tzm = 0x005F, // Tamazight (Latin) 820 tzm_latn = 0x7C5F, // Tamazight (Latin) 821 tzm_latn_dz = 0x085F, // Tamazight (Latin), Algeria 822 ta = 0x0049, // Tamil 823 ta_in = 0x0449, // Tamil, India 824 ta_lk = 0x0849, // Tamil, Sri Lanka 825 tt = 0x0044, // Tatar 826 tt_ru = 0x0444, // Tatar, Russia 827 te = 0x004A, // Telugu 828 te_in = 0x044A, // Telugu, India 829 th = 0x001E, // Thai 830 th_th = 0x041E, // Thai, Thailand 831 bo = 0x0051, // Tibetan 832 bo_cn = 0x0451, // Tibetan, People's Republic of China 833 ti = 0x0073, // Tigrinya 834 ti_er = 0x0873, // Tigrinya, Eritrea 835 ti_et = 0x0473, // Tigrinya, Ethiopia 836 ts = 0x0031, // Tsonga 837 ts_za = 0x0431, // Tsonga, South Africa 838 tr = 0x001F, // Turkish 839 tr_tr = 0x041F, // Turkish, Turkey 840 tk = 0x0042, // Turkmen 841 tk_tm = 0x0442, // Turkmen, Turkmenistan 842 uk = 0x0022, // Ukrainian 843 uk_ua = 0x0422, // Ukrainian, Ukraine 844 hsb = 0x002E, // Upper Sorbian 845 hsb_de = 0x042E, // Upper Sorbian, Germany 846 ur = 0x0020, // Urdu 847 ur_in = 0x0820, // Urdu, India 848 ur_pk = 0x0420, // Urdu, Islamic Republic of Pakistan 849 ug = 0x0080, // Uyghur 850 ug_cn = 0x0480, // Uyghur, People's Republic of China 851 uz_cyrl = 0x7843, // Uzbek (Cyrillic) 852 uz_cyrl_uz = 0x0843, // Uzbek (Cyrillic), Uzbekistan 853 uz = 0x0043, // Uzbek (Latin) 854 uz_latn = 0x7C43, // Uzbek (Latin) 855 uz_latn_uz = 0x0443, // Uzbek (Latin), Uzbekistan 856 ca_es_valencia = 0x0803, // Valencian, Spain 857 ve = 0x0033, // Venda 858 ve_za = 0x0433, // Venda, South Africa 859 vi = 0x002A, // Vietnamese 860 vi_vn = 0x042A, // Vietnamese, Vietnam 861 cy = 0x0052, // Welsh 862 cy_gb = 0x0452, // Welsh, United Kingdom 863 wo = 0x0088, // Wolof 864 wo_sn = 0x0488, // Wolof, Senegal 865 xh = 0x0034, // Xhosa 866 xh_za = 0x0434, // Xhosa, South Africa 867 ii = 0x0078, // Yi 868 ii_cn = 0x0478, // Yi, People's Republic of China 869 yi_001 = 0x043D, // Yiddish, World 870 yo = 0x006A, // Yoruba 871 yo_ng = 0x046A, // Yoruba, Nigeria 872 zu = 0x0035, // Zulu 873 zu_za = 0x0435, // Zulu, South Africa 874 875 /// Special case 876 x_iv_mathan = 0x007F, // LANG_INVARIANT, "math alphanumeric sorting" 877 };