windows1252.zig (31322B) - Raw
1 const std = @import("std"); 2 3 pub fn windows1252ToUtf8Stream(writer: anytype, reader: anytype) !usize { 4 var bytes_written: usize = 0; 5 var utf8_buf: [3]u8 = undefined; 6 while (true) { 7 const c = reader.readByte() catch |err| switch (err) { 8 error.EndOfStream => return bytes_written, 9 else => |e| return e, 10 }; 11 const codepoint = toCodepoint(c); 12 if (codepoint <= 0x7F) { 13 try writer.writeByte(c); 14 bytes_written += 1; 15 } else { 16 const utf8_len = std.unicode.utf8Encode(codepoint, &utf8_buf) catch unreachable; 17 try writer.writeAll(utf8_buf[0..utf8_len]); 18 bytes_written += utf8_len; 19 } 20 } 21 } 22 23 /// Returns the number of code units written to the writer 24 pub fn windows1252ToUtf16AllocZ(allocator: std.mem.Allocator, win1252_str: []const u8) ![:0]u16 { 25 // Guaranteed to need exactly the same number of code units as Windows-1252 bytes 26 var utf16_slice = try allocator.allocSentinel(u16, win1252_str.len, 0); 27 errdefer allocator.free(utf16_slice); 28 for (win1252_str, 0..) |c, i| { 29 utf16_slice[i] = toCodepoint(c); 30 } 31 return utf16_slice; 32 } 33 34 /// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt 35 pub fn toCodepoint(c: u8) u16 { 36 return switch (c) { 37 0x80 => 0x20ac, // Euro Sign 38 0x82 => 0x201a, // Single Low-9 Quotation Mark 39 0x83 => 0x0192, // Latin Small Letter F With Hook 40 0x84 => 0x201e, // Double Low-9 Quotation Mark 41 0x85 => 0x2026, // Horizontal Ellipsis 42 0x86 => 0x2020, // Dagger 43 0x87 => 0x2021, // Double Dagger 44 0x88 => 0x02c6, // Modifier Letter Circumflex Accent 45 0x89 => 0x2030, // Per Mille Sign 46 0x8a => 0x0160, // Latin Capital Letter S With Caron 47 0x8b => 0x2039, // Single Left-Pointing Angle Quotation Mark 48 0x8c => 0x0152, // Latin Capital Ligature Oe 49 0x8e => 0x017d, // Latin Capital Letter Z With Caron 50 0x91 => 0x2018, // Left Single Quotation Mark 51 0x92 => 0x2019, // Right Single Quotation Mark 52 0x93 => 0x201c, // Left Double Quotation Mark 53 0x94 => 0x201d, // Right Double Quotation Mark 54 0x95 => 0x2022, // Bullet 55 0x96 => 0x2013, // En Dash 56 0x97 => 0x2014, // Em Dash 57 0x98 => 0x02dc, // Small Tilde 58 0x99 => 0x2122, // Trade Mark Sign 59 0x9a => 0x0161, // Latin Small Letter S With Caron 60 0x9b => 0x203a, // Single Right-Pointing Angle Quotation Mark 61 0x9c => 0x0153, // Latin Small Ligature Oe 62 0x9e => 0x017e, // Latin Small Letter Z With Caron 63 0x9f => 0x0178, // Latin Capital Letter Y With Diaeresis 64 else => c, 65 }; 66 } 67 68 /// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt 69 /// Plus some mappings found empirically by iterating all codepoints: 70 /// 0x2007 => 0xA0, // Figure Space 71 /// 0x2008 => ' ', // Punctuation Space 72 /// 0x2009 => ' ', // Thin Space 73 /// 0x200A => ' ', // Hair Space 74 /// 0x2012 => '-', // Figure Dash 75 /// 0x2015 => '-', // Horizontal Bar 76 /// 0x201B => '\'', // Single High-reversed-9 Quotation Mark 77 /// 0x201F => '"', // Double High-reversed-9 Quotation Mark 78 /// 0x202F => 0xA0, // Narrow No-Break Space 79 /// 0x2033 => '"', // Double Prime 80 /// 0x2036 => '"', // Reversed Double Prime 81 pub fn bestFitFromCodepoint(codepoint: u21) ?u8 { 82 return switch (codepoint) { 83 0x00...0x7F, 84 0x81, 85 0x8D, 86 0x8F, 87 0x90, 88 0x9D, 89 0xA0...0xFF, 90 => @intCast(codepoint), 91 0x0100 => 0x41, // Latin Capital Letter A With Macron 92 0x0101 => 0x61, // Latin Small Letter A With Macron 93 0x0102 => 0x41, // Latin Capital Letter A With Breve 94 0x0103 => 0x61, // Latin Small Letter A With Breve 95 0x0104 => 0x41, // Latin Capital Letter A With Ogonek 96 0x0105 => 0x61, // Latin Small Letter A With Ogonek 97 0x0106 => 0x43, // Latin Capital Letter C With Acute 98 0x0107 => 0x63, // Latin Small Letter C With Acute 99 0x0108 => 0x43, // Latin Capital Letter C With Circumflex 100 0x0109 => 0x63, // Latin Small Letter C With Circumflex 101 0x010a => 0x43, // Latin Capital Letter C With Dot Above 102 0x010b => 0x63, // Latin Small Letter C With Dot Above 103 0x010c => 0x43, // Latin Capital Letter C With Caron 104 0x010d => 0x63, // Latin Small Letter C With Caron 105 0x010e => 0x44, // Latin Capital Letter D With Caron 106 0x010f => 0x64, // Latin Small Letter D With Caron 107 0x0110 => 0xd0, // Latin Capital Letter D With Stroke 108 0x0111 => 0x64, // Latin Small Letter D With Stroke 109 0x0112 => 0x45, // Latin Capital Letter E With Macron 110 0x0113 => 0x65, // Latin Small Letter E With Macron 111 0x0114 => 0x45, // Latin Capital Letter E With Breve 112 0x0115 => 0x65, // Latin Small Letter E With Breve 113 0x0116 => 0x45, // Latin Capital Letter E With Dot Above 114 0x0117 => 0x65, // Latin Small Letter E With Dot Above 115 0x0118 => 0x45, // Latin Capital Letter E With Ogonek 116 0x0119 => 0x65, // Latin Small Letter E With Ogonek 117 0x011a => 0x45, // Latin Capital Letter E With Caron 118 0x011b => 0x65, // Latin Small Letter E With Caron 119 0x011c => 0x47, // Latin Capital Letter G With Circumflex 120 0x011d => 0x67, // Latin Small Letter G With Circumflex 121 0x011e => 0x47, // Latin Capital Letter G With Breve 122 0x011f => 0x67, // Latin Small Letter G With Breve 123 0x0120 => 0x47, // Latin Capital Letter G With Dot Above 124 0x0121 => 0x67, // Latin Small Letter G With Dot Above 125 0x0122 => 0x47, // Latin Capital Letter G With Cedilla 126 0x0123 => 0x67, // Latin Small Letter G With Cedilla 127 0x0124 => 0x48, // Latin Capital Letter H With Circumflex 128 0x0125 => 0x68, // Latin Small Letter H With Circumflex 129 0x0126 => 0x48, // Latin Capital Letter H With Stroke 130 0x0127 => 0x68, // Latin Small Letter H With Stroke 131 0x0128 => 0x49, // Latin Capital Letter I With Tilde 132 0x0129 => 0x69, // Latin Small Letter I With Tilde 133 0x012a => 0x49, // Latin Capital Letter I With Macron 134 0x012b => 0x69, // Latin Small Letter I With Macron 135 0x012c => 0x49, // Latin Capital Letter I With Breve 136 0x012d => 0x69, // Latin Small Letter I With Breve 137 0x012e => 0x49, // Latin Capital Letter I With Ogonek 138 0x012f => 0x69, // Latin Small Letter I With Ogonek 139 0x0130 => 0x49, // Latin Capital Letter I With Dot Above 140 0x0131 => 0x69, // Latin Small Letter Dotless I 141 0x0134 => 0x4a, // Latin Capital Letter J With Circumflex 142 0x0135 => 0x6a, // Latin Small Letter J With Circumflex 143 0x0136 => 0x4b, // Latin Capital Letter K With Cedilla 144 0x0137 => 0x6b, // Latin Small Letter K With Cedilla 145 0x0139 => 0x4c, // Latin Capital Letter L With Acute 146 0x013a => 0x6c, // Latin Small Letter L With Acute 147 0x013b => 0x4c, // Latin Capital Letter L With Cedilla 148 0x013c => 0x6c, // Latin Small Letter L With Cedilla 149 0x013d => 0x4c, // Latin Capital Letter L With Caron 150 0x013e => 0x6c, // Latin Small Letter L With Caron 151 0x0141 => 0x4c, // Latin Capital Letter L With Stroke 152 0x0142 => 0x6c, // Latin Small Letter L With Stroke 153 0x0143 => 0x4e, // Latin Capital Letter N With Acute 154 0x0144 => 0x6e, // Latin Small Letter N With Acute 155 0x0145 => 0x4e, // Latin Capital Letter N With Cedilla 156 0x0146 => 0x6e, // Latin Small Letter N With Cedilla 157 0x0147 => 0x4e, // Latin Capital Letter N With Caron 158 0x0148 => 0x6e, // Latin Small Letter N With Caron 159 0x014c => 0x4f, // Latin Capital Letter O With Macron 160 0x014d => 0x6f, // Latin Small Letter O With Macron 161 0x014e => 0x4f, // Latin Capital Letter O With Breve 162 0x014f => 0x6f, // Latin Small Letter O With Breve 163 0x0150 => 0x4f, // Latin Capital Letter O With Double Acute 164 0x0151 => 0x6f, // Latin Small Letter O With Double Acute 165 0x0152 => 0x8c, // Latin Capital Ligature Oe 166 0x0153 => 0x9c, // Latin Small Ligature Oe 167 0x0154 => 0x52, // Latin Capital Letter R With Acute 168 0x0155 => 0x72, // Latin Small Letter R With Acute 169 0x0156 => 0x52, // Latin Capital Letter R With Cedilla 170 0x0157 => 0x72, // Latin Small Letter R With Cedilla 171 0x0158 => 0x52, // Latin Capital Letter R With Caron 172 0x0159 => 0x72, // Latin Small Letter R With Caron 173 0x015a => 0x53, // Latin Capital Letter S With Acute 174 0x015b => 0x73, // Latin Small Letter S With Acute 175 0x015c => 0x53, // Latin Capital Letter S With Circumflex 176 0x015d => 0x73, // Latin Small Letter S With Circumflex 177 0x015e => 0x53, // Latin Capital Letter S With Cedilla 178 0x015f => 0x73, // Latin Small Letter S With Cedilla 179 0x0160 => 0x8a, // Latin Capital Letter S With Caron 180 0x0161 => 0x9a, // Latin Small Letter S With Caron 181 0x0162 => 0x54, // Latin Capital Letter T With Cedilla 182 0x0163 => 0x74, // Latin Small Letter T With Cedilla 183 0x0164 => 0x54, // Latin Capital Letter T With Caron 184 0x0165 => 0x74, // Latin Small Letter T With Caron 185 0x0166 => 0x54, // Latin Capital Letter T With Stroke 186 0x0167 => 0x74, // Latin Small Letter T With Stroke 187 0x0168 => 0x55, // Latin Capital Letter U With Tilde 188 0x0169 => 0x75, // Latin Small Letter U With Tilde 189 0x016a => 0x55, // Latin Capital Letter U With Macron 190 0x016b => 0x75, // Latin Small Letter U With Macron 191 0x016c => 0x55, // Latin Capital Letter U With Breve 192 0x016d => 0x75, // Latin Small Letter U With Breve 193 0x016e => 0x55, // Latin Capital Letter U With Ring Above 194 0x016f => 0x75, // Latin Small Letter U With Ring Above 195 0x0170 => 0x55, // Latin Capital Letter U With Double Acute 196 0x0171 => 0x75, // Latin Small Letter U With Double Acute 197 0x0172 => 0x55, // Latin Capital Letter U With Ogonek 198 0x0173 => 0x75, // Latin Small Letter U With Ogonek 199 0x0174 => 0x57, // Latin Capital Letter W With Circumflex 200 0x0175 => 0x77, // Latin Small Letter W With Circumflex 201 0x0176 => 0x59, // Latin Capital Letter Y With Circumflex 202 0x0177 => 0x79, // Latin Small Letter Y With Circumflex 203 0x0178 => 0x9f, // Latin Capital Letter Y With Diaeresis 204 0x0179 => 0x5a, // Latin Capital Letter Z With Acute 205 0x017a => 0x7a, // Latin Small Letter Z With Acute 206 0x017b => 0x5a, // Latin Capital Letter Z With Dot Above 207 0x017c => 0x7a, // Latin Small Letter Z With Dot Above 208 0x017d => 0x8e, // Latin Capital Letter Z With Caron 209 0x017e => 0x9e, // Latin Small Letter Z With Caron 210 0x0180 => 0x62, // Latin Small Letter B With Stroke 211 0x0189 => 0xd0, // Latin Capital Letter African D 212 0x0191 => 0x83, // Latin Capital Letter F With Hook 213 0x0192 => 0x83, // Latin Small Letter F With Hook 214 0x0197 => 0x49, // Latin Capital Letter I With Stroke 215 0x019a => 0x6c, // Latin Small Letter L With Bar 216 0x019f => 0x4f, // Latin Capital Letter O With Middle Tilde 217 0x01a0 => 0x4f, // Latin Capital Letter O With Horn 218 0x01a1 => 0x6f, // Latin Small Letter O With Horn 219 0x01ab => 0x74, // Latin Small Letter T With Palatal Hook 220 0x01ae => 0x54, // Latin Capital Letter T With Retroflex Hook 221 0x01af => 0x55, // Latin Capital Letter U With Horn 222 0x01b0 => 0x75, // Latin Small Letter U With Horn 223 0x01b6 => 0x7a, // Latin Small Letter Z With Stroke 224 0x01c0 => 0x7c, // Latin Letter Dental Click 225 0x01c3 => 0x21, // Latin Letter Retroflex Click 226 0x01cd => 0x41, // Latin Capital Letter A With Caron 227 0x01ce => 0x61, // Latin Small Letter A With Caron 228 0x01cf => 0x49, // Latin Capital Letter I With Caron 229 0x01d0 => 0x69, // Latin Small Letter I With Caron 230 0x01d1 => 0x4f, // Latin Capital Letter O With Caron 231 0x01d2 => 0x6f, // Latin Small Letter O With Caron 232 0x01d3 => 0x55, // Latin Capital Letter U With Caron 233 0x01d4 => 0x75, // Latin Small Letter U With Caron 234 0x01d5 => 0x55, // Latin Capital Letter U With Diaeresis And Macron 235 0x01d6 => 0x75, // Latin Small Letter U With Diaeresis And Macron 236 0x01d7 => 0x55, // Latin Capital Letter U With Diaeresis And Acute 237 0x01d8 => 0x75, // Latin Small Letter U With Diaeresis And Acute 238 0x01d9 => 0x55, // Latin Capital Letter U With Diaeresis And Caron 239 0x01da => 0x75, // Latin Small Letter U With Diaeresis And Caron 240 0x01db => 0x55, // Latin Capital Letter U With Diaeresis And Grave 241 0x01dc => 0x75, // Latin Small Letter U With Diaeresis And Grave 242 0x01de => 0x41, // Latin Capital Letter A With Diaeresis And Macron 243 0x01df => 0x61, // Latin Small Letter A With Diaeresis And Macron 244 0x01e4 => 0x47, // Latin Capital Letter G With Stroke 245 0x01e5 => 0x67, // Latin Small Letter G With Stroke 246 0x01e6 => 0x47, // Latin Capital Letter G With Caron 247 0x01e7 => 0x67, // Latin Small Letter G With Caron 248 0x01e8 => 0x4b, // Latin Capital Letter K With Caron 249 0x01e9 => 0x6b, // Latin Small Letter K With Caron 250 0x01ea => 0x4f, // Latin Capital Letter O With Ogonek 251 0x01eb => 0x6f, // Latin Small Letter O With Ogonek 252 0x01ec => 0x4f, // Latin Capital Letter O With Ogonek And Macron 253 0x01ed => 0x6f, // Latin Small Letter O With Ogonek And Macron 254 0x01f0 => 0x6a, // Latin Small Letter J With Caron 255 0x0261 => 0x67, // Latin Small Letter Script G 256 0x02b9 => 0x27, // Modifier Letter Prime 257 0x02ba => 0x22, // Modifier Letter Double Prime 258 0x02bc => 0x27, // Modifier Letter Apostrophe 259 0x02c4 => 0x5e, // Modifier Letter Up Arrowhead 260 0x02c6 => 0x88, // Modifier Letter Circumflex Accent 261 0x02c8 => 0x27, // Modifier Letter Vertical Line 262 0x02c9 => 0xaf, // Modifier Letter Macron 263 0x02ca => 0xb4, // Modifier Letter Acute Accent 264 0x02cb => 0x60, // Modifier Letter Grave Accent 265 0x02cd => 0x5f, // Modifier Letter Low Macron 266 0x02da => 0xb0, // Ring Above 267 0x02dc => 0x98, // Small Tilde 268 0x0300 => 0x60, // Combining Grave Accent 269 0x0301 => 0xb4, // Combining Acute Accent 270 0x0302 => 0x5e, // Combining Circumflex Accent 271 0x0303 => 0x7e, // Combining Tilde 272 0x0304 => 0xaf, // Combining Macron 273 0x0305 => 0xaf, // Combining Overline 274 0x0308 => 0xa8, // Combining Diaeresis 275 0x030a => 0xb0, // Combining Ring Above 276 0x030e => 0x22, // Combining Double Vertical Line Above 277 0x0327 => 0xb8, // Combining Cedilla 278 0x0331 => 0x5f, // Combining Macron Below 279 0x0332 => 0x5f, // Combining Low Line 280 0x037e => 0x3b, // Greek Question Mark 281 0x0393 => 0x47, // Greek Capital Letter Gamma 282 0x0398 => 0x54, // Greek Capital Letter Theta 283 0x03a3 => 0x53, // Greek Capital Letter Sigma 284 0x03a6 => 0x46, // Greek Capital Letter Phi 285 0x03a9 => 0x4f, // Greek Capital Letter Omega 286 0x03b1 => 0x61, // Greek Small Letter Alpha 287 0x03b2 => 0xdf, // Greek Small Letter Beta 288 0x03b4 => 0x64, // Greek Small Letter Delta 289 0x03b5 => 0x65, // Greek Small Letter Epsilon 290 0x03bc => 0xb5, // Greek Small Letter Mu 291 0x03c0 => 0x70, // Greek Small Letter Pi 292 0x03c3 => 0x73, // Greek Small Letter Sigma 293 0x03c4 => 0x74, // Greek Small Letter Tau 294 0x03c6 => 0x66, // Greek Small Letter Phi 295 0x04bb => 0x68, // Cyrillic Small Letter Shha 296 0x0589 => 0x3a, // Armenian Full Stop 297 0x066a => 0x25, // Arabic Percent Sign 298 0x2000 => 0x20, // En Quad 299 0x2001 => 0x20, // Em Quad 300 0x2002 => 0x20, // En Space 301 0x2003 => 0x20, // Em Space 302 0x2004 => 0x20, // Three-Per-Em Space 303 0x2005 => 0x20, // Four-Per-Em Space 304 0x2006 => 0x20, // Six-Per-Em Space 305 0x2010 => 0x2d, // Hyphen 306 0x2011 => 0x2d, // Non-Breaking Hyphen 307 0x2013 => 0x96, // En Dash 308 0x2014 => 0x97, // Em Dash 309 0x2017 => 0x3d, // Double Low Line 310 0x2018 => 0x91, // Left Single Quotation Mark 311 0x2019 => 0x92, // Right Single Quotation Mark 312 0x201a => 0x82, // Single Low-9 Quotation Mark 313 0x201c => 0x93, // Left Double Quotation Mark 314 0x201d => 0x94, // Right Double Quotation Mark 315 0x201e => 0x84, // Double Low-9 Quotation Mark 316 0x2020 => 0x86, // Dagger 317 0x2021 => 0x87, // Double Dagger 318 0x2022 => 0x95, // Bullet 319 0x2024 => 0xb7, // One Dot Leader 320 0x2026 => 0x85, // Horizontal Ellipsis 321 0x2030 => 0x89, // Per Mille Sign 322 0x2032 => 0x27, // Prime 323 0x2035 => 0x60, // Reversed Prime 324 0x2039 => 0x8b, // Single Left-Pointing Angle Quotation Mark 325 0x203a => 0x9b, // Single Right-Pointing Angle Quotation Mark 326 0x2044 => 0x2f, // Fraction Slash 327 0x2070 => 0xb0, // Superscript Zero 328 0x2074 => 0x34, // Superscript Four 329 0x2075 => 0x35, // Superscript Five 330 0x2076 => 0x36, // Superscript Six 331 0x2077 => 0x37, // Superscript Seven 332 0x2078 => 0x38, // Superscript Eight 333 0x207f => 0x6e, // Superscript Latin Small Letter N 334 0x2080 => 0x30, // Subscript Zero 335 0x2081 => 0x31, // Subscript One 336 0x2082 => 0x32, // Subscript Two 337 0x2083 => 0x33, // Subscript Three 338 0x2084 => 0x34, // Subscript Four 339 0x2085 => 0x35, // Subscript Five 340 0x2086 => 0x36, // Subscript Six 341 0x2087 => 0x37, // Subscript Seven 342 0x2088 => 0x38, // Subscript Eight 343 0x2089 => 0x39, // Subscript Nine 344 0x20ac => 0x80, // Euro Sign 345 0x20a1 => 0xa2, // Colon Sign 346 0x20a4 => 0xa3, // Lira Sign 347 0x20a7 => 0x50, // Peseta Sign 348 0x2102 => 0x43, // Double-Struck Capital C 349 0x2107 => 0x45, // Euler Constant 350 0x210a => 0x67, // Script Small G 351 0x210b => 0x48, // Script Capital H 352 0x210c => 0x48, // Black-Letter Capital H 353 0x210d => 0x48, // Double-Struck Capital H 354 0x210e => 0x68, // Planck Constant 355 0x2110 => 0x49, // Script Capital I 356 0x2111 => 0x49, // Black-Letter Capital I 357 0x2112 => 0x4c, // Script Capital L 358 0x2113 => 0x6c, // Script Small L 359 0x2115 => 0x4e, // Double-Struck Capital N 360 0x2118 => 0x50, // Script Capital P 361 0x2119 => 0x50, // Double-Struck Capital P 362 0x211a => 0x51, // Double-Struck Capital Q 363 0x211b => 0x52, // Script Capital R 364 0x211c => 0x52, // Black-Letter Capital R 365 0x211d => 0x52, // Double-Struck Capital R 366 0x2122 => 0x99, // Trade Mark Sign 367 0x2124 => 0x5a, // Double-Struck Capital Z 368 0x2128 => 0x5a, // Black-Letter Capital Z 369 0x212a => 0x4b, // Kelvin Sign 370 0x212b => 0xc5, // Angstrom Sign 371 0x212c => 0x42, // Script Capital B 372 0x212d => 0x43, // Black-Letter Capital C 373 0x212e => 0x65, // Estimated Symbol 374 0x212f => 0x65, // Script Small E 375 0x2130 => 0x45, // Script Capital E 376 0x2131 => 0x46, // Script Capital F 377 0x2133 => 0x4d, // Script Capital M 378 0x2134 => 0x6f, // Script Small O 379 0x2205 => 0xd8, // Empty Set 380 0x2212 => 0x2d, // Minus Sign 381 0x2213 => 0xb1, // Minus-Or-Plus Sign 382 0x2215 => 0x2f, // Division Slash 383 0x2216 => 0x5c, // Set Minus 384 0x2217 => 0x2a, // Asterisk Operator 385 0x2218 => 0xb0, // Ring Operator 386 0x2219 => 0xb7, // Bullet Operator 387 0x221a => 0x76, // Square Root 388 0x221e => 0x38, // Infinity 389 0x2223 => 0x7c, // Divides 390 0x2229 => 0x6e, // Intersection 391 0x2236 => 0x3a, // Ratio 392 0x223c => 0x7e, // Tilde Operator 393 0x2248 => 0x98, // Almost Equal To 394 0x2261 => 0x3d, // Identical To 395 0x2264 => 0x3d, // Less-Than Or Equal To 396 0x2265 => 0x3d, // Greater-Than Or Equal To 397 0x226a => 0xab, // Much Less-Than 398 0x226b => 0xbb, // Much Greater-Than 399 0x22c5 => 0xb7, // Dot Operator 400 0x2302 => 0xa6, // House 401 0x2303 => 0x5e, // Up Arrowhead 402 0x2310 => 0xac, // Reversed Not Sign 403 0x2320 => 0x28, // Top Half Integral 404 0x2321 => 0x29, // Bottom Half Integral 405 0x2329 => 0x3c, // Left-Pointing Angle Bracket 406 0x232a => 0x3e, // Right-Pointing Angle Bracket 407 0x2500 => 0x2d, // Box Drawings Light Horizontal 408 0x2502 => 0xa6, // Box Drawings Light Vertical 409 0x250c => 0x2b, // Box Drawings Light Down And Right 410 0x2510 => 0x2b, // Box Drawings Light Down And Left 411 0x2514 => 0x2b, // Box Drawings Light Up And Right 412 0x2518 => 0x2b, // Box Drawings Light Up And Left 413 0x251c => 0x2b, // Box Drawings Light Vertical And Right 414 0x2524 => 0xa6, // Box Drawings Light Vertical And Left 415 0x252c => 0x2d, // Box Drawings Light Down And Horizontal 416 0x2534 => 0x2d, // Box Drawings Light Up And Horizontal 417 0x253c => 0x2b, // Box Drawings Light Vertical And Horizontal 418 0x2550 => 0x2d, // Box Drawings Double Horizontal 419 0x2551 => 0xa6, // Box Drawings Double Vertical 420 0x2552 => 0x2b, // Box Drawings Down Single And Right Double 421 0x2553 => 0x2b, // Box Drawings Down Double And Right Single 422 0x2554 => 0x2b, // Box Drawings Double Down And Right 423 0x2555 => 0x2b, // Box Drawings Down Single And Left Double 424 0x2556 => 0x2b, // Box Drawings Down Double And Left Single 425 0x2557 => 0x2b, // Box Drawings Double Down And Left 426 0x2558 => 0x2b, // Box Drawings Up Single And Right Double 427 0x2559 => 0x2b, // Box Drawings Up Double And Right Single 428 0x255a => 0x2b, // Box Drawings Double Up And Right 429 0x255b => 0x2b, // Box Drawings Up Single And Left Double 430 0x255c => 0x2b, // Box Drawings Up Double And Left Single 431 0x255d => 0x2b, // Box Drawings Double Up And Left 432 0x255e => 0xa6, // Box Drawings Vertical Single And Right Double 433 0x255f => 0xa6, // Box Drawings Vertical Double And Right Single 434 0x2560 => 0xa6, // Box Drawings Double Vertical And Right 435 0x2561 => 0xa6, // Box Drawings Vertical Single And Left Double 436 0x2562 => 0xa6, // Box Drawings Vertical Double And Left Single 437 0x2563 => 0xa6, // Box Drawings Double Vertical And Left 438 0x2564 => 0x2d, // Box Drawings Down Single And Horizontal Double 439 0x2565 => 0x2d, // Box Drawings Down Double And Horizontal Single 440 0x2566 => 0x2d, // Box Drawings Double Down And Horizontal 441 0x2567 => 0x2d, // Box Drawings Up Single And Horizontal Double 442 0x2568 => 0x2d, // Box Drawings Up Double And Horizontal Single 443 0x2569 => 0x2d, // Box Drawings Double Up And Horizontal 444 0x256a => 0x2b, // Box Drawings Vertical Single And Horizontal Double 445 0x256b => 0x2b, // Box Drawings Vertical Double And Horizontal Single 446 0x256c => 0x2b, // Box Drawings Double Vertical And Horizontal 447 0x2580 => 0xaf, // Upper Half Block 448 0x2584 => 0x5f, // Lower Half Block 449 0x2588 => 0xa6, // Full Block 450 0x258c => 0xa6, // Left Half Block 451 0x2590 => 0xa6, // Right Half Block 452 0x2591 => 0xa6, // Light Shade 453 0x2592 => 0xa6, // Medium Shade 454 0x2593 => 0xa6, // Dark Shade 455 0x25a0 => 0xa6, // Black Square 456 0x263c => 0xa4, // White Sun With Rays 457 0x2758 => 0x7c, // Light Vertical Bar 458 0x3000 => 0x20, // Ideographic Space 459 0x3008 => 0x3c, // Left Angle Bracket 460 0x3009 => 0x3e, // Right Angle Bracket 461 0x300a => 0xab, // Left Double Angle Bracket 462 0x300b => 0xbb, // Right Double Angle Bracket 463 0x301a => 0x5b, // Left White Square Bracket 464 0x301b => 0x5d, // Right White Square Bracket 465 0x30fb => 0xb7, // Katakana Middle Dot 466 0xff01 => 0x21, // Fullwidth Exclamation Mark 467 0xff02 => 0x22, // Fullwidth Quotation Mark 468 0xff03 => 0x23, // Fullwidth Number Sign 469 0xff04 => 0x24, // Fullwidth Dollar Sign 470 0xff05 => 0x25, // Fullwidth Percent Sign 471 0xff06 => 0x26, // Fullwidth Ampersand 472 0xff07 => 0x27, // Fullwidth Apostrophe 473 0xff08 => 0x28, // Fullwidth Left Parenthesis 474 0xff09 => 0x29, // Fullwidth Right Parenthesis 475 0xff0a => 0x2a, // Fullwidth Asterisk 476 0xff0b => 0x2b, // Fullwidth Plus Sign 477 0xff0c => 0x2c, // Fullwidth Comma 478 0xff0d => 0x2d, // Fullwidth Hyphen-Minus 479 0xff0e => 0x2e, // Fullwidth Full Stop 480 0xff0f => 0x2f, // Fullwidth Solidus 481 0xff10 => 0x30, // Fullwidth Digit Zero 482 0xff11 => 0x31, // Fullwidth Digit One 483 0xff12 => 0x32, // Fullwidth Digit Two 484 0xff13 => 0x33, // Fullwidth Digit Three 485 0xff14 => 0x34, // Fullwidth Digit Four 486 0xff15 => 0x35, // Fullwidth Digit Five 487 0xff16 => 0x36, // Fullwidth Digit Six 488 0xff17 => 0x37, // Fullwidth Digit Seven 489 0xff18 => 0x38, // Fullwidth Digit Eight 490 0xff19 => 0x39, // Fullwidth Digit Nine 491 0xff1a => 0x3a, // Fullwidth Colon 492 0xff1b => 0x3b, // Fullwidth Semicolon 493 0xff1c => 0x3c, // Fullwidth Less-Than Sign 494 0xff1d => 0x3d, // Fullwidth Equals Sign 495 0xff1e => 0x3e, // Fullwidth Greater-Than Sign 496 0xff1f => 0x3f, // Fullwidth Question Mark 497 0xff20 => 0x40, // Fullwidth Commercial At 498 0xff21 => 0x41, // Fullwidth Latin Capital Letter A 499 0xff22 => 0x42, // Fullwidth Latin Capital Letter B 500 0xff23 => 0x43, // Fullwidth Latin Capital Letter C 501 0xff24 => 0x44, // Fullwidth Latin Capital Letter D 502 0xff25 => 0x45, // Fullwidth Latin Capital Letter E 503 0xff26 => 0x46, // Fullwidth Latin Capital Letter F 504 0xff27 => 0x47, // Fullwidth Latin Capital Letter G 505 0xff28 => 0x48, // Fullwidth Latin Capital Letter H 506 0xff29 => 0x49, // Fullwidth Latin Capital Letter I 507 0xff2a => 0x4a, // Fullwidth Latin Capital Letter J 508 0xff2b => 0x4b, // Fullwidth Latin Capital Letter K 509 0xff2c => 0x4c, // Fullwidth Latin Capital Letter L 510 0xff2d => 0x4d, // Fullwidth Latin Capital Letter M 511 0xff2e => 0x4e, // Fullwidth Latin Capital Letter N 512 0xff2f => 0x4f, // Fullwidth Latin Capital Letter O 513 0xff30 => 0x50, // Fullwidth Latin Capital Letter P 514 0xff31 => 0x51, // Fullwidth Latin Capital Letter Q 515 0xff32 => 0x52, // Fullwidth Latin Capital Letter R 516 0xff33 => 0x53, // Fullwidth Latin Capital Letter S 517 0xff34 => 0x54, // Fullwidth Latin Capital Letter T 518 0xff35 => 0x55, // Fullwidth Latin Capital Letter U 519 0xff36 => 0x56, // Fullwidth Latin Capital Letter V 520 0xff37 => 0x57, // Fullwidth Latin Capital Letter W 521 0xff38 => 0x58, // Fullwidth Latin Capital Letter X 522 0xff39 => 0x59, // Fullwidth Latin Capital Letter Y 523 0xff3a => 0x5a, // Fullwidth Latin Capital Letter Z 524 0xff3b => 0x5b, // Fullwidth Left Square Bracket 525 0xff3c => 0x5c, // Fullwidth Reverse Solidus 526 0xff3d => 0x5d, // Fullwidth Right Square Bracket 527 0xff3e => 0x5e, // Fullwidth Circumflex Accent 528 0xff3f => 0x5f, // Fullwidth Low Line 529 0xff40 => 0x60, // Fullwidth Grave Accent 530 0xff41 => 0x61, // Fullwidth Latin Small Letter A 531 0xff42 => 0x62, // Fullwidth Latin Small Letter B 532 0xff43 => 0x63, // Fullwidth Latin Small Letter C 533 0xff44 => 0x64, // Fullwidth Latin Small Letter D 534 0xff45 => 0x65, // Fullwidth Latin Small Letter E 535 0xff46 => 0x66, // Fullwidth Latin Small Letter F 536 0xff47 => 0x67, // Fullwidth Latin Small Letter G 537 0xff48 => 0x68, // Fullwidth Latin Small Letter H 538 0xff49 => 0x69, // Fullwidth Latin Small Letter I 539 0xff4a => 0x6a, // Fullwidth Latin Small Letter J 540 0xff4b => 0x6b, // Fullwidth Latin Small Letter K 541 0xff4c => 0x6c, // Fullwidth Latin Small Letter L 542 0xff4d => 0x6d, // Fullwidth Latin Small Letter M 543 0xff4e => 0x6e, // Fullwidth Latin Small Letter N 544 0xff4f => 0x6f, // Fullwidth Latin Small Letter O 545 0xff50 => 0x70, // Fullwidth Latin Small Letter P 546 0xff51 => 0x71, // Fullwidth Latin Small Letter Q 547 0xff52 => 0x72, // Fullwidth Latin Small Letter R 548 0xff53 => 0x73, // Fullwidth Latin Small Letter S 549 0xff54 => 0x74, // Fullwidth Latin Small Letter T 550 0xff55 => 0x75, // Fullwidth Latin Small Letter U 551 0xff56 => 0x76, // Fullwidth Latin Small Letter V 552 0xff57 => 0x77, // Fullwidth Latin Small Letter W 553 0xff58 => 0x78, // Fullwidth Latin Small Letter X 554 0xff59 => 0x79, // Fullwidth Latin Small Letter Y 555 0xff5a => 0x7a, // Fullwidth Latin Small Letter Z 556 0xff5b => 0x7b, // Fullwidth Left Curly Bracket 557 0xff5c => 0x7c, // Fullwidth Vertical Line 558 0xff5d => 0x7d, // Fullwidth Right Curly Bracket 559 0xff5e => 0x7e, // Fullwidth Tilde 560 // Not in the best fit mapping, but RC uses these mappings too 561 0x2007 => 0xA0, // Figure Space 562 0x2008 => ' ', // Punctuation Space 563 0x2009 => ' ', // Thin Space 564 0x200A => ' ', // Hair Space 565 0x2012 => '-', // Figure Dash 566 0x2015 => '-', // Horizontal Bar 567 0x201B => '\'', // Single High-reversed-9 Quotation Mark 568 0x201F => '"', // Double High-reversed-9 Quotation Mark 569 0x202F => 0xA0, // Narrow No-Break Space 570 0x2033 => '"', // Double Prime 571 0x2036 => '"', // Reversed Double Prime 572 else => null, 573 }; 574 } 575 576 test "windows-1252 to utf8" { 577 var buf = std.array_list.Managed(u8).init(std.testing.allocator); 578 defer buf.deinit(); 579 580 const input_windows1252 = "\x81pqrstuvwxyz{|}~\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8e\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9e\x9f\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"; 581 const expected_utf8 = "\xc2\x81pqrstuvwxyz{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; 582 583 var fbs = std.io.fixedBufferStream(input_windows1252); 584 const bytes_written = try windows1252ToUtf8Stream(buf.writer(), fbs.reader()); 585 586 try std.testing.expectEqualStrings(expected_utf8, buf.items); 587 try std.testing.expectEqual(expected_utf8.len, bytes_written); 588 }