commit c0d7f6403665a8b81b0afebf08d1741cca1daef9 (tree)
parent f9192adaba0eb344ed12aad9c675cd73b740d2a2
Author: Andrew Kelley <andrew@ziglang.org>
Date: Sat, 15 Oct 2022 14:28:33 -0400
Merge pull request #12448 from r00ster91/ultimateascii
std.ascii: rename functions and other improvements
Diffstat:
| M | lib/std/ascii.zig | | | 241 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------- |
1 file changed, 210 insertions(+), 31 deletions(-)
diff --git a/lib/std/ascii.zig b/lib/std/ascii.zig
@@ -1,54 +1,164 @@
-// Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does.
-// I could have taken only a u7 to make this clear, but it would be slower
-// It is my opinion that encodings other than UTF-8 should not be supported.
-//
-// (and 128 bytes is not much to pay).
-// Also does not handle Unicode character classes.
-//
-// https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png
+//! The 7-bit [ASCII](https://en.wikipedia.org/wiki/ASCII) character encoding standard.
+//!
+//! This is not to be confused with the 8-bit [extended ASCII](https://en.wikipedia.org/wiki/Extended_ASCII) character encoding.
+//!
+//! Even though this module concerns itself with 7-bit ASCII,
+//! functions use `u8` as the type instead of `u7` for convenience and compatibility.
+//! Characters outside of the 7-bit range are gracefully handled (e.g. by returning `false`).
+//!
+//! See also: https://en.wikipedia.org/wiki/ASCII#Character_set
const std = @import("std");
-/// Contains constants for the C0 control codes of the ASCII encoding.
-/// https://en.wikipedia.org/wiki/C0_and_C1_control_codes
+// TODO: remove all decls marked as DEPRECATED after 0.10.0's release
+
+/// The C0 control codes of the ASCII encoding.
+///
+/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `isControl`.
pub const control_code = struct {
+ // DEPRECATED: use the lowercase variant
pub const NUL = 0x00;
+ // DEPRECATED: use the lowercase variant
pub const SOH = 0x01;
+ // DEPRECATED: use the lowercase variant
pub const STX = 0x02;
+ // DEPRECATED: use the lowercase variant
pub const ETX = 0x03;
+ // DEPRECATED: use the lowercase variant
pub const EOT = 0x04;
+ // DEPRECATED: use the lowercase variant
pub const ENQ = 0x05;
+ // DEPRECATED: use the lowercase variant
pub const ACK = 0x06;
+ // DEPRECATED: use the lowercase variant
pub const BEL = 0x07;
+ // DEPRECATED: use the lowercase variant
pub const BS = 0x08;
+ // DEPRECATED: use `ht`
pub const TAB = 0x09;
+ // DEPRECATED: use the lowercase variant
pub const LF = 0x0A;
+ // DEPRECATED: use the lowercase variant
pub const VT = 0x0B;
+ // DEPRECATED: use the lowercase variant
pub const FF = 0x0C;
+ // DEPRECATED: use the lowercase variant
pub const CR = 0x0D;
+ // DEPRECATED: use the lowercase variant
pub const SO = 0x0E;
+ // DEPRECATED: use the lowercase variant
pub const SI = 0x0F;
+ // DEPRECATED: use the lowercase variant
pub const DLE = 0x10;
+ // DEPRECATED: use the lowercase variant
pub const DC1 = 0x11;
+ // DEPRECATED: use the lowercase variant
pub const DC2 = 0x12;
+ // DEPRECATED: use the lowercase variant
pub const DC3 = 0x13;
+ // DEPRECATED: use the lowercase variant
pub const DC4 = 0x14;
+ // DEPRECATED: use the lowercase variant
pub const NAK = 0x15;
+ // DEPRECATED: use the lowercase variant
pub const SYN = 0x16;
+ // DEPRECATED: use the lowercase variant
pub const ETB = 0x17;
+ // DEPRECATED: use the lowercase variant
pub const CAN = 0x18;
+ // DEPRECATED: use the lowercase variant
pub const EM = 0x19;
+ // DEPRECATED: use the lowercase variant
pub const SUB = 0x1A;
+ // DEPRECATED: use the lowercase variant
pub const ESC = 0x1B;
+ // DEPRECATED: use the lowercase variant
pub const FS = 0x1C;
+ // DEPRECATED: use the lowercase variant
pub const GS = 0x1D;
+ // DEPRECATED: use the lowercase variant
pub const RS = 0x1E;
+ // DEPRECATED: use the lowercase variant
pub const US = 0x1F;
-
+ // DEPRECATED: use the lowercase variant
pub const DEL = 0x7F;
-
+ // DEPRECATED: use the lowercase variant
pub const XON = 0x11;
+ // DEPRECATED: use the lowercase variant
pub const XOFF = 0x13;
+
+ /// Null.
+ pub const nul = 0x00;
+ /// Start of Heading.
+ pub const soh = 0x01;
+ /// Start of Text.
+ pub const stx = 0x02;
+ /// End of Text.
+ pub const etx = 0x03;
+ /// End of Transmission.
+ pub const eot = 0x04;
+ /// Enquiry.
+ pub const enq = 0x05;
+ /// Acknowledge.
+ pub const ack = 0x06;
+ /// Bell, Alert.
+ pub const bel = 0x07;
+ /// Backspace.
+ pub const bs = 0x08;
+ /// Horizontal Tab, Tab ('\t').
+ pub const ht = 0x09;
+ /// Line Feed, Newline ('\n').
+ pub const lf = 0x0A;
+ /// Vertical Tab.
+ pub const vt = 0x0B;
+ /// Form Feed.
+ pub const ff = 0x0C;
+ /// Carriage Return ('\r').
+ pub const cr = 0x0D;
+ /// Shift Out.
+ pub const so = 0x0E;
+ /// Shift In.
+ pub const si = 0x0F;
+ /// Data Link Escape.
+ pub const dle = 0x10;
+ /// Device Control One (XON).
+ pub const dc1 = 0x11;
+ /// Device Control Two.
+ pub const dc2 = 0x12;
+ /// Device Control Three (XOFF).
+ pub const dc3 = 0x13;
+ /// Device Control Four.
+ pub const dc4 = 0x14;
+ /// Negative Acknowledge.
+ pub const nak = 0x15;
+ /// Synchronous Idle.
+ pub const syn = 0x16;
+ /// End of Transmission Block
+ pub const etb = 0x17;
+ /// Cancel.
+ pub const can = 0x18;
+ /// End of Medium.
+ pub const em = 0x19;
+ /// Substitute.
+ pub const sub = 0x1A;
+ /// Escape.
+ pub const esc = 0x1B;
+ /// File Separator.
+ pub const fs = 0x1C;
+ /// Group Separator.
+ pub const gs = 0x1D;
+ /// Record Separator.
+ pub const rs = 0x1E;
+ /// Unit Separator.
+ pub const us = 0x1F;
+
+ /// Delete.
+ pub const del = 0x7F;
+
+ /// An alias to `dc1`.
+ pub const xon = dc1;
+ /// An alias to `dc3`.
+ pub const xoff = dc3;
};
const tIndex = enum(u3) {
@@ -188,73 +298,106 @@ fn inTable(c: u8, t: tIndex) bool {
return (combinedTable[c] & (@as(u8, 1) << @enumToInt(t))) != 0;
}
-pub fn isAlNum(c: u8) bool {
+/// DEPRECATED: use `isAlphanumeric`
+pub const isAlNum = isAlphanumeric;
+/// DEPRECATED: use `isAlpha`
+pub const isAlpha = isAlphabetic;
+/// DEPRECATED: use `isAlpha`
+pub const isCntrl = isControl;
+/// DEPRECATED: use `isWhitespace`.
+pub const isSpace = isWhitespace;
+/// DEPRECATED: use `whitespace`.
+pub const spaces = whitespace;
+/// DEPRECATED: use `isHex`.
+pub const isXDigit = isHex;
+
+/// Returns whether the character is alphanumeric.
+pub fn isAlphanumeric(c: u8) bool {
return (combinedTable[c] & ((@as(u8, 1) << @enumToInt(tIndex.Alpha)) |
@as(u8, 1) << @enumToInt(tIndex.Digit))) != 0;
}
-pub fn isAlpha(c: u8) bool {
+/// Returns whether the character is alphabetic.
+pub fn isAlphabetic(c: u8) bool {
return inTable(c, tIndex.Alpha);
}
-pub fn isCntrl(c: u8) bool {
- return c < 0x20 or c == 127; //DEL
+/// Returns whether the character is a control character.
+/// This is the same as `!isPrint(c)`.
+///
+/// See also: `control_code`.
+pub fn isControl(c: u8) bool {
+ return c <= control_code.us or c == control_code.del;
}
+/// Returns whether the character is a digit.
pub fn isDigit(c: u8) bool {
return inTable(c, tIndex.Digit);
}
+/// DEPRECATED: use `isPrint(c) and c != ' '` instead
pub fn isGraph(c: u8) bool {
return inTable(c, tIndex.Graph);
}
+/// Returns whether the character is a lowercased letter.
pub fn isLower(c: u8) bool {
return inTable(c, tIndex.Lower);
}
+/// Returns whether the character has some graphical representation and can be printed.
+/// This also returns `true` for the space character.
+/// This is the same as `!isControl(c)`.
pub fn isPrint(c: u8) bool {
return inTable(c, tIndex.Graph) or c == ' ';
}
+/// DEPRECATED: create your own function based on your needs and what you want to do.
pub fn isPunct(c: u8) bool {
return inTable(c, tIndex.Punct);
}
-pub fn isSpace(c: u8) bool {
+/// Returns whether this character is included in `whitespace`.
+pub fn isWhitespace(c: u8) bool {
return inTable(c, tIndex.Space);
}
-/// All the values for which isSpace() returns true. This may be used with
-/// e.g. std.mem.trim() to trim whiteSpace.
-pub const spaces = [_]u8{ ' ', '\t', '\n', '\r', control_code.VT, control_code.FF };
+/// Whitespace for general use.
+/// This may be used with e.g. `std.mem.trim` to trim whitespace.
+///
+/// See also: `isWhitespace`.
+pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control_code.vt, control_code.ff };
-test "spaces" {
- const testing = std.testing;
- for (spaces) |space| try testing.expect(isSpace(space));
+test "whitespace" {
+ for (whitespace) |char| try std.testing.expect(isWhitespace(char));
var i: u8 = 0;
while (isASCII(i)) : (i += 1) {
- if (isSpace(i)) try testing.expect(std.mem.indexOfScalar(u8, &spaces, i) != null);
+ if (isWhitespace(i)) try std.testing.expect(std.mem.indexOfScalar(u8, &whitespace, i) != null);
}
}
+/// Returns whether the character is an uppercased letter.
pub fn isUpper(c: u8) bool {
return inTable(c, tIndex.Upper);
}
-pub fn isXDigit(c: u8) bool {
+/// Returns whether the character is a hexadecimal digit. This is case-insensitive.
+pub fn isHex(c: u8) bool {
return inTable(c, tIndex.Hex);
}
+/// Returns whether the character is a 7-bit ASCII character.
pub fn isASCII(c: u8) bool {
return c < 128;
}
+/// DEPRECATED: use `c == ' ' or c == '\t'` or try `isWhitespace`
pub fn isBlank(c: u8) bool {
return (c == ' ') or (c == '\x09');
}
+/// Uppercases the character and returns it as-is if it's already uppercased or not a letter.
pub fn toUpper(c: u8) u8 {
if (isLower(c)) {
return c & 0b11011111;
@@ -263,6 +406,7 @@ pub fn toUpper(c: u8) u8 {
}
}
+/// Lowercases the character and returns it as-is if it's already lowercased or not a letter.
pub fn toLower(c: u8) u8 {
if (isUpper(c)) {
return c | 0b00100000;
@@ -274,13 +418,50 @@ pub fn toLower(c: u8) u8 {
test "ascii character classes" {
const testing = std.testing;
+ try testing.expect(!isControl('a'));
+ try testing.expect(!isControl('z'));
+ try testing.expect(isControl(control_code.nul));
+ try testing.expect(isControl(control_code.ff));
+ try testing.expect(isControl(control_code.us));
+
try testing.expect('C' == toUpper('c'));
try testing.expect(':' == toUpper(':'));
try testing.expect('\xab' == toUpper('\xab'));
+ try testing.expect(!isUpper('z'));
+
try testing.expect('c' == toLower('C'));
+ try testing.expect(':' == toLower(':'));
+ try testing.expect('\xab' == toLower('\xab'));
+ try testing.expect(!isLower('Z'));
+
+ try testing.expect(isAlphanumeric('Z'));
+ try testing.expect(isAlphanumeric('z'));
+ try testing.expect(isAlphanumeric('5'));
+ try testing.expect(isAlphanumeric('5'));
+ try testing.expect(!isAlphanumeric('!'));
+
+ try testing.expect(!isAlpha('5'));
try testing.expect(isAlpha('c'));
try testing.expect(!isAlpha('5'));
- try testing.expect(isSpace(' '));
+
+ try testing.expect(isWhitespace(' '));
+ try testing.expect(isWhitespace('\t'));
+ try testing.expect(isWhitespace('\r'));
+ try testing.expect(isWhitespace('\n'));
+ try testing.expect(!isWhitespace('.'));
+
+ try testing.expect(!isHex('g'));
+ try testing.expect(isHex('b'));
+ try testing.expect(isHex('9'));
+
+ try testing.expect(!isDigit('~'));
+ try testing.expect(isDigit('0'));
+ try testing.expect(isDigit('9'));
+
+ try testing.expect(isPrint(' '));
+ try testing.expect(isPrint('@'));
+ try testing.expect(isPrint('~'));
+ try testing.expect(!isPrint(control_code.esc));
}
/// Writes a lower case copy of `ascii_string` to `output`.
@@ -341,7 +522,7 @@ test "allocUpperString" {
try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
}
-/// Compares strings `a` and `b` case insensitively and returns whether they are equal.
+/// Compares strings `a` and `b` case-insensitively and returns whether they are equal.
pub fn eqlIgnoreCase(a: []const u8, b: []const u8) bool {
if (a.len != b.len) return false;
for (a) |a_c, i| {
@@ -397,11 +578,10 @@ test "indexOfIgnoreCase" {
try std.testing.expect(indexOfIgnoreCase("one two three FouR", "gOur") == null);
try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0);
try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null);
-
try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0);
}
-/// Compares two slices of numbers lexicographically. O(n).
+/// Returns the lexicographical order of two slices. O(n).
pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
const n = std.math.min(lhs.len, rhs.len);
var i: usize = 0;
@@ -415,8 +595,7 @@ pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
return std.math.order(lhs.len, rhs.len);
}
-/// Returns true if lhs < rhs, false otherwise
-/// TODO rename "IgnoreCase" to "Insensitive" in this entire file.
+/// Returns whether the lexicographical order of `lhs` is lower than `rhs`.
pub fn lessThanIgnoreCase(lhs: []const u8, rhs: []const u8) bool {
return orderIgnoreCase(lhs, rhs) == .lt;
}