commit 874b6e39db08f66bb0f702d2177b91535b2a8a02 (tree)
parent f26cdb2771a4bb4d5f1d5acc446ec51c3e177f75
Author: David Rubin <david@vortan.dev>
Date: Fri, 27 Jun 2025 15:34:32 -0700
hash: implement fast crc32c
Diffstat:
6 files changed, 301 insertions(+), 34 deletions(-)
diff --git a/lib/std/hash/crc.zig b/lib/std/hash/crc.zig
@@ -1,5 +1,6 @@
//! This file is auto-generated by tools/update_crc_catalog.zig.
+const builtin = @import("builtin");
const impl = @import("crc/impl.zig");
pub const Crc = impl.Crc;
@@ -13,6 +14,17 @@ test {
_ = @import("crc/test.zig");
}
+pub const Crc32Iscsi = switch (builtin.cpu.hasAll(.x86, &.{ .@"64bit", .crc32 }) and builtin.zig_backend == .stage2_llvm) {
+ true => @import("crc/Crc32c.zig"),
+ else => Crc(u32, .{
+ .polynomial = 0x1edc6f41,
+ .initial = 0xffffffff,
+ .reflect_input = true,
+ .reflect_output = true,
+ .xor_output = 0xffffffff,
+ }),
+};
+
pub const Crc3Gsm = Crc(u3, .{
.polynomial = 0x3,
.initial = 0x0,
@@ -797,14 +809,6 @@ pub const Crc32Cksum = Crc(u32, .{
.xor_output = 0xffffffff,
});
-pub const Crc32Iscsi = Crc(u32, .{
- .polynomial = 0x1edc6f41,
- .initial = 0xffffffff,
- .reflect_input = true,
- .reflect_output = true,
- .xor_output = 0xffffffff,
-});
-
pub const Crc32IsoHdlc = Crc(u32, .{
.polynomial = 0x04c11db7,
.initial = 0xffffffff,
diff --git a/lib/std/hash/crc/Crc32c.zig b/lib/std/hash/crc/Crc32c.zig
@@ -0,0 +1,238 @@
+//! Implements CRC-32C (Castagnoli) using the SSE4.2 Intel CRC32 instruction.
+//!
+//! A couple useful links for understanding the approach taken here:
+//! - https://github.com/madler/brotli/blob/1d428d3a9baade233ebc3ac108293256bcb813d1/crc32c.c
+//! - https://github.com/madler/zlib/blob/5a82f71ed1dfc0bec044d9702463dbdf84ea3b71/crc32.c
+//! - http://www.ross.net/crc/download/crc_v3.txt
+
+// Reflected CRC-32C polynomial in binary form.
+const POLY = 0x82f63b78;
+
+const LONG = 8192;
+const SHORT = 256;
+const long_lookup_table = genTable(LONG);
+const short_lookup_table = genTable(SHORT);
+
+const Wrapper = @This();
+
+crc: u32,
+
+pub fn init() Wrapper {
+ return .{ .crc = 0 };
+}
+
+pub fn update(w: *Wrapper, bytes: []const u8) void {
+ w.crc = crc32(w.crc, bytes);
+}
+
+pub fn final(w: Wrapper) u32 {
+ return w.crc;
+}
+
+pub fn hash(bytes: []const u8) u32 {
+ var c = init();
+ c.update(bytes);
+ return c.final();
+}
+
+/// Generates the lookup table for efficiently combining CRCs over a block of a given length `length`.
+/// This works by building an operator that advances the CRC state as if `length` zero-bytes were appended.
+/// We pre-compute 4 tables of 256 entries each (one per byte offset).
+///
+///
+/// The idea behind this table is quite interesting. The CRC state is equivalent to the
+/// remainder of dividing the message polynomial (over GF(2)) by the CRC polynomial.
+///
+/// Advancing the CRC register by `k` zero bits is equivalent to multiplying the current
+/// CRC state by `x^k` modulo the CRC polynomial. This operation can be represented
+/// as a linear transformation in GF(2), i.e, a matrix.
+///
+/// We build up this matrix via repeated squaring:
+/// - odd represents the operator for 1 zero bit (i.e, multiplication by `x^1 mod POLY`)
+/// - even represents the operator for 2 zero bits (`x^2 mod POLY`)
+/// - squaring again gives `x^4 mod POLY`, and so on until we get to the right size.
+///
+/// By squaring the shifting `len`, we build the operator for `x^l mod POLY`.
+fn genTable(length: usize) [4][256]u32 {
+ @setEvalBranchQuota(250000);
+
+ var even: [32]u32 = undefined;
+ zeroes: {
+ var odd: [32]u32 = undefined;
+
+ // Initialize our `odd` array with the operator for a single zero bit:
+ // - odd[0] is the polynomial itself (acts on the MSB).
+ // - odd[1..32] represent shifting a single bit through 31 positions.
+ odd[0] = POLY;
+ var row: u32 = 1;
+ for (1..32) |n| {
+ odd[n] = row;
+ row <<= 1;
+ }
+
+ // even = odd squared: even represents `x^2 mod POLY`.
+ square(&even, &odd);
+ // odd = even squared: odd now represents `x^4 mod POLY`.
+ square(&odd, &even);
+
+ // Continue squaring to double the number of zeroes encoded each time:
+ //
+ // At each point in the process:
+ // - square(even, odd): even gets the operator for twice the current length.
+ // - square(odd, even): odd gets the operator for 4 times the original length.
+ var len = length;
+ while (true) {
+ square(&even, &odd);
+ len >>= 1;
+ if (len == 0) break :zeroes;
+ square(&odd, &even);
+ len >>= 1;
+ if (len == 0) break;
+ }
+
+ @memcpy(&even, &odd);
+ }
+
+ var zeroes: [4][256]u32 = undefined;
+ for (0..256) |n| {
+ zeroes[0][n] = times(&even, n);
+ zeroes[1][n] = times(&even, n << 8);
+ zeroes[2][n] = times(&even, n << 16);
+ zeroes[3][n] = times(&even, n << 24);
+ }
+ return zeroes;
+}
+
+/// Computes `mat * vec` over `GF(2)`, where `mat` is a 32x32 binary matrix and `vec`
+/// is a 32-bit vector. This somewhat "simulates" how bits propagate through the CRC register
+/// during shifting.
+///
+/// - In GF(2) (aka a field where the only values are 0 and 1, aka binary), multiplication is
+/// an `AND`, and addition is `XOR`.
+/// - This dot product determines how each bit in the input vector "contributes" to
+/// the final CRC state, by XORing (adding) rows of the matrix where `vec` has 1s.
+fn times(mat: *const [32]u32, vec: u32) u32 {
+ var sum: u32 = 0;
+ var v = vec;
+ var i: u32 = 0;
+ while (v != 0) {
+ if (v & 1 != 0) sum ^= mat[i];
+ v >>= 1;
+ i += 1;
+ }
+ return sum;
+}
+
+/// Computes the square of a matrix in GF(2), i.e `dst = dst x src`.
+///
+/// This produces the operator for doubling the number of zeroes:
+/// if `src` represents advancing the CRC by `k` zeroes, then `dest` will
+/// represent advancing by 2k zeroes.
+///
+/// Since polynomial multiplication mod POLY is linear, `mat(mat(x)) = mat^2(x)`
+/// gives the effect of two sequential applications of the operator.
+fn square(dst: *[32]u32, src: *const [32]u32) void {
+ for (dst, src) |*d, s| {
+ d.* = times(src, s);
+ }
+}
+
+fn shift(table: *const [4][256]u32, crc: u32) u32 {
+ return table[0][crc & 0xFF] ^ table[1][(crc >> 8) & 0xFF] ^ table[2][(crc >> 16) & 0xFF] ^ table[3][crc >> 24];
+}
+
+fn crc32(crc: u32, input: []const u8) u32 {
+ var crc0: u64 = ~crc;
+
+ // Compute the CRC for up to seven leading bytes to bring the
+ // `next` pointer to an eight-byte boundary.
+ var next = input;
+ while (next.len > 0 and @intFromPtr(next.ptr) & 7 != 0) {
+ asm volatile ("crc32b %[out], %[in]"
+ : [in] "+r" (crc0),
+ : [out] "rm" (next[0]),
+ );
+ next = next[1..];
+ }
+
+ // Compute the CRC on sets of LONG * 3 bytes, executing three independent
+ // CRC instructions, each on LONG bytes. This is an optimization for
+ // targets where the CRC instruction has a throughput of one CRC per
+ // cycle, but a latency of three cycles.
+ while (next.len >= LONG * 3) {
+ var crc1: u64 = 0;
+ var crc2: u64 = 0;
+
+ const start = next.len;
+ while (true) {
+ // Safe @alignCast(), since we've aligned the pointer to 8 bytes before this loop.
+ const long: [*]const u64 = @ptrCast(@alignCast(next));
+ asm volatile (
+ \\crc32q %[out0], %[in0]
+ \\crc32q %[out1], %[in1]
+ \\crc32q %[out2], %[in2]
+ : [in0] "+r" (crc0),
+ [in1] "+r" (crc1),
+ [in2] "+r" (crc2),
+ : [out0] "rm" (long[0 * LONG / 8]),
+ [out1] "rm" (long[1 * LONG / 8]),
+ [out2] "rm" (long[2 * LONG / 8]),
+ );
+ next = next[8..];
+ if (next.len <= start - LONG) break;
+ }
+
+ crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc1;
+ crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc2;
+ next = next[LONG * 2 ..];
+ }
+
+ // Same thing as above, but for smaller chunks of SHORT bytes.
+ while (next.len >= SHORT * 3) {
+ var crc1: u64 = 0;
+ var crc2: u64 = 0;
+
+ const start = next.len;
+ while (true) {
+ const long: [*]const u64 = @ptrCast(@alignCast(next));
+ asm volatile (
+ \\crc32q %[out0], %[in0]
+ \\crc32q %[out1], %[in1]
+ \\crc32q %[out2], %[in2]
+ : [in0] "+r" (crc0),
+ [in1] "+r" (crc1),
+ [in2] "+r" (crc2),
+ : [out0] "rm" (long[0 * SHORT / 8]),
+ [out1] "rm" (long[1 * SHORT / 8]),
+ [out2] "rm" (long[2 * SHORT / 8]),
+ );
+ next = next[8..];
+ if (next.len <= start - SHORT) break;
+ }
+
+ crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc1;
+ crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc2;
+ next = next[SHORT * 2 ..];
+ }
+
+ // Compute via 8-byte chunks, until we're left with less than 8 bytes.
+ while (next.len >= 8) {
+ const long: [*]const u64 = @ptrCast(@alignCast(next));
+ asm volatile ("crc32q %[out], %[in]"
+ : [in] "+r" (crc0),
+ : [out] "rm" (long[0]),
+ );
+ next = next[8..];
+ }
+
+ // Finish the last bytes with just single instructions.
+ while (next.len > 0) {
+ asm volatile ("crc32b %[out], %[in]"
+ : [in] "+r" (crc0),
+ : [out] "rm" (next[0]),
+ );
+ next = next[1..];
+ }
+
+ return @truncate(~crc0);
+}
diff --git a/lib/std/hash/crc/impl.zig b/lib/std/hash/crc/impl.zig
@@ -23,12 +23,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
const I = if (@bitSizeOf(W) < 8) u8 else W;
const lookup_table = blk: {
@setEvalBranchQuota(2500);
-
- const poly = if (algorithm.reflect_input)
- @bitReverse(@as(I, algorithm.polynomial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
- else
- @as(I, algorithm.polynomial) << (@bitSizeOf(I) - @bitSizeOf(W));
-
+ const poly = reflect(algorithm.polynomial);
var table: [256]I = undefined;
for (&table, 0..) |*e, i| {
var crc: I = i;
@@ -52,15 +47,13 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
crc: I,
pub fn init() Self {
- const initial = if (algorithm.reflect_input)
- @bitReverse(@as(I, algorithm.initial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
- else
- @as(I, algorithm.initial) << (@bitSizeOf(I) - @bitSizeOf(W));
- return Self{ .crc = initial };
+ const initial = reflect(algorithm.initial);
+ return .{ .crc = initial };
}
inline fn tableEntry(index: I) I {
- return lookup_table[@as(u8, @intCast(index & 0xFF))];
+ const short: u8 = @truncate(index);
+ return lookup_table[short];
}
pub fn update(self: *Self, bytes: []const u8) void {
@@ -90,7 +83,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
if (!algorithm.reflect_output) {
c >>= @bitSizeOf(I) - @bitSizeOf(W);
}
- return @as(W, @intCast(c ^ algorithm.xor_output));
+ return @intCast(c ^ algorithm.xor_output);
}
pub fn hash(bytes: []const u8) W {
@@ -98,5 +91,13 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
c.update(bytes);
return c.final();
}
+
+ fn reflect(x: I) I {
+ const offset = @bitSizeOf(I) - @bitSizeOf(W);
+ if (algorithm.reflect_input)
+ return @bitReverse(x) >> offset
+ else
+ return x << offset;
+ }
};
}
diff --git a/lib/std/hash/crc/test.zig b/lib/std/hash/crc/test.zig
@@ -26,6 +26,17 @@ test "crc32 koopman regression" {
try testing.expectEqual(crc32.hash("abc"), 0xba2322ac);
}
+test "CRC-32/ISCSI" {
+ const Crc32Iscsi = crc.Crc32Iscsi;
+
+ try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
+
+ var c = Crc32Iscsi.init();
+ c.update("1234");
+ c.update("56789");
+ try testing.expectEqual(@as(u32, 0xe3069283), c.final());
+}
+
test "CRC-3/GSM" {
const Crc3Gsm = crc.Crc3Gsm;
@@ -1104,17 +1115,6 @@ test "CRC-32/CKSUM" {
try testing.expectEqual(@as(u32, 0x765e7680), c.final());
}
-test "CRC-32/ISCSI" {
- const Crc32Iscsi = crc.Crc32Iscsi;
-
- try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
-
- var c = Crc32Iscsi.init();
- c.update("1234");
- c.update("56789");
- try testing.expectEqual(@as(u32, 0xe3069283), c.final());
-}
-
test "CRC-32/ISO-HDLC" {
const Crc32IsoHdlc = crc.Crc32IsoHdlc;
diff --git a/tools/crc/catalog.txt b/tools/crc/catalog.txt
@@ -97,7 +97,8 @@ width=32 poly=0xa833982b init=0xffffffff refin=true refout=true xorout=0xff
width=32 poly=0x04c11db7 init=0xffffffff refin=false refout=false xorout=0xffffffff check=0xfc891918 residue=0xc704dd7b name="CRC-32/BZIP2"
width=32 poly=0x8001801b init=0x00000000 refin=true refout=true xorout=0x00000000 check=0x6ec2edc4 residue=0x00000000 name="CRC-32/CD-ROM-EDC"
width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 residue=0xc704dd7b name="CRC-32/CKSUM"
-width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI"
+# CRC-32C implementation is defined manually, since it has an accelerated variant.
+# width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI"
width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xcbf43926 residue=0xdebb20e3 name="CRC-32/ISO-HDLC"
width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0x00000000 check=0x340bc6d9 residue=0x00000000 name="CRC-32/JAMCRC"
width=32 poly=0x741b8cd7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0x2d3dd0ae residue=0x00000000 name="CRC-32/KOOPMAN"
diff --git a/tools/update_crc_catalog.zig b/tools/update_crc_catalog.zig
@@ -39,6 +39,7 @@ fn @"i like cheese"(arena: std.mem.Allocator, io: Io, args: []const []const u8)
try code_writer.writeAll(
\\//! This file is auto-generated by tools/update_crc_catalog.zig.
\\
+ \\const builtin = @import("builtin");
\\const impl = @import("crc/impl.zig");
\\
\\pub const Crc = impl.Crc;
@@ -52,6 +53,17 @@ fn @"i like cheese"(arena: std.mem.Allocator, io: Io, args: []const []const u8)
\\ _ = @import("crc/test.zig");
\\}
\\
+ \\pub const Crc32Iscsi = switch (builtin.cpu.hasAll(.x86, &.{ .@"64bit", .crc32 }) and builtin.zig_backend == .stage2_llvm) {
+ \\ true => @import("crc/Crc32c.zig"),
+ \\ else => Crc(u32, .{
+ \\ .polynomial = 0x1edc6f41,
+ \\ .initial = 0xffffffff,
+ \\ .reflect_input = true,
+ \\ .reflect_output = true,
+ \\ .xor_output = 0xffffffff,
+ \\ }),
+ \\};
+ \\
);
var zig_test_file = try crc_target_dir.createFile(io, "test.zig", .{});
@@ -83,12 +95,23 @@ fn @"i like cheese"(arena: std.mem.Allocator, io: Io, args: []const []const u8)
\\}
\\
\\test "crc32 koopman regression" {
- \\ const crc32 = crc.Koopman;
+ \\ const crc32 = crc.Crc32Koopman;
\\ try testing.expectEqual(crc32.hash(""), 0x00000000);
\\ try testing.expectEqual(crc32.hash("a"), 0x0da2aa8a);
\\ try testing.expectEqual(crc32.hash("abc"), 0xba2322ac);
\\}
\\
+ \\test "CRC-32/ISCSI" {
+ \\ const Crc32Iscsi = crc.Crc32Iscsi;
+ \\
+ \\ try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
+ \\
+ \\ var c = Crc32Iscsi.init();
+ \\ c.update("1234");
+ \\ c.update("56789");
+ \\ try testing.expectEqual(@as(u32, 0xe3069283), c.final());
+ \\}
+ \\
);
var reader: std.Io.Reader = .fixed(catalog_txt);