From cd766513febecd79fc62a0f46ee18867fabbbf7d Mon Sep 17 00:00:00 2001
From: Marc Tiehuis <marc@tiehu.is>
Date: Thu, 14 Sep 2023 19:54:36 +1200
Subject: [PATCH] std.mem: add vectorized indexOfScalarPos and indexOfSentinel

These are an order of magnitude quicker than the previous
implementations:

A relative comparison of each, measuring scanning a 1G file.

    Reading 1G (1.0000000009313226GiB)

             std.mem.sliceTo: 281.232ms
          vectorized.sliceTo: 24.769ms
                      strlen: 24.291ms

           std.indexOfScalar: 229.016ms
    vectorized.indexOfScalar: 24.685ms
                      memchr: 24.958ms
---
 lib/std/mem.zig | 106 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 102 insertions(+), 4 deletions(-)

diff --git a/lib/std/mem.zig b/lib/std/mem.zig
index b24cfe227a..6e7302331e 100644
--- a/lib/std/mem.zig
+++ b/lib/std/mem.zig
@@ -953,9 +953,57 @@ test "len" {
     try testing.expect(len(c_ptr) == 2);
 }
 
-pub fn indexOfSentinel(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
+pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]const T) usize {
     var i: usize = 0;
-    while (ptr[i] != sentinel) {
+
+    if (!@inComptime() and (@typeInfo(T) == .Int or @typeInfo(T) == .Float) and std.math.isPowerOfTwo(@bitSizeOf(T))) {
+        switch (@import("builtin").cpu.arch) {
+            // The below branch assumes that reading past the end of the buffer is valid, as long
+            // as we don't read into a new page. This should be the case for most architectures
+            // which use paged memory, however should be confirmed before adding a new arch below.
+            .aarch64, .x86, .x86_64 => if (comptime std.simd.suggestVectorSize(T)) |block_len| {
+                comptime std.debug.assert(std.mem.page_size % block_len == 0);
+                const Block = @Vector(block_len, T);
+                const mask: Block = @splat(sentinel);
+
+                // First block may be unaligned
+                const start_addr = @intFromPtr(&p[i]);
+                const offset_in_page = start_addr & (std.mem.page_size - 1);
+                if (offset_in_page < std.mem.page_size - block_len) {
+                    // Will not read past the end of a page, full block.
+                    const block: Block = p[i..][0..block_len].*;
+                    const matches = block == mask;
+                    if (@reduce(.Or, matches)) {
+                        return i + std.simd.firstTrue(matches).?;
+                    }
+
+                    i += std.mem.alignForward(usize, start_addr, block_len) - start_addr;
+                } else {
+                    // Would read over a page boundary. Per-byte at a time until aligned or found.
+                    // 0.39% chance this branch is taken for 4K pages at 16b block length.
+                    //
+                    // An alternate strategy is to do read a full block (the last in the page) and
+                    // mask the entries before the pointer.
+                    while ((@intFromPtr(&p[i]) & (block_len - 1)) != 0) : (i += 1) {
+                        if (p[i] == sentinel) return i;
+                    }
+                }
+
+                std.debug.assert(std.mem.isAligned(@intFromPtr(&p[i]), block_len));
+                while (true) {
+                    const block: *const Block = @ptrCast(@alignCast(p[i..][0..block_len]));
+                    const matches = block.* == mask;
+                    if (@reduce(.Or, matches)) {
+                        return i + std.simd.firstTrue(matches).?;
+                    }
+                    i += block_len;
+                }
+            },
+            else => {},
+        }
+    }
+
+    while (p[i] != sentinel) {
         i += 1;
     }
     return i;
@@ -1016,8 +1064,58 @@ pub fn lastIndexOfScalar(comptime T: type, slice: []const T, value: T) ?usize {
 
 pub fn indexOfScalarPos(comptime T: type, slice: []const T, start_index: usize, value: T) ?usize {
     if (start_index >= slice.len) return null;
-    for (slice[start_index..], start_index..) |c, i| {
-        if (c == value) return i;
+
+    var i: usize = start_index;
+    if (!@inComptime() and (@typeInfo(T) == .Int or @typeInfo(T) == .Float) and std.math.isPowerOfTwo(@bitSizeOf(T))) {
+        if (comptime std.simd.suggestVectorSize(T)) |block_len| {
+            // For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result
+            // in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning.
+            //
+            // Use `comptime std.simd.suggestVectorSize(T)` to get the same alignment as used in this function
+            // however this usually isn't necessary unless your arch has a performance penalty due to this.
+            //
+            // This may differ for other arch's. Arm for example costs a cycle when loading across a cache
+            // line so explicit alignment prologues may be worth exploration.
+
+            // Unrolling here is ~10% improvement. We can then do one bounds check every 2 blocks
+            // instead of one which adds up.
+            const Block = @Vector(block_len, T);
+            if (i + 2 * block_len < slice.len) {
+                const mask: Block = @splat(value);
+                while (true) {
+                    inline for (0..2) |_| {
+                        const block: Block = slice[i..][0..block_len].*;
+                        const matches = block == mask;
+                        if (@reduce(.Or, matches)) {
+                            return i + std.simd.firstTrue(matches).?;
+                        }
+                        i += block_len;
+                    }
+                    if (i + 2 * block_len >= slice.len) break;
+                }
+            }
+
+            // {block_len, block_len / 2} check
+            inline for (0..2) |j| {
+                const block_x_len = block_len / (1 << j);
+                comptime if (block_x_len < 4) break;
+
+                const BlockX = @Vector(block_x_len, T);
+                if (i + block_x_len < slice.len) {
+                    const mask: BlockX = @splat(value);
+                    const block: BlockX = slice[i..][0..block_x_len].*;
+                    const matches = block == mask;
+                    if (@reduce(.Or, matches)) {
+                        return i + std.simd.firstTrue(matches).?;
+                    }
+                    i += block_x_len;
+                }
+            }
+        }
+    }
+
+    for (slice[i..], i..) |c, j| {
+        if (c == value) return j;
     }
     return null;
 }