diff --git a/lib/compiler_rt/memcpy.zig b/lib/compiler_rt/memcpy.zig index 97188ac65f..d67a6e766a 100644 --- a/lib/compiler_rt/memcpy.zig +++ b/lib/compiler_rt/memcpy.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const assert = std.debug.assert; const common = @import("./common.zig"); const builtin = @import("builtin"); @@ -17,6 +18,19 @@ comptime { } } +const Element = if (std.simd.suggestVectorLength(u8)) |vec_size| + @Type(.{ .vector = .{ + .child = u8, + .len = vec_size, + } }) +else + usize; + +comptime { + assert(@sizeOf(Element) >= @alignOf(Element)); + assert(std.math.isPowerOfTwo(@sizeOf(Element))); +} + fn memcpySmall(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { @setRuntimeSafety(builtin.is_test); @@ -27,6 +41,173 @@ fn memcpySmall(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) call return dest; } -fn memcpyFast(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { - return @call(.always_inline, @import("memmove.zig").memmove, .{ opt_dest, opt_src, len }); +fn memcpyFast(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { + @setRuntimeSafety(builtin.is_test); + + const small_limit = 2 * @sizeOf(Element); + + if (copySmallLength(small_limit, dest.?, src.?, len)) return dest; + + copyForwards(dest.?, src.?, len); + + return dest; +} + +inline fn copySmallLength( + comptime small_limit: comptime_int, + dest: [*]u8, + src: [*]const u8, + len: usize, +) bool { + if (len < 16) { + copyLessThan16(dest, src, len); + return true; + } + + if (comptime 2 < (std.math.log2(small_limit) + 1) / 2) { + if (copy16ToSmallLimit(small_limit, dest, src, len)) return true; + } + + return false; +} + +inline fn copyLessThan16( + dest: [*]u8, + src: [*]const u8, + len: usize, +) void { + @setRuntimeSafety(builtin.is_test); + if (len < 4) { + if (len == 0) return; + dest[0] = src[0]; + dest[len / 2] = src[len / 2]; + dest[len - 1] = src[len - 1]; + return; + } + copyRange4(4, dest, src, len); +} + +inline fn copy16ToSmallLimit( + comptime small_limit: comptime_int, + dest: [*]u8, + src: [*]const u8, + len: usize, +) bool { + @setRuntimeSafety(builtin.is_test); + inline for (2..(std.math.log2(small_limit) + 1) / 2 + 1) |p| { + const limit = 1 << (2 * p); + if (len < limit) { + copyRange4(limit / 4, dest, src, len); + return true; + } + } + return false; +} + +inline fn copyForwards( + noalias dest: [*]u8, + noalias src: [*]const u8, + len: usize, +) void { + @setRuntimeSafety(builtin.is_test); + assert(len >= 2 * @sizeOf(Element)); + + dest[0..@sizeOf(Element)].* = src[0..@sizeOf(Element)].*; + const alignment_offset = @alignOf(Element) - @intFromPtr(src) % @alignOf(Element); + const n = len - alignment_offset; + const d = dest + alignment_offset; + const s = src + alignment_offset; + + copyBlocksAlignedSource(@ptrCast(d), @alignCast(@ptrCast(s)), n); + + // copy last `@sizeOf(Element)` bytes unconditionally, since block copy + // methods only copy a multiple of `@sizeOf(Element)` bytes. + const offset = len - @sizeOf(Element); + dest[offset..][0..@sizeOf(Element)].* = src[offset..][0..@sizeOf(Element)].*; +} + +inline fn copyBlocksAlignedSource( + noalias dest: [*]align(1) Element, + noalias src: [*]const Element, + max_bytes: usize, +) void { + copyBlocks(dest, src, max_bytes); +} + +/// Copies the largest multiple of `@sizeOf(T)` bytes from `src` to `dest`, +/// that is less than `max_bytes` where `T` is the child type of `src` and +/// `dest`. +inline fn copyBlocks( + noalias dest: anytype, + noalias src: anytype, + max_bytes: usize, +) void { + @setRuntimeSafety(builtin.is_test); + + const T = @typeInfo(@TypeOf(dest)).pointer.child; + comptime assert(T == @typeInfo(@TypeOf(src)).pointer.child); + + const loop_count = max_bytes / @sizeOf(T); + + for (dest[0..loop_count], src[0..loop_count]) |*d, s| { + d.* = s; + } +} + +/// copy `len` bytes from `src` to `dest`; `len` must be in the range +/// `[copy_len, 4 * copy_len)`. +inline fn copyRange4( + comptime copy_len: comptime_int, + noalias dest: [*]u8, + noalias src: [*]const u8, + len: usize, +) void { + @setRuntimeSafety(builtin.is_test); + comptime assert(std.math.isPowerOfTwo(copy_len)); + assert(len >= copy_len); + assert(len < 4 * copy_len); + + const a = len & (copy_len * 2); + const b = a / 2; + + const last = len - copy_len; + const pen = last - b; + + dest[0..copy_len].* = src[0..copy_len].*; + dest[b..][0..copy_len].* = src[b..][0..copy_len].*; + dest[pen..][0..copy_len].* = src[pen..][0..copy_len].*; + dest[last..][0..copy_len].* = src[last..][0..copy_len].*; +} + +test { + const S = struct { + fn testFunc(comptime copy_func: anytype) !void { + const max_len = 1024; + var buffer: [max_len + @alignOf(Element) - 1]u8 align(@alignOf(Element)) = undefined; + for (&buffer, 0..) |*b, i| { + b.* = @intCast(i % 97); + } + var dest: [max_len + @alignOf(Element) - 1]u8 align(@alignOf(Element)) = undefined; + + for (0..max_len) |copy_len| { + for (0..@alignOf(Element)) |s_offset| { + for (0..@alignOf(Element)) |d_offset| { + @memset(&dest, 0xff); + const s = buffer[s_offset..][0..copy_len]; + const d = dest[d_offset..][0..copy_len]; + _ = copy_func(@ptrCast(d.ptr), @ptrCast(s.ptr), s.len); + std.testing.expectEqualSlices(u8, s, d) catch |e| { + std.debug.print("error encountered for length={d}, s_offset={d}, d_offset={d}\n", .{ + copy_len, s_offset, d_offset, + }); + return e; + }; + } + } + } + } + }; + + try S.testFunc(memcpySmall); + try S.testFunc(memcpyFast); }