diff --git a/lib/std/math/big.zig b/lib/std/math/big.zig
index e7f8a7fb34..c7fc0b17f5 100644
--- a/lib/std/math/big.zig
+++ b/lib/std/math/big.zig
@@ -7,6 +7,7 @@ pub const Limb = usize;
 const limb_info = @typeInfo(Limb).Int;
 pub const SignedLimb = std.meta.Int(.signed, limb_info.bits);
 pub const DoubleLimb = std.meta.Int(.unsigned, 2 * limb_info.bits);
+pub const HalfLimb = std.meta.Int(.unsigned, limb_info.bits / 2);
 pub const SignedDoubleLimb = std.meta.Int(.signed, 2 * limb_info.bits);
 pub const Log2Limb = std.math.Log2Int(Limb);
 
diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index ec0143a3d7..87a62bf66c 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -2,6 +2,8 @@ const std = @import("../../std.zig");
 const math = std.math;
 const Limb = std.math.big.Limb;
 const limb_bits = @typeInfo(Limb).Int.bits;
+const HalfLimb = std.math.big.HalfLimb;
+const half_limb_bits = @typeInfo(HalfLimb).Int.bits;
 const DoubleLimb = std.math.big.DoubleLimb;
 const SignedDoubleLimb = std.math.big.SignedDoubleLimb;
 const Log2Limb = std.math.big.Log2Limb;
@@ -1335,7 +1337,16 @@ pub const Mutable = struct {
         const xy_trailing = math.min(x_trailing, y_trailing);
 
         if (y.len - xy_trailing == 1) {
-            lldiv1(q.limbs, &r.limbs[0], x.limbs[xy_trailing..x.len], y.limbs[y.len - 1]);
+            const divisor = y.limbs[y.len - 1];
+
+            // Optimization for small divisor. By using a half limb we can avoid requiring DoubleLimb
+            // divisions in the hot code path. This may often require compiler_rt software-emulation.
+            if (divisor < maxInt(HalfLimb)) {
+                lldiv0p5(q.limbs, &r.limbs[0], x.limbs[xy_trailing..x.len], @intCast(HalfLimb, divisor));
+            } else {
+                lldiv1(q.limbs, &r.limbs[0], x.limbs[xy_trailing..x.len], divisor);
+            }
+
             q.normalize(x.len - xy_trailing);
             q.positive = q_positive;
 
@@ -1939,7 +1950,8 @@ pub const Const = struct {
             }
         } else {
             // Non power-of-two: batch divisions per word size.
-            const digits_per_limb = math.log(Limb, base, maxInt(Limb));
+            // We use a HalfLimb here so the division uses the faster lldiv0p5 over lldiv1 codepath.
+            const digits_per_limb = math.log(HalfLimb, base, maxInt(HalfLimb));
             var limb_base: Limb = 1;
             var j: usize = 0;
             while (j < digits_per_limb) : (j += 1) {
@@ -3208,6 +3220,30 @@ fn lldiv1(quo: []Limb, rem: *Limb, a: []const Limb, b: Limb) void {
     }
 }
 
+fn lldiv0p5(quo: []Limb, rem: *Limb, a: []const Limb, b: HalfLimb) void {
+    @setRuntimeSafety(debug_safety);
+    assert(a.len > 1 or a[0] >= b);
+    assert(quo.len >= a.len);
+
+    rem.* = 0;
+    for (a) |_, ri| {
+        const i = a.len - ri - 1;
+        const ai_high = a[i] >> half_limb_bits;
+        const ai_low = a[i] & ((1 << half_limb_bits) - 1);
+
+        // Split the division into two divisions acting on half a limb each. Carry remainder.
+        const ai_high_with_carry = (rem.* << half_limb_bits) | ai_high;
+        const ai_high_quo = ai_high_with_carry / b;
+        rem.* = ai_high_with_carry % b;
+
+        const ai_low_with_carry = (rem.* << half_limb_bits) | ai_low;
+        const ai_low_quo = ai_low_with_carry / b;
+        rem.* = ai_low_with_carry % b;
+
+        quo[i] = (ai_high_quo << half_limb_bits) | ai_low_quo;
+    }
+}
+
 fn llshl(r: []Limb, a: []const Limb, shift: usize) void {
     @setRuntimeSafety(debug_safety);
     assert(a.len >= 1);
diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig
index 4c1d12116e..70a9b97a38 100644
--- a/lib/std/math/big/int_test.zig
+++ b/lib/std/math/big/int_test.zig
@@ -1064,7 +1064,7 @@ test "big.int mulWrap large" {
     try testing.expect(b.eq(c));
 }
 
-test "big.int div single-single no rem" {
+test "big.int div single-half no rem" {
     var a = try Managed.initSet(testing.allocator, 50);
     defer a.deinit();
     var b = try Managed.initSet(testing.allocator, 5);
@@ -1080,7 +1080,7 @@ test "big.int div single-single no rem" {
     try testing.expect((try r.to(u32)) == 0);
 }
 
-test "big.int div single-single with rem" {
+test "big.int div single-half with rem" {
     var a = try Managed.initSet(testing.allocator, 49);
     defer a.deinit();
     var b = try Managed.initSet(testing.allocator, 5);
@@ -1096,6 +1096,39 @@ test "big.int div single-single with rem" {
     try testing.expect((try r.to(u32)) == 4);
 }
 
+test "big.int div single-single no rem" {
+    // assumes usize is <= 64 bits.
+    var a = try Managed.initSet(testing.allocator, 1 << 52);
+    defer a.deinit();
+    var b = try Managed.initSet(testing.allocator, 1 << 35);
+    defer b.deinit();
+
+    var q = try Managed.init(testing.allocator);
+    defer q.deinit();
+    var r = try Managed.init(testing.allocator);
+    defer r.deinit();
+    try Managed.divTrunc(&q, &r, a.toConst(), b.toConst());
+
+    try testing.expect((try q.to(u32)) == 131072);
+    try testing.expect((try r.to(u32)) == 0);
+}
+
+test "big.int div single-single with rem" {
+    var a = try Managed.initSet(testing.allocator, (1 << 52) | (1 << 33));
+    defer a.deinit();
+    var b = try Managed.initSet(testing.allocator, (1 << 35));
+    defer b.deinit();
+
+    var q = try Managed.init(testing.allocator);
+    defer q.deinit();
+    var r = try Managed.init(testing.allocator);
+    defer r.deinit();
+    try Managed.divTrunc(&q, &r, a.toConst(), b.toConst());
+
+    try testing.expect((try q.to(u64)) == 131072);
+    try testing.expect((try r.to(u64)) == 8589934592);
+}
+
 test "big.int div multi-single no rem" {
     const op1 = 0xffffeeeeddddcccc;
     const op2 = 34;