From 509c51ca5354bd6475a3df0cf7a56191eb845ad5 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Mon, 27 Jan 2025 17:46:58 +0100
Subject: [PATCH 01/21] Make `Uint::as_limbs_mut` const

---
 src/uint.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/uint.rs b/src/uint.rs
index fe2208d2..d13e1b58 100644
--- a/src/uint.rs
+++ b/src/uint.rs
@@ -165,7 +165,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     }
 
     /// Borrow the limbs of this [`Uint`] mutably.
-    pub fn as_limbs_mut(&mut self) -> &mut [Limb; LIMBS] {
+    pub const fn as_limbs_mut(&mut self) -> &mut [Limb; LIMBS] {
         &mut self.limbs
     }
 

From 3b78bd5af1c00a5dbe56a57792f21aad10eea845 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Mon, 27 Jan 2025 18:14:54 +0100
Subject: [PATCH 02/21] Implement `Limb::carrying_shr`

---
 src/limb/shr.rs | 19 ++++++++++++++++++-
 tests/limb.rs   | 23 +++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 tests/limb.rs

diff --git a/src/limb/shr.rs b/src/limb/shr.rs
index 08549732..bfd67499 100644
--- a/src/limb/shr.rs
+++ b/src/limb/shr.rs
@@ -1,6 +1,6 @@
 //! Limb right bitshift
 
-use crate::{Limb, WrappingShr};
+use crate::{ConstChoice, Limb, WrappingShr};
 use core::ops::{Shr, ShrAssign};
 
 impl Limb {
@@ -16,6 +16,23 @@ impl Limb {
     pub(crate) const fn shr1(self) -> (Self, Self) {
         (Self(self.0 >> 1), Self(self.0 << Self::HI_BIT))
     }
+
+    /// Computes `self >> shift` and returns the result as well as the carry: the `shift` _most_
+    /// significant bits of the `carry` are equal to the `shift` _least_ significant bits of `self`.
+    ///
+    /// Panics if `shift` overflows `Limb::BITS`.
+    #[inline(always)]
+    pub const fn carrying_shr(self, shift: u32) -> (Self, Self) {
+        // Note that we can compute carry = self << (Self::BITS - shift) whenever shift > 0.
+        // However, we need to account for the case that shift = 0:
+        // - the carry should be 0, and
+        // - the value by which carry is left shifted should be made to be < Self::BITS.
+        let shift_is_zero = ConstChoice::from_u32_eq(shift, 0);
+        let carry = Self::select(self, Self::ZERO, shift_is_zero);
+        let left_shift = shift_is_zero.select_u32(Self::BITS - shift, 0);
+
+        (self.shr(shift), carry.shl(left_shift))
+    }
 }
 
 macro_rules! impl_shr {
diff --git a/tests/limb.rs b/tests/limb.rs
new file mode 100644
index 00000000..c10bcf65
--- /dev/null
+++ b/tests/limb.rs
@@ -0,0 +1,23 @@
+use crypto_bigint::{Limb, Word};
+use proptest::prelude::*;
+
+prop_compose! {
+    fn limb()(x in any::<Word>()) -> Limb {
+        Limb::from(x)
+    }
+}
+proptest! {
+    #[test]
+    fn carrying_shr_doesnt_panic(limb in limb(), shift in 0..32u32) {
+        limb.carrying_shr(shift);
+    }
+
+    #[test]
+    fn carrying_shr(limb in limb(), shift in 0..32u32) {
+        if shift == 0 {
+            assert_eq!(limb.carrying_shr(shift), (limb, Limb::ZERO));
+        } else {
+            assert_eq!(limb.carrying_shr(shift), (limb.shr(shift), limb.shl(Limb::BITS - shift)));
+        }
+    }
+}

From 16f59fc353272fb6b538202eaa46126ba9d76ec4 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Mon, 27 Jan 2025 18:20:07 +0100
Subject: [PATCH 03/21] Implement `Uint::split_overflowing_shr`

---
 benches/uint.rs | 41 ++++++++++++++++++++++++++++---
 src/uint/shr.rs | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/benches/uint.rs b/benches/uint.rs
index 3bb2a961..3961f62e 100644
--- a/benches/uint.rs
+++ b/benches/uint.rs
@@ -1,7 +1,10 @@
-use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion};
+use criterion::measurement::WallTime;
+use criterion::{
+    black_box, criterion_group, criterion_main, BatchSize, BenchmarkGroup, BenchmarkId, Criterion,
+};
 use crypto_bigint::{
-    Limb, NonZero, Odd, Random, RandomBits, RandomMod, Reciprocal, Uint, U1024, U128, U2048, U256,
-    U4096, U512,
+    Limb, NonZero, Odd, Random, RandomBits, RandomMod, Reciprocal, Uint, U1024, U128, U16384,
+    U2048, U256, U4096, U512, U8192,
 };
 use rand_chacha::ChaCha8Rng;
 use rand_core::{OsRng, RngCore, SeedableRng};
@@ -370,6 +373,30 @@ fn bench_shl(c: &mut Criterion) {
     group.finish();
 }
 
+fn shr_benchmark<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
+    group.bench_function(BenchmarkId::new("overflowing_shr_vartime", LIMBS), |b| {
+        b.iter_batched(
+            || Uint::<LIMBS>::ONE,
+            |x| x.overflowing_shr_vartime(Uint::<LIMBS>::BITS / 2 + 10),
+            BatchSize::SmallInput,
+        )
+    });
+    group.bench_function(BenchmarkId::new("overflowing_shr", LIMBS), |b| {
+        b.iter_batched(
+            || Uint::<LIMBS>::ONE,
+            |x| x.overflowing_shr(Uint::<LIMBS>::BITS / 2 + 10),
+            BatchSize::SmallInput,
+        )
+    });
+    group.bench_function(BenchmarkId::new("split_overflowing_shr", LIMBS), |b| {
+        b.iter_batched(
+            || Uint::<LIMBS>::ONE,
+            |x| x.split_overflowing_shr(Uint::<LIMBS>::BITS / 2 + 10),
+            BatchSize::SmallInput,
+        )
+    });
+}
+
 fn bench_shr(c: &mut Criterion) {
     let mut group = c.benchmark_group("right shift");
 
@@ -405,6 +432,14 @@ fn bench_shr(c: &mut Criterion) {
         )
     });
 
+    shr_benchmark::<{ U256::LIMBS }>(&mut group);
+    shr_benchmark::<{ U512::LIMBS }>(&mut group);
+    shr_benchmark::<{ U1024::LIMBS }>(&mut group);
+    shr_benchmark::<{ U2048::LIMBS }>(&mut group);
+    shr_benchmark::<{ U4096::LIMBS }>(&mut group);
+    shr_benchmark::<{ U8192::LIMBS }>(&mut group);
+    shr_benchmark::<{ U16384::LIMBS }>(&mut group);
+
     group.finish();
 }
 
diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index 0212b570..319a70e9 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -47,6 +47,71 @@ impl<const LIMBS: usize> Uint<LIMBS> {
         ConstCtOption::new(Uint::select(&result, &Self::ZERO, overflow), overflow.not())
     }
 
+    /// Computes `self >> shift`.
+    ///
+    /// Returns `None` if `shift >= Self::BITS`.
+    pub const fn split_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
+        let (intra_limb_shift, limb_shift) = (shift % Limb::BITS, shift / Limb::BITS);
+        self.intra_limb_carrying_shr_internal(intra_limb_shift)
+            .full_limb_shr(limb_shift)
+    }
+
+    /// Computes `self >> shift`, for `shift < Limb::BITS`.
+    ///
+    /// Returns `None` if `shift >= Limb::BITS`.
+    pub const fn intra_limb_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
+        let overflow = ConstChoice::from_u32_lt(shift, Limb::BITS).not();
+        let result = self.intra_limb_carrying_shr_internal(shift % Limb::BITS);
+        ConstCtOption::new(Uint::select(&result, &Self::ZERO, overflow), overflow.not())
+    }
+
+    /// Computes `self >> shift`, for `shift < Limb::BITS`.
+    ///
+    /// Panics if `shift >= Limb::BITS`.
+    #[inline(always)]
+    const fn intra_limb_carrying_shr_internal(&self, shift: u32) -> Self {
+        debug_assert!(shift < Limb::BITS);
+
+        let (mut result, mut carry) = (*self, Limb::ZERO);
+
+        let limbs = result.as_limbs_mut();
+        let mut i = limbs.len();
+        while i > 0 {
+            i -= 1;
+            let (shifted, new_carry) = limbs[i].carrying_shr(shift);
+            limbs[i] = shifted.bitxor(carry);
+            carry = new_carry;
+        }
+
+        result
+    }
+
+    /// Compute `self >> (Limb::BITS * limb_shift)`, for `limb_shift < Self::LIMBS`.
+    ///
+    /// Returns `None` if `limb_shift >= Self::LIMBS`.
+    #[inline(always)]
+    pub const fn full_limb_shr(&self, limb_shift: u32) -> ConstCtOption<Self> {
+        let shift_bits = u32::BITS - (LIMBS as u32 - 1).leading_zeros();
+        let overflow = ConstChoice::from_u32_lt(limb_shift, LIMBS as u32).not();
+        let limb_shift = limb_shift % LIMBS as u32;
+
+        let mut result = *self;
+        let mut i = 0;
+        while i < shift_bits {
+            let bit = ConstChoice::from_u32_lsb((limb_shift >> i) & 1);
+            result = Uint::select(
+                &result,
+                &result
+                    .overflowing_shr_vartime(Limb::BITS << i)
+                    .expect("shift within range"),
+                bit,
+            );
+            i += 1;
+        }
+
+        ConstCtOption::new(Uint::select(&result, &Self::ZERO, overflow), overflow.not())
+    }
+
     /// Computes `self >> shift`.
     ///
     /// Returns `None` if `shift >= Self::BITS`.

From 068b054f8812317ba15ef17a4e4a121aaa335bce Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Mon, 27 Jan 2025 18:21:22 +0100
Subject: [PATCH 04/21] Introduce `Uint::fast_split_overflowing_shr`

---
 benches/uint.rs | 21 ++++++++++++++-------
 src/uint/shr.rs | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/benches/uint.rs b/benches/uint.rs
index 3961f62e..216de786 100644
--- a/benches/uint.rs
+++ b/benches/uint.rs
@@ -395,6 +395,13 @@ fn shr_benchmark<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
             BatchSize::SmallInput,
         )
     });
+    group.bench_function(BenchmarkId::new("fast_split_overflowing_shr", LIMBS), |b| {
+        b.iter_batched(
+            || Uint::<LIMBS>::ONE,
+            |x| x.fast_split_overflowing_shr(Uint::<LIMBS>::BITS / 2 + 10),
+            BatchSize::SmallInput,
+        )
+    });
 }
 
 fn bench_shr(c: &mut Criterion) {
@@ -522,14 +529,14 @@ fn bench_sqrt(c: &mut Criterion) {
 
 criterion_group!(
     benches,
-    bench_random,
-    bench_mul,
-    bench_division,
-    bench_gcd,
-    bench_shl,
+    // bench_random,
+    // bench_mul,
+    // bench_division,
+    // bench_gcd,
+    // bench_shl,
     bench_shr,
-    bench_inv_mod,
-    bench_sqrt
+    // bench_inv_mod,
+    // bench_sqrt
 );
 
 criterion_main!(benches);
diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index 319a70e9..1b64704b 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -112,6 +112,47 @@ impl<const LIMBS: usize> Uint<LIMBS> {
         ConstCtOption::new(Uint::select(&result, &Self::ZERO, overflow), overflow.not())
     }
 
+    /// Computes `self >> shift`.
+    ///
+    /// Returns `None` if `shift >= Self::BITS`.
+    pub const fn fast_split_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
+        let (intra_limb_shift, limb_shift) = (shift % Limb::BITS, shift / Limb::BITS);
+        self.intra_limb_carrying_shr_internal(intra_limb_shift)
+            .fast_full_limb_shr(limb_shift)
+    }
+
+    /// Compute `self >> (Limb::BITS * limb_shift)`, for `limb_shift < Self::LIMBS`.
+    ///
+    /// Returns `None` if `limb_shift >= Self::LIMBS`.
+    #[inline(always)]
+    pub const fn fast_full_limb_shr(&self, limb_shift: u32) -> ConstCtOption<Self> {
+        let shift_bits = u32::BITS - (LIMBS as u32 - 1).leading_zeros();
+        let overflow = ConstChoice::from_u32_lt(limb_shift, LIMBS as u32).not();
+        let limb_shift = limb_shift % LIMBS as u32;
+
+        let mut result = *self;
+        let mut i = 0;
+        while i < shift_bits {
+            let bit = ConstChoice::from_u32_lsb((limb_shift >> i) & 1);
+
+            let mut j = 0;
+            let limbs = result.as_limbs_mut();
+            let offset = 1 << i;
+            while j < Self::LIMBS.saturating_sub(offset) {
+                limbs[j] = Limb::select(limbs[j], limbs[j + offset], bit);
+                j += 1;
+            }
+            while j < Self::LIMBS {
+                limbs[j] = Limb::select(limbs[j], Limb::ZERO, bit);
+                j += 1;
+            }
+
+            i += 1;
+        }
+
+        ConstCtOption::new(Uint::select(&result, &Self::ZERO, overflow), overflow.not())
+    }
+
     /// Computes `self >> shift`.
     ///
     /// Returns `None` if `shift >= Self::BITS`.

From dd28ce4988563d6f1948eae52a508f6a0b0ed827 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Mon, 27 Jan 2025 18:46:21 +0100
Subject: [PATCH 05/21] Fix benches

---
 benches/uint.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benches/uint.rs b/benches/uint.rs
index 216de786..2a6f0db8 100644
--- a/benches/uint.rs
+++ b/benches/uint.rs
@@ -529,14 +529,14 @@ fn bench_sqrt(c: &mut Criterion) {
 
 criterion_group!(
     benches,
-    // bench_random,
-    // bench_mul,
-    // bench_division,
-    // bench_gcd,
-    // bench_shl,
+    bench_random,
+    bench_mul,
+    bench_division,
+    bench_gcd,
+    bench_shl,
     bench_shr,
-    // bench_inv_mod,
-    // bench_sqrt
+    bench_inv_mod,
+    bench_sqrt
 );
 
 criterion_main!(benches);

From a5dbc7454ba0770a404952a946b6104d1baec52b Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Wed, 29 Jan 2025 16:47:20 +0100
Subject: [PATCH 06/21] Remove integer division/remainder from
 `split_overflowing_shr`

---
 src/uint/shr.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index 1b64704b..7de3a2f8 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -51,7 +51,9 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     ///
     /// Returns `None` if `shift >= Self::BITS`.
     pub const fn split_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
-        let (intra_limb_shift, limb_shift) = (shift % Limb::BITS, shift / Limb::BITS);
+        let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
+        let intra_limb_shift = shift & (Limb::BITS - 1);
+        let limb_shift = shift >> limb_bits_bits;
         self.intra_limb_carrying_shr_internal(intra_limb_shift)
             .full_limb_shr(limb_shift)
     }
@@ -116,7 +118,9 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     ///
     /// Returns `None` if `shift >= Self::BITS`.
     pub const fn fast_split_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
-        let (intra_limb_shift, limb_shift) = (shift % Limb::BITS, shift / Limb::BITS);
+        let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
+        let intra_limb_shift = shift & (Limb::BITS - 1);
+        let limb_shift = shift >> limb_bits_bits;
         self.intra_limb_carrying_shr_internal(intra_limb_shift)
             .fast_full_limb_shr(limb_shift)
     }

From 6858e2298d3ca25066fc946409dbdf2d866a986f Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Wed, 29 Jan 2025 16:51:00 +0100
Subject: [PATCH 07/21] Add annotations

---
 src/uint/shr.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index 7de3a2f8..9eb95fae 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -51,6 +51,8 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     ///
     /// Returns `None` if `shift >= Self::BITS`.
     pub const fn split_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
+        // Split shift into (shift % Limb::BITS, shift / Limb::BITS)
+        // Since Limb::BITS is known to be a power of two, this can also be computed as follows:
         let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
         let intra_limb_shift = shift & (Limb::BITS - 1);
         let limb_shift = shift >> limb_bits_bits;
@@ -118,6 +120,8 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     ///
     /// Returns `None` if `shift >= Self::BITS`.
     pub const fn fast_split_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
+        // Split shift into (shift % Limb::BITS, shift / Limb::BITS)
+        // Since Limb::BITS is known to be a power of two, this can also be computed as follows:
         let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
         let intra_limb_shift = shift & (Limb::BITS - 1);
         let limb_shift = shift >> limb_bits_bits;

From fb15f24ebd46c266f5b492749e20124fb6222124 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 10:38:30 +0100
Subject: [PATCH 08/21] Replace `Uint::shr` with the new implementation.

---
 benches/uint.rs | 14 -----------
 src/uint/shr.rs | 66 ++-----------------------------------------------
 2 files changed, 2 insertions(+), 78 deletions(-)

diff --git a/benches/uint.rs b/benches/uint.rs
index 2a6f0db8..f4df3d2d 100644
--- a/benches/uint.rs
+++ b/benches/uint.rs
@@ -388,20 +388,6 @@ fn shr_benchmark<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
             BatchSize::SmallInput,
         )
     });
-    group.bench_function(BenchmarkId::new("split_overflowing_shr", LIMBS), |b| {
-        b.iter_batched(
-            || Uint::<LIMBS>::ONE,
-            |x| x.split_overflowing_shr(Uint::<LIMBS>::BITS / 2 + 10),
-            BatchSize::SmallInput,
-        )
-    });
-    group.bench_function(BenchmarkId::new("fast_split_overflowing_shr", LIMBS), |b| {
-        b.iter_batched(
-            || Uint::<LIMBS>::ONE,
-            |x| x.fast_split_overflowing_shr(Uint::<LIMBS>::BITS / 2 + 10),
-            BatchSize::SmallInput,
-        )
-    });
 }
 
 fn bench_shr(c: &mut Criterion) {
diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index 9eb95fae..01ae2fd9 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -27,30 +27,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     pub const fn overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
         // `floor(log2(BITS - 1))` is the number of bits in the representation of `shift`
         // (which lies in range `0 <= shift < BITS`).
-        let shift_bits = u32::BITS - (Self::BITS - 1).leading_zeros();
-        let overflow = ConstChoice::from_u32_lt(shift, Self::BITS).not();
-        let shift = shift % Self::BITS;
-        let mut result = *self;
-        let mut i = 0;
-        while i < shift_bits {
-            let bit = ConstChoice::from_u32_lsb((shift >> i) & 1);
-            result = Uint::select(
-                &result,
-                &result
-                    .overflowing_shr_vartime(1 << i)
-                    .expect("shift within range"),
-                bit,
-            );
-            i += 1;
-        }
-
-        ConstCtOption::new(Uint::select(&result, &Self::ZERO, overflow), overflow.not())
-    }
-
-    /// Computes `self >> shift`.
-    ///
-    /// Returns `None` if `shift >= Self::BITS`.
-    pub const fn split_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
+        //
         // Split shift into (shift % Limb::BITS, shift / Limb::BITS)
         // Since Limb::BITS is known to be a power of two, this can also be computed as follows:
         let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
@@ -93,51 +70,12 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     /// Compute `self >> (Limb::BITS * limb_shift)`, for `limb_shift < Self::LIMBS`.
     ///
     /// Returns `None` if `limb_shift >= Self::LIMBS`.
-    #[inline(always)]
+    #[inline]
     pub const fn full_limb_shr(&self, limb_shift: u32) -> ConstCtOption<Self> {
         let shift_bits = u32::BITS - (LIMBS as u32 - 1).leading_zeros();
         let overflow = ConstChoice::from_u32_lt(limb_shift, LIMBS as u32).not();
         let limb_shift = limb_shift % LIMBS as u32;
 
-        let mut result = *self;
-        let mut i = 0;
-        while i < shift_bits {
-            let bit = ConstChoice::from_u32_lsb((limb_shift >> i) & 1);
-            result = Uint::select(
-                &result,
-                &result
-                    .overflowing_shr_vartime(Limb::BITS << i)
-                    .expect("shift within range"),
-                bit,
-            );
-            i += 1;
-        }
-
-        ConstCtOption::new(Uint::select(&result, &Self::ZERO, overflow), overflow.not())
-    }
-
-    /// Computes `self >> shift`.
-    ///
-    /// Returns `None` if `shift >= Self::BITS`.
-    pub const fn fast_split_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
-        // Split shift into (shift % Limb::BITS, shift / Limb::BITS)
-        // Since Limb::BITS is known to be a power of two, this can also be computed as follows:
-        let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
-        let intra_limb_shift = shift & (Limb::BITS - 1);
-        let limb_shift = shift >> limb_bits_bits;
-        self.intra_limb_carrying_shr_internal(intra_limb_shift)
-            .fast_full_limb_shr(limb_shift)
-    }
-
-    /// Compute `self >> (Limb::BITS * limb_shift)`, for `limb_shift < Self::LIMBS`.
-    ///
-    /// Returns `None` if `limb_shift >= Self::LIMBS`.
-    #[inline(always)]
-    pub const fn fast_full_limb_shr(&self, limb_shift: u32) -> ConstCtOption<Self> {
-        let shift_bits = u32::BITS - (LIMBS as u32 - 1).leading_zeros();
-        let overflow = ConstChoice::from_u32_lt(limb_shift, LIMBS as u32).not();
-        let limb_shift = limb_shift % LIMBS as u32;
-
         let mut result = *self;
         let mut i = 0;
         while i < shift_bits {

From 88f19720c277eadd691e5d2c81876b8484cb69eb Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 10:43:26 +0100
Subject: [PATCH 09/21] Fix `Uint::intra_limb_shr` naming

---
 src/uint/shr.rs | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index 01ae2fd9..e0989bb5 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -33,24 +33,15 @@ impl<const LIMBS: usize> Uint<LIMBS> {
         let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
         let intra_limb_shift = shift & (Limb::BITS - 1);
         let limb_shift = shift >> limb_bits_bits;
-        self.intra_limb_carrying_shr_internal(intra_limb_shift)
+        self.intra_limb_shr(intra_limb_shift)
             .full_limb_shr(limb_shift)
     }
 
-    /// Computes `self >> shift`, for `shift < Limb::BITS`.
-    ///
-    /// Returns `None` if `shift >= Limb::BITS`.
-    pub const fn intra_limb_overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
-        let overflow = ConstChoice::from_u32_lt(shift, Limb::BITS).not();
-        let result = self.intra_limb_carrying_shr_internal(shift % Limb::BITS);
-        ConstCtOption::new(Uint::select(&result, &Self::ZERO, overflow), overflow.not())
-    }
-
     /// Computes `self >> shift`, for `shift < Limb::BITS`.
     ///
     /// Panics if `shift >= Limb::BITS`.
     #[inline(always)]
-    const fn intra_limb_carrying_shr_internal(&self, shift: u32) -> Self {
+    const fn intra_limb_shr(&self, shift: u32) -> Self {
         debug_assert!(shift < Limb::BITS);
 
         let (mut result, mut carry) = (*self, Limb::ZERO);
@@ -68,6 +59,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     }
 
     /// Compute `self >> (Limb::BITS * limb_shift)`, for `limb_shift < Self::LIMBS`.
+    /// In other words, shift `self` right by `limb_shift` full limbs.
     ///
     /// Returns `None` if `limb_shift >= Self::LIMBS`.
     #[inline]

From d24b2994e0414e6adc44e7084121e381f6ece876 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 10:44:17 +0100
Subject: [PATCH 10/21] Fix `Uint::full_limb_shr` naming

---
 src/uint/shr.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index e0989bb5..f56b7a09 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -34,7 +34,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
         let intra_limb_shift = shift & (Limb::BITS - 1);
         let limb_shift = shift >> limb_bits_bits;
         self.intra_limb_shr(intra_limb_shift)
-            .full_limb_shr(limb_shift)
+            .full_limb_overflowing_shr(limb_shift)
     }
 
     /// Computes `self >> shift`, for `shift < Limb::BITS`.
@@ -63,7 +63,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     ///
     /// Returns `None` if `limb_shift >= Self::LIMBS`.
     #[inline]
-    pub const fn full_limb_shr(&self, limb_shift: u32) -> ConstCtOption<Self> {
+    pub const fn full_limb_overflowing_shr(&self, limb_shift: u32) -> ConstCtOption<Self> {
         let shift_bits = u32::BITS - (LIMBS as u32 - 1).leading_zeros();
         let overflow = ConstChoice::from_u32_lt(limb_shift, LIMBS as u32).not();
         let limb_shift = limb_shift % LIMBS as u32;

From a49f889847b920a8d0eeb44a1f4a2a0604d39f33 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 10:50:15 +0100
Subject: [PATCH 11/21] Update `Uint::shr` benchmarking

---
 benches/uint.rs | 43 ++++++++++++++++---------------------------
 1 file changed, 16 insertions(+), 27 deletions(-)

diff --git a/benches/uint.rs b/benches/uint.rs
index f4df3d2d..b5643f52 100644
--- a/benches/uint.rs
+++ b/benches/uint.rs
@@ -374,56 +374,45 @@ fn bench_shl(c: &mut Criterion) {
 }
 
 fn shr_benchmark<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
-    group.bench_function(BenchmarkId::new("overflowing_shr_vartime", LIMBS), |b| {
+    group.bench_function(BenchmarkId::new("shr_vartime, small", LIMBS), |b| {
         b.iter_batched(
             || Uint::<LIMBS>::ONE,
-            |x| x.overflowing_shr_vartime(Uint::<LIMBS>::BITS / 2 + 10),
+            |x| x.overflowing_shr_vartime(10),
             BatchSize::SmallInput,
         )
     });
-    group.bench_function(BenchmarkId::new("overflowing_shr", LIMBS), |b| {
+    group.bench_function(BenchmarkId::new("shr_vartime, large", LIMBS), |b| {
         b.iter_batched(
             || Uint::<LIMBS>::ONE,
-            |x| x.overflowing_shr(Uint::<LIMBS>::BITS / 2 + 10),
+            |x| x.overflowing_shr_vartime(Uint::<LIMBS>::BITS / 2 + 10),
             BatchSize::SmallInput,
         )
     });
-}
-
-fn bench_shr(c: &mut Criterion) {
-    let mut group = c.benchmark_group("right shift");
-
-    group.bench_function("shr_vartime, small, U2048", |b| {
+    group.bench_function(BenchmarkId::new("shr_vartime_wide, large", LIMBS), |b| {
         b.iter_batched(
-            || U2048::ONE,
-            |x| x.overflowing_shr_vartime(10),
+            || ( Uint::<LIMBS>::ONE,  Uint::<LIMBS>::ONE),
+            |x| Uint::overflowing_shr_vartime_wide(x, Uint::<LIMBS>::BITS / 2 + 10),
             BatchSize::SmallInput,
         )
     });
-
-    group.bench_function("shr_vartime, large, U2048", |b| {
+    group.bench_function(BenchmarkId::new("shr, small", LIMBS), |b| {
         b.iter_batched(
-            || U2048::ONE,
-            |x| x.overflowing_shr_vartime(1024 + 10),
+            || Uint::<LIMBS>::ONE,
+            |x| x.overflowing_shr(10),
             BatchSize::SmallInput,
         )
     });
-
-    group.bench_function("shr_vartime_wide, large, U2048", |b| {
+    group.bench_function(BenchmarkId::new("shr, large", LIMBS), |b| {
         b.iter_batched(
-            || (U2048::ONE, U2048::ONE),
-            |x| Uint::overflowing_shr_vartime_wide(x, 1024 + 10),
+            || Uint::<LIMBS>::ONE,
+            |x| x.overflowing_shr(Uint::<LIMBS>::BITS / 2 + 10),
             BatchSize::SmallInput,
         )
     });
+}
 
-    group.bench_function("shr, U2048", |b| {
-        b.iter_batched(
-            || U2048::ONE,
-            |x| x.overflowing_shr(1024 + 10),
-            BatchSize::SmallInput,
-        )
-    });
+fn bench_shr(c: &mut Criterion) {
+    let mut group = c.benchmark_group("right shift");
 
     shr_benchmark::<{ U256::LIMBS }>(&mut group);
     shr_benchmark::<{ U512::LIMBS }>(&mut group);

From 271eadb45c8587d9a3b2c941925ede37ea20f9f6 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 10:56:33 +0100
Subject: [PATCH 12/21] Implement `Limb::carrying_shl`

---
 src/limb/shl.rs | 19 ++++++++++++++++++-
 tests/limb.rs   | 14 ++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/limb/shl.rs b/src/limb/shl.rs
index ebcc6dcc..8b5ce583 100644
--- a/src/limb/shl.rs
+++ b/src/limb/shl.rs
@@ -1,6 +1,6 @@
 //! Limb left bitshift
 
-use crate::Limb;
+use crate::{ConstChoice, Limb};
 use core::ops::{Shl, ShlAssign};
 use num_traits::WrappingShl;
 
@@ -17,6 +17,23 @@ impl Limb {
     pub(crate) const fn shl1(self) -> (Self, Self) {
         (Self(self.0 << 1), Self(self.0 >> Self::HI_BIT))
     }
+
+    /// Computes `self << shift` and returns the result as well as the carry: the `shift` _least_
+    /// significant bits of the `carry` are equal to the `shift` _most_ significant bits of `self`.
+    ///
+    /// Panics if `shift` overflows `Limb::BITS`.
+    #[inline(always)]
+    pub const fn carrying_shl(self, shift: u32) -> (Self, Self) {
+        // Note that we can compute carry = self >> (Self::BITS - shift) whenever shift > 0.
+        // However, we need to account for the case that shift = 0:
+        // - the carry should be 0, and
+        // - the value by which carry is left shifted should be made to be < Self::BITS.
+        let shift_is_zero = ConstChoice::from_u32_eq(shift, 0);
+        let carry = Self::select(self, Self::ZERO, shift_is_zero);
+        let left_shift = shift_is_zero.select_u32(Self::BITS - shift, 0);
+
+        (self.shl(shift), carry.shr(left_shift))
+    }
 }
 
 macro_rules! impl_shl {
diff --git a/tests/limb.rs b/tests/limb.rs
index c10bcf65..66e37028 100644
--- a/tests/limb.rs
+++ b/tests/limb.rs
@@ -20,4 +20,18 @@ proptest! {
             assert_eq!(limb.carrying_shr(shift), (limb.shr(shift), limb.shl(Limb::BITS - shift)));
         }
     }
+
+    #[test]
+    fn carrying_shl_doesnt_panic(limb in limb(), shift in 0..32u32) {
+        limb.carrying_shl(shift);
+    }
+
+    #[test]
+    fn carrying_shl(limb in limb(), shift in 0..32u32) {
+        if shift == 0 {
+            assert_eq!(limb.carrying_shl(shift), (limb, Limb::ZERO));
+        } else {
+            assert_eq!(limb.carrying_shl(shift), (limb.shl(shift), limb.shr(Limb::BITS - shift)));
+        }
+    }
 }

From 3d7c87e92f38eea50e87c10e1cf6dd9c5904eba0 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 11:12:03 +0100
Subject: [PATCH 13/21] Speed up `Uint::shl`

---
 src/uint/shl.rs | 67 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 11 deletions(-)

diff --git a/src/uint/shl.rs b/src/uint/shl.rs
index 5ad17a8e..d87e9667 100644
--- a/src/uint/shl.rs
+++ b/src/uint/shl.rs
@@ -27,20 +27,65 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     pub const fn overflowing_shl(&self, shift: u32) -> ConstCtOption<Self> {
         // `floor(log2(BITS - 1))` is the number of bits in the representation of `shift`
         // (which lies in range `0 <= shift < BITS`).
-        let shift_bits = u32::BITS - (Self::BITS - 1).leading_zeros();
-        let overflow = ConstChoice::from_u32_lt(shift, Self::BITS).not();
-        let shift = shift % Self::BITS;
+        //
+        // Split shift into (shift % Limb::BITS, shift / Limb::BITS)
+        // Since Limb::BITS is known to be a power of two, this can also be computed as follows:
+        let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
+        let intra_limb_shift = shift & (Limb::BITS - 1);
+        let limb_shift = shift >> limb_bits_bits;
+        self.intra_limb_shl(intra_limb_shift)
+            .full_limb_overflowing_shl(limb_shift)
+    }
+
+    /// Computes `self << shift`, for `shift < Limb::BITS`.
+    ///
+    /// Panics if `shift >= Limb::BITS`.
+    #[inline(always)]
+    const fn intra_limb_shl(&self, shift: u32) -> Self {
+        debug_assert!(shift < Limb::BITS);
+
+        let (mut result, mut carry) = (*self, Limb::ZERO);
+
+        let limbs = result.as_limbs_mut();
+        let mut i = 0;
+        while i < limbs.len() {
+            let (shifted, new_carry) = limbs[i].carrying_shl(shift);
+            limbs[i] = shifted.bitxor(carry);
+            carry = new_carry;
+
+            i += 1;
+        }
+
+        result
+    }
+
+    /// Compute `self << (Limb::BITS * limb_shift)`, for `limb_shift < Self::LIMBS`.
+    /// In other words, shift `self` left by `limb_shift` full limbs.
+    ///
+    /// Returns `None` if `limb_shift >= Self::LIMBS`.
+    #[inline]
+    pub const fn full_limb_overflowing_shl(&self, limb_shift: u32) -> ConstCtOption<Self> {
+        let shift_bits = u32::BITS - (LIMBS as u32 - 1).leading_zeros();
+        let overflow = ConstChoice::from_u32_lt(limb_shift, LIMBS as u32).not();
+        let limb_shift = limb_shift % LIMBS as u32;
+
         let mut result = *self;
         let mut i = 0;
         while i < shift_bits {
-            let bit = ConstChoice::from_u32_lsb((shift >> i) & 1);
-            result = Uint::select(
-                &result,
-                &result
-                    .overflowing_shl_vartime(1 << i)
-                    .expect("shift within range"),
-                bit,
-            );
+            let bit = ConstChoice::from_u32_lsb((limb_shift >> i) & 1);
+
+            let mut j = Self::LIMBS;
+            let limbs = result.as_limbs_mut();
+            let offset = 1 << i;
+            while j > offset {
+                j -= 1;
+                limbs[j] = Limb::select(limbs[j], limbs[j - offset], bit);
+            }
+            while j > 0 {
+                j -= 1;
+                limbs[j] = Limb::select(limbs[j], Limb::ZERO, bit);
+            }
+
             i += 1;
         }
 

From 6949c330a5ef65d8bf5698ed45be745152f3984e Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 11:26:04 +0100
Subject: [PATCH 14/21] Remove duplicate shr/shl code

---
 src/uint/shl.rs | 46 ++++++++++++++++++++++------------------------
 src/uint/shr.rs | 32 +++++++-------------------------
 2 files changed, 29 insertions(+), 49 deletions(-)

diff --git a/src/uint/shl.rs b/src/uint/shl.rs
index d87e9667..4dc2faa4 100644
--- a/src/uint/shl.rs
+++ b/src/uint/shl.rs
@@ -25,14 +25,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     ///
     /// Returns `None` if `shift >= Self::BITS`.
     pub const fn overflowing_shl(&self, shift: u32) -> ConstCtOption<Self> {
-        // `floor(log2(BITS - 1))` is the number of bits in the representation of `shift`
-        // (which lies in range `0 <= shift < BITS`).
-        //
-        // Split shift into (shift % Limb::BITS, shift / Limb::BITS)
-        // Since Limb::BITS is known to be a power of two, this can also be computed as follows:
-        let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
-        let intra_limb_shift = shift & (Limb::BITS - 1);
-        let limb_shift = shift >> limb_bits_bits;
+        let (intra_limb_shift, limb_shift) = Self::decompose_shift(shift);
         self.intra_limb_shl(intra_limb_shift)
             .full_limb_overflowing_shl(limb_shift)
     }
@@ -108,31 +101,36 @@ impl<const LIMBS: usize> Uint<LIMBS> {
             return ConstCtOption::none(Self::ZERO);
         }
 
-        let shift_num = (shift / Limb::BITS) as usize;
-        let rem = shift % Limb::BITS;
-
+        let (rem, shift_num) = Self::decompose_shift(shift);
+        let shift_num = shift_num as usize;
         let mut i = shift_num;
         while i < LIMBS {
             limbs[i] = self.limbs[i - shift_num];
             i += 1;
         }
 
-        if rem == 0 {
-            return ConstCtOption::some(Self { limbs });
+        let mut shifted = Self { limbs };
+        if rem != 0 {
+            shifted = shifted.intra_limb_shl(rem);
         }
+        ConstCtOption::some(shifted)
+    }
 
-        let mut carry = Limb::ZERO;
-
-        let mut i = shift_num;
-        while i < LIMBS {
-            let shifted = limbs[i].shl(rem);
-            let new_carry = limbs[i].shr(Limb::BITS - rem);
-            limbs[i] = shifted.bitor(carry);
-            carry = new_carry;
-            i += 1;
-        }
+    /// Split `shift` into `shift % Limb::BITS` (its intra-limb-shift component), and
+    /// `shift / Limb::BITS` (its limb-shift component).
+    ///
+    /// This function achieves this without using a division/remainder operation.
+    pub(crate) const fn decompose_shift(shift: u32) -> (u32, u32) {
+        // `floor(log2(BITS - 1))` is the number of bits in the representation of `shift`
+        // (which lies in range `0 <= shift < BITS`).
+        //
+        // Split shift into (shift % Limb::BITS, shift / Limb::BITS)
+        // Since Limb::BITS is known to be a power of two, this can also be computed as follows:
+        let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
+        let intra_limb_shift = shift & (Limb::BITS - 1);
+        let limb_shift = shift >> limb_bits_bits;
 
-        ConstCtOption::some(Self { limbs })
+        (intra_limb_shift, limb_shift)
     }
 
     /// Computes a left shift on a wide input as `(lo, hi)`.
diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index f56b7a09..bcef3e0a 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -25,14 +25,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     ///
     /// Returns `None` if `shift >= Self::BITS`.
     pub const fn overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
-        // `floor(log2(BITS - 1))` is the number of bits in the representation of `shift`
-        // (which lies in range `0 <= shift < BITS`).
-        //
-        // Split shift into (shift % Limb::BITS, shift / Limb::BITS)
-        // Since Limb::BITS is known to be a power of two, this can also be computed as follows:
-        let limb_bits_bits = u32::BITS - (Limb::BITS - 1).leading_zeros();
-        let intra_limb_shift = shift & (Limb::BITS - 1);
-        let limb_shift = shift >> limb_bits_bits;
+        let (intra_limb_shift, limb_shift) = Self::decompose_shift(shift);
         self.intra_limb_shr(intra_limb_shift)
             .full_limb_overflowing_shr(limb_shift)
     }
@@ -107,30 +100,19 @@ impl<const LIMBS: usize> Uint<LIMBS> {
             return ConstCtOption::none(Self::ZERO);
         }
 
-        let shift_num = (shift / Limb::BITS) as usize;
-        let rem = shift % Limb::BITS;
-
+        let (rem, shift_num) = Self::decompose_shift(shift);
+        let shift_num = shift_num as usize;
         let mut i = 0;
         while i < LIMBS - shift_num {
             limbs[i] = self.limbs[i + shift_num];
             i += 1;
         }
 
-        if rem == 0 {
-            return ConstCtOption::some(Self { limbs });
+        let mut shifted = Self { limbs };
+        if rem != 0 {
+            shifted = shifted.intra_limb_shr(rem);
         }
-
-        let mut carry = Limb::ZERO;
-
-        while i > 0 {
-            i -= 1;
-            let shifted = limbs[i].shr(rem);
-            let new_carry = limbs[i].shl(Limb::BITS - rem);
-            limbs[i] = shifted.bitor(carry);
-            carry = new_carry;
-        }
-
-        ConstCtOption::some(Self { limbs })
+        ConstCtOption::some(shifted)
     }
 
     /// Computes a right shift on a wide input as `(lo, hi)`.

From 81de22d6313559c11bcacf62998106effd163138 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 11:39:42 +0100
Subject: [PATCH 15/21] Remove duplicate `Uint::shl` code

---
 src/uint/add_mod.rs |  2 +-
 src/uint/shl.rs     | 32 ++++++++++++--------------------
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/uint/add_mod.rs b/src/uint/add_mod.rs
index 506f57c2..a3af579f 100644
--- a/src/uint/add_mod.rs
+++ b/src/uint/add_mod.rs
@@ -38,7 +38,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     ///
     /// Assumes `self` as unbounded integer is `< p`.
     pub const fn double_mod(&self, p: &Self) -> Self {
-        let (w, carry) = self.overflowing_shl1();
+        let (w, carry) = self.carrying_shl1();
 
         // Attempt to subtract the modulus, to ensure the result is in the field.
         let (w, borrow) = w.sbb(p, Limb::ZERO);
diff --git a/src/uint/shl.rs b/src/uint/shl.rs
index 4dc2faa4..5206c294 100644
--- a/src/uint/shl.rs
+++ b/src/uint/shl.rs
@@ -26,15 +26,17 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     /// Returns `None` if `shift >= Self::BITS`.
     pub const fn overflowing_shl(&self, shift: u32) -> ConstCtOption<Self> {
         let (intra_limb_shift, limb_shift) = Self::decompose_shift(shift);
-        self.intra_limb_shl(intra_limb_shift)
+        self.intra_limb_carrying_shl(intra_limb_shift)
+            .0
             .full_limb_overflowing_shl(limb_shift)
     }
 
-    /// Computes `self << shift`, for `shift < Limb::BITS`.
+    /// Computes `self << shift`, for `shift < Limb::BITS`. Also returns a [Limb] containing the
+    /// `carry`.
     ///
     /// Panics if `shift >= Limb::BITS`.
     #[inline(always)]
-    const fn intra_limb_shl(&self, shift: u32) -> Self {
+    const fn intra_limb_carrying_shl(&self, shift: u32) -> (Self, Limb) {
         debug_assert!(shift < Limb::BITS);
 
         let (mut result, mut carry) = (*self, Limb::ZERO);
@@ -49,7 +51,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
             i += 1;
         }
 
-        result
+        (result, carry)
     }
 
     /// Compute `self << (Limb::BITS * limb_shift)`, for `limb_shift < Self::LIMBS`.
@@ -111,7 +113,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
 
         let mut shifted = Self { limbs };
         if rem != 0 {
-            shifted = shifted.intra_limb_shl(rem);
+            shifted = shifted.intra_limb_carrying_shl(rem).0;
         }
         ConstCtOption::some(shifted)
     }
@@ -204,21 +206,11 @@ impl<const LIMBS: usize> Uint<LIMBS> {
         (Uint::<LIMBS>::new(limbs), Limb(carry))
     }
 
-    /// Computes `self << 1` in constant-time, returning [`ConstChoice::TRUE`]
-    /// if the most significant bit was set, and [`ConstChoice::FALSE`] otherwise.
+    /// Computes `self << 1` in constant-time, furthermore returning a [Limb] containing the
+    /// `carry`.
     #[inline(always)]
-    pub(crate) const fn overflowing_shl1(&self) -> (Self, Limb) {
-        let mut ret = Self::ZERO;
-        let mut i = 0;
-        let mut carry = Limb::ZERO;
-        while i < LIMBS {
-            let (shifted, new_carry) = self.limbs[i].shl1();
-            ret.limbs[i] = shifted.bitor(carry);
-            carry = new_carry;
-            i += 1;
-        }
-
-        (ret, carry)
+    pub(crate) const fn carrying_shl1(&self) -> (Self, Limb) {
+        self.intra_limb_carrying_shl(1)
     }
 }
 
@@ -302,7 +294,7 @@ mod tests {
     #[test]
     fn shl1() {
         assert_eq!(N << 1, TWO_N);
-        assert_eq!(N.overflowing_shl1(), (TWO_N, Limb::ONE));
+        assert_eq!(N.carrying_shl1(), (TWO_N, Limb::ONE));
     }
 
     #[test]

From d4385081c10850c62ae8fcedc245e10fb754b486 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 11:39:51 +0100
Subject: [PATCH 16/21] Deprecate `Limb::shl1`

---
 src/limb/shl.rs | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/limb/shl.rs b/src/limb/shl.rs
index 8b5ce583..850bf64d 100644
--- a/src/limb/shl.rs
+++ b/src/limb/shl.rs
@@ -12,12 +12,6 @@ impl Limb {
         Limb(self.0 << shift)
     }
 
-    /// Computes `self << 1` and return the result and the carry (0 or 1).
-    #[inline(always)]
-    pub(crate) const fn shl1(self) -> (Self, Self) {
-        (Self(self.0 << 1), Self(self.0 >> Self::HI_BIT))
-    }
-
     /// Computes `self << shift` and returns the result as well as the carry: the `shift` _least_
     /// significant bits of the `carry` are equal to the `shift` _most_ significant bits of `self`.
     ///

From aeb0f73f9977821ec6792176279a93b02c5f3f5b Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 11:40:52 +0100
Subject: [PATCH 17/21] Remove duplicate `Uint::shr` code

---
 src/uint/shr.rs | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/uint/shr.rs b/src/uint/shr.rs
index bcef3e0a..1cfd1a5e 100644
--- a/src/uint/shr.rs
+++ b/src/uint/shr.rs
@@ -26,15 +26,17 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     /// Returns `None` if `shift >= Self::BITS`.
     pub const fn overflowing_shr(&self, shift: u32) -> ConstCtOption<Self> {
         let (intra_limb_shift, limb_shift) = Self::decompose_shift(shift);
-        self.intra_limb_shr(intra_limb_shift)
+        self.intra_limb_carrying_shr(intra_limb_shift)
+            .0
             .full_limb_overflowing_shr(limb_shift)
     }
 
-    /// Computes `self >> shift`, for `shift < Limb::BITS`.
+    /// Computes `self >> shift` for `shift < Limb::BITS`. Also returns a [Limb] containing the
+    /// `carry`.
     ///
     /// Panics if `shift >= Limb::BITS`.
     #[inline(always)]
-    const fn intra_limb_shr(&self, shift: u32) -> Self {
+    const fn intra_limb_carrying_shr(&self, shift: u32) -> (Self, Limb) {
         debug_assert!(shift < Limb::BITS);
 
         let (mut result, mut carry) = (*self, Limb::ZERO);
@@ -48,7 +50,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
             carry = new_carry;
         }
 
-        result
+        (result, carry)
     }
 
     /// Compute `self >> (Limb::BITS * limb_shift)`, for `limb_shift < Self::LIMBS`.
@@ -110,7 +112,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
 
         let mut shifted = Self { limbs };
         if rem != 0 {
-            shifted = shifted.intra_limb_shr(rem);
+            shifted = shifted.intra_limb_carrying_shr(rem).0;
         }
         ConstCtOption::some(shifted)
     }
@@ -171,16 +173,7 @@ impl<const LIMBS: usize> Uint<LIMBS> {
     /// if the least significant bit was set, and [`ConstChoice::FALSE`] otherwise.
     #[inline(always)]
     pub(crate) const fn shr1_with_carry(&self) -> (Self, ConstChoice) {
-        let mut ret = Self::ZERO;
-        let mut i = LIMBS;
-        let mut carry = Limb::ZERO;
-        while i > 0 {
-            i -= 1;
-            let (shifted, new_carry) = self.limbs[i].shr1();
-            ret.limbs[i] = shifted.bitor(carry);
-            carry = new_carry;
-        }
-
+        let (ret, carry) = self.intra_limb_carrying_shr(1);
         (ret, ConstChoice::from_word_lsb(carry.0 >> Limb::HI_BIT))
     }
 }

From b138396e7de8e8e99a006574324e8721de47e166 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 11:41:04 +0100
Subject: [PATCH 18/21] Deprecate `Limb::shr1`

---
 src/limb/shr.rs | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/limb/shr.rs b/src/limb/shr.rs
index bfd67499..6d3d0370 100644
--- a/src/limb/shr.rs
+++ b/src/limb/shr.rs
@@ -11,12 +11,6 @@ impl Limb {
         Limb(self.0 >> shift)
     }
 
-    /// Computes `self >> 1` and return the result and the carry (0 or `1 << HI_BIT`).
-    #[inline(always)]
-    pub(crate) const fn shr1(self) -> (Self, Self) {
-        (Self(self.0 >> 1), Self(self.0 << Self::HI_BIT))
-    }
-
     /// Computes `self >> shift` and returns the result as well as the carry: the `shift` _most_
     /// significant bits of the `carry` are equal to the `shift` _least_ significant bits of `self`.
     ///

From fb01e75ae184a8dc980d86edd7cd6aa5b5d732ca Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 11:43:11 +0100
Subject: [PATCH 19/21] Fix fmt

---
 benches/uint.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benches/uint.rs b/benches/uint.rs
index b5643f52..26c4b61e 100644
--- a/benches/uint.rs
+++ b/benches/uint.rs
@@ -390,7 +390,7 @@ fn shr_benchmark<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
     });
     group.bench_function(BenchmarkId::new("shr_vartime_wide, large", LIMBS), |b| {
         b.iter_batched(
-            || ( Uint::<LIMBS>::ONE,  Uint::<LIMBS>::ONE),
+            || (Uint::<LIMBS>::ONE, Uint::<LIMBS>::ONE),
             |x| Uint::overflowing_shr_vartime_wide(x, Uint::<LIMBS>::BITS / 2 + 10),
             BatchSize::SmallInput,
         )

From ada14bccf100dd1f2755389db3cbd38533855e01 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 11:49:24 +0100
Subject: [PATCH 20/21] Expand `Uint::shl` benchmarking

---
 benches/uint.rs | 64 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/benches/uint.rs b/benches/uint.rs
index 26c4b61e..862291b6 100644
--- a/benches/uint.rs
+++ b/benches/uint.rs
@@ -335,45 +335,59 @@ fn bench_gcd(c: &mut Criterion) {
     group.finish();
 }
 
-fn bench_shl(c: &mut Criterion) {
-    let mut group = c.benchmark_group("left shift");
-
-    group.bench_function("shl_vartime, small, U2048", |b| {
+fn shl_benchmarks<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
+    group.bench_function(BenchmarkId::new("shl_vartime, small", LIMBS), |b| {
         b.iter_batched(
-            || U2048::ONE,
+            || Uint::<LIMBS>::ONE,
             |x| x.overflowing_shl_vartime(10),
             BatchSize::SmallInput,
         )
     });
-
-    group.bench_function("shl_vartime, large, U2048", |b| {
+    group.bench_function(BenchmarkId::new("shl_vartime, large", LIMBS), |b| {
         b.iter_batched(
-            || U2048::ONE,
-            |x| black_box(x.overflowing_shl_vartime(1024 + 10)),
+            || Uint::<LIMBS>::ONE,
+            |x| black_box(x.overflowing_shl_vartime(Uint::<LIMBS>::BITS/2 + 10)),
             BatchSize::SmallInput,
         )
     });
-
-    group.bench_function("shl_vartime_wide, large, U2048", |b| {
+    group.bench_function(BenchmarkId::new("shl_vartime_wide, large", LIMBS), |b| {
         b.iter_batched(
-            || (U2048::ONE, U2048::ONE),
-            |x| Uint::overflowing_shl_vartime_wide(x, 1024 + 10),
+            || (Uint::<LIMBS>::ONE, Uint::<LIMBS>::ONE),
+            |x| Uint::overflowing_shl_vartime_wide(x, Uint::<LIMBS>::BITS/2 + 10),
             BatchSize::SmallInput,
         )
     });
-
-    group.bench_function("shl, U2048", |b| {
+    group.bench_function(BenchmarkId::new("shl, small", LIMBS), |b| {
+        b.iter_batched(
+            || Uint::<LIMBS>::ONE,
+            |x| x.overflowing_shl( 10),
+            BatchSize::SmallInput,
+        )
+    });
+    group.bench_function(BenchmarkId::new("shl, large", LIMBS), |b| {
         b.iter_batched(
-            || U2048::ONE,
-            |x| x.overflowing_shl(1024 + 10),
+            || Uint::<LIMBS>::ONE,
+            |x| x.overflowing_shl( Uint::<LIMBS>::BITS/2 + 10),
             BatchSize::SmallInput,
         )
     });
+}
+
+fn bench_shl(c: &mut Criterion) {
+    let mut group = c.benchmark_group("left shift");
+
+    shl_benchmarks::<{ U256::LIMBS }>(&mut group);
+    shl_benchmarks::<{ U512::LIMBS }>(&mut group);
+    shl_benchmarks::<{ U1024::LIMBS }>(&mut group);
+    shl_benchmarks::<{ U2048::LIMBS }>(&mut group);
+    shl_benchmarks::<{ U4096::LIMBS }>(&mut group);
+    shl_benchmarks::<{ U8192::LIMBS }>(&mut group);
+    shl_benchmarks::<{ U16384::LIMBS }>(&mut group);
 
     group.finish();
 }
 
-fn shr_benchmark<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
+fn shr_benchmarks<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
     group.bench_function(BenchmarkId::new("shr_vartime, small", LIMBS), |b| {
         b.iter_batched(
             || Uint::<LIMBS>::ONE,
@@ -414,13 +428,13 @@ fn shr_benchmark<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
 fn bench_shr(c: &mut Criterion) {
     let mut group = c.benchmark_group("right shift");
 
-    shr_benchmark::<{ U256::LIMBS }>(&mut group);
-    shr_benchmark::<{ U512::LIMBS }>(&mut group);
-    shr_benchmark::<{ U1024::LIMBS }>(&mut group);
-    shr_benchmark::<{ U2048::LIMBS }>(&mut group);
-    shr_benchmark::<{ U4096::LIMBS }>(&mut group);
-    shr_benchmark::<{ U8192::LIMBS }>(&mut group);
-    shr_benchmark::<{ U16384::LIMBS }>(&mut group);
+    shr_benchmarks::<{ U256::LIMBS }>(&mut group);
+    shr_benchmarks::<{ U512::LIMBS }>(&mut group);
+    shr_benchmarks::<{ U1024::LIMBS }>(&mut group);
+    shr_benchmarks::<{ U2048::LIMBS }>(&mut group);
+    shr_benchmarks::<{ U4096::LIMBS }>(&mut group);
+    shr_benchmarks::<{ U8192::LIMBS }>(&mut group);
+    shr_benchmarks::<{ U16384::LIMBS }>(&mut group);
 
     group.finish();
 }

From 3a1eb7a9a4b34df21bcea76fe8d27e8406fb1a05 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Thu, 30 Jan 2025 12:16:44 +0100
Subject: [PATCH 21/21] Fix fmt

---
 benches/uint.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benches/uint.rs b/benches/uint.rs
index 862291b6..7d98b470 100644
--- a/benches/uint.rs
+++ b/benches/uint.rs
@@ -346,28 +346,28 @@ fn shl_benchmarks<const LIMBS: usize>(group: &mut BenchmarkGroup<WallTime>) {
     group.bench_function(BenchmarkId::new("shl_vartime, large", LIMBS), |b| {
         b.iter_batched(
             || Uint::<LIMBS>::ONE,
-            |x| black_box(x.overflowing_shl_vartime(Uint::<LIMBS>::BITS/2 + 10)),
+            |x| black_box(x.overflowing_shl_vartime(Uint::<LIMBS>::BITS / 2 + 10)),
             BatchSize::SmallInput,
         )
     });
     group.bench_function(BenchmarkId::new("shl_vartime_wide, large", LIMBS), |b| {
         b.iter_batched(
             || (Uint::<LIMBS>::ONE, Uint::<LIMBS>::ONE),
-            |x| Uint::overflowing_shl_vartime_wide(x, Uint::<LIMBS>::BITS/2 + 10),
+            |x| Uint::overflowing_shl_vartime_wide(x, Uint::<LIMBS>::BITS / 2 + 10),
             BatchSize::SmallInput,
         )
     });
     group.bench_function(BenchmarkId::new("shl, small", LIMBS), |b| {
         b.iter_batched(
             || Uint::<LIMBS>::ONE,
-            |x| x.overflowing_shl( 10),
+            |x| x.overflowing_shl(10),
             BatchSize::SmallInput,
         )
     });
     group.bench_function(BenchmarkId::new("shl, large", LIMBS), |b| {
         b.iter_batched(
             || Uint::<LIMBS>::ONE,
-            |x| x.overflowing_shl( Uint::<LIMBS>::BITS/2 + 10),
+            |x| x.overflowing_shl(Uint::<LIMBS>::BITS / 2 + 10),
             BatchSize::SmallInput,
         )
     });