From 018dd8f499f4f971638a612c9cc2edff02e11e3a Mon Sep 17 00:00:00 2001
From: hkalbasi <hamidrezakalbasi@protonmail.com>
Date: Thu, 2 Jan 2025 14:38:27 +0330
Subject: [PATCH] Add Burnikel Ziegler algorithm for division

---
 benches/bigint.rs       |   5 ++
 src/bigint.rs           |  10 ++++
 src/biguint/division.rs | 130 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 137 insertions(+), 8 deletions(-)
diff --git a/benches/bigint.rs b/benches/bigint.rs
index 9b1ae2ff..d916a7de 100644
--- a/benches/bigint.rs
+++ b/benches/bigint.rs
@@ -112,6 +112,11 @@ fn divide_2(b: &mut Bencher) {
     divide_bench(b, 1 << 16, 1 << 12);
 }
 
+#[bench]
+fn divide_3(b: &mut Bencher) {
+    divide_bench(b, 1 << 20, 1 << 16);
+}
+
 #[bench]
 fn divide_big_little(b: &mut Bencher) {
     divide_bench(b, 1 << 16, 1 << 4);
diff --git a/src/bigint.rs b/src/bigint.rs
index b4f84b9e..f6fb983a 100644
--- a/src/bigint.rs
+++ b/src/bigint.rs
@@ -975,6 +975,16 @@ impl BigInt {
         self.data.bits()
     }
 
+    /// Converts this [`BigInt`] into a [`BigUint`], if it's not negative.
+    #[inline]
+    pub fn into_biguint(self) -> Option<BigUint> {
+        match self.sign {
+            Plus => Some(self.data),
+            NoSign => Some(BigUint::ZERO),
+            Minus => None,
+        }
+    }
+    
     /// Converts this [`BigInt`] into a [`BigUint`], if it's not negative.
     #[inline]
     pub fn to_biguint(&self) -> Option<BigUint> {
diff --git a/src/biguint/division.rs b/src/biguint/division.rs
index 3dfb0bbb..cf6f484b 100644
--- a/src/biguint/division.rs
+++ b/src/biguint/division.rs
@@ -1,14 +1,15 @@
 use super::addition::__add2;
 use super::{cmp_slice, BigUint};
 
-use crate::big_digit::{self, BigDigit, DoubleBigDigit};
-use crate::UsizePromotion;
+use crate::big_digit::{self, BigDigit, DoubleBigDigit, BITS};
+use crate::biguint::IntDigits;
+use crate::{BigInt, UsizePromotion};
 
 use core::cmp::Ordering::{Equal, Greater, Less};
 use core::mem;
 use core::ops::{Div, DivAssign, Rem, RemAssign};
 use num_integer::Integer;
-use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero};
+use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, Signed, ToPrimitive, Zero};
 
 pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64"));
 
@@ -197,9 +198,9 @@ fn div_rem(mut u: BigUint, mut d: BigUint) -> (BigUint, BigUint) {
 
     if shift == 0 {
         // no need to clone d
-        div_rem_core(u, &d.data)
+        div_rem_core(u, &d)
     } else {
-        let (q, r) = div_rem_core(u << shift, &(d << shift).data);
+        let (q, r) = div_rem_core(u << shift, &(d << shift));
         // renormalize the remainder
         (q, r >> shift)
     }
@@ -239,17 +240,130 @@ pub(super) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
 
     if shift == 0 {
         // no need to clone d
-        div_rem_core(u.clone(), &d.data)
+        div_rem_core(u.clone(), &d)
     } else {
-        let (q, r) = div_rem_core(u << shift, &(d << shift).data);
+        let (q, r) = div_rem_core(u << shift, &(d << shift));
         // renormalize the remainder
         (q, r >> shift)
     }
 }
 
+const BURNIKEL_ZIEGLER_THRESHOLD: usize = 64;
+
+/// This algorithm is from Burnikel and Ziegler, "Fast Recursive Division", Algorithm 1.
+/// It is a recursive algorithm that divides the dividend and divisor into blocks of digits
+/// and uses a divide-and-conquer approach to find the quotient.
+///
+/// The algorithm is more complex than the base algorithm, but it is faster for large operands.
+///
+/// Time complexity of this algorithm is the same as the algorithm used for the multiplication.
+/// 
+/// link: https://pure.mpg.de/rest/items/item_1819444_4/component/file_2599480/content 
+fn div_rem_burnikel_ziegler(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
+    fn divide_biguint(mut b: BigUint, level: usize) -> (BigUint, BigUint) {
+        if b.len() <= level {
+            return (BigUint::ZERO, b);
+        }
+        let b1_data = b.data[level..].to_vec();
+        b.data.truncate(level);
+        (
+            BigUint { data: b1_data }.normalized(),
+            b.normalized(),
+        )
+    }
+
+    fn normalizing_shift_amount(b: &BigUint, level: usize) -> usize {
+        (level - b.len() + 1) * BITS as usize - b.data.last().unwrap().ilog2() as usize - 1
+    }
+
+    fn concat_biguint(b1: &BigUint, b2: BigUint, level: usize) -> BigUint {
+        let mut data = b2.data;
+        data.reserve(level + b1.len() - data.len());
+        data.extend(std::iter::repeat(0).take(level - data.len()));
+        data.extend_from_slice(&b1.data);
+        BigUint { data }.normalized()
+    }
+
+    fn div_two_digit_by_one(
+        ah: BigUint,
+        al: BigUint,
+        b: BigUint,
+        level: usize,
+    ) -> (BigUint, BigUint) {
+        // A precondition of this function is that q fits into a single digit.
+        debug_assert!(ah < b);
+        if level <= BURNIKEL_ZIEGLER_THRESHOLD {
+            return div_rem(concat_biguint(&ah, al.clone(), level), b);
+        }
+        let shift = normalizing_shift_amount(&b, level);
+        if shift != 0 {
+            let b = b << shift;
+            let (ah, al) = divide_biguint(concat_biguint(&ah, al.clone(), level) << shift, level);
+            let (q, r) = div_two_digit_by_one_normalized(ah, al, b, level);
+            (q, r >> shift)
+        } else {
+            div_two_digit_by_one_normalized(ah, al, b, level)
+        }
+    }
+
+    fn div_two_digit_by_one_normalized(
+        ah: BigUint,
+        al: BigUint,
+        b: BigUint,
+        level: usize,
+    ) -> (BigUint, BigUint) {
+        let level = level / 2;
+        let (a1, a2) = divide_biguint(ah, level);
+        let (a3, a4) = divide_biguint(al, level);
+        let (b1, b2) = divide_biguint(b, level);
+        let (q1, r) = div_three_halves_by_two(a1, a2, a3, b1.clone(), b2.clone(), level);
+        let (r1, r2) = divide_biguint(r, level);
+        let (q2, s) = div_three_halves_by_two(r1, r2, a4, b1, b2, level);
+        (concat_biguint(&q1, q2, level), s)
+    }
+
+    fn div_three_halves_by_two(
+        a1: BigUint,
+        a2: BigUint,
+        a3: BigUint,
+        b1: BigUint,
+        b2: BigUint,
+        level: usize,
+    ) -> (BigUint, BigUint) {
+        let (mut q, c) = div_two_digit_by_one(a1, a2, b1.clone(), level);
+        let mut r = BigInt::from(concat_biguint(&c, a3, level)) - BigInt::from(&q * &b2);
+        let b = concat_biguint(&b1, b2, level);
+        if r.is_negative() {
+            q -= 1u32;
+            r += BigInt::from(b.clone());
+            if r.is_negative() {
+                q -= 1u32;
+                r += BigInt::from(b.clone());
+            }
+        }
+        (q, r.into_biguint().unwrap())
+    }
+
+    let mut level = 1 << (u.data.len().ilog2());
+    if d.len() > level {
+        level *= 2;
+    }
+    let (u1, u2) = divide_biguint(u.clone(), level);
+    if &u1 > d {
+        div_two_digit_by_one(BigUint::ZERO, u.clone(), d.clone(), level * 2)    
+    } else {
+        div_two_digit_by_one(u1, u2, d.clone(), level)
+    }
+}
+
 /// An implementation of the base division algorithm.
 /// Knuth, TAOCP vol 2 section 4.3.1, algorithm D, with an improvement from exercises 19-21.
-fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
+fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
+    if a.len() > BURNIKEL_ZIEGLER_THRESHOLD * 2 && b.len() > BURNIKEL_ZIEGLER_THRESHOLD {
+        return div_rem_burnikel_ziegler(&a, b);
+    }
+
+    let b = &*b.data;
     debug_assert!(a.data.len() >= b.len() && b.len() > 1);
     debug_assert!(b.last().unwrap().leading_zeros() == 0);