specialized_div_rem/
binary_long.rs

1/// Creates unsigned and signed division functions that use binary long division, designed for
2/// computer architectures without division instructions. These functions have good performance for
3/// microarchitectures with large branch miss penalties and architectures without the ability to
4/// predicate instructions. For architectures with predicated instructions, one of the algorithms
5/// described in the documentation of these functions probably has higher performance, and a custom
6/// assembly routine should be used instead.
7#[macro_export]
8macro_rules! impl_binary_long {
9    (
10        $unsigned_name:ident, // name of the unsigned division function
11        $signed_name:ident, // name of the signed division function
12        $zero_div_fn:ident, // function called when division by zero is attempted
13        $normalization_shift:ident, // function for finding the normalization shift
14        $n:tt, // the number of bits in a $iX or $uX
15        $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
16        $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
17        $($unsigned_attr:meta),*; // attributes for the unsigned function
18        $($signed_attr:meta),* // attributes for the signed function
19    ) => {
20        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
21        /// tuple.
22        $(
23            #[$unsigned_attr]
24        )*
25        pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX, $uX) {
26            let mut duo = duo;
27            // handle edge cases before calling `$normalization_shift`
28            if div == 0 {
29                $zero_div_fn()
30            }
31            if duo < div {
32                return (0, duo)
33            }
34
35            // There are many variations of binary division algorithm that could be used. This
36            // documentation gives a tour of different methods so that future readers wanting to
37            // optimize further do not have to painstakingly derive them. The SWAR variation is
38            // especially hard to understand without reading the less convoluted methods first.
39
40            // You may notice that a `duo < div_original` check is included in many these
41            // algorithms. A critical optimization that many algorithms miss is handling of
42            // quotients that will turn out to have many trailing zeros or many leading zeros. This
43            // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
44            // in cases where the quotient is small. The `duo < div_original` check handles these
45            // cases of early returns and ends up replacing other kinds of mundane checks that
46            // normally terminate a binary division algorithm.
47            //
48            // Something you may see in other algorithms that is not special-cased here is checks
49            // for division by powers of two. The `duo < div_original` check handles this case and
50            // more, however it can be checked up front before the bisection using the
51            // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
52            // compilers should handle most cases where divisions by power of two occur, and we do
53            // not want to add on a few cycles for every division operation just to save a few
54            // cycles rarely.
55
56            // The following example is the most straightforward translation from the way binary
57            // long division is typically visualized:
58            // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
59            // the result from `$normalization_shift(duo, div, false)`.
60            //
61            // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
62            // and `duo` is kept unchanged.
63            // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
64            //
65            // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
66            // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
67            //
68            // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
69            // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
70            // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
71            // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
72            // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
73            // The `duo < div_original` check terminates the algorithm with the correct quotient of
74            // 29u8 and remainder of 4u8
75            /*
76            let div_original = div;
77            let mut shl = $normalization_shift(duo, div, false);
78            let mut quo = 0;
79            loop {
80                let div_shifted = div << shl;
81                let sub = duo.wrapping_sub(div_shifted);
82                // it is recommended to use `println!`s like this if functionality is unclear
83                /*
84                println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
85                    duo,
86                    div_shifted,
87                    sub,
88                    quo,
89                    shl
90                );
91                */
92                if 0 <= (sub as $iX) {
93                    duo = sub;
94                    quo += 1 << shl;
95                    if duo < div_original {
96                        // this branch is optional
97                        return (quo, duo)
98                    }
99                }
100                if shl == 0 {
101                    return (quo, duo)
102                }
103                shl -= 1;
104            }
105            */
106
107            // This restoring binary long division algorithm reduces the number of operations
108            // overall via:
109            // - `pow` can be shifted right instead of recalculating from `shl`
110            // - starting `div` shifted left and shifting it right for each step instead of
111            //   recalculating from `shl`
112            // - The `duo < div_original` branch is used to terminate the algorithm instead of the
113            //   `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
114            //   `div` from being shifted off the end. This check also only occurs on half of steps
115            //   on average, since it is behind the `(sub as $iX) >= 0` branch.
116            // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
117            //   being updated between steps
118            //
119            // There are many variations of this algorithm, but this encompases the largest number
120            // of architectures and does not rely on carry flags, add-with-carry, or SWAR
121            // complications to be decently fast.
122            /*
123            let div_original = div;
124            let shl = $normalization_shift(duo, div, false);
125            let mut div: $uX = div << shl;
126            let mut pow: $uX = 1 << shl;
127            let mut quo: $uX = 0;
128            loop {
129                let sub = duo.wrapping_sub(div);
130                if 0 <= (sub as $iX) {
131                    duo = sub;
132                    quo |= pow;
133                    if duo < div_original {
134                        return (quo, duo)
135                    }
136                }
137                div >>= 1;
138                pow >>= 1;
139            }
140            */
141
142            // If the architecture has flags and predicated arithmetic instructions, it is possible
143            // to do binary long division without branching and in only 3 or 4 instructions. This is
144            // a variation of a 3 instruction central loop from
145            // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
146            //
147            // What allows doing division in only 3 instructions is realizing that instead of
148            // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
149            // place and `duo` can be shifted left. This means `div` does not have to be updated,
150            // but causes edge case problems and makes `duo < div_original` tests harder. Some
151            // architectures have an option to shift an argument in an arithmetic operation, which
152            // means `duo` can be shifted left and subtracted from in one instruction. The other two
153            // instructions are updating `quo` and undoing the subtraction if it turns out things
154            // were not normalized.
155
156            /*
157            // Perform one binary long division step on the already normalized arguments, because
158            // the main. Note that this does a full normalization since the central loop needs
159            // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
160            // variation only did normalization to the nearest 4 steps, but this makes handling edge
161            // cases much harder. We do a full normalization and perform a binary long division
162            // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
163            // of `duo`, then the edge case handler shifts `div` right and does another long
164            // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
165            let div_original = div;
166            let mut shl = $normalization_shift(duo, div, true);
167            let mut div: $uX = (div << shl);
168            let mut quo: $uX = 1;
169            duo = duo.wrapping_sub(div);
170            if duo < div_original {
171                return (1 << shl, duo);
172            }
173            let div_neg: $uX;
174            if (div as $iX) < 0 {
175                // A very ugly edge case where the most significant bit of `div` is set (after
176                // shifting to match `duo` when its most significant bit is at the sign bit), which
177                // leads to the sign bit of `div_neg` being cut off and carries not happening when
178                // they should. This branch performs a long division step that keeps `duo` in place
179                // and shifts `div` down.
180                div >>= 1;
181                div_neg = div.wrapping_neg();
182                let (sub, carry) = duo.overflowing_add(div_neg);
183                duo = sub;
184                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
185                if !carry {
186                    duo = duo.wrapping_add(div);
187                }
188                shl -= 1;
189            } else {
190                div_neg = div.wrapping_neg();
191            }
192            // The add-with-carry that updates `quo` needs to have the carry set when a normalized
193            // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
194            // subtraction generates a carry when an unnormalized subtract happens, which is the
195            // opposite of what we want. Instead, we use
196            // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
197            let mut i = shl;
198            loop {
199                if i == 0 {
200                    break;
201                }
202                i -= 1;
203                // `ADDS duo, div, duo, LSL #1`
204                // (add `div` to `duo << 1` and set flags)
205                let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
206                duo = sub;
207                // `ADC quo, quo, quo`
208                // (add with carry). Effectively shifts `quo` left by 1 and sets the least
209                // significant bit to the carry.
210                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
211                // `ADDCC duo, duo, div`
212                // (add if carry clear). Undoes the subtraction if no carry was generated.
213                if !carry {
214                    duo = duo.wrapping_add(div);
215                }
216            }
217            return (quo, duo >> shl);
218            */
219
220            // This is the SWAR (SIMD within in a register) restoring division algorithm.
221            // This combines several ideas of the above algorithms:
222            //  - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
223            //    restoring division algorithm, some architectures can do the shifting and
224            //    subtraction step in one instruction.
225            //  - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
226            //    and adding one.
227            //  - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
228            //    LSB, so what if we use those bits to store `quo`?
229            // Through a complex setup, it is possible to manage `duo` and `quo` in the same
230            // register, and perform one step with 2 or 3 instructions. The only major downsides are
231            // that there is significant setup (it is only saves instructions if `shl` is
232            // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
233            // initiated, and the number of division steps taken has to be exact (we cannot do more
234            // division steps than `shl`, because it introduces edge cases where quotient bits in
235            // `duo` start to collide with the real part of `div`.
236            /*
237            // first step. The quotient bit is stored in `quo` for now
238            let div_original = div;
239            let mut shl = $normalization_shift(duo, div, true);
240            let mut div: $uX = (div << shl);
241            duo = duo.wrapping_sub(div);
242            let mut quo: $uX = 1 << shl;
243            if duo < div_original {
244                return (quo, duo);
245            }
246
247            let mask: $uX;
248            if (div as $iX) < 0 {
249                // deal with same edge case as the 3 instruction restoring division algorithm, but
250                // the quotient bit from this step also has to be stored in `quo`
251                div >>= 1;
252                shl -= 1;
253                let tmp = 1 << shl;
254                mask = tmp - 1;
255                let sub = duo.wrapping_sub(div);
256                if (sub as $iX) >= 0 {
257                    // restore
258                    duo = sub;
259                    quo |= tmp;
260                }
261                if duo < div_original {
262                    return (quo, duo);
263                }
264            } else {
265                mask = quo - 1;
266            }
267            // There is now room for quotient bits in `duo`.
268
269            // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
270            // `div` and end up with the subset of `shl` bits being all being set. This subset acts
271            // just like a two's complement negative one. The subset of `div` containing the divisor
272            // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
273            // as long as the quotient stays positive.
274            //
275            // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
276            // adds a quotient bit to the least significant bit.
277            // For example, 89 (0b01011001) divided by 3 (0b11):
278            //
279            // shl:4, div:0b00110000
280            // first step:
281            //       duo:0b01011001
282            // + div_neg:0b11010000
283            // ____________________
284            //           0b00101001
285            // quo is set to 0b00010000 and mask is set to 0b00001111 for later
286            //
287            // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
288            // quotient part of `duo` with `^`s.
289            // chars.
290            //     div:0b00110000
291            //               ^^^^
292            //   +     0b11111111
293            //   ________________
294            //         0b00101111
295            //               ^^^^
296            // div_neg:0b11010001
297            //
298            // first SWAR step:
299            //  duo_shl1:0b01010010
300            //                    ^
301            // + div_neg:0b11010001
302            // ____________________
303            //           0b00100011
304            //                    ^
305            // second:
306            //  duo_shl1:0b01000110
307            //                   ^^
308            // + div_neg:0b11010001
309            // ____________________
310            //           0b00010111
311            //                   ^^
312            // third:
313            //  duo_shl1:0b00101110
314            //                  ^^^
315            // + div_neg:0b11010001
316            // ____________________
317            //           0b11111111
318            //                  ^^^
319            // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
320            // part of `duo` is negative and the third step was an unnormalized step. The restore
321            // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
322            //
323            //   duo:0b11111111
324            //              ^^^
325            // + div:0b00101111
326            //             ^^^^
327            // ________________
328            //       0b00101110
329            //              ^^^
330            // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
331            // unset quotient bit.
332            //
333            // final step (`shl` was 4, so exactly 4 steps must be taken)
334            //  duo_shl1:0b01011100
335            //                 ^^^^
336            // + div_neg:0b11010001
337            // ____________________
338            //           0b00101101
339            //                 ^^^^
340            // The quotient includes the `^` bits added with the `quo` bits from the beginning that
341            // contained the first step and potential edge case step,
342            // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
343            // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
344            // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
345            let div: $uX = div.wrapping_sub(1);
346            let mut i = shl;
347            loop {
348                if i == 0 {
349                    break;
350                }
351                i -= 1;
352                duo = duo.wrapping_shl(1).wrapping_sub(div);
353                if (duo as $iX) < 0 {
354                    // restore
355                    duo = duo.wrapping_add(div);
356                }
357            }
358            // unpack the results of SWAR
359            return ((duo & mask) | quo, duo >> shl);
360            */
361
362            // The problem with the conditional restoring SWAR algorithm above is that, in practice,
363            // it requires assembly code to bring out its full unrolled potential (It seems that
364            // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
365            // that my algorithm intends. On architectures without predicated instructions, the code
366            // gen is especially bad. We need a default software division algorithm that is
367            // guaranteed to get decent code gen for the central loop.
368
369            // For non-SWAR algorithms, there is a way to do binary long division without
370            // predication or even branching. This involves creating a mask from the sign bit and
371            // performing different kinds of steps using that.
372            /*
373            let shl = $normalization_shift(duo, div, true);
374            let mut div: $uX = div << shl;
375            let mut pow: $uX = 1 << shl;
376            let mut quo: $uX = 0;
377            loop {
378                let sub = duo.wrapping_sub(div);
379                let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
380                duo -= div & sign_mask;
381                quo |= pow & sign_mask;
382                div >>= 1;
383                pow >>= 1;
384                if pow == 0 {
385                    break;
386                }
387            }
388            return (quo, duo);
389            */
390            // However, it requires about 4 extra operations (smearing the sign bit, negating the
391            // mask, and applying the mask twice) on top of the operations done by the actual
392            // algorithm. With SWAR however, just 2 extra operations are needed, making it
393            // practical and even the most optimal algorithm for some architectures.
394
395            // What we do is use custom assembly for predicated architectures that need software
396            // division, and for the default algorithm use a mask based restoring SWAR algorithm
397            // without conditionals or branches. On almost all architectures, this Rust code is
398            // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
399            // will unroll it in a decent way.
400
401            // standard opening for SWAR algorithm with first step and edge case handling
402            let div_original = div;
403            let mut shl = $normalization_shift(duo, div, true);
404            let mut div: $uX = (div << shl);
405            duo = duo.wrapping_sub(div);
406            let mut quo: $uX = 1 << shl;
407            if duo < div_original {
408                return (quo, duo);
409            }
410            let mask: $uX;
411            if (div as $iX) < 0 {
412                div >>= 1;
413                shl -= 1;
414                let tmp = 1 << shl;
415                mask = tmp - 1;
416                let sub = duo.wrapping_sub(div);
417                if (sub as $iX) >= 0 {
418                    duo = sub;
419                    quo |= tmp;
420                }
421                if duo < div_original {
422                    return (quo, duo);
423                }
424            } else {
425                mask = quo - 1;
426            }
427
428            // central loop
429            div = div.wrapping_sub(1);
430            let mut i = shl;
431            loop {
432                if i == 0 {
433                    break
434                }
435                i -= 1;
436                // shift left 1 and subtract
437                duo = duo.wrapping_shl(1).wrapping_sub(div);
438                // create mask
439                let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
440                // restore
441                duo = duo.wrapping_add(div & mask);
442            }
443            // unpack
444            return ((duo & mask) | quo, duo >> shl);
445
446            // miscellanious binary long division algorithms that might be better for specific
447            // architectures
448
449            // Another kind of long division uses an interesting fact that `div` and `pow` can be
450            // negated when `duo` is negative to perform a "negated" division step that works in
451            // place of any normalization mechanism. This is a non-restoring division algorithm that
452            // is very similar to the non-restoring division algorithms that can be found on the
453            // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
454            // be viewed as shifting the least significant set bit right (e.x. if we enter a series
455            // of negated binary long division steps starting with `quo == 0b1011_0000` and
456            // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
457            // 0b1010_0010, 0b1010_0001).
458            /*
459            let div_original = div;
460            let shl = $normalization_shift(duo, div, true);
461            let mut div: $uX = (div << shl);
462            let mut pow: $uX = 1 << shl;
463            let mut quo: $uX = pow;
464            duo = duo.wrapping_sub(div);
465            if duo < div_original {
466                return (quo, duo);
467            }
468            div >>= 1;
469            pow >>= 1;
470            loop {
471                if (duo as $iX) < 0 {
472                    // Negated binary long division step.
473                    duo = duo.wrapping_add(div);
474                    quo = quo.wrapping_sub(pow);
475                } else {
476                    // Normal long division step.
477                    if duo < div_original {
478                        return (quo, duo)
479                    }
480                    duo = duo.wrapping_sub(div);
481                    quo = quo.wrapping_add(pow);
482                }
483                pow >>= 1;
484                div >>= 1;
485            }
486            */
487
488            // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
489            // SWAR techniques that makes the only difference between steps be negation of `div`.
490            // If there was an architecture with an instruction that negated inputs to an adder
491            // based on conditionals, and in place shifting (or a three input addition operation
492            // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
493            // single instruction central loop is possible. Microarchitectures often have inputs to
494            // their ALU that can invert the arguments and carry in of adders, but the architectures
495            // unfortunately do not have an instruction to dynamically invert this input based on
496            // conditionals.
497            /*
498            // SWAR opening
499            let div_original = div;
500            let mut shl = $normalization_shift(duo, div, true);
501            let mut div: $uX = (div << shl);
502            duo = duo.wrapping_sub(div);
503            let mut quo: $uX = 1 << shl;
504            if duo < div_original {
505                return (quo, duo);
506            }
507            let mask: $uX;
508            if (div as $iX) < 0 {
509                div >>= 1;
510                shl -= 1;
511                let tmp = 1 << shl;
512                let sub = duo.wrapping_sub(div);
513                if (sub as $iX) >= 0 {
514                    // restore
515                    duo = sub;
516                    quo |= tmp;
517                }
518                if duo < div_original {
519                    return (quo, duo);
520                }
521                mask = tmp - 1;
522            } else {
523                mask = quo - 1;
524            }
525
526            // central loop
527            let div: $uX = div.wrapping_sub(1);
528            let mut i = shl;
529            loop {
530                if i == 0 {
531                    break;
532                }
533                i -= 1;
534                // note: the `wrapping_shl(1)` can be factored out, but would require another
535                // restoring division step to prevent `(duo as $iX)` from overflowing
536                if (duo as $iX) < 0 {
537                    // Negated binary long division step.
538                    duo = duo.wrapping_shl(1).wrapping_add(div);
539                } else {
540                    // Normal long division step.
541                    duo = duo.wrapping_shl(1).wrapping_sub(div);
542                }
543            }
544            if (duo as $iX) < 0 {
545                // Restore. This was not needed in the original nonrestoring algorithm because of
546                // the `duo < div_original` checks.
547                duo = duo.wrapping_add(div);
548            }
549            // unpack
550            return ((duo & mask) | quo, duo >> shl);
551            */
552        }
553
554        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
555        /// tuple.
556        $(
557            #[$signed_attr]
558        )*
559        pub fn $signed_name(duo: $iX, div: $iX) -> ($iX, $iX) {
560            // There is a way of doing this without any branches, but requires too many extra
561            // operations to be faster.
562            /*
563            let duo_s = duo >> ($n - 1);
564            let div_s = div >> ($n - 1);
565            let duo = (duo ^ duo_s).wrapping_sub(duo_s);
566            let div = (div ^ div_s).wrapping_sub(div_s);
567            let quo_s = duo_s ^ div_s;
568            let rem_s = duo_s;
569            let tmp = $unsigned_name(duo as $uX, div as $uX);
570            (
571                ((tmp.0 as $iX) ^ quo_s).wrapping_sub(quo_s),
572                ((tmp.1 as $iX) ^ rem_s).wrapping_sub(rem_s),
573            )
574            */
575            // this is problematic because LLVM likes to inline 4 times over
576            /*
577            match (duo < 0, div < 0) {
578                (false, false) => {
579                    let t = $unsigned_name(duo as $uX, div as $uX);
580                    (t.0 as $iX, t.1 as $iX)
581                },
582                (true, false) => {
583                    let t = $unsigned_name(duo.wrapping_neg() as $uX, div as $uX);
584                    ((t.0 as $iX).wrapping_neg(), (t.1 as $iX).wrapping_neg())
585                },
586                (false, true) => {
587                    let t = $unsigned_name(duo as $uX, div.wrapping_neg() as $uX);
588                    ((t.0 as $iX).wrapping_neg(), t.1 as $iX)
589                },
590                (true, true) => {
591                    let t = $unsigned_name(duo.wrapping_neg() as $uX, div.wrapping_neg() as $uX);
592                    (t.0 as $iX, (t.1 as $iX).wrapping_neg())
593                },
594            }
595            */
596            // this retains the ability of LLVM to eliminate branches
597            let duo_neg = duo < 0;
598            let div_neg = div < 0;
599            let mut duo = duo;
600            let mut div = div;
601            if duo_neg {
602                duo = duo.wrapping_neg();
603            }
604            if div_neg {
605                div = div.wrapping_neg();
606            }
607            let t = $unsigned_name(duo as $uX, div as $uX);
608            let mut quo = t.0 as $iX;
609            let mut rem = t.1 as $iX;
610            if duo_neg {
611                rem = rem.wrapping_neg();
612            }
613            if duo_neg != div_neg {
614                quo = quo.wrapping_neg();
615            }
616            (quo, rem)
617        }
618    }
619}