specialized_div_rem/binary_long.rs
1/// Creates unsigned and signed division functions that use binary long division, designed for
2/// computer architectures without division instructions. These functions have good performance for
3/// microarchitectures with large branch miss penalties and architectures without the ability to
4/// predicate instructions. For architectures with predicated instructions, one of the algorithms
5/// described in the documentation of these functions probably has higher performance, and a custom
6/// assembly routine should be used instead.
7#[macro_export]
8macro_rules! impl_binary_long {
9 (
10 $unsigned_name:ident, // name of the unsigned division function
11 $signed_name:ident, // name of the signed division function
12 $zero_div_fn:ident, // function called when division by zero is attempted
13 $normalization_shift:ident, // function for finding the normalization shift
14 $n:tt, // the number of bits in a $iX or $uX
15 $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
16 $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
17 $($unsigned_attr:meta),*; // attributes for the unsigned function
18 $($signed_attr:meta),* // attributes for the signed function
19 ) => {
20 /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
21 /// tuple.
22 $(
23 #[$unsigned_attr]
24 )*
25 pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX, $uX) {
26 let mut duo = duo;
27 // handle edge cases before calling `$normalization_shift`
28 if div == 0 {
29 $zero_div_fn()
30 }
31 if duo < div {
32 return (0, duo)
33 }
34
35 // There are many variations of binary division algorithm that could be used. This
36 // documentation gives a tour of different methods so that future readers wanting to
37 // optimize further do not have to painstakingly derive them. The SWAR variation is
38 // especially hard to understand without reading the less convoluted methods first.
39
40 // You may notice that a `duo < div_original` check is included in many these
41 // algorithms. A critical optimization that many algorithms miss is handling of
42 // quotients that will turn out to have many trailing zeros or many leading zeros. This
43 // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
44 // in cases where the quotient is small. The `duo < div_original` check handles these
45 // cases of early returns and ends up replacing other kinds of mundane checks that
46 // normally terminate a binary division algorithm.
47 //
48 // Something you may see in other algorithms that is not special-cased here is checks
49 // for division by powers of two. The `duo < div_original` check handles this case and
50 // more, however it can be checked up front before the bisection using the
51 // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
52 // compilers should handle most cases where divisions by power of two occur, and we do
53 // not want to add on a few cycles for every division operation just to save a few
54 // cycles rarely.
55
56 // The following example is the most straightforward translation from the way binary
57 // long division is typically visualized:
58 // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
59 // the result from `$normalization_shift(duo, div, false)`.
60 //
61 // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
62 // and `duo` is kept unchanged.
63 // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
64 //
65 // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
66 // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
67 //
68 // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
69 // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
70 // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
71 // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
72 // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
73 // The `duo < div_original` check terminates the algorithm with the correct quotient of
74 // 29u8 and remainder of 4u8
75 /*
76 let div_original = div;
77 let mut shl = $normalization_shift(duo, div, false);
78 let mut quo = 0;
79 loop {
80 let div_shifted = div << shl;
81 let sub = duo.wrapping_sub(div_shifted);
82 // it is recommended to use `println!`s like this if functionality is unclear
83 /*
84 println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
85 duo,
86 div_shifted,
87 sub,
88 quo,
89 shl
90 );
91 */
92 if 0 <= (sub as $iX) {
93 duo = sub;
94 quo += 1 << shl;
95 if duo < div_original {
96 // this branch is optional
97 return (quo, duo)
98 }
99 }
100 if shl == 0 {
101 return (quo, duo)
102 }
103 shl -= 1;
104 }
105 */
106
107 // This restoring binary long division algorithm reduces the number of operations
108 // overall via:
109 // - `pow` can be shifted right instead of recalculating from `shl`
110 // - starting `div` shifted left and shifting it right for each step instead of
111 // recalculating from `shl`
112 // - The `duo < div_original` branch is used to terminate the algorithm instead of the
113 // `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
114 // `div` from being shifted off the end. This check also only occurs on half of steps
115 // on average, since it is behind the `(sub as $iX) >= 0` branch.
116 // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
117 // being updated between steps
118 //
119 // There are many variations of this algorithm, but this encompases the largest number
120 // of architectures and does not rely on carry flags, add-with-carry, or SWAR
121 // complications to be decently fast.
122 /*
123 let div_original = div;
124 let shl = $normalization_shift(duo, div, false);
125 let mut div: $uX = div << shl;
126 let mut pow: $uX = 1 << shl;
127 let mut quo: $uX = 0;
128 loop {
129 let sub = duo.wrapping_sub(div);
130 if 0 <= (sub as $iX) {
131 duo = sub;
132 quo |= pow;
133 if duo < div_original {
134 return (quo, duo)
135 }
136 }
137 div >>= 1;
138 pow >>= 1;
139 }
140 */
141
142 // If the architecture has flags and predicated arithmetic instructions, it is possible
143 // to do binary long division without branching and in only 3 or 4 instructions. This is
144 // a variation of a 3 instruction central loop from
145 // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
146 //
147 // What allows doing division in only 3 instructions is realizing that instead of
148 // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
149 // place and `duo` can be shifted left. This means `div` does not have to be updated,
150 // but causes edge case problems and makes `duo < div_original` tests harder. Some
151 // architectures have an option to shift an argument in an arithmetic operation, which
152 // means `duo` can be shifted left and subtracted from in one instruction. The other two
153 // instructions are updating `quo` and undoing the subtraction if it turns out things
154 // were not normalized.
155
156 /*
157 // Perform one binary long division step on the already normalized arguments, because
158 // the main. Note that this does a full normalization since the central loop needs
159 // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
160 // variation only did normalization to the nearest 4 steps, but this makes handling edge
161 // cases much harder. We do a full normalization and perform a binary long division
162 // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
163 // of `duo`, then the edge case handler shifts `div` right and does another long
164 // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
165 let div_original = div;
166 let mut shl = $normalization_shift(duo, div, true);
167 let mut div: $uX = (div << shl);
168 let mut quo: $uX = 1;
169 duo = duo.wrapping_sub(div);
170 if duo < div_original {
171 return (1 << shl, duo);
172 }
173 let div_neg: $uX;
174 if (div as $iX) < 0 {
175 // A very ugly edge case where the most significant bit of `div` is set (after
176 // shifting to match `duo` when its most significant bit is at the sign bit), which
177 // leads to the sign bit of `div_neg` being cut off and carries not happening when
178 // they should. This branch performs a long division step that keeps `duo` in place
179 // and shifts `div` down.
180 div >>= 1;
181 div_neg = div.wrapping_neg();
182 let (sub, carry) = duo.overflowing_add(div_neg);
183 duo = sub;
184 quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
185 if !carry {
186 duo = duo.wrapping_add(div);
187 }
188 shl -= 1;
189 } else {
190 div_neg = div.wrapping_neg();
191 }
192 // The add-with-carry that updates `quo` needs to have the carry set when a normalized
193 // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
194 // subtraction generates a carry when an unnormalized subtract happens, which is the
195 // opposite of what we want. Instead, we use
196 // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
197 let mut i = shl;
198 loop {
199 if i == 0 {
200 break;
201 }
202 i -= 1;
203 // `ADDS duo, div, duo, LSL #1`
204 // (add `div` to `duo << 1` and set flags)
205 let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
206 duo = sub;
207 // `ADC quo, quo, quo`
208 // (add with carry). Effectively shifts `quo` left by 1 and sets the least
209 // significant bit to the carry.
210 quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
211 // `ADDCC duo, duo, div`
212 // (add if carry clear). Undoes the subtraction if no carry was generated.
213 if !carry {
214 duo = duo.wrapping_add(div);
215 }
216 }
217 return (quo, duo >> shl);
218 */
219
220 // This is the SWAR (SIMD within in a register) restoring division algorithm.
221 // This combines several ideas of the above algorithms:
222 // - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
223 // restoring division algorithm, some architectures can do the shifting and
224 // subtraction step in one instruction.
225 // - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
226 // and adding one.
227 // - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
228 // LSB, so what if we use those bits to store `quo`?
229 // Through a complex setup, it is possible to manage `duo` and `quo` in the same
230 // register, and perform one step with 2 or 3 instructions. The only major downsides are
231 // that there is significant setup (it is only saves instructions if `shl` is
232 // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
233 // initiated, and the number of division steps taken has to be exact (we cannot do more
234 // division steps than `shl`, because it introduces edge cases where quotient bits in
235 // `duo` start to collide with the real part of `div`.
236 /*
237 // first step. The quotient bit is stored in `quo` for now
238 let div_original = div;
239 let mut shl = $normalization_shift(duo, div, true);
240 let mut div: $uX = (div << shl);
241 duo = duo.wrapping_sub(div);
242 let mut quo: $uX = 1 << shl;
243 if duo < div_original {
244 return (quo, duo);
245 }
246
247 let mask: $uX;
248 if (div as $iX) < 0 {
249 // deal with same edge case as the 3 instruction restoring division algorithm, but
250 // the quotient bit from this step also has to be stored in `quo`
251 div >>= 1;
252 shl -= 1;
253 let tmp = 1 << shl;
254 mask = tmp - 1;
255 let sub = duo.wrapping_sub(div);
256 if (sub as $iX) >= 0 {
257 // restore
258 duo = sub;
259 quo |= tmp;
260 }
261 if duo < div_original {
262 return (quo, duo);
263 }
264 } else {
265 mask = quo - 1;
266 }
267 // There is now room for quotient bits in `duo`.
268
269 // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
270 // `div` and end up with the subset of `shl` bits being all being set. This subset acts
271 // just like a two's complement negative one. The subset of `div` containing the divisor
272 // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
273 // as long as the quotient stays positive.
274 //
275 // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
276 // adds a quotient bit to the least significant bit.
277 // For example, 89 (0b01011001) divided by 3 (0b11):
278 //
279 // shl:4, div:0b00110000
280 // first step:
281 // duo:0b01011001
282 // + div_neg:0b11010000
283 // ____________________
284 // 0b00101001
285 // quo is set to 0b00010000 and mask is set to 0b00001111 for later
286 //
287 // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
288 // quotient part of `duo` with `^`s.
289 // chars.
290 // div:0b00110000
291 // ^^^^
292 // + 0b11111111
293 // ________________
294 // 0b00101111
295 // ^^^^
296 // div_neg:0b11010001
297 //
298 // first SWAR step:
299 // duo_shl1:0b01010010
300 // ^
301 // + div_neg:0b11010001
302 // ____________________
303 // 0b00100011
304 // ^
305 // second:
306 // duo_shl1:0b01000110
307 // ^^
308 // + div_neg:0b11010001
309 // ____________________
310 // 0b00010111
311 // ^^
312 // third:
313 // duo_shl1:0b00101110
314 // ^^^
315 // + div_neg:0b11010001
316 // ____________________
317 // 0b11111111
318 // ^^^
319 // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
320 // part of `duo` is negative and the third step was an unnormalized step. The restore
321 // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
322 //
323 // duo:0b11111111
324 // ^^^
325 // + div:0b00101111
326 // ^^^^
327 // ________________
328 // 0b00101110
329 // ^^^
330 // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
331 // unset quotient bit.
332 //
333 // final step (`shl` was 4, so exactly 4 steps must be taken)
334 // duo_shl1:0b01011100
335 // ^^^^
336 // + div_neg:0b11010001
337 // ____________________
338 // 0b00101101
339 // ^^^^
340 // The quotient includes the `^` bits added with the `quo` bits from the beginning that
341 // contained the first step and potential edge case step,
342 // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
343 // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
344 // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
345 let div: $uX = div.wrapping_sub(1);
346 let mut i = shl;
347 loop {
348 if i == 0 {
349 break;
350 }
351 i -= 1;
352 duo = duo.wrapping_shl(1).wrapping_sub(div);
353 if (duo as $iX) < 0 {
354 // restore
355 duo = duo.wrapping_add(div);
356 }
357 }
358 // unpack the results of SWAR
359 return ((duo & mask) | quo, duo >> shl);
360 */
361
362 // The problem with the conditional restoring SWAR algorithm above is that, in practice,
363 // it requires assembly code to bring out its full unrolled potential (It seems that
364 // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
365 // that my algorithm intends. On architectures without predicated instructions, the code
366 // gen is especially bad. We need a default software division algorithm that is
367 // guaranteed to get decent code gen for the central loop.
368
369 // For non-SWAR algorithms, there is a way to do binary long division without
370 // predication or even branching. This involves creating a mask from the sign bit and
371 // performing different kinds of steps using that.
372 /*
373 let shl = $normalization_shift(duo, div, true);
374 let mut div: $uX = div << shl;
375 let mut pow: $uX = 1 << shl;
376 let mut quo: $uX = 0;
377 loop {
378 let sub = duo.wrapping_sub(div);
379 let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
380 duo -= div & sign_mask;
381 quo |= pow & sign_mask;
382 div >>= 1;
383 pow >>= 1;
384 if pow == 0 {
385 break;
386 }
387 }
388 return (quo, duo);
389 */
390 // However, it requires about 4 extra operations (smearing the sign bit, negating the
391 // mask, and applying the mask twice) on top of the operations done by the actual
392 // algorithm. With SWAR however, just 2 extra operations are needed, making it
393 // practical and even the most optimal algorithm for some architectures.
394
395 // What we do is use custom assembly for predicated architectures that need software
396 // division, and for the default algorithm use a mask based restoring SWAR algorithm
397 // without conditionals or branches. On almost all architectures, this Rust code is
398 // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
399 // will unroll it in a decent way.
400
401 // standard opening for SWAR algorithm with first step and edge case handling
402 let div_original = div;
403 let mut shl = $normalization_shift(duo, div, true);
404 let mut div: $uX = (div << shl);
405 duo = duo.wrapping_sub(div);
406 let mut quo: $uX = 1 << shl;
407 if duo < div_original {
408 return (quo, duo);
409 }
410 let mask: $uX;
411 if (div as $iX) < 0 {
412 div >>= 1;
413 shl -= 1;
414 let tmp = 1 << shl;
415 mask = tmp - 1;
416 let sub = duo.wrapping_sub(div);
417 if (sub as $iX) >= 0 {
418 duo = sub;
419 quo |= tmp;
420 }
421 if duo < div_original {
422 return (quo, duo);
423 }
424 } else {
425 mask = quo - 1;
426 }
427
428 // central loop
429 div = div.wrapping_sub(1);
430 let mut i = shl;
431 loop {
432 if i == 0 {
433 break
434 }
435 i -= 1;
436 // shift left 1 and subtract
437 duo = duo.wrapping_shl(1).wrapping_sub(div);
438 // create mask
439 let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
440 // restore
441 duo = duo.wrapping_add(div & mask);
442 }
443 // unpack
444 return ((duo & mask) | quo, duo >> shl);
445
446 // miscellanious binary long division algorithms that might be better for specific
447 // architectures
448
449 // Another kind of long division uses an interesting fact that `div` and `pow` can be
450 // negated when `duo` is negative to perform a "negated" division step that works in
451 // place of any normalization mechanism. This is a non-restoring division algorithm that
452 // is very similar to the non-restoring division algorithms that can be found on the
453 // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
454 // be viewed as shifting the least significant set bit right (e.x. if we enter a series
455 // of negated binary long division steps starting with `quo == 0b1011_0000` and
456 // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
457 // 0b1010_0010, 0b1010_0001).
458 /*
459 let div_original = div;
460 let shl = $normalization_shift(duo, div, true);
461 let mut div: $uX = (div << shl);
462 let mut pow: $uX = 1 << shl;
463 let mut quo: $uX = pow;
464 duo = duo.wrapping_sub(div);
465 if duo < div_original {
466 return (quo, duo);
467 }
468 div >>= 1;
469 pow >>= 1;
470 loop {
471 if (duo as $iX) < 0 {
472 // Negated binary long division step.
473 duo = duo.wrapping_add(div);
474 quo = quo.wrapping_sub(pow);
475 } else {
476 // Normal long division step.
477 if duo < div_original {
478 return (quo, duo)
479 }
480 duo = duo.wrapping_sub(div);
481 quo = quo.wrapping_add(pow);
482 }
483 pow >>= 1;
484 div >>= 1;
485 }
486 */
487
488 // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
489 // SWAR techniques that makes the only difference between steps be negation of `div`.
490 // If there was an architecture with an instruction that negated inputs to an adder
491 // based on conditionals, and in place shifting (or a three input addition operation
492 // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
493 // single instruction central loop is possible. Microarchitectures often have inputs to
494 // their ALU that can invert the arguments and carry in of adders, but the architectures
495 // unfortunately do not have an instruction to dynamically invert this input based on
496 // conditionals.
497 /*
498 // SWAR opening
499 let div_original = div;
500 let mut shl = $normalization_shift(duo, div, true);
501 let mut div: $uX = (div << shl);
502 duo = duo.wrapping_sub(div);
503 let mut quo: $uX = 1 << shl;
504 if duo < div_original {
505 return (quo, duo);
506 }
507 let mask: $uX;
508 if (div as $iX) < 0 {
509 div >>= 1;
510 shl -= 1;
511 let tmp = 1 << shl;
512 let sub = duo.wrapping_sub(div);
513 if (sub as $iX) >= 0 {
514 // restore
515 duo = sub;
516 quo |= tmp;
517 }
518 if duo < div_original {
519 return (quo, duo);
520 }
521 mask = tmp - 1;
522 } else {
523 mask = quo - 1;
524 }
525
526 // central loop
527 let div: $uX = div.wrapping_sub(1);
528 let mut i = shl;
529 loop {
530 if i == 0 {
531 break;
532 }
533 i -= 1;
534 // note: the `wrapping_shl(1)` can be factored out, but would require another
535 // restoring division step to prevent `(duo as $iX)` from overflowing
536 if (duo as $iX) < 0 {
537 // Negated binary long division step.
538 duo = duo.wrapping_shl(1).wrapping_add(div);
539 } else {
540 // Normal long division step.
541 duo = duo.wrapping_shl(1).wrapping_sub(div);
542 }
543 }
544 if (duo as $iX) < 0 {
545 // Restore. This was not needed in the original nonrestoring algorithm because of
546 // the `duo < div_original` checks.
547 duo = duo.wrapping_add(div);
548 }
549 // unpack
550 return ((duo & mask) | quo, duo >> shl);
551 */
552 }
553
554 /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
555 /// tuple.
556 $(
557 #[$signed_attr]
558 )*
559 pub fn $signed_name(duo: $iX, div: $iX) -> ($iX, $iX) {
560 // There is a way of doing this without any branches, but requires too many extra
561 // operations to be faster.
562 /*
563 let duo_s = duo >> ($n - 1);
564 let div_s = div >> ($n - 1);
565 let duo = (duo ^ duo_s).wrapping_sub(duo_s);
566 let div = (div ^ div_s).wrapping_sub(div_s);
567 let quo_s = duo_s ^ div_s;
568 let rem_s = duo_s;
569 let tmp = $unsigned_name(duo as $uX, div as $uX);
570 (
571 ((tmp.0 as $iX) ^ quo_s).wrapping_sub(quo_s),
572 ((tmp.1 as $iX) ^ rem_s).wrapping_sub(rem_s),
573 )
574 */
575 // this is problematic because LLVM likes to inline 4 times over
576 /*
577 match (duo < 0, div < 0) {
578 (false, false) => {
579 let t = $unsigned_name(duo as $uX, div as $uX);
580 (t.0 as $iX, t.1 as $iX)
581 },
582 (true, false) => {
583 let t = $unsigned_name(duo.wrapping_neg() as $uX, div as $uX);
584 ((t.0 as $iX).wrapping_neg(), (t.1 as $iX).wrapping_neg())
585 },
586 (false, true) => {
587 let t = $unsigned_name(duo as $uX, div.wrapping_neg() as $uX);
588 ((t.0 as $iX).wrapping_neg(), t.1 as $iX)
589 },
590 (true, true) => {
591 let t = $unsigned_name(duo.wrapping_neg() as $uX, div.wrapping_neg() as $uX);
592 (t.0 as $iX, (t.1 as $iX).wrapping_neg())
593 },
594 }
595 */
596 // this retains the ability of LLVM to eliminate branches
597 let duo_neg = duo < 0;
598 let div_neg = div < 0;
599 let mut duo = duo;
600 let mut div = div;
601 if duo_neg {
602 duo = duo.wrapping_neg();
603 }
604 if div_neg {
605 div = div.wrapping_neg();
606 }
607 let t = $unsigned_name(duo as $uX, div as $uX);
608 let mut quo = t.0 as $iX;
609 let mut rem = t.1 as $iX;
610 if duo_neg {
611 rem = rem.wrapping_neg();
612 }
613 if duo_neg != div_neg {
614 quo = quo.wrapping_neg();
615 }
616 (quo, rem)
617 }
618 }
619}