flt2dec2flt/core_num/dec2flt/
algorithm.rs

1//! The various algorithms from the paper.
2
3use core::cmp::min;
4use core::cmp::Ordering::{Equal, Greater, Less};
5use crate::core_num::dec2flt::num::{self, Big};
6use crate::core_num::dec2flt::rawfp::{self, fp_to_float, next_float, prev_float, RawFloat, Unpacked};
7use crate::core_num::dec2flt::table;
8use crate::core_num::diy_float::Fp;
9
10/// Number of significand bits in Fp
11const P: u32 = 64;
12
13// We simply store the best approximation for *all* exponents, so the variable "h" and the
14// associated conditions can be omitted. This trades performance for a couple kilobytes of space.
15
16fn power_of_ten(e: i16) -> Fp {
17    assert!(e >= table::MIN_E);
18    let i = e - table::MIN_E;
19    let sig = table::POWERS.0[i as usize];
20    let exp = table::POWERS.1[i as usize];
21    Fp { f: sig, e: exp }
22}
23
24// Disabled because `asm!` cannot be used.
25/*
26// In most architectures, floating point operations have an explicit bit size, therefore the
27// precision of the computation is determined on a per-operation basis.
28#[cfg(any(not(target_arch = "x86"), target_feature = "sse2"))]
29mod fpu_precision {
30    pub fn set_precision<T>() {}
31}
32
33// On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available.
34// The x87 FPU operates with 80 bits of precision by default, which means that operations will
35// round to 80 bits causing double rounding to happen when values are eventually represented as
36// 32/64 bit float values. To overcome this, the FPU control word can be set so that the
37// computations are performed in the desired precision.
38#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
39mod fpu_precision {
40    use crate::mem::size_of;
41
42    /// A structure used to preserve the original value of the FPU control word, so that it can be
43    /// restored when the structure is dropped.
44    ///
45    /// The x87 FPU is a 16-bits register whose fields are as follows:
46    ///
47    /// | 12-15 | 10-11 | 8-9 | 6-7 |  5 |  4 |  3 |  2 |  1 |  0 |
48    /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:|
49    /// |       | RC    | PC  |     | PM | UM | OM | ZM | DM | IM |
50    ///
51    /// The documentation for all of the fields is available in the IA-32 Architectures Software
52    /// Developer's Manual (Volume 1).
53    ///
54    /// The only field which is relevant for the following code is PC, Precision Control. This
55    /// field determines the precision of the operations performed by the  FPU. It can be set to:
56    ///  - 0b00, single precision i.e., 32-bits
57    ///  - 0b10, double precision i.e., 64-bits
58    ///  - 0b11, double extended precision i.e., 80-bits (default state)
59    /// The 0b01 value is reserved and should not be used.
60    pub struct FPUControlWord(u16);
61
62    fn set_cw(cw: u16) {
63        // SAFETY: the `fldcw` instruction has been audited to be able to work correctly with
64        // any `u16`
65        unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") }
66    }
67
68    /// Sets the precision field of the FPU to `T` and returns a `FPUControlWord`.
69    pub fn set_precision<T>() -> FPUControlWord {
70        let cw = 0u16;
71
72        // Compute the value for the Precision Control field that is appropriate for `T`.
73        let cw_precision = match size_of::<T>() {
74            4 => 0x0000, // 32 bits
75            8 => 0x0200, // 64 bits
76            _ => 0x0300, // default, 80 bits
77        };
78
79        // Get the original value of the control word to restore it later, when the
80        // `FPUControlWord` structure is dropped
81        // SAFETY: the `fnstcw` instruction has been audited to be able to work correctly with
82        // any `u16`
83        unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") }
84
85        // Set the control word to the desired precision. This is achieved by masking away the old
86        // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.
87        set_cw((cw & 0xFCFF) | cw_precision);
88
89        FPUControlWord(cw)
90    }
91
92    impl Drop for FPUControlWord {
93        fn drop(&mut self) {
94            set_cw(self.0)
95        }
96    }
97}
98
99/// The fast path of Bellerophon using machine-sized integers and floats.
100///
101/// This is extracted into a separate function so that it can be attempted before constructing
102/// a bignum.
103pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
104    let num_digits = integral.len() + fractional.len();
105    // log_10(f64::MAX_SIG) ~ 15.95. We compare the exact value to MAX_SIG near the end,
106    // this is just a quick, cheap rejection (and also frees the rest of the code from
107    // worrying about underflow).
108    if num_digits > 16 {
109        return None;
110    }
111    if e.abs() >= T::CEIL_LOG5_OF_MAX_SIG as i64 {
112        return None;
113    }
114    let f = num::from_str_unchecked(integral.iter().chain(fractional.iter()));
115    if f > T::MAX_SIG {
116        return None;
117    }
118
119    // The fast path crucially depends on arithmetic being rounded to the correct number of bits
120    // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision
121    // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit.
122    // The `set_precision` function takes care of setting the precision on architectures which
123    // require setting it by changing the global state (like the control word of the x87 FPU).
124    let _cw = fpu_precision::set_precision::<T>();
125
126    // The case e < 0 cannot be folded into the other branch. Negative powers result in
127    // a repeating fractional part in binary, which are rounded, which causes real
128    // (and occasionally quite significant!) errors in the final result.
129    if e >= 0 {
130        Some(T::from_int(f) * T::short_fast_pow10(e as usize))
131    } else {
132        Some(T::from_int(f) / T::short_fast_pow10(e.abs() as usize))
133    }
134}
135*/
136
137/// Algorithm Bellerophon is trivial code justified by non-trivial numeric analysis.
138///
139/// It rounds ``f`` to a float with 64 bit significand and multiplies it by the best approximation
140/// of `10^e` (in the same floating point format). This is often enough to get the correct result.
141/// However, when the result is close to halfway between two adjacent (ordinary) floats, the
142/// compound rounding error from multiplying two approximation means the result may be off by a
143/// few bits. When this happens, the iterative Algorithm R fixes things up.
144///
145/// The hand-wavy "close to halfway" is made precise by the numeric analysis in the paper.
146/// In the words of Clinger:
147///
148/// > Slop, expressed in units of the least significant bit, is an inclusive bound for the error
149/// > accumulated during the floating point calculation of the approximation to f * 10^e. (Slop is
150/// > not a bound for the true error, but bounds the difference between the approximation z and
151/// > the best possible approximation that uses p bits of significand.)
152pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T {
153    let slop = if f <= &Big::from_u64(T::MAX_SIG) {
154        // The cases abs(e) < log5(2^N) are in fast_path()
155        if e >= 0 { 0 } else { 3 }
156    } else {
157        if e >= 0 { 1 } else { 4 }
158    };
159    let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize();
160    let exp_p_n = 1 << (P - T::SIG_BITS as u32);
161    let lowbits: i64 = (z.f % exp_p_n) as i64;
162    // Is the slop large enough to make a difference when
163    // rounding to n bits?
164    if (lowbits - exp_p_n as i64 / 2).abs() <= slop {
165        algorithm_r(f, e, fp_to_float(z))
166    } else {
167        fp_to_float(z)
168    }
169}
170
171/// An iterative algorithm that improves a floating point approximation of `f * 10^e`.
172///
173/// Each iteration gets one unit in the last place closer, which of course takes terribly long to
174/// converge if `z0` is even mildly off. Luckily, when used as fallback for Bellerophon, the
175/// starting approximation is off by at most one ULP.
176fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T {
177    let mut z = z0;
178    loop {
179        let raw = z.unpack();
180        let (m, k) = (raw.sig, raw.k);
181        let mut x = f.clone();
182        let mut y = Big::from_u64(m);
183
184        // Find positive integers `x`, `y` such that `x / y` is exactly `(f * 10^e) / (m * 2^k)`.
185        // This not only avoids dealing with the signs of `e` and `k`, we also eliminate the
186        // power of two common to `10^e` and `2^k` to make the numbers smaller.
187        make_ratio(&mut x, &mut y, e, k);
188
189        let m_digits = [(m & 0xFF_FF_FF_FF) as u32, (m >> 32) as u32];
190        // This is written a bit awkwardly because our bignums don't support
191        // negative numbers, so we use the absolute value + sign information.
192        // The multiplication with m_digits can't overflow. If `x` or `y` are large enough that
193        // we need to worry about overflow, then they are also large enough that `make_ratio` has
194        // reduced the fraction by a factor of 2^64 or more.
195        let (d2, d_negative) = if x >= y {
196            // Don't need x any more, save a clone().
197            x.sub(&y).mul_pow2(1).mul_digits(&m_digits);
198            (x, false)
199        } else {
200            // Still need y - make a copy.
201            let mut y = y.clone();
202            y.sub(&x).mul_pow2(1).mul_digits(&m_digits);
203            (y, true)
204        };
205
206        if d2 < y {
207            let mut d2_double = d2;
208            d2_double.mul_pow2(1);
209            if m == T::MIN_SIG && d_negative && d2_double > y {
210                z = prev_float(z);
211            } else {
212                return z;
213            }
214        } else if d2 == y {
215            if m % 2 == 0 {
216                if m == T::MIN_SIG && d_negative {
217                    z = prev_float(z);
218                } else {
219                    return z;
220                }
221            } else if d_negative {
222                z = prev_float(z);
223            } else {
224                z = next_float(z);
225            }
226        } else if d_negative {
227            z = prev_float(z);
228        } else {
229            z = next_float(z);
230        }
231    }
232}
233
234/// Given `x = f` and `y = m` where `f` represent input decimal digits as usual and `m` is the
235/// significand of a floating point approximation, make the ratio `x / y` equal to
236/// `(f * 10^e) / (m * 2^k)`, possibly reduced by a power of two both have in common.
237fn make_ratio(x: &mut Big, y: &mut Big, e: i16, k: i16) {
238    let (e_abs, k_abs) = (e.abs() as usize, k.abs() as usize);
239    if e >= 0 {
240        if k >= 0 {
241            // x = f * 10^e, y = m * 2^k, except that we reduce the fraction by some power of two.
242            let common = min(e_abs, k_abs);
243            x.mul_pow5(e_abs).mul_pow2(e_abs - common);
244            y.mul_pow2(k_abs - common);
245        } else {
246            // x = f * 10^e * 2^abs(k), y = m
247            // This can't overflow because it requires positive `e` and negative `k`, which can
248            // only happen for values extremely close to 1, which means that `e` and `k` will be
249            // comparatively tiny.
250            x.mul_pow5(e_abs).mul_pow2(e_abs + k_abs);
251        }
252    } else {
253        if k >= 0 {
254            // x = f, y = m * 10^abs(e) * 2^k
255            // This can't overflow either, see above.
256            y.mul_pow5(e_abs).mul_pow2(k_abs + e_abs);
257        } else {
258            // x = f * 2^abs(k), y = m * 10^abs(e), again reducing by a common power of two.
259            let common = min(e_abs, k_abs);
260            x.mul_pow2(k_abs - common);
261            y.mul_pow5(e_abs).mul_pow2(e_abs - common);
262        }
263    }
264}
265
266/// Conceptually, Algorithm M is the simplest way to convert a decimal to a float.
267///
268/// We form a ratio that is equal to `f * 10^e`, then throwing in powers of two until it gives
269/// a valid float significand. The binary exponent `k` is the number of times we multiplied
270/// numerator or denominator by two, i.e., at all times `f * 10^e` equals `(u / v) * 2^k`.
271/// When we have found out significand, we only need to round by inspecting the remainder of the
272/// division, which is done in helper functions further below.
273///
274/// This algorithm is super slow, even with the optimization described in `quick_start()`.
275/// However, it's the simplest of the algorithms to adapt for overflow, underflow, and subnormal
276/// results. This implementation takes over when Bellerophon and Algorithm R are overwhelmed.
277/// Detecting underflow and overflow is easy: The ratio still isn't an in-range significand,
278/// yet the minimum/maximum exponent has been reached. In the case of overflow, we simply return
279/// infinity.
280///
281/// Handling underflow and subnormals is trickier. One big problem is that, with the minimum
282/// exponent, the ratio might still be too large for a significand. See underflow() for details.
283pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T {
284    let mut u;
285    let mut v;
286    let e_abs = e.abs() as usize;
287    let mut k = 0;
288    if e < 0 {
289        u = f.clone();
290        v = Big::from_small(1);
291        v.mul_pow5(e_abs).mul_pow2(e_abs);
292    } else {
293        // FIXME possible optimization: generalize big_to_fp so that we can do the equivalent of
294        // fp_to_float(big_to_fp(u)) here, only without the double rounding.
295        u = f.clone();
296        u.mul_pow5(e_abs).mul_pow2(e_abs);
297        v = Big::from_small(1);
298    }
299    quick_start::<T>(&mut u, &mut v, &mut k);
300    let mut rem = Big::from_small(0);
301    let mut x = Big::from_small(0);
302    let min_sig = Big::from_u64(T::MIN_SIG);
303    let max_sig = Big::from_u64(T::MAX_SIG);
304    loop {
305        u.div_rem(&v, &mut x, &mut rem);
306        if k == T::MIN_EXP_INT {
307            // We have to stop at the minimum exponent, if we wait until `k < T::MIN_EXP_INT`,
308            // then we'd be off by a factor of two. Unfortunately this means we have to special-
309            // case normal numbers with the minimum exponent.
310            // FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure
311            // that it's actually correct!
312            if x >= min_sig && x <= max_sig {
313                break;
314            }
315            return underflow(x, v, rem);
316        }
317        if k > T::MAX_EXP_INT {
318            return T::INFINITY;
319        }
320        if x < min_sig {
321            u.mul_pow2(1);
322            k -= 1;
323        } else if x > max_sig {
324            v.mul_pow2(1);
325            k += 1;
326        } else {
327            break;
328        }
329    }
330    let q = num::to_u64(&x);
331    let z = rawfp::encode_normal(Unpacked::new(q, k));
332    round_by_remainder(v, rem, q, z)
333}
334
335/// Skips over most Algorithm M iterations by checking the bit length.
336fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) {
337    // The bit length is an estimate of the base two logarithm, and log(u / v) = log(u) - log(v).
338    // The estimate is off by at most 1, but always an under-estimate, so the error on log(u)
339    // and log(v) are of the same sign and cancel out (if both are large). Therefore the error
340    // for log(u / v) is at most one as well.
341    // The target ratio is one where u/v is in an in-range significand. Thus our termination
342    // condition is log2(u / v) being the significand bits, plus/minus one.
343    // FIXME Looking at the second bit could improve the estimate and avoid some more divisions.
344    let target_ratio = T::SIG_BITS as i16;
345    let log2_u = u.bit_length() as i16;
346    let log2_v = v.bit_length() as i16;
347    let mut u_shift: i16 = 0;
348    let mut v_shift: i16 = 0;
349    assert!(*k == 0);
350    loop {
351        if *k == T::MIN_EXP_INT {
352            // Underflow or subnormal. Leave it to the main function.
353            break;
354        }
355        if *k == T::MAX_EXP_INT {
356            // Overflow. Leave it to the main function.
357            break;
358        }
359        let log2_ratio = (log2_u + u_shift) - (log2_v + v_shift);
360        if log2_ratio < target_ratio - 1 {
361            u_shift += 1;
362            *k -= 1;
363        } else if log2_ratio > target_ratio + 1 {
364            v_shift += 1;
365            *k += 1;
366        } else {
367            break;
368        }
369    }
370    u.mul_pow2(u_shift as usize);
371    v.mul_pow2(v_shift as usize);
372}
373
374fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
375    if x < Big::from_u64(T::MIN_SIG) {
376        let q = num::to_u64(&x);
377        let z = rawfp::encode_subnormal(q);
378        return round_by_remainder(v, rem, q, z);
379    }
380    // Ratio isn't an in-range significand with the minimum exponent, so we need to round off
381    // excess bits and adjust the exponent accordingly. The real value now looks like this:
382    //
383    //        x        lsb
384    // /--------------\/
385    // 1010101010101010.10101010101010 * 2^k
386    // \-----/\-------/ \------------/
387    //    q     trunc.    (represented by rem)
388    //
389    // Therefore, when the rounded-off bits are != 0.5 ULP, they decide the rounding
390    // on their own. When they are equal and the remainder is non-zero, the value still
391    // needs to be rounded up. Only when the rounded off bits are 1/2 and the remainder
392    // is zero, we have a half-to-even situation.
393    let bits = x.bit_length();
394    let lsb = bits - T::SIG_BITS as usize;
395    let q = num::get_bits(&x, lsb, bits);
396    let k = T::MIN_EXP_INT + lsb as i16;
397    let z = rawfp::encode_normal(Unpacked::new(q, k));
398    let q_even = q % 2 == 0;
399    match num::compare_with_half_ulp(&x, lsb) {
400        Greater => next_float(z),
401        Less => z,
402        Equal if rem.is_zero() && q_even => z,
403        Equal => next_float(z),
404    }
405}
406
407/// Ordinary round-to-even, obfuscated by having to round based on the remainder of a division.
408fn round_by_remainder<T: RawFloat>(v: Big, r: Big, q: u64, z: T) -> T {
409    let mut v_minus_r = v;
410    v_minus_r.sub(&r);
411    if r < v_minus_r {
412        z
413    } else if r > v_minus_r {
414        next_float(z)
415    } else if q % 2 == 0 {
416        z
417    } else {
418        next_float(z)
419    }
420}
flt2dec2flt/core_num/dec2flt/algorithm.rs

flt2dec2flt/core_num/dec2flt/
algorithm.rs