minifloat/
lib.rs

1// This file is part of the minifloat project.
2//
3// Copyright (C) 2024-2025 Chen-Pang He <jdh8@skymizer.com>
4//
5// This Source Code Form is subject to the terms of the Mozilla
6// Public License v. 2.0. If a copy of the MPL was not distributed
7// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
9#![doc = include_str!("../README.md")]
10#![warn(missing_docs)]
11
12pub mod detail;
13pub mod example;
14
15use core::cmp::Ordering;
16use core::f64::consts::LOG10_2;
17use core::ops::Neg;
18
19/// NaN encoding style
20///
21/// The variants follow [LLVM/MLIR naming conventions][llvm] derived from
22/// their differences to [IEEE 754][ieee].
23///
24/// [llvm]: https://llvm.org/doxygen/structllvm_1_1APFloatBase.html
25/// [ieee]: https://en.wikipedia.org/wiki/IEEE_754
26#[allow(clippy::upper_case_acronyms)]
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum NanStyle {
29    /// IEEE 754 NaN encoding
30    ///
31    /// The maximum exponent is reserved for non-finite numbers.  The zero
32    /// mantissa stands for infinity, while any other value represents a NaN.
33    IEEE,
34
35    /// `FN` suffix as in LLVM/MLIR
36    ///
37    /// `F` is for finite, `N` for a special NaN encoding.  There are no
38    /// infinities.  The maximum magnitude is reserved for NaNs, where the
39    /// exponent and mantissa are all ones.
40    FN,
41
42    /// `FNUZ` suffix as in LLVM/MLIR
43    ///
44    /// `F` is for finite, `N` for a special NaN encoding, `UZ` for unsigned
45    /// zero.  There are no infinities.  The negative zero (&minus;0.0)
46    /// representation is reserved for NaN.  As a result, there is only one
47    /// (+0.0) unsigned zero.
48    FNUZ,
49}
50
51#[allow(clippy::excessive_precision)]
52const LOG2_SIGNIFICAND: [f64; 16] = [
53    -2.0,
54    -1.0,
55    -4.150_374_992_788_438_13e-1,
56    -1.926_450_779_423_958_81e-1,
57    -9.310_940_439_148_146_51e-2,
58    -4.580_368_961_312_478_86e-2,
59    -2.272_007_650_008_352_89e-2,
60    -1.131_531_322_783_414_61e-2,
61    -5.646_563_141_142_062_72e-3,
62    -2.820_519_062_378_662_63e-3,
63    -1.409_570_254_671_353_63e-3,
64    -7.046_129_765_893_727_06e-4,
65    -3.522_634_716_290_213_85e-4,
66    -1.761_209_842_740_240_62e-4,
67    -8.805_780_458_002_638_34e-5,
68    -4.402_823_044_177_721_15e-5,
69];
70
71/// Generic trait for minifloat types
72///
73/// I am **not** going to implement [`num_traits::Float`][flt] because:
74///
75/// 1. [`FN`][NanStyle::FN] and [`FNUZ`][NanStyle::FNUZ] types do not have infinities.
76/// 2. [`FNUZ`][NanStyle::FNUZ] types do not have a negative zero.
77/// 3. I don't have plans for [arithmetic operations][ops] yet.
78///
79/// [flt]: https://docs.rs/num-traits/latest/num_traits/float/trait.Float.html
80/// [ops]: https://docs.rs/num-traits/latest/num_traits/trait.NumOps.html
81pub trait Minifloat: Copy + PartialEq + PartialOrd + Neg<Output = Self> {
82    /// Storage type
83    type Bits;
84
85    /// Whether the type is signed
86    const S: bool = true;
87
88    /// Exponent bit-width
89    const E: u32;
90
91    /// Significand (mantissa) precision
92    const M: u32;
93
94    /// Exponent bias
95    const B: i32 = (1 << (Self::E - 1)) - 1;
96
97    /// NaN encoding style
98    const N: NanStyle = NanStyle::IEEE;
99
100    /// Total bitwidth
101    const BITWIDTH: u32 = Self::S as u32 + Self::E + Self::M;
102
103    /// The radix of the internal representation
104    const RADIX: u32 = 2;
105
106    /// The number of digits in the significand, including the implicit leading bit
107    ///
108    /// Equal to `M` + 1
109    const MANTISSA_DIGITS: u32 = Self::M + 1;
110
111    /// The maximum exponent
112    ///
113    /// Normal numbers < 1 &times; 2<sup>`MAX_EXP`</sup>.
114    const MAX_EXP: i32 = (1 << Self::E)
115        - Self::B
116        - match Self::N {
117            NanStyle::IEEE => 1,
118            NanStyle::FN => (Self::M == 0) as i32,
119            NanStyle::FNUZ => 0,
120        };
121
122    /// One greater than the minimum normal exponent
123    ///
124    /// Normal numbers ≥ 0.5 &times; 2<sup>`MIN_EXP`</sup>.
125    ///
126    /// This quirk comes from C macros `FLT_MIN_EXP` and friends.  However, it
127    /// is no big deal to mistake it since [[`MIN_POSITIVE`][Self::MIN_POSITIVE],
128    /// 2 &times; `MIN_POSITIVE`] is a buffer zone where numbers can be
129    /// interpreted as normal or subnormal.
130    const MIN_EXP: i32 = 2 - Self::B;
131
132    /// Approximate number of significant decimal digits
133    ///
134    /// Equal to floor([`M`][Self::M] log<sub>10</sub>(2))
135    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
136    const DIGITS: u32 = (Self::M as f64 * crate::LOG10_2) as u32;
137
138    /// Maximum <var>x</var> such that 10<sup>`x`</sup> is normal
139    ///
140    /// Equal to floor(log<sub>10</sub>([`MAX`][Self::MAX]))
141    #[allow(clippy::cast_possible_truncation)]
142    const MAX_10_EXP: i32 = {
143        let exponent = (1 << Self::E) - Self::B - matches!(Self::N, NanStyle::IEEE) as i32;
144        let precision = Self::M + !matches!(Self::N, NanStyle::FN) as u32;
145        let log2_max = exponent as f64 + crate::LOG2_SIGNIFICAND[precision as usize];
146        (log2_max * crate::LOG10_2) as i32
147    };
148
149    /// Minimum <var>x</var> such that 10<sup>`x`</sup> is normal
150    ///
151    /// Equal to ceil(log<sub>10</sub>([`MIN_POSITIVE`][Self::MIN_POSITIVE]))
152    #[allow(clippy::cast_possible_truncation)]
153    const MIN_10_EXP: i32 = ((Self::MIN_EXP - 1) as f64 * crate::LOG10_2) as i32;
154
155    /// One representation of NaN
156    const NAN: Self;
157
158    /// The largest number of this type
159    ///
160    /// This value would be +∞ if the type has infinities.  Otherwise, it is
161    /// the maximum finite representation.  This value is also the result of
162    /// a positive overflow.
163    const HUGE: Self;
164
165    /// The maximum finite number
166    const MAX: Self;
167
168    /// The smallest positive (subnormal) number
169    const TINY: Self;
170
171    /// The smallest positive normal number
172    ///
173    /// Equal to 2<sup>[`MIN_EXP`][Self::MIN_EXP]&minus;1</sup>
174    const MIN_POSITIVE: Self;
175
176    /// [Machine epsilon](https://en.wikipedia.org/wiki/Machine_epsilon)
177    ///
178    /// The difference between 1.0 and the next larger representable number.
179    ///
180    /// Equal to 2<sup>&minus;`M`</sup>.
181    const EPSILON: Self;
182
183    /// The minimum finite number
184    ///
185    /// Equal to &minus;[`MAX`][Self::MAX]
186    const MIN: Self;
187
188    /// Raw transmutation from [`Self::Bits`]
189    #[must_use]
190    fn from_bits(v: Self::Bits) -> Self;
191
192    /// Raw transmutation to [`Self::Bits`]
193    #[must_use]
194    fn to_bits(self) -> Self::Bits;
195
196    /// IEEE 754 total-ordering predicate
197    ///
198    /// The normative definition is lengthy, but it is essentially comparing
199    /// sign-magnitude notations.
200    ///
201    /// See also [`f32::total_cmp`],
202    /// <https://en.wikipedia.org/wiki/IEEE_754#Total-ordering_predicate>
203    #[must_use]
204    fn total_cmp(&self, other: &Self) -> Ordering;
205
206    /// Check if the value is NaN
207    #[must_use]
208    fn is_nan(self) -> bool;
209
210    /// Check if the value is positive or negative infinity
211    #[must_use]
212    fn is_infinite(self) -> bool;
213
214    /// Check if the value is finite, i.e. neither infinite nor NaN
215    #[must_use]
216    fn is_finite(self) -> bool;
217
218    /// Check if the value is [subnormal]
219    ///
220    /// [subnormal]: https://en.wikipedia.org/wiki/Subnormal_number
221    #[must_use]
222    fn is_subnormal(self) -> bool {
223        matches!(self.classify(), core::num::FpCategory::Subnormal)
224    }
225
226    /// Check if the value is normal, i.e. not zero, [subnormal], infinite, or NaN
227    ///
228    /// [subnormal]: https://en.wikipedia.org/wiki/Subnormal_number
229    #[must_use]
230    fn is_normal(self) -> bool {
231        matches!(self.classify(), core::num::FpCategory::Normal)
232    }
233
234    /// Classify the value into a floating-point category
235    ///
236    /// If only one property is going to be tested, it is generally faster to
237    /// use the specific predicate instead.
238    #[must_use]
239    fn classify(self) -> core::num::FpCategory;
240
241    /// Compute the absolute value
242    #[must_use]
243    fn abs(self) -> Self;
244
245    /// Check if the sign bit is clear
246    #[must_use]
247    fn is_sign_positive(self) -> bool;
248
249    /// Check if the sign bit is set
250    #[must_use]
251    fn is_sign_negative(self) -> bool;
252}
253
254/// Internal macro to conditionally define infinities
255#[doc(hidden)]
256#[macro_export]
257macro_rules! __conditionally_define_infinities {
258    (impl $name:ident, IEEE) => {
259        impl $name {
260            /// Positive infinity
261            pub const INFINITY: Self = Self::HUGE;
262
263            /// Negative infinity
264            pub const NEG_INFINITY: Self = Self(Self::HUGE.0 | (1 << (Self::E + Self::M)));
265        }
266    };
267    (impl $name:ident, $n:ident) => {};
268}
269
270/// Internal macro to select the correct sized trait implementation
271///
272/// This macro needs to be public for [`minifloat!`] to invoke, but it is not
273/// intended for general use.
274#[doc(hidden)]
275#[macro_export]
276macro_rules! __select_sized_trait {
277    (u8, $name:ident, $e:expr, $m:expr) => {
278        impl $crate::Most8<$m> for $name {
279            const E: u32 = Self::E;
280            const B: i32 = Self::B;
281            const N: $crate::NanStyle = Self::N;
282
283            const NAN: Self = Self::NAN;
284            const HUGE: Self = Self::HUGE;
285            const MAX: Self = Self::MAX;
286            const TINY: Self = Self::TINY;
287            const MIN_POSITIVE: Self = Self::MIN_POSITIVE;
288            const EPSILON: Self = Self::EPSILON;
289            const MIN: Self = Self::MIN;
290
291            fn from_bits(v: u8) -> Self {
292                Self::from_bits(v)
293            }
294
295            fn to_bits(self) -> u8 {
296                self.to_bits()
297            }
298
299            fn total_cmp(&self, other: &Self) -> core::cmp::Ordering {
300                Self::total_cmp_key(self.0).cmp(&Self::total_cmp_key(other.0))
301            }
302        }
303    };
304    (u16, $name:ident, $e:expr, $m:expr) => {
305        impl $crate::Most16<$m> for $name {
306            const E: u32 = Self::E;
307            const B: i32 = Self::B;
308            const N: $crate::NanStyle = Self::N;
309
310            const NAN: Self = Self::NAN;
311            const HUGE: Self = Self::HUGE;
312            const MAX: Self = Self::MAX;
313            const TINY: Self = Self::TINY;
314            const MIN_POSITIVE: Self = Self::MIN_POSITIVE;
315            const EPSILON: Self = Self::EPSILON;
316            const MIN: Self = Self::MIN;
317
318            fn from_bits(v: u16) -> Self {
319                Self::from_bits(v)
320            }
321
322            fn to_bits(self) -> u16 {
323                self.to_bits()
324            }
325
326            fn total_cmp(&self, other: &Self) -> core::cmp::Ordering {
327                Self::total_cmp_key(self.0).cmp(&Self::total_cmp_key(other.0))
328            }
329        }
330    };
331}
332
333/// Define a minifloat taking up to 16 bits
334///
335/// * `$name`: name of the type
336/// * `$bits`: the underlying integer type, which must be [`u8`] or [`u16`]
337/// * `$e`: exponent bit-width
338/// * `$m`: explicit significand (mantissa) bit-width
339/// * `$b`: exponent bias, which defaults to 2<sup>`$e`&minus;1</sup> &minus; 1
340/// * `$n`: NaN encoding style, one of the [`NanStyle`] variants
341///
342/// ## Constraints
343///
344/// * `$e` + `$m` < 16 (there is always a sign bit)
345/// * `$e` ≥ 2 (or use an integer type instead)
346/// * `$m` > 0 if `$n` is [`IEEE`][NanStyle::IEEE] (∞ ≠ NaN)
347///
348/// ## Example
349///
350/// ```
351/// use minifloat::minifloat;
352/// minifloat!(pub struct F8E4M3FN(u8): 4, 3, FN);
353/// ```
354#[macro_export]
355macro_rules! minifloat {
356    ($vis:vis struct $name:ident($bits:tt): $e:expr, $m:expr, $b:expr, $n:ident) => {
357        #[allow(non_camel_case_types)]
358        #[doc = concat!("A minifloat with bit-layout S1E", $e, "M", $m)]
359        #[derive(Debug, Clone, Copy, Default)]
360        $vis struct $name($bits);
361
362        impl $name {
363            /// Exponent bitwidth
364            pub const E: u32 = $e;
365
366            /// Explicit significand (mantissa) bitwidth
367            ///
368            /// This width excludes the implicit leading bit.
369            pub const M: u32 = $m;
370
371            /// Exponent bias
372            pub const B: i32 = $b;
373
374            /// NaN encoding style
375            pub const N: $crate::NanStyle = $crate::NanStyle::$n;
376
377            /// Total bitwidth
378            pub const BITWIDTH: u32 = 1 + Self::E + Self::M;
379
380            /// The radix of the internal representation
381            pub const RADIX: u32 = 2;
382
383            /// The number of digits in the significand, including the implicit leading bit
384            ///
385            /// Equal to [`M`][Self::M] + 1
386            pub const MANTISSA_DIGITS: u32 = $m + 1;
387
388            /// The maximum exponent
389            ///
390            /// Normal numbers < 1 &times; 2<sup>`MAX_EXP`</sup>.
391            pub const MAX_EXP: i32 = (1 << Self::E)
392                - Self::B
393                - match Self::N {
394                    $crate::NanStyle::IEEE => 1,
395                    $crate::NanStyle::FN => (Self::M == 0) as i32,
396                    $crate::NanStyle::FNUZ => 0,
397                };
398
399            /// One greater than the minimum normal exponent
400            ///
401            /// Normal numbers ≥ 0.5 &times; 2<sup>`MIN_EXP`</sup>.
402            ///
403            /// This quirk comes from C macros `FLT_MIN_EXP` and friends.  However, it
404            /// is no big deal to mistake it since [[`MIN_POSITIVE`][Self::MIN_POSITIVE],
405            /// 2 &times; `MIN_POSITIVE`] is a buffer zone where numbers can be
406            /// interpreted as normal or subnormal.
407            pub const MIN_EXP: i32 = 2 - Self::B;
408
409            /// One representation of NaN
410            pub const NAN: Self = Self(match Self::N {
411                $crate::NanStyle::IEEE => ((1 << (Self::E + 1)) - 1) << (Self::M - 1),
412                $crate::NanStyle::FN => (1 << (Self::E + Self::M)) - 1,
413                $crate::NanStyle::FNUZ => 1 << (Self::E + Self::M),
414            });
415
416            /// The largest number of this type
417            ///
418            /// This value would be +∞ if the type has infinities.  Otherwise, it is
419            /// the maximum finite representation.  This value is also the result of
420            /// a positive overflow.
421            pub const HUGE: Self = Self(match Self::N {
422                $crate::NanStyle::IEEE => ((1 << Self::E) - 1) << Self::M,
423                $crate::NanStyle::FN => (1 << (Self::E + Self::M)) - 2,
424                $crate::NanStyle::FNUZ => (1 << (Self::E + Self::M)) - 1,
425            });
426
427            /// The maximum finite number
428            pub const MAX: Self = Self(Self::HUGE.0 - matches!(Self::N, $crate::NanStyle::IEEE) as $bits);
429
430            /// The smallest positive (subnormal) number
431            pub const TINY: Self = Self(1);
432
433            /// The smallest positive normal number
434            ///
435            /// Equal to 2<sup>[`MIN_EXP`][Self::MIN_EXP]&minus;1</sup>.
436            pub const MIN_POSITIVE: Self = Self(1 << Self::M);
437
438            /// [Machine epsilon](https://en.wikipedia.org/wiki/Machine_epsilon)
439            ///
440            /// The difference between 1.0 and the next larger representable number.
441            ///
442            /// Equal to 2<sup>&minus;`M`</sup>.
443            #[allow(clippy::cast_possible_wrap)]
444            pub const EPSILON: Self = Self(match Self::B - Self::M as i32 {
445                #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
446                s @ 1.. => (s as $bits) << Self::M,
447                s => 1 << (Self::M as i32 - 1 + s),
448            });
449
450            /// The minimum finite number
451            ///
452            /// Equal to &minus;[`MAX`][Self::MAX].
453            pub const MIN: Self = Self(Self::MAX.0 | 1 << (Self::E + Self::M));
454
455            /// Magnitude mask for internal usage
456            const ABS_MASK: $bits = (1 << (Self::E + Self::M)) - 1;
457
458            #[doc = concat!("Raw transmutation from [`", stringify!($bits), "`]")]
459            #[must_use]
460            pub const fn from_bits(v: $bits) -> Self {
461                Self($bits::MAX >> ($bits::BITS - Self::BITWIDTH) & v)
462            }
463
464            #[doc = concat!("Raw transmutation to [`", stringify!($bits), "`]")]
465            #[must_use]
466            pub const fn to_bits(self) -> $bits {
467                self.0
468            }
469
470            /// Check if the value is NaN
471            #[must_use]
472            pub const fn is_nan(self) -> bool {
473                match Self::N {
474                    #[allow(clippy::bad_bit_mask)]
475                    $crate::NanStyle::IEEE => self.0 & Self::ABS_MASK > Self::HUGE.0,
476                    $crate::NanStyle::FN => self.0 & Self::ABS_MASK == Self::NAN.0 & Self::ABS_MASK,
477                    $crate::NanStyle::FNUZ => self.0 == Self::NAN.0,
478                }
479            }
480
481            /// Check if the value is positive or negative infinity
482            #[must_use]
483            pub const fn is_infinite(self) -> bool {
484                matches!(Self::N, $crate::NanStyle::IEEE) && self.0 & Self::ABS_MASK == Self::HUGE.0
485            }
486
487            /// Check if the value is finite, i.e. neither infinite nor NaN
488            #[must_use]
489            pub const fn is_finite(self) -> bool {
490                match Self::N {
491                    $crate::NanStyle::IEEE => self.0 & Self::ABS_MASK < Self::HUGE.0,
492                    _ => !self.is_nan(),
493                }
494            }
495
496            /// Check if the value is [subnormal]
497            ///
498            /// [subnormal]: https://en.wikipedia.org/wiki/Subnormal_number
499            #[must_use]
500            pub const fn is_subnormal(self) -> bool {
501                matches!(self.classify(), core::num::FpCategory::Subnormal)
502            }
503
504            /// Check if the value is normal, i.e. not zero, [subnormal], infinite, or NaN
505            ///
506            /// [subnormal]: https://en.wikipedia.org/wiki/Subnormal_number
507            #[must_use]
508            pub const fn is_normal(self) -> bool {
509                matches!(self.classify(), core::num::FpCategory::Normal)
510            }
511
512            /// Classify the value into a floating-point category
513            ///
514            /// If only one property is going to be tested, it is generally faster to
515            /// use the specific predicate instead.
516            #[must_use]
517            pub const fn classify(self) -> core::num::FpCategory {
518                if self.is_nan() {
519                    core::num::FpCategory::Nan
520                } else if self.is_infinite() {
521                    core::num::FpCategory::Infinite
522                } else {
523                    let exp_mask = ((1 << Self::E) - 1) << Self::M;
524                    let man_mask = (1 << Self::M) - 1;
525
526                    match (self.0 & exp_mask, self.0 & man_mask) {
527                        (0, 0) => core::num::FpCategory::Zero,
528                        (0, _) => core::num::FpCategory::Subnormal,
529                        (_, _) => core::num::FpCategory::Normal,
530                    }
531                }
532            }
533
534            /// Compute the absolute value
535            #[must_use]
536            pub const fn abs(self) -> Self {
537                if matches!(Self::N, $crate::NanStyle::FNUZ) && self.0 == Self::NAN.0 {
538                    return Self::NAN;
539                }
540                Self::from_bits(self.to_bits() & Self::ABS_MASK)
541            }
542
543            /// Check if the sign bit is clear
544            #[must_use]
545            pub const fn is_sign_positive(self) -> bool {
546                self.0 >> (Self::E + Self::M) & 1 == 0
547            }
548
549            /// Check if the sign bit is set
550            #[must_use]
551            pub const fn is_sign_negative(self) -> bool {
552                self.0 >> (Self::E + Self::M) & 1 == 1
553            }
554
555            /// Map sign-magnitude notations to plain unsigned integers
556            ///
557            /// This serves as a hook for the [`Minifloat`] trait.
558            const fn total_cmp_key(x: $bits) -> $bits {
559                let sign = 1 << (Self::E + Self::M);
560                let mask = ((x & sign) >> (Self::E + Self::M)) * (sign - 1);
561                x ^ (sign | mask)
562            }
563
564            /// Probably lossy conversion from [`f32`]
565            ///
566            /// NaNs are preserved.  Overflows result in ±[`HUGE`][Self::HUGE].
567            /// Other values are rounded to the nearest representable value.
568            #[must_use]
569            #[allow(clippy::cast_possible_wrap)]
570            pub fn from_f32(x: f32) -> Self {
571                if x.is_nan() {
572                    let sign_bit = <$bits>::from(x.is_sign_negative()) << (Self::E + Self::M);
573                    return Self::from_bits(Self::NAN.0 | sign_bit);
574                }
575
576                let bits = $crate::detail::round_f32_to_precision::<$m>(x).to_bits();
577                let sign_bit = ((bits >> 31) as $bits) << (Self::E + Self::M);
578                let diff = (Self::MIN_EXP - f32::MIN_EXP) << Self::M;
579                let magnitude = bits << 1 >> (f32::MANTISSA_DIGITS - Self::M);
580                let magnitude = magnitude as i32 - diff;
581
582                if magnitude < 1 << Self::M {
583                    let ticks = f64::from(x.abs()) * $crate::detail::exp2i(Self::MANTISSA_DIGITS as i32 - Self::MIN_EXP);
584                    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
585                    let ticks = ticks.round_ties_even() as $bits;
586                    return Self::from_bits(
587                        (<$bits>::from(Self::N != $crate::NanStyle::FNUZ || ticks != 0) * sign_bit) | ticks,
588                    );
589                }
590
591                #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
592                Self::from_bits(magnitude.min(i32::from(Self::HUGE.to_bits())) as $bits | sign_bit)
593            }
594
595            /// Probably lossy conversion from [`f64`]
596            ///
597            /// NaNs are preserved.  Overflows result in ±[`HUGE`][Self::HUGE].
598            /// Other values are rounded to the nearest representable value.
599            #[must_use]
600            #[allow(clippy::cast_possible_wrap)]
601            pub fn from_f64(x: f64) -> Self {
602                if x.is_nan() {
603                    let sign_bit = <$bits>::from(x.is_sign_negative()) << (Self::E + Self::M);
604                    return Self::from_bits(Self::NAN.to_bits() | sign_bit);
605                }
606
607                let bits = $crate::detail::round_f64_to_precision::<$m>(x).to_bits();
608                let sign_bit = ((bits >> 63) as $bits) << (Self::E + Self::M);
609                let diff = i64::from(Self::MIN_EXP - f64::MIN_EXP) << Self::M;
610                let magnitude = bits << 1 >> (f64::MANTISSA_DIGITS - Self::M);
611                let magnitude = magnitude as i64 - diff;
612
613                if magnitude < 1 << Self::M {
614                    let ticks = x.abs() * $crate::detail::exp2i(Self::MANTISSA_DIGITS as i32 - Self::MIN_EXP);
615                    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
616                    let ticks = ticks.round_ties_even() as $bits;
617                    return Self::from_bits(
618                        (<$bits>::from(Self::N != $crate::NanStyle::FNUZ || ticks != 0) * sign_bit) | ticks,
619                    );
620                }
621
622                #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
623                Self::from_bits(magnitude.min(i64::from(Self::HUGE.to_bits())) as $bits | sign_bit)
624            }
625
626            /// Fast conversion to [`f32`]
627            ///
628            /// This method serves as a shortcut if conversion to [`f32`] is
629            /// lossless.
630            fn fast_to_f32(self) -> f32 {
631                let sign = if self.is_sign_negative() { -1.0 } else { 1.0 };
632                let magnitude = self.to_bits() & Self::ABS_MASK;
633
634                if self.is_nan() {
635                    return f32::NAN.copysign(sign);
636                }
637                if self.is_infinite() {
638                    return f32::INFINITY * sign;
639                }
640                if magnitude < 1 << Self::M {
641                    #[allow(clippy::cast_possible_wrap)]
642                    let shift = Self::MIN_EXP - Self::MANTISSA_DIGITS as i32;
643                    #[allow(clippy::cast_possible_truncation)]
644                    return ($crate::detail::exp2i(shift) * f64::from(sign) * f64::from(magnitude)) as f32;
645                }
646                let shift = f32::MANTISSA_DIGITS - Self::MANTISSA_DIGITS;
647                #[allow(clippy::cast_sign_loss)]
648                let diff = (Self::MIN_EXP - f32::MIN_EXP) as u32;
649                let diff = diff << (f32::MANTISSA_DIGITS - 1);
650                let sign = u32::from(self.is_sign_negative()) << 31;
651                f32::from_bits(((u32::from(magnitude) << shift) + diff) | sign)
652            }
653
654            /// Fast conversion to [`f64`]
655            ///
656            /// This method serves as a shortcut if conversion to [`f64`] is
657            /// lossless.
658            fn fast_to_f64(self) -> f64 {
659                let sign = if self.is_sign_negative() { -1.0 } else { 1.0 };
660                let magnitude = self.to_bits() & Self::ABS_MASK;
661
662                if self.is_nan() {
663                    return f64::NAN.copysign(sign);
664                }
665                if self.is_infinite() {
666                    return f64::INFINITY * sign;
667                }
668                if magnitude < 1 << Self::M {
669                    #[allow(clippy::cast_possible_wrap)]
670                    let shift = Self::MIN_EXP - Self::MANTISSA_DIGITS as i32;
671                    return $crate::detail::exp2i(shift) * sign * f64::from(magnitude);
672                }
673                let shift = f64::MANTISSA_DIGITS - Self::MANTISSA_DIGITS;
674                #[allow(clippy::cast_sign_loss)]
675                let diff = (Self::MIN_EXP - f64::MIN_EXP) as u64;
676                let diff = diff << (f64::MANTISSA_DIGITS - 1);
677                let sign = u64::from(self.is_sign_negative()) << 63;
678                f64::from_bits(((u64::from(magnitude) << shift) + diff) | sign)
679            }
680
681            /// Lossy conversion to [`f64`]
682            ///
683            /// This variant assumes that the conversion is lossy only when the exponent
684            /// is out of range.
685            fn as_f64(self) -> f64 {
686                let bias = (1 << (Self::E - 1)) - 1;
687                let sign = if self.is_sign_negative() { -1.0 } else { 1.0 };
688                let magnitude = self.abs().to_bits();
689
690                if self.is_nan() {
691                    return f64::NAN.copysign(sign);
692                }
693                if self.is_infinite() {
694                    return f64::INFINITY * sign;
695                }
696                if i32::from(magnitude) >= (f64::MAX_EXP + bias) << Self::M {
697                    return f64::INFINITY * sign;
698                }
699                if magnitude < 1 << Self::M {
700                    #[allow(clippy::cast_possible_wrap)]
701                    let shift = Self::MIN_EXP - Self::MANTISSA_DIGITS as i32;
702                    return $crate::detail::exp2i(shift) * sign * f64::from(magnitude);
703                }
704                if i32::from(magnitude >> Self::M) < f64::MIN_EXP + bias {
705                    let significand = (magnitude & ((1 << Self::M) - 1)) | 1 << Self::M;
706                    let exponent = i32::from(magnitude >> Self::M) - bias;
707                    #[allow(clippy::cast_possible_wrap)]
708                    return $crate::detail::exp2i(exponent - Self::M as i32) * sign * f64::from(significand);
709                }
710                let shift = f64::MANTISSA_DIGITS - Self::MANTISSA_DIGITS;
711                #[allow(clippy::cast_sign_loss)]
712                let diff = (Self::MIN_EXP - f64::MIN_EXP) as u64;
713                let diff = diff << (f64::MANTISSA_DIGITS - 1);
714                let sign = u64::from(self.is_sign_negative()) << 63;
715                f64::from_bits(((u64::from(magnitude) << shift) + diff) | sign)
716            }
717
718            /// Best effort conversion to [`f64`]
719            #[must_use]
720            pub fn to_f64(self) -> f64 {
721                let lossless = f64::MANTISSA_DIGITS >= Self::MANTISSA_DIGITS
722                    && f64::MAX_EXP >= Self::MAX_EXP
723                    && f64::MIN_EXP <= Self::MIN_EXP;
724
725                if lossless {
726                    self.fast_to_f64()
727                } else {
728                    self.as_f64()
729                }
730            }
731
732            /// Best effort conversion to [`f32`]
733            #[must_use]
734            pub fn to_f32(self) -> f32 {
735                let lossless = f32::MANTISSA_DIGITS >= Self::MANTISSA_DIGITS
736                    && f32::MAX_EXP >= Self::MAX_EXP
737                    && f32::MIN_EXP <= Self::MIN_EXP;
738
739                if lossless {
740                    return self.fast_to_f32();
741                }
742                // Conversion to `f64` is lossy only when then exponent width is
743                // too large.  In this case, a second conversion to `f32` is
744                // safe.
745                #[allow(clippy::cast_possible_truncation)]
746                return self.to_f64() as f32;
747            }
748        }
749
750        const _: () = assert!($name::BITWIDTH <= 16);
751        const _: () = assert!($name::E >= 2);
752        const _: () = assert!($name::M > 0 || !matches!($name::N, $crate::NanStyle::IEEE));
753        const _: () = assert!($name::MAX_EXP >= 1);
754        const _: () = assert!($name::MIN_EXP <= 1);
755
756        impl PartialEq for $name {
757            fn eq(&self, other: &Self) -> bool {
758                let eq = self.0 == other.0 && !self.is_nan();
759                eq || !matches!(Self::N, $crate::NanStyle::FNUZ) && (self.0 | other.0) & Self::ABS_MASK == 0
760            }
761        }
762
763        impl PartialOrd for $name {
764            fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
765                if self.is_nan() || other.is_nan() {
766                    return None;
767                }
768                if self == other {
769                    return Some(core::cmp::Ordering::Equal);
770                }
771
772                let sign = (self.0 | other.0) >> (Self::E + Self::M) & 1 == 1;
773
774                Some(if (self.0 > other.0) ^ sign {
775                    core::cmp::Ordering::Greater
776                } else {
777                    core::cmp::Ordering::Less
778                })
779            }
780        }
781
782        impl core::ops::Neg for $name {
783            type Output = Self;
784
785            fn neg(self) -> Self::Output {
786                let flag = matches!(Self::N, $crate::NanStyle::FNUZ) && self.0 & Self::ABS_MASK == 0;
787                let switch = <$bits>::from(!flag) << (Self::E + Self::M);
788                Self(self.0 ^ switch)
789            }
790        }
791
792        impl $crate::Minifloat for $name {
793            type Bits = $bits;
794            const E: u32 = $e;
795            const M: u32 = $m;
796            const B: i32 = $b;
797            const N: $crate::NanStyle = $crate::NanStyle::$n;
798
799            const NAN: Self = Self::NAN;
800            const HUGE: Self = Self::HUGE;
801            const MAX: Self = Self::MAX;
802            const TINY: Self = Self::TINY;
803            const MIN_POSITIVE: Self = Self::MIN_POSITIVE;
804            const EPSILON: Self = Self::EPSILON;
805            const MIN: Self = Self::MIN;
806
807            fn from_bits(v: Self::Bits) -> Self {
808                Self::from_bits(v)
809            }
810
811            fn to_bits(self) -> Self::Bits {
812                self.to_bits()
813            }
814
815            fn total_cmp(&self, other: &Self) -> core::cmp::Ordering {
816                Self::total_cmp_key(self.0).cmp(&Self::total_cmp_key(other.0))
817            }
818
819            fn is_nan(self) -> bool {
820                self.is_nan()
821            }
822
823            fn is_infinite(self) -> bool {
824                self.is_infinite()
825            }
826
827            fn is_finite(self) -> bool {
828                self.is_finite()
829            }
830
831            fn classify(self) -> core::num::FpCategory {
832                self.classify()
833            }
834
835            fn abs(self) -> Self {
836                self.abs()
837            }
838
839            fn is_sign_positive(self) -> bool {
840                self.is_sign_positive()
841            }
842
843            fn is_sign_negative(self) -> bool {
844                self.is_sign_negative()
845            }
846        }
847
848        $crate::__conditionally_define_infinities!(impl $name, $n);
849    };
850    ($vis:vis struct $name:ident($bits:tt): $e:expr, $m:expr, $n:ident) => {
851        $crate::minifloat!($vis struct $name($bits): $e, $m, (1 << ($e - 1)) - 1, $n);
852    };
853    ($vis:vis struct $name:ident($bits:tt): $e:expr, $m:expr, $b:expr) => {
854        $crate::minifloat!($vis struct $name($bits): $e, $m, $b, IEEE);
855    };
856    ($vis:vis struct $name:ident($bits:tt): $e:expr, $m:expr) => {
857        $crate::minifloat!($vis struct $name($bits): $e, $m, (1 << ($e - 1)) - 1, IEEE);
858    };
859}
860
861minifloat!(pub struct F16(u16): 5, 10);
862minifloat!(pub struct BF16(u16): 8, 7);
minifloat/lib.rs

minifloat/
lib.rs