float16/
binary16.rs

1use core::{
2    cmp::Ordering,
3    iter::{Product, Sum},
4    num::FpCategory,
5    ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Rem, RemAssign, Sub, SubAssign},
6};
7#[cfg(not(target_arch = "spirv"))]
8use core::{
9    fmt::{
10        Binary,
11        Debug,
12        Display,
13        Error,
14        Formatter,
15        LowerExp,
16        LowerHex,
17        Octal,
18        UpperExp,
19        UpperHex,
20    },
21    num::ParseFloatError,
22    str::FromStr,
23};
24
25use crate::error::TryFromFloatError;
26use crate::try_from::try_from_lossless;
27
28pub(crate) mod arch;
29
30/// A 16-bit floating point type implementing the IEEE 754-2008 standard
31/// [`binary16`] a.k.a "half" format.
32///
33/// This 16-bit floating point type is intended for efficient storage where the
34/// full range and precision of a larger floating point value is not required.
35///
36/// [`binary16`]: https://en.wikipedia.org/wiki/Half-precision_floating-point_format
37#[repr(C)]
38#[allow(non_camel_case_types)]
39#[derive(Clone, Copy, Default)]
40#[cfg_attr(kani, derive(kani::Arbitrary))]
41pub struct f16(u16);
42
43impl f16 {
44    /// Constructs a 16-bit floating point value from the raw bits.
45    #[inline]
46    #[must_use]
47    pub const fn from_bits(bits: u16) -> f16 {
48        f16(bits)
49    }
50
51    /// Constructs a 16-bit floating point value from a 32-bit floating point
52    /// value.
53    ///
54    /// This operation is lossy. If the 32-bit value is to large to fit in
55    /// 16-bits, ±∞ will result. NaN values are preserved. 32-bit subnormal
56    /// values are too tiny to be represented in 16-bits and result in ±0.
57    /// Exponents that underflow the minimum 16-bit exponent will result in
58    /// 16-bit subnormals or ±0. All other values are truncated and rounded
59    /// to the nearest representable 16-bit value.
60    ///
61    /// This will prefer correctness over speed. Currently, this always
62    /// uses an intrinsic if available.
63    #[inline]
64    #[must_use]
65    pub fn from_f32(value: f32) -> f16 {
66        Self::from_f32_instrinsic(value)
67    }
68
69    /// Constructs a 16-bit floating point value from a 32-bit floating point
70    /// value.
71    ///
72    /// This function is identical to [`from_f32`][Self::from_f32] except it
73    /// never uses hardware intrinsics, which allows it to be `const`.
74    /// [`from_f32`][Self::from_f32] should be preferred in any non-`const`
75    /// context.
76    ///
77    /// This operation is lossy. If the 32-bit value is to large to fit in
78    /// 16-bits, ±∞ will result. NaN values are preserved. 32-bit subnormal
79    /// values are too tiny to be represented in 16-bits and result in ±0.
80    /// Exponents that underflow the minimum 16-bit exponent will result in
81    /// 16-bit subnormals or ±0. All other values are truncated and rounded
82    /// to the nearest representable 16-bit value.
83    #[inline]
84    #[must_use]
85    pub const fn from_f32_const(value: f32) -> f16 {
86        f16(arch::f32_to_f16_fallback(value))
87    }
88
89    /// Constructs a 16-bit floating point value from a 32-bit floating point
90    /// value.
91    ///
92    /// This operation is lossy. If the 32-bit value is to large to fit in
93    /// 16-bits, ±∞ will result. NaN values are preserved. 32-bit subnormal
94    /// values are too tiny to be represented in 16-bits and result in ±0.
95    /// Exponents that underflow the minimum 16-bit exponent will result in
96    /// 16-bit subnormals or ±0. All other values are truncated and rounded
97    /// to the nearest representable 16-bit value.
98    #[inline]
99    #[must_use]
100    pub fn from_f32_instrinsic(value: f32) -> f16 {
101        f16(arch::f32_to_f16(value))
102    }
103
104    /// Create a [`struct@f16`] loslessly from an [`f32`].
105    ///
106    /// This is only true if the [`f32`] is non-finite
107    /// (infinite or NaN), or the exponent can be represented
108    /// by a normal [`struct@f16`] and no non-zero bits would
109    /// be truncated.
110    ///
111    /// "Lossless" does not mean the data is represented the
112    /// same as a decimal number. For example, an [`f32`]
113    /// and [`f64`] have the significant digits (excluding the
114    /// hidden bit) for a value closest to `1e35` of:
115    /// - `f32`: `110100001001100001100`
116    /// - `f64`: `11010000100110000110000000000000000000000000000000`
117    ///
118    /// However, the [`f64`] is displayed as `1.0000000409184788e+35`,
119    /// while the value closest to `1e35` in [`f64`] is
120    /// `11010000100110000101110010110001110100110110000010`. This
121    /// makes it look like precision has been lost but this is
122    /// due to the approximations used to represent binary values as
123    /// a decimal.
124    ///
125    /// This does not respect signalling NaNs: if the value
126    /// is NaN or inf, then it will return that value.
127    #[inline]
128    pub const fn from_f32_lossless(value: f32) -> Option<f16> {
129        try_from_lossless!(
130            value => value,
131            half => f16,
132            full => f32,
133            half_bits => u16,
134            full_bits => u32,
135            to_half => from_f32
136        )
137    }
138
139    /// Constructs a 16-bit floating point value from a 64-bit floating point
140    /// value.
141    ///
142    /// This operation is lossy. If the 64-bit value is to large to fit in
143    /// 16-bits, ±∞ will result. NaN values are preserved. 64-bit subnormal
144    /// values are too tiny to be represented in 16-bits and result in ±0.
145    /// Exponents that underflow the minimum 16-bit exponent will result in
146    /// 16-bit subnormals or ±0. All other values are truncated and rounded
147    /// to the nearest representable 16-bit value.
148    ///
149    /// This will prefer correctness over speed: on x86 systems, this currently
150    /// uses a software rather than an instrinsic implementation on x86.
151    #[inline]
152    #[must_use]
153    pub fn from_f64(value: f64) -> f16 {
154        // FIXME: Once `_mm_cvtpd_ph` is stablized, move to using the intrinsic.
155        if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
156            Self::from_f64_const(value)
157        } else {
158            Self::from_f64_instrinsic(value)
159        }
160    }
161
162    /// Constructs a 16-bit floating point value from a 64-bit floating point
163    /// value.
164    ///
165    /// This function is identical to [`from_f64`][Self::from_f64] except it
166    /// never uses hardware intrinsics, which allows it to be `const`.
167    /// [`from_f64`][Self::from_f64] should be preferred in any non-`const`
168    /// context.
169    ///
170    /// This operation is lossy. If the 64-bit value is to large to fit in
171    /// 16-bits, ±∞ will result. NaN values are preserved. 64-bit subnormal
172    /// values are too tiny to be represented in 16-bits and result in ±0.
173    /// Exponents that underflow the minimum 16-bit exponent will result in
174    /// 16-bit subnormals or ±0. All other values are truncated and rounded
175    /// to the nearest representable 16-bit value.
176    #[inline]
177    #[must_use]
178    pub const fn from_f64_const(value: f64) -> f16 {
179        f16(arch::f64_to_f16_fallback(value))
180    }
181
182    /// Constructs a 16-bit floating point value from a 64-bit floating point
183    /// value.
184    ///
185    /// This operation is lossy. If the 64-bit value is to large to fit in
186    /// 16-bits, ±∞ will result. NaN values are preserved. 64-bit subnormal
187    /// values are too tiny to be represented in 16-bits and result in ±0.
188    /// Exponents that underflow the minimum 16-bit exponent will result in
189    /// 16-bit subnormals or ±0. All other values are truncated and rounded
190    /// to the nearest representable 16-bit value.
191    ///
192    /// This prefers to use vendor instrinsics if possible, otherwise, it
193    /// goes to a fallback. On x86 and x86_64, this can be more lossy than
194    /// `from_f64`.
195    #[inline]
196    #[must_use]
197    pub fn from_f64_instrinsic(value: f64) -> f16 {
198        f16(arch::f64_to_f16(value))
199    }
200
201    /// Create a [`struct@f16`] loslessly from an [`f64`].
202    ///
203    /// This is only true if the [`f64`] is non-finite
204    /// (infinite or NaN), or the exponent can be represented
205    /// by a normal [`struct@f16`] and no non-zero bits would
206    /// be truncated.
207    ///
208    /// "Lossless" does not mean the data is represented the
209    /// same as a decimal number. For example, an [`f32`]
210    /// and [`f64`] have the significant digits (excluding the
211    /// hidden bit) for a value closest to `1e35` of:
212    /// - `f32`: `110100001001100001100`
213    /// - `f64`: `11010000100110000110000000000000000000000000000000`
214    ///
215    /// However, the [`f64`] is displayed as `1.0000000409184788e+35`,
216    /// while the value closest to `1e35` in [`f64`] is
217    /// `11010000100110000101110010110001110100110110000010`. This
218    /// makes it look like precision has been lost but this is
219    /// due to the approximations used to represent binary values as
220    /// a decimal.
221    ///
222    /// This does not respect signalling NaNs: if the value
223    /// is NaN or inf, then it will return that value.
224    #[inline]
225    pub const fn from_f64_lossless(value: f64) -> Option<f16> {
226        try_from_lossless!(
227            value => value,
228            half => f16,
229            full => f64,
230            half_bits => u16,
231            full_bits => u64,
232            to_half => from_f64
233        )
234    }
235
236    /// Converts a [`struct@f16`] into the underlying bit representation.
237    #[inline]
238    #[must_use]
239    pub const fn to_bits(self) -> u16 {
240        self.0
241    }
242
243    /// Returns the memory representation of the underlying bit representation
244    /// as a byte array in little-endian byte order.
245    ///
246    /// # Examples
247    ///
248    /// ```rust
249    /// # use float16::*;
250    /// let bytes = f16::from_f32(12.5).to_le_bytes();
251    /// assert_eq!(bytes, [0x40, 0x4A]);
252    /// ```
253    #[inline]
254    #[must_use]
255    pub const fn to_le_bytes(self) -> [u8; 2] {
256        self.0.to_le_bytes()
257    }
258
259    /// Returns the memory representation of the underlying bit representation
260    /// as a byte array in big-endian (network) byte order.
261    ///
262    /// # Examples
263    ///
264    /// ```rust
265    /// # use float16::*;
266    /// let bytes = f16::from_f32(12.5).to_be_bytes();
267    /// assert_eq!(bytes, [0x4A, 0x40]);
268    /// ```
269    #[inline]
270    #[must_use]
271    pub const fn to_be_bytes(self) -> [u8; 2] {
272        self.0.to_be_bytes()
273    }
274
275    /// Returns the memory representation of the underlying bit representation
276    /// as a byte array in native byte order.
277    ///
278    /// As the target platform's native endianness is used, portable code should
279    /// use [`to_be_bytes`][Self::to_be_bytes] or
280    /// [`to_le_bytes`][Self::to_le_bytes], as appropriate, instead.
281    ///
282    /// # Examples
283    ///
284    /// ```rust
285    /// # use float16::*;
286    /// let bytes = f16::from_f32(12.5).to_ne_bytes();
287    /// assert_eq!(bytes, if cfg!(target_endian = "big") {
288    ///     [0x4A, 0x40]
289    /// } else {
290    ///     [0x40, 0x4A]
291    /// });
292    /// ```
293    #[inline]
294    #[must_use]
295    pub const fn to_ne_bytes(self) -> [u8; 2] {
296        self.0.to_ne_bytes()
297    }
298
299    /// Creates a floating point value from its representation as a byte array
300    /// in little endian.
301    ///
302    /// # Examples
303    ///
304    /// ```rust
305    /// # use float16::*;
306    /// let value = f16::from_le_bytes([0x40, 0x4A]);
307    /// assert_eq!(value, f16::from_f32(12.5));
308    /// ```
309    #[inline]
310    #[must_use]
311    pub const fn from_le_bytes(bytes: [u8; 2]) -> f16 {
312        f16::from_bits(u16::from_le_bytes(bytes))
313    }
314
315    /// Creates a floating point value from its representation as a byte array
316    /// in big endian.
317    ///
318    /// # Examples
319    ///
320    /// ```rust
321    /// # use float16::*;
322    /// let value = f16::from_be_bytes([0x4A, 0x40]);
323    /// assert_eq!(value, f16::from_f32(12.5));
324    /// ```
325    #[inline]
326    #[must_use]
327    pub const fn from_be_bytes(bytes: [u8; 2]) -> f16 {
328        f16::from_bits(u16::from_be_bytes(bytes))
329    }
330
331    /// Creates a floating point value from its representation as a byte array
332    /// in native endian.
333    ///
334    /// As the target platform's native endianness is used, portable code likely
335    /// wants to use [`from_be_bytes`][Self::from_be_bytes] or
336    /// [`from_le_bytes`][Self::from_le_bytes], as appropriate instead.
337    ///
338    /// # Examples
339    ///
340    /// ```rust
341    /// # use float16::*;
342    /// let value = f16::from_ne_bytes(if cfg!(target_endian = "big") {
343    ///     [0x4A, 0x40]
344    /// } else {
345    ///     [0x40, 0x4A]
346    /// });
347    /// assert_eq!(value, f16::from_f32(12.5));
348    /// ```
349    #[inline]
350    #[must_use]
351    pub const fn from_ne_bytes(bytes: [u8; 2]) -> f16 {
352        f16::from_bits(u16::from_ne_bytes(bytes))
353    }
354
355    /// Converts a [`struct@f16`] value into a `f32` value.
356    ///
357    /// This conversion is lossless as all 16-bit floating point values can be
358    /// represented exactly in 32-bit floating point.
359    ///
360    /// This will prefer correctness over speed. Currently, this always
361    /// uses an intrinsic if available.
362    #[inline]
363    #[must_use]
364    pub fn to_f32(self) -> f32 {
365        self.to_f32_intrinsic()
366    }
367
368    /// Converts a [`struct@f16`] value into a `f32` value.
369    ///
370    /// This function is identical to [`to_f32`][Self::to_f32] except it never
371    /// uses hardware intrinsics, which allows it to be `const`.
372    /// [`to_f32`][Self::to_f32] should be preferred in any non-`const`
373    /// context.
374    ///
375    /// This conversion is lossless as all 16-bit floating point values can be
376    /// represented exactly in 32-bit floating point.
377    #[inline]
378    #[must_use]
379    pub const fn to_f32_const(self) -> f32 {
380        arch::f16_to_f32_fallback(self.0)
381    }
382
383    /// Converts a [`struct@f16`] value into a `f32` value.
384    ///
385    /// This conversion is lossless as all 16-bit floating point values can be
386    /// represented exactly in 32-bit floating point.
387    #[inline]
388    #[must_use]
389    pub fn to_f32_intrinsic(self) -> f32 {
390        arch::f16_to_f32(self.0)
391    }
392
393    /// Convert the data to an `f32` type, used for numerical operations.
394    #[inline(always)]
395    pub fn as_f32(self) -> f32 {
396        self.to_f32_const()
397    }
398
399    /// Convert the data to an `f32` type, used for numerical operations.
400    #[inline(always)]
401    pub const fn as_f32_const(self) -> f32 {
402        self.to_f32_const()
403    }
404
405    /// Converts a [`struct@f16`] value into a `f64` value.
406    ///
407    /// This conversion is lossless as all 16-bit floating point values can be
408    /// represented exactly in 64-bit floating point.
409    ///
410    /// This will prefer correctness over speed: on x86 systems, this currently
411    /// uses a software rather than an instrinsic implementation on x86.
412    #[inline]
413    #[must_use]
414    pub fn to_f64(self) -> f64 {
415        self.to_f64_const()
416    }
417
418    /// Converts a [`struct@f16`] value into a `f64` value.
419    ///
420    /// This function is identical to [`to_f64`][Self::to_f64] except it never
421    /// uses hardware intrinsics, which allows it to be `const`.
422    /// [`to_f64`][Self::to_f64] should be preferred in any non-`const`
423    /// context.
424    ///
425    /// This conversion is lossless as all 16-bit floating point values can be
426    /// represented exactly in 64-bit floating point.
427    #[inline]
428    #[must_use]
429    pub const fn to_f64_const(self) -> f64 {
430        arch::f16_to_f64_fallback(self.0)
431    }
432
433    /// Converts a [`struct@f16`] value into a `f32` value.
434    ///
435    /// This conversion is lossless as all 16-bit floating point values can be
436    /// represented exactly in 32-bit floating point.
437    #[inline]
438    #[must_use]
439    pub fn to_f64_intrinsic(self) -> f64 {
440        arch::f16_to_f64(self.0)
441    }
442
443    /// Convert the data to an `f64` type, used for numerical operations.
444    #[inline(always)]
445    pub fn as_f64(self) -> f64 {
446        self.to_f64_const()
447    }
448
449    /// Convert the data to an `f64` type, used for numerical operations.
450    #[inline(always)]
451    pub const fn as_f64_const(self) -> f64 {
452        self.to_f64_const()
453    }
454
455    /// Returns `true` if this value is `NaN` and `false` otherwise.
456    ///
457    /// # Examples
458    ///
459    /// ```rust
460    /// # use float16::*;
461    ///
462    /// let nan = f16::NAN;
463    /// let f = f16::from_f32(7.0_f32);
464    ///
465    /// assert!(nan.is_nan());
466    /// assert!(!f.is_nan());
467    /// ```
468    #[inline]
469    #[must_use]
470    pub const fn is_nan(self) -> bool {
471        self.0 & Self::NOT_SIGN > Self::EXP_MASK
472    }
473
474    /// Computes the absolute value of `self`.
475    #[must_use]
476    #[inline(always)]
477    pub const fn abs(self) -> Self {
478        Self(self.0 & !Self::SIGN_MASK)
479    }
480
481    /// Returns `true` if this value is ±∞ and `false`.
482    /// otherwise.
483    ///
484    /// # Examples
485    ///
486    /// ```rust
487    /// # use float16::*;
488    ///
489    /// let f = f16::from_f32(7.0f32);
490    /// let inf = f16::INFINITY;
491    /// let neg_inf = f16::NEG_INFINITY;
492    /// let nan = f16::NAN;
493    ///
494    /// assert!(!f.is_infinite());
495    /// assert!(!nan.is_infinite());
496    ///
497    /// assert!(inf.is_infinite());
498    /// assert!(neg_inf.is_infinite());
499    /// ```
500    #[inline]
501    #[must_use]
502    pub const fn is_infinite(self) -> bool {
503        self.0 & Self::NOT_SIGN == Self::EXP_MASK
504    }
505
506    /// Returns `true` if this number is neither infinite nor `NaN`.
507    ///
508    /// # Examples
509    ///
510    /// ```rust
511    /// # use float16::*;
512    ///
513    /// let f = f16::from_f32(7.0f32);
514    /// let inf = f16::INFINITY;
515    /// let neg_inf = f16::NEG_INFINITY;
516    /// let nan = f16::NAN;
517    ///
518    /// assert!(f.is_finite());
519    ///
520    /// assert!(!nan.is_finite());
521    /// assert!(!inf.is_finite());
522    /// assert!(!neg_inf.is_finite());
523    /// ```
524    #[inline]
525    #[must_use]
526    pub const fn is_finite(self) -> bool {
527        self.0 & Self::EXP_MASK != Self::EXP_MASK
528    }
529
530    /// Returns `true` if the number is [subnormal].
531    ///
532    /// [subnormal]: https://en.wikipedia.org/wiki/Denormal_number
533    #[must_use]
534    #[inline(always)]
535    pub const fn is_subnormal(self) -> bool {
536        matches!(self.classify(), FpCategory::Subnormal)
537    }
538
539    /// Returns `true` if the number is neither zero, infinite, subnormal, or
540    /// `NaN`.
541    ///
542    /// # Examples
543    ///
544    /// ```rust
545    /// # use float16::*;
546    ///
547    /// let min = f16::MIN_POSITIVE;
548    /// let max = f16::MAX;
549    /// let lower_than_min = f16::from_f32(1.0e-10_f32);
550    /// let zero = f16::from_f32(0.0_f32);
551    ///
552    /// assert!(min.is_normal());
553    /// assert!(max.is_normal());
554    ///
555    /// assert!(!zero.is_normal());
556    /// assert!(!f16::NAN.is_normal());
557    /// assert!(!f16::INFINITY.is_normal());
558    /// // Values between `0` and `min` are Subnormal.
559    /// assert!(!lower_than_min.is_normal());
560    /// ```
561    #[inline]
562    #[must_use]
563    pub const fn is_normal(self) -> bool {
564        let exp = self.0 & Self::EXP_MASK;
565        exp != Self::EXP_MASK && exp != 0
566    }
567
568    /// Returns the floating point category of the number.
569    ///
570    /// If only one property is going to be tested, it is generally faster to
571    /// use the specific predicate instead.
572    ///
573    /// # Examples
574    ///
575    /// ```rust
576    /// use std::num::FpCategory;
577    /// # use float16::*;
578    ///
579    /// let num = f16::from_f32(12.4_f32);
580    /// let inf = f16::INFINITY;
581    ///
582    /// assert_eq!(num.classify(), FpCategory::Normal);
583    /// assert_eq!(inf.classify(), FpCategory::Infinite);
584    /// ```
585    #[inline]
586    #[must_use]
587    pub const fn classify(self) -> FpCategory {
588        let exp = self.0 & Self::EXP_MASK;
589        let man = self.0 & Self::MAN_MASK;
590        match (exp, man) {
591            (0, 0) => FpCategory::Zero,
592            (0, _) => FpCategory::Subnormal,
593            (Self::EXP_MASK, 0) => FpCategory::Infinite,
594            (Self::EXP_MASK, _) => FpCategory::Nan,
595            _ => FpCategory::Normal,
596        }
597    }
598
599    /// Returns a number that represents the sign of `self`.
600    ///
601    /// * `1.0` if the number is positive, `+0.0` or [`INFINITY`][f16::INFINITY]
602    /// * `-1.0` if the number is negative, `-0.0` or
603    ///   [`NEG_INFINITY`][f16::NEG_INFINITY]
604    /// * [`NAN`][f16::NAN] if the number is `NaN`
605    ///
606    /// # Examples
607    ///
608    /// ```rust
609    /// # use float16::*;
610    ///
611    /// let f = f16::from_f32(3.5_f32);
612    ///
613    /// assert_eq!(f.signum(), f16::from_f32(1.0));
614    /// assert_eq!(f16::NEG_INFINITY.signum(), f16::from_f32(-1.0));
615    ///
616    /// assert!(f16::NAN.signum().is_nan());
617    /// ```
618    #[inline]
619    #[must_use]
620    pub const fn signum(self) -> f16 {
621        if self.is_nan() {
622            self
623        } else if self.0 & Self::SIGN_MASK != 0 {
624            Self::NEG_ONE
625        } else {
626            Self::ONE
627        }
628    }
629
630    /// Returns `true` if and only if `self` has a positive sign, including
631    /// `+0.0`, `NaNs` with a positive sign bit and +∞.
632    ///
633    /// # Examples
634    ///
635    /// ```rust
636    /// # use float16::*;
637    ///
638    /// let nan = f16::NAN;
639    /// let f = f16::from_f32(7.0_f32);
640    /// let g = f16::from_f32(-7.0_f32);
641    ///
642    /// assert!(f.is_sign_positive());
643    /// assert!(!g.is_sign_positive());
644    /// // `NaN` can be either positive or negative
645    /// assert!(nan.is_sign_positive() != nan.is_sign_negative());
646    /// ```
647    #[inline]
648    #[must_use]
649    pub const fn is_sign_positive(self) -> bool {
650        self.0 & Self::SIGN_MASK == 0
651    }
652
653    /// Returns `true` if and only if `self` has a negative sign, including
654    /// `-0.0`, `NaNs` with a negative sign bit and −∞.
655    ///
656    /// # Examples
657    ///
658    /// ```rust
659    /// # use float16::*;
660    ///
661    /// let nan = f16::NAN;
662    /// let f = f16::from_f32(7.0f32);
663    /// let g = f16::from_f32(-7.0f32);
664    ///
665    /// assert!(!f.is_sign_negative());
666    /// assert!(g.is_sign_negative());
667    /// // `NaN` can be either positive or negative
668    /// assert!(nan.is_sign_positive() != nan.is_sign_negative());
669    /// ```
670    #[inline]
671    #[must_use]
672    pub const fn is_sign_negative(self) -> bool {
673        self.0 & Self::SIGN_MASK != 0
674    }
675
676    /// Returns a number composed of the magnitude of `self` and the sign of
677    /// `sign`.
678    ///
679    /// Equal to `self` if the sign of `self` and `sign` are the same, otherwise
680    /// equal to `-self`. If `self` is NaN, then NaN with the sign of `sign`
681    /// is returned.
682    ///
683    /// # Examples
684    ///
685    /// ```
686    /// # use float16::*;
687    /// let f = f16::from_f32(3.5);
688    ///
689    /// assert_eq!(f.copysign(f16::from_f32(0.42)), f16::from_f32(3.5));
690    /// assert_eq!(f.copysign(f16::from_f32(-0.42)), f16::from_f32(-3.5));
691    /// assert_eq!((-f).copysign(f16::from_f32(0.42)), f16::from_f32(3.5));
692    /// assert_eq!((-f).copysign(f16::from_f32(-0.42)), f16::from_f32(-3.5));
693    ///
694    /// assert!(f16::NAN.copysign(f16::from_f32(1.0)).is_nan());
695    /// ```
696    #[inline]
697    #[must_use]
698    pub const fn copysign(self, sign: f16) -> f16 {
699        f16((sign.0 & Self::SIGN_MASK) | (self.0 & Self::NOT_SIGN))
700    }
701
702    /// Takes the reciprocal (inverse) of a number, `1/x`.
703    #[must_use]
704    #[inline(always)]
705    pub fn recip(self) -> Self {
706        Self::ONE / self
707    }
708
709    /// Converts radians to degrees.
710    #[must_use]
711    #[inline(always)]
712    pub fn to_degrees(self) -> Self {
713        self * Self::from(180u8) / Self::PI
714    }
715
716    /// Converts degrees to radians.
717    #[must_use]
718    #[inline(always)]
719    pub fn to_radians(self) -> Self {
720        self * Self::PI / Self::from(180u8)
721    }
722
723    /// Returns the maximum of the two numbers.
724    ///
725    /// If one of the arguments is NaN, then the other argument is returned.
726    ///
727    /// # Examples
728    ///
729    /// ```
730    /// # use float16::*;
731    /// let x = f16::from_f32(1.0);
732    /// let y = f16::from_f32(2.0);
733    ///
734    /// assert_eq!(x.max(y), y);
735    /// ```
736    #[inline]
737    #[must_use]
738    pub const fn max(self, other: f16) -> f16 {
739        if self.is_nan() || gt(other, self) {
740            other
741        } else {
742            self
743        }
744    }
745
746    /// Returns the minimum of the two numbers.
747    ///
748    /// If one of the arguments is NaN, then the other argument is returned.
749    ///
750    /// # Examples
751    ///
752    /// ```
753    /// # use float16::*;
754    /// let x = f16::from_f32(1.0);
755    /// let y = f16::from_f32(2.0);
756    ///
757    /// assert_eq!(x.min(y), x);
758    /// ```
759    #[inline]
760    #[must_use]
761    pub const fn min(self, other: f16) -> f16 {
762        if self.is_nan() || lt(other, self) {
763            other
764        } else {
765            self
766        }
767    }
768
769    /// Restrict a value to a certain interval unless it is NaN.
770    ///
771    /// Returns `max` if `self` is greater than `max`, and `min` if `self` is
772    /// less than `min`. Otherwise this returns `self`.
773    ///
774    /// Note that this function returns NaN if the initial value was NaN as
775    /// well.
776    ///
777    /// # Panics
778    /// Panics if `min > max`, `min` is NaN, or `max` is NaN.
779    ///
780    /// # Examples
781    ///
782    /// ```
783    /// # use float16::*;
784    /// assert!(f16::from_f32(-3.0).clamp(f16::from_f32(-2.0), f16::from_f32(1.0)) == f16::from_f32(-2.0));
785    /// assert!(f16::from_f32(0.0).clamp(f16::from_f32(-2.0), f16::from_f32(1.0)) == f16::from_f32(0.0));
786    /// assert!(f16::from_f32(2.0).clamp(f16::from_f32(-2.0), f16::from_f32(1.0)) == f16::from_f32(1.0));
787    /// assert!(f16::NAN.clamp(f16::from_f32(-2.0), f16::from_f32(1.0)).is_nan());
788    /// ```
789    #[inline]
790    #[must_use]
791    pub const fn clamp(self, min: f16, max: f16) -> f16 {
792        assert!(le(min, max));
793        let mut x = self;
794        if lt(x, min) {
795            x = min;
796        }
797        if gt(x, max) {
798            x = max;
799        }
800        x
801    }
802
803    /// Returns the ordering between `self` and `other`.
804    ///
805    /// Unlike the standard partial comparison between floating point numbers,
806    /// this comparison always produces an ordering in accordance to
807    /// the `totalOrder` predicate as defined in the IEEE 754 (2008 revision)
808    /// floating point standard. The values are ordered in the following
809    /// sequence:
810    ///
811    /// - negative quiet NaN
812    /// - negative signaling NaN
813    /// - negative infinity
814    /// - negative numbers
815    /// - negative subnormal numbers
816    /// - negative zero
817    /// - positive zero
818    /// - positive subnormal numbers
819    /// - positive numbers
820    /// - positive infinity
821    /// - positive signaling NaN
822    /// - positive quiet NaN.
823    ///
824    /// The ordering established by this function does not always agree with the
825    /// [`PartialOrd`] and [`PartialEq`] implementations of `f16`. For example,
826    /// they consider negative and positive zero equal, while `total_cmp`
827    /// doesn't.
828    ///
829    /// The interpretation of the signaling NaN bit follows the definition in
830    /// the IEEE 754 standard, which may not match the interpretation by some of
831    /// the older, non-conformant (e.g. MIPS) hardware implementations.
832    ///
833    /// # Examples
834    /// ```
835    /// # use float16::f16;
836    /// let mut v: Vec<f16> = vec![];
837    /// v.push(f16::ONE);
838    /// v.push(f16::INFINITY);
839    /// v.push(f16::NEG_INFINITY);
840    /// v.push(f16::NAN);
841    /// v.push(f16::MAX_SUBNORMAL);
842    /// v.push(-f16::MAX_SUBNORMAL);
843    /// v.push(f16::ZERO);
844    /// v.push(f16::NEG_ZERO);
845    /// v.push(f16::NEG_ONE);
846    /// v.push(f16::MIN_POSITIVE);
847    ///
848    /// v.sort_by(|a, b| a.total_cmp(&b));
849    ///
850    /// assert!(v
851    ///     .into_iter()
852    ///     .zip(
853    ///         [
854    ///             f16::NEG_INFINITY,
855    ///             f16::NEG_ONE,
856    ///             -f16::MAX_SUBNORMAL,
857    ///             f16::NEG_ZERO,
858    ///             f16::ZERO,
859    ///             f16::MAX_SUBNORMAL,
860    ///             f16::MIN_POSITIVE,
861    ///             f16::ONE,
862    ///             f16::INFINITY,
863    ///             f16::NAN
864    ///         ]
865    ///         .iter()
866    ///     )
867    ///     .all(|(a, b)| a.to_bits() == b.to_bits()));
868    /// ```
869    // Implementation based on: https://doc.rust-lang.org/std/primitive.f32.html#method.total_cmp
870    #[inline]
871    #[must_use]
872    pub fn total_cmp(&self, other: &Self) -> Ordering {
873        let mut left = self.to_bits() as i16;
874        let mut right = other.to_bits() as i16;
875        left ^= (((left >> 15) as u16) >> 1) as i16;
876        right ^= (((right >> 15) as u16) >> 1) as i16;
877        left.cmp(&right)
878    }
879
880    /// Approximate number of [`struct@f16`] significant digits in base 10
881    pub const DIGITS: u32 = 3;
882    /// [`struct@f16`]
883    /// [machine epsilon](https://en.wikipedia.org/wiki/Machine_epsilon) value
884    ///
885    /// This is the difference between 1.0 and the next largest representable
886    /// number.
887    pub const EPSILON: f16 = f16(0x1400u16);
888    /// [`struct@f16`] positive Infinity (+∞)
889    pub const INFINITY: f16 = f16(0x7C00u16);
890    /// Number of [`struct@f16`] significant digits in base 2
891    pub const MANTISSA_DIGITS: u32 = 11;
892    /// Largest finite [`struct@f16`] value
893    pub const MAX: f16 = f16(0x7BFF);
894    /// Maximum possible [`struct@f16`] power of 10 exponent
895    pub const MAX_10_EXP: i32 = 4;
896    /// Maximum possible [`struct@f16`] power of 2 exponent
897    pub const MAX_EXP: i32 = 16;
898    /// Smallest finite [`struct@f16`] value
899    pub const MIN: f16 = f16(0xFBFF);
900    /// Minimum possible normal [`struct@f16`] power of 10 exponent
901    pub const MIN_10_EXP: i32 = -4;
902    /// One greater than the minimum possible normal [`struct@f16`] power of 2
903    /// exponent
904    pub const MIN_EXP: i32 = -13;
905    /// Smallest positive normal [`struct@f16`] value
906    pub const MIN_POSITIVE: f16 = f16(0x0400u16);
907    /// [`struct@f16`] Not a Number (NaN)
908    pub const NAN: f16 = f16(0x7E00u16);
909    /// [`struct@f16`] negative infinity (-∞)
910    pub const NEG_INFINITY: f16 = f16(0xFC00u16);
911    /// The radix or base of the internal representation of [`struct@f16`]
912    pub const RADIX: u32 = 2;
913
914    /// Minimum positive subnormal [`struct@f16`] value
915    pub const MIN_POSITIVE_SUBNORMAL: f16 = f16(0x0001u16);
916    /// Maximum subnormal [`struct@f16`] value
917    pub const MAX_SUBNORMAL: f16 = f16(0x03FFu16);
918
919    /// [`struct@f16`] 1
920    pub const ONE: f16 = f16(0x3C00u16);
921    /// [`struct@f16`] 0
922    pub const ZERO: f16 = f16(0x0000u16);
923    /// [`struct@f16`] -0
924    pub const NEG_ZERO: f16 = f16(0x8000u16);
925    /// [`struct@f16`] -1
926    pub const NEG_ONE: f16 = f16(0xBC00u16);
927
928    /// [`struct@f16`] Euler's number (ℯ)
929    pub const E: f16 = f16(0x4170u16);
930    /// [`struct@f16`] Archimedes' constant (π)
931    pub const PI: f16 = f16(0x4248u16);
932    /// [`struct@f16`] 1/π
933    pub const FRAC_1_PI: f16 = f16(0x3518u16);
934    /// [`struct@f16`] 1/√2
935    pub const FRAC_1_SQRT_2: f16 = f16(0x39A8u16);
936    /// [`struct@f16`] 2/π
937    pub const FRAC_2_PI: f16 = f16(0x3918u16);
938    /// [`struct@f16`] 2/√π
939    pub const FRAC_2_SQRT_PI: f16 = f16(0x3C83u16);
940    /// [`struct@f16`] π/2
941    pub const FRAC_PI_2: f16 = f16(0x3E48u16);
942    /// [`struct@f16`] π/3
943    pub const FRAC_PI_3: f16 = f16(0x3C30u16);
944    /// [`struct@f16`] π/4
945    pub const FRAC_PI_4: f16 = f16(0x3A48u16);
946    /// [`struct@f16`] π/6
947    pub const FRAC_PI_6: f16 = f16(0x3830u16);
948    /// [`struct@f16`] π/8
949    pub const FRAC_PI_8: f16 = f16(0x3648u16);
950    /// [`struct@f16`] 𝗅𝗇 10
951    pub const LN_10: f16 = f16(0x409Bu16);
952    /// [`struct@f16`] 𝗅𝗇 2
953    pub const LN_2: f16 = f16(0x398Cu16);
954    /// [`struct@f16`] 𝗅𝗈𝗀₁₀ℯ
955    pub const LOG10_E: f16 = f16(0x36F3u16);
956    /// [`struct@f16`] 𝗅𝗈𝗀₁₀2
957    pub const LOG10_2: f16 = f16(0x34D1u16);
958    /// [`struct@f16`] 𝗅𝗈𝗀₂ℯ
959    pub const LOG2_E: f16 = f16(0x3DC5u16);
960    /// [`struct@f16`] 𝗅𝗈𝗀₂10
961    pub const LOG2_10: f16 = f16(0x42A5u16);
962    /// [`struct@f16`] √2
963    pub const SQRT_2: f16 = f16(0x3DA8u16);
964
965    /// Sign bit
966    pub const SIGN_MASK: u16 = 0x8000;
967    // Private helper for comparisons.
968    const NOT_SIGN: u16 = !Self::SIGN_MASK;
969
970    /// Exponent mask
971    pub const EXP_MASK: u16 = 0x7C00;
972
973    /// Mask for the hidden bit.
974    pub const HIDDEN_BIT_MASK: u16 = 0x0400;
975
976    /// Mantissa mask
977    pub const MAN_MASK: u16 = 0x03FF;
978
979    /// Minimum representable positive value (min subnormal)
980    pub const TINY_BITS: u16 = 0x1;
981
982    /// Minimum representable negative value (min negative subnormal)
983    pub const NEG_TINY_BITS: u16 = Self::TINY_BITS | Self::SIGN_MASK;
984}
985
986macro_rules! from_int_impl {
987    ($t:ty, $func:ident) => {
988        /// Create from the integral type, as if by an `as` cast.
989        #[inline(always)]
990        pub const fn $func(value: $t) -> Self {
991            Self::from_f32_const(value as f32)
992        }
993    };
994}
995
996impl f16 {
997    from_int_impl!(u8, from_u8);
998    from_int_impl!(u16, from_u16);
999    from_int_impl!(u32, from_u32);
1000    from_int_impl!(u64, from_u64);
1001    from_int_impl!(u128, from_u128);
1002    from_int_impl!(i8, from_i8);
1003    from_int_impl!(i16, from_i16);
1004    from_int_impl!(i32, from_i32);
1005    from_int_impl!(i64, from_i64);
1006    from_int_impl!(i128, from_i128);
1007}
1008
1009impl From<f16> for f32 {
1010    #[inline]
1011    fn from(x: f16) -> f32 {
1012        x.to_f32()
1013    }
1014}
1015
1016impl From<f16> for f64 {
1017    #[inline]
1018    fn from(x: f16) -> f64 {
1019        x.to_f64()
1020    }
1021}
1022
1023impl From<i8> for f16 {
1024    #[inline]
1025    fn from(x: i8) -> f16 {
1026        // Convert to f32, then to f16
1027        f16::from_f32(f32::from(x))
1028    }
1029}
1030
1031impl From<u8> for f16 {
1032    #[inline]
1033    fn from(x: u8) -> f16 {
1034        // Convert to f32, then to f16
1035        f16::from_f32(f32::from(x))
1036    }
1037}
1038
1039impl TryFrom<f32> for f16 {
1040    type Error = TryFromFloatError;
1041
1042    #[inline]
1043    fn try_from(x: f32) -> Result<Self, Self::Error> {
1044        Self::from_f32_lossless(x).ok_or(TryFromFloatError(()))
1045    }
1046}
1047
1048impl TryFrom<f64> for f16 {
1049    type Error = TryFromFloatError;
1050
1051    #[inline]
1052    fn try_from(x: f64) -> Result<Self, Self::Error> {
1053        Self::from_f64_lossless(x).ok_or(TryFromFloatError(()))
1054    }
1055}
1056
1057impl PartialEq for f16 {
1058    #[inline]
1059    fn eq(&self, other: &f16) -> bool {
1060        eq(*self, *other)
1061    }
1062}
1063
1064impl PartialOrd for f16 {
1065    #[inline]
1066    fn partial_cmp(&self, other: &f16) -> Option<Ordering> {
1067        if self.is_nan() || other.is_nan() {
1068            None
1069        } else {
1070            let neg = self.0 & Self::SIGN_MASK != 0;
1071            let other_neg = other.0 & Self::SIGN_MASK != 0;
1072            match (neg, other_neg) {
1073                (false, false) => Some(self.0.cmp(&other.0)),
1074                (false, true) => {
1075                    if (self.0 | other.0) & Self::NOT_SIGN == 0 {
1076                        Some(Ordering::Equal)
1077                    } else {
1078                        Some(Ordering::Greater)
1079                    }
1080                },
1081                (true, false) => {
1082                    if (self.0 | other.0) & Self::NOT_SIGN == 0 {
1083                        Some(Ordering::Equal)
1084                    } else {
1085                        Some(Ordering::Less)
1086                    }
1087                },
1088                (true, true) => Some(other.0.cmp(&self.0)),
1089            }
1090        }
1091    }
1092
1093    #[inline]
1094    fn lt(&self, other: &f16) -> bool {
1095        lt(*self, *other)
1096    }
1097
1098    #[inline]
1099    fn le(&self, other: &f16) -> bool {
1100        le(*self, *other)
1101    }
1102
1103    #[inline]
1104    fn gt(&self, other: &f16) -> bool {
1105        gt(*self, *other)
1106    }
1107
1108    #[inline]
1109    fn ge(&self, other: &f16) -> bool {
1110        ge(*self, *other)
1111    }
1112}
1113
1114#[cfg(not(target_arch = "spirv"))]
1115impl FromStr for f16 {
1116    type Err = ParseFloatError;
1117
1118    #[inline]
1119    fn from_str(src: &str) -> Result<f16, ParseFloatError> {
1120        f32::from_str(src).map(f16::from_f32)
1121    }
1122}
1123
1124#[cfg(not(target_arch = "spirv"))]
1125impl Debug for f16 {
1126    #[inline]
1127    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
1128        Debug::fmt(&self.to_f32(), f)
1129    }
1130}
1131
1132#[cfg(not(target_arch = "spirv"))]
1133impl Display for f16 {
1134    #[inline]
1135    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
1136        Display::fmt(&self.to_f32(), f)
1137    }
1138}
1139
1140#[cfg(not(target_arch = "spirv"))]
1141impl LowerExp for f16 {
1142    #[inline]
1143    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
1144        write!(f, "{:e}", self.to_f32())
1145    }
1146}
1147
1148#[cfg(not(target_arch = "spirv"))]
1149impl UpperExp for f16 {
1150    #[inline]
1151    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
1152        write!(f, "{:E}", self.to_f32())
1153    }
1154}
1155
1156#[cfg(not(target_arch = "spirv"))]
1157impl Binary for f16 {
1158    #[inline]
1159    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
1160        write!(f, "{:b}", self.0)
1161    }
1162}
1163
1164#[cfg(not(target_arch = "spirv"))]
1165impl Octal for f16 {
1166    #[inline]
1167    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
1168        write!(f, "{:o}", self.0)
1169    }
1170}
1171
1172#[cfg(not(target_arch = "spirv"))]
1173impl LowerHex for f16 {
1174    #[inline]
1175    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
1176        write!(f, "{:x}", self.0)
1177    }
1178}
1179
1180#[cfg(not(target_arch = "spirv"))]
1181impl UpperHex for f16 {
1182    #[inline]
1183    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
1184        write!(f, "{:X}", self.0)
1185    }
1186}
1187
1188impl Neg for f16 {
1189    type Output = Self;
1190
1191    #[inline]
1192    fn neg(self) -> Self::Output {
1193        Self(self.0 ^ Self::SIGN_MASK)
1194    }
1195}
1196
1197impl Neg for &f16 {
1198    type Output = <f16 as Neg>::Output;
1199
1200    #[inline]
1201    fn neg(self) -> Self::Output {
1202        Neg::neg(*self)
1203    }
1204}
1205
1206impl Add for f16 {
1207    type Output = Self;
1208
1209    #[inline]
1210    fn add(self, rhs: Self) -> Self::Output {
1211        f16(arch::add_f16(self.0, rhs.0))
1212    }
1213}
1214
1215impl Add<&f16> for f16 {
1216    type Output = <f16 as Add<f16>>::Output;
1217
1218    #[inline]
1219    fn add(self, rhs: &f16) -> Self::Output {
1220        self.add(*rhs)
1221    }
1222}
1223
1224impl Add<&f16> for &f16 {
1225    type Output = <f16 as Add<f16>>::Output;
1226
1227    #[inline]
1228    fn add(self, rhs: &f16) -> Self::Output {
1229        (*self).add(*rhs)
1230    }
1231}
1232
1233impl Add<f16> for &f16 {
1234    type Output = <f16 as Add<f16>>::Output;
1235
1236    #[inline]
1237    fn add(self, rhs: f16) -> Self::Output {
1238        (*self).add(rhs)
1239    }
1240}
1241
1242impl AddAssign for f16 {
1243    #[inline]
1244    fn add_assign(&mut self, rhs: Self) {
1245        *self = (*self).add(rhs);
1246    }
1247}
1248
1249impl AddAssign<&f16> for f16 {
1250    #[inline]
1251    fn add_assign(&mut self, rhs: &f16) {
1252        *self = (*self).add(rhs);
1253    }
1254}
1255
1256impl Sub for f16 {
1257    type Output = Self;
1258
1259    #[inline]
1260    fn sub(self, rhs: Self) -> Self::Output {
1261        f16(arch::subtract_f16(self.0, rhs.0))
1262    }
1263}
1264
1265impl Sub<&f16> for f16 {
1266    type Output = <f16 as Sub<f16>>::Output;
1267
1268    #[inline]
1269    fn sub(self, rhs: &f16) -> Self::Output {
1270        self.sub(*rhs)
1271    }
1272}
1273
1274impl Sub<&f16> for &f16 {
1275    type Output = <f16 as Sub<f16>>::Output;
1276
1277    #[inline]
1278    fn sub(self, rhs: &f16) -> Self::Output {
1279        (*self).sub(*rhs)
1280    }
1281}
1282
1283impl Sub<f16> for &f16 {
1284    type Output = <f16 as Sub<f16>>::Output;
1285
1286    #[inline]
1287    fn sub(self, rhs: f16) -> Self::Output {
1288        (*self).sub(rhs)
1289    }
1290}
1291
1292impl SubAssign for f16 {
1293    #[inline]
1294    fn sub_assign(&mut self, rhs: Self) {
1295        *self = (*self).sub(rhs);
1296    }
1297}
1298
1299impl SubAssign<&f16> for f16 {
1300    #[inline]
1301    fn sub_assign(&mut self, rhs: &f16) {
1302        *self = (*self).sub(rhs);
1303    }
1304}
1305
1306impl Mul for f16 {
1307    type Output = Self;
1308
1309    #[inline]
1310    fn mul(self, rhs: Self) -> Self::Output {
1311        f16(arch::multiply_f16(self.0, rhs.0))
1312    }
1313}
1314
1315impl Mul<&f16> for f16 {
1316    type Output = <f16 as Mul<f16>>::Output;
1317
1318    #[inline]
1319    fn mul(self, rhs: &f16) -> Self::Output {
1320        self.mul(*rhs)
1321    }
1322}
1323
1324impl Mul<&f16> for &f16 {
1325    type Output = <f16 as Mul<f16>>::Output;
1326
1327    #[inline]
1328    fn mul(self, rhs: &f16) -> Self::Output {
1329        (*self).mul(*rhs)
1330    }
1331}
1332
1333impl Mul<f16> for &f16 {
1334    type Output = <f16 as Mul<f16>>::Output;
1335
1336    #[inline]
1337    fn mul(self, rhs: f16) -> Self::Output {
1338        (*self).mul(rhs)
1339    }
1340}
1341
1342impl MulAssign for f16 {
1343    #[inline]
1344    fn mul_assign(&mut self, rhs: Self) {
1345        *self = (*self).mul(rhs);
1346    }
1347}
1348
1349impl MulAssign<&f16> for f16 {
1350    #[inline]
1351    fn mul_assign(&mut self, rhs: &f16) {
1352        *self = (*self).mul(rhs);
1353    }
1354}
1355
1356impl Div for f16 {
1357    type Output = Self;
1358
1359    #[inline]
1360    fn div(self, rhs: Self) -> Self::Output {
1361        f16(arch::divide_f16(self.0, rhs.0))
1362    }
1363}
1364
1365impl Div<&f16> for f16 {
1366    type Output = <f16 as Div<f16>>::Output;
1367
1368    #[inline]
1369    fn div(self, rhs: &f16) -> Self::Output {
1370        self.div(*rhs)
1371    }
1372}
1373
1374impl Div<&f16> for &f16 {
1375    type Output = <f16 as Div<f16>>::Output;
1376
1377    #[inline]
1378    fn div(self, rhs: &f16) -> Self::Output {
1379        (*self).div(*rhs)
1380    }
1381}
1382
1383impl Div<f16> for &f16 {
1384    type Output = <f16 as Div<f16>>::Output;
1385
1386    #[inline]
1387    fn div(self, rhs: f16) -> Self::Output {
1388        (*self).div(rhs)
1389    }
1390}
1391
1392impl DivAssign for f16 {
1393    #[inline]
1394    fn div_assign(&mut self, rhs: Self) {
1395        *self = (*self).div(rhs);
1396    }
1397}
1398
1399impl DivAssign<&f16> for f16 {
1400    #[inline]
1401    fn div_assign(&mut self, rhs: &f16) {
1402        *self = (*self).div(rhs);
1403    }
1404}
1405
1406impl Rem for f16 {
1407    type Output = Self;
1408
1409    #[inline]
1410    fn rem(self, rhs: Self) -> Self::Output {
1411        f16(arch::remainder_f16(self.0, rhs.0))
1412    }
1413}
1414
1415impl Rem<&f16> for f16 {
1416    type Output = <f16 as Rem<f16>>::Output;
1417
1418    #[inline]
1419    fn rem(self, rhs: &f16) -> Self::Output {
1420        self.rem(*rhs)
1421    }
1422}
1423
1424impl Rem<&f16> for &f16 {
1425    type Output = <f16 as Rem<f16>>::Output;
1426
1427    #[inline]
1428    fn rem(self, rhs: &f16) -> Self::Output {
1429        (*self).rem(*rhs)
1430    }
1431}
1432
1433impl Rem<f16> for &f16 {
1434    type Output = <f16 as Rem<f16>>::Output;
1435
1436    #[inline]
1437    fn rem(self, rhs: f16) -> Self::Output {
1438        (*self).rem(rhs)
1439    }
1440}
1441
1442impl RemAssign for f16 {
1443    #[inline]
1444    fn rem_assign(&mut self, rhs: Self) {
1445        *self = (*self).rem(rhs);
1446    }
1447}
1448
1449impl RemAssign<&f16> for f16 {
1450    #[inline]
1451    fn rem_assign(&mut self, rhs: &f16) {
1452        *self = (*self).rem(rhs);
1453    }
1454}
1455
1456impl Product for f16 {
1457    #[inline]
1458    fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
1459        f16(arch::product_f16(iter.map(|f| f.to_bits())))
1460    }
1461}
1462
1463impl<'a> Product<&'a f16> for f16 {
1464    #[inline]
1465    fn product<I: Iterator<Item = &'a f16>>(iter: I) -> Self {
1466        f16(arch::product_f16(iter.map(|f| f.to_bits())))
1467    }
1468}
1469
1470impl Sum for f16 {
1471    #[inline]
1472    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
1473        f16(arch::sum_f16(iter.map(|f| f.to_bits())))
1474    }
1475}
1476
1477impl<'a> Sum<&'a f16> for f16 {
1478    #[inline]
1479    fn sum<I: Iterator<Item = &'a f16>>(iter: I) -> Self {
1480        f16(arch::sum_f16(iter.map(|f| f.to_bits())))
1481    }
1482}
1483
1484#[inline]
1485const fn eq(lhs: f16, rhs: f16) -> bool {
1486    if lhs.is_nan() || rhs.is_nan() {
1487        false
1488    } else {
1489        (lhs.0 == rhs.0) || ((lhs.0 | rhs.0) & f16::NOT_SIGN == 0)
1490    }
1491}
1492
1493#[inline]
1494const fn lt(lhs: f16, rhs: f16) -> bool {
1495    if lhs.is_nan() || rhs.is_nan() {
1496        false
1497    } else {
1498        let neg = lhs.0 & f16::SIGN_MASK != 0;
1499        let rhs_neg = rhs.0 & f16::SIGN_MASK != 0;
1500        match (neg, rhs_neg) {
1501            (false, false) => lhs.0 < rhs.0,
1502            (false, true) => false,
1503            (true, false) => (lhs.0 | rhs.0) & f16::NOT_SIGN != 0,
1504            (true, true) => lhs.0 > rhs.0,
1505        }
1506    }
1507}
1508
1509#[inline]
1510const fn le(lhs: f16, rhs: f16) -> bool {
1511    if lhs.is_nan() || rhs.is_nan() {
1512        false
1513    } else {
1514        let neg = lhs.0 & f16::SIGN_MASK != 0;
1515        let rhs_neg = rhs.0 & f16::SIGN_MASK != 0;
1516        match (neg, rhs_neg) {
1517            (false, false) => lhs.0 <= rhs.0,
1518            (false, true) => (lhs.0 | rhs.0) & f16::NOT_SIGN == 0,
1519            (true, false) => true,
1520            (true, true) => lhs.0 >= rhs.0,
1521        }
1522    }
1523}
1524
1525#[inline]
1526const fn gt(lhs: f16, rhs: f16) -> bool {
1527    if lhs.is_nan() || rhs.is_nan() {
1528        false
1529    } else {
1530        let neg = lhs.0 & f16::SIGN_MASK != 0;
1531        let rhs_neg = rhs.0 & f16::SIGN_MASK != 0;
1532        match (neg, rhs_neg) {
1533            (false, false) => lhs.0 > rhs.0,
1534            (false, true) => (lhs.0 | rhs.0) & f16::NOT_SIGN != 0,
1535            (true, false) => false,
1536            (true, true) => lhs.0 < rhs.0,
1537        }
1538    }
1539}
1540
1541#[inline]
1542const fn ge(lhs: f16, rhs: f16) -> bool {
1543    if lhs.is_nan() || rhs.is_nan() {
1544        false
1545    } else {
1546        let neg = lhs.0 & f16::SIGN_MASK != 0;
1547        let rhs_neg = rhs.0 & f16::SIGN_MASK != 0;
1548        match (neg, rhs_neg) {
1549            (false, false) => lhs.0 >= rhs.0,
1550            (false, true) => true,
1551            (true, false) => (lhs.0 | rhs.0) & f16::NOT_SIGN == 0,
1552            (true, true) => lhs.0 <= rhs.0,
1553        }
1554    }
1555}
1556
1557#[allow(clippy::cognitive_complexity, clippy::float_cmp, clippy::neg_cmp_op_on_partial_ord)]
1558#[cfg(test)]
1559mod test {
1560    use core::cmp::Ordering;
1561    use core::mem;
1562
1563    use super::*;
1564
1565    #[test]
1566    #[cfg_attr(miri, ignore)]
1567    fn test_f16_consts() {
1568        // DIGITS
1569        let digits = ((f16::MANTISSA_DIGITS as f32 - 1.0) * 2f32.log10()).floor() as u32;
1570        assert_eq!(f16::DIGITS, digits);
1571        // sanity check to show test is good
1572        let digits32 = ((core::f32::MANTISSA_DIGITS as f32 - 1.0) * 2f32.log10()).floor() as u32;
1573        assert_eq!(core::f32::DIGITS, digits32);
1574
1575        // EPSILON
1576        let one = f16::from_f32(1.0);
1577        let one_plus_epsilon = f16::from_bits(one.to_bits() + 1);
1578        let epsilon = f16::from_f32(one_plus_epsilon.to_f32() - 1.0);
1579        assert_eq!(f16::EPSILON, epsilon);
1580        // sanity check to show test is good
1581        let one_plus_epsilon32 = f32::from_bits(1.0f32.to_bits() + 1);
1582        let epsilon32 = one_plus_epsilon32 - 1f32;
1583        assert_eq!(core::f32::EPSILON, epsilon32);
1584
1585        // MAX, MIN and MIN_POSITIVE
1586        let max = f16::from_bits(f16::INFINITY.to_bits() - 1);
1587        let min = f16::from_bits(f16::NEG_INFINITY.to_bits() - 1);
1588        let min_pos = f16::from_f32(2f32.powi(f16::MIN_EXP - 1));
1589        assert_eq!(f16::MAX, max);
1590        assert_eq!(f16::MIN, min);
1591        assert_eq!(f16::MIN_POSITIVE, min_pos);
1592        // sanity check to show test is good
1593        let max32 = f32::from_bits(core::f32::INFINITY.to_bits() - 1);
1594        let min32 = f32::from_bits(core::f32::NEG_INFINITY.to_bits() - 1);
1595        let min_pos32 = 2f32.powi(core::f32::MIN_EXP - 1);
1596        assert_eq!(core::f32::MAX, max32);
1597        assert_eq!(core::f32::MIN, min32);
1598        assert_eq!(core::f32::MIN_POSITIVE, min_pos32);
1599
1600        // MIN_10_EXP and MAX_10_EXP
1601        let ten_to_min = 10f32.powi(f16::MIN_10_EXP);
1602        assert!(ten_to_min / 10.0 < f16::MIN_POSITIVE.to_f32());
1603        assert!(ten_to_min > f16::MIN_POSITIVE.to_f32());
1604        let ten_to_max = 10f32.powi(f16::MAX_10_EXP);
1605        assert!(ten_to_max < f16::MAX.to_f32());
1606        assert!(ten_to_max * 10.0 > f16::MAX.to_f32());
1607        // sanity check to show test is good
1608        let ten_to_min32 = 10f64.powi(core::f32::MIN_10_EXP);
1609        assert!(ten_to_min32 / 10.0 < f64::from(core::f32::MIN_POSITIVE));
1610        assert!(ten_to_min32 > f64::from(core::f32::MIN_POSITIVE));
1611        let ten_to_max32 = 10f64.powi(core::f32::MAX_10_EXP);
1612        assert!(ten_to_max32 < f64::from(core::f32::MAX));
1613        assert!(ten_to_max32 * 10.0 > f64::from(core::f32::MAX));
1614    }
1615
1616    #[test]
1617    fn test_f16_consts_from_f32() {
1618        let one = f16::from_f32(1.0);
1619        let zero = f16::from_f32(0.0);
1620        let neg_zero = f16::from_f32(-0.0);
1621        let neg_one = f16::from_f32(-1.0);
1622        let inf = f16::from_f32(core::f32::INFINITY);
1623        let neg_inf = f16::from_f32(core::f32::NEG_INFINITY);
1624        let nan = f16::from_f32(core::f32::NAN);
1625
1626        assert_eq!(f16::ONE, one);
1627        assert_eq!(f16::ZERO, zero);
1628        assert!(zero.is_sign_positive());
1629        assert_eq!(f16::NEG_ZERO, neg_zero);
1630        assert!(neg_zero.is_sign_negative());
1631        assert_eq!(f16::NEG_ONE, neg_one);
1632        assert!(neg_one.is_sign_negative());
1633        assert_eq!(f16::INFINITY, inf);
1634        assert_eq!(f16::NEG_INFINITY, neg_inf);
1635        assert!(nan.is_nan());
1636        assert!(f16::NAN.is_nan());
1637
1638        let e = f16::from_f32(core::f32::consts::E);
1639        let pi = f16::from_f32(core::f32::consts::PI);
1640        let frac_1_pi = f16::from_f32(core::f32::consts::FRAC_1_PI);
1641        let frac_1_sqrt_2 = f16::from_f32(core::f32::consts::FRAC_1_SQRT_2);
1642        let frac_2_pi = f16::from_f32(core::f32::consts::FRAC_2_PI);
1643        let frac_2_sqrt_pi = f16::from_f32(core::f32::consts::FRAC_2_SQRT_PI);
1644        let frac_pi_2 = f16::from_f32(core::f32::consts::FRAC_PI_2);
1645        let frac_pi_3 = f16::from_f32(core::f32::consts::FRAC_PI_3);
1646        let frac_pi_4 = f16::from_f32(core::f32::consts::FRAC_PI_4);
1647        let frac_pi_6 = f16::from_f32(core::f32::consts::FRAC_PI_6);
1648        let frac_pi_8 = f16::from_f32(core::f32::consts::FRAC_PI_8);
1649        let ln_10 = f16::from_f32(core::f32::consts::LN_10);
1650        let ln_2 = f16::from_f32(core::f32::consts::LN_2);
1651        let log10_e = f16::from_f32(core::f32::consts::LOG10_E);
1652        // core::f32::consts::LOG10_2 requires rustc 1.43.0
1653        let log10_2 = f16::from_f32(2f32.log10());
1654        let log2_e = f16::from_f32(core::f32::consts::LOG2_E);
1655        // core::f32::consts::LOG2_10 requires rustc 1.43.0
1656        let log2_10 = f16::from_f32(10f32.log2());
1657        let sqrt_2 = f16::from_f32(core::f32::consts::SQRT_2);
1658
1659        assert_eq!(f16::E, e);
1660        assert_eq!(f16::PI, pi);
1661        assert_eq!(f16::FRAC_1_PI, frac_1_pi);
1662        assert_eq!(f16::FRAC_1_SQRT_2, frac_1_sqrt_2);
1663        assert_eq!(f16::FRAC_2_PI, frac_2_pi);
1664        assert_eq!(f16::FRAC_2_SQRT_PI, frac_2_sqrt_pi);
1665        assert_eq!(f16::FRAC_PI_2, frac_pi_2);
1666        assert_eq!(f16::FRAC_PI_3, frac_pi_3);
1667        assert_eq!(f16::FRAC_PI_4, frac_pi_4);
1668        assert_eq!(f16::FRAC_PI_6, frac_pi_6);
1669        assert_eq!(f16::FRAC_PI_8, frac_pi_8);
1670        assert_eq!(f16::LN_10, ln_10);
1671        assert_eq!(f16::LN_2, ln_2);
1672        assert_eq!(f16::LOG10_E, log10_e);
1673        assert_eq!(f16::LOG10_2, log10_2);
1674        assert_eq!(f16::LOG2_E, log2_e);
1675        assert_eq!(f16::LOG2_10, log2_10);
1676        assert_eq!(f16::SQRT_2, sqrt_2);
1677    }
1678
1679    #[test]
1680    fn test_f16_consts_from_f64() {
1681        let one = f16::from_f64(1.0);
1682        let zero = f16::from_f64(0.0);
1683        let neg_zero = f16::from_f64(-0.0);
1684        let inf = f16::from_f64(core::f64::INFINITY);
1685        let neg_inf = f16::from_f64(core::f64::NEG_INFINITY);
1686        let nan = f16::from_f64(core::f64::NAN);
1687
1688        assert_eq!(f16::ONE, one);
1689        assert_eq!(f16::ZERO, zero);
1690        assert!(zero.is_sign_positive());
1691        assert_eq!(f16::NEG_ZERO, neg_zero);
1692        assert!(neg_zero.is_sign_negative());
1693        assert_eq!(f16::INFINITY, inf);
1694        assert_eq!(f16::NEG_INFINITY, neg_inf);
1695        assert!(nan.is_nan());
1696        assert!(f16::NAN.is_nan());
1697
1698        let e = f16::from_f64(core::f64::consts::E);
1699        let pi = f16::from_f64(core::f64::consts::PI);
1700        let frac_1_pi = f16::from_f64(core::f64::consts::FRAC_1_PI);
1701        let frac_1_sqrt_2 = f16::from_f64(core::f64::consts::FRAC_1_SQRT_2);
1702        let frac_2_pi = f16::from_f64(core::f64::consts::FRAC_2_PI);
1703        let frac_2_sqrt_pi = f16::from_f64(core::f64::consts::FRAC_2_SQRT_PI);
1704        let frac_pi_2 = f16::from_f64(core::f64::consts::FRAC_PI_2);
1705        let frac_pi_3 = f16::from_f64(core::f64::consts::FRAC_PI_3);
1706        let frac_pi_4 = f16::from_f64(core::f64::consts::FRAC_PI_4);
1707        let frac_pi_6 = f16::from_f64(core::f64::consts::FRAC_PI_6);
1708        let frac_pi_8 = f16::from_f64(core::f64::consts::FRAC_PI_8);
1709        let ln_10 = f16::from_f64(core::f64::consts::LN_10);
1710        let ln_2 = f16::from_f64(core::f64::consts::LN_2);
1711        let log10_e = f16::from_f64(core::f64::consts::LOG10_E);
1712        // core::f64::consts::LOG10_2 requires rustc 1.43.0
1713        let log10_2 = f16::from_f64(2f64.log10());
1714        let log2_e = f16::from_f64(core::f64::consts::LOG2_E);
1715        // core::f64::consts::LOG2_10 requires rustc 1.43.0
1716        let log2_10 = f16::from_f64(10f64.log2());
1717        let sqrt_2 = f16::from_f64(core::f64::consts::SQRT_2);
1718
1719        assert_eq!(f16::E, e);
1720        assert_eq!(f16::PI, pi);
1721        assert_eq!(f16::FRAC_1_PI, frac_1_pi);
1722        assert_eq!(f16::FRAC_1_SQRT_2, frac_1_sqrt_2);
1723        assert_eq!(f16::FRAC_2_PI, frac_2_pi);
1724        assert_eq!(f16::FRAC_2_SQRT_PI, frac_2_sqrt_pi);
1725        assert_eq!(f16::FRAC_PI_2, frac_pi_2);
1726        assert_eq!(f16::FRAC_PI_3, frac_pi_3);
1727        assert_eq!(f16::FRAC_PI_4, frac_pi_4);
1728        assert_eq!(f16::FRAC_PI_6, frac_pi_6);
1729        assert_eq!(f16::FRAC_PI_8, frac_pi_8);
1730        assert_eq!(f16::LN_10, ln_10);
1731        assert_eq!(f16::LN_2, ln_2);
1732        assert_eq!(f16::LOG10_E, log10_e);
1733        assert_eq!(f16::LOG10_2, log10_2);
1734        assert_eq!(f16::LOG2_E, log2_e);
1735        assert_eq!(f16::LOG2_10, log2_10);
1736        assert_eq!(f16::SQRT_2, sqrt_2);
1737    }
1738
1739    #[test]
1740    fn test_nan_conversion_to_smaller() {
1741        let nan64 = f64::from_bits(0x7FF0_0000_0000_0001u64);
1742        let neg_nan64 = f64::from_bits(0xFFF0_0000_0000_0001u64);
1743        let nan32 = f32::from_bits(0x7F80_0001u32);
1744        let neg_nan32 = f32::from_bits(0xFF80_0001u32);
1745        let nan32_from_64 = nan64 as f32;
1746        let neg_nan32_from_64 = neg_nan64 as f32;
1747        let nan16_from_64 = f16::from_f64(nan64);
1748        let neg_nan16_from_64 = f16::from_f64(neg_nan64);
1749        let nan16_from_32 = f16::from_f32(nan32);
1750        let neg_nan16_from_32 = f16::from_f32(neg_nan32);
1751
1752        assert!(nan64.is_nan() && nan64.is_sign_positive());
1753        assert!(neg_nan64.is_nan() && neg_nan64.is_sign_negative());
1754        assert!(nan32.is_nan() && nan32.is_sign_positive());
1755        assert!(neg_nan32.is_nan() && neg_nan32.is_sign_negative());
1756
1757        // f32/f64 NaN conversion sign is non-deterministic: https://github.com/VoidStarKat/half-rs/issues/103
1758        assert!(nan32_from_64.is_nan());
1759        assert!(neg_nan32_from_64.is_nan());
1760        assert!(nan16_from_64.is_nan());
1761        assert!(neg_nan16_from_64.is_nan());
1762        assert!(nan16_from_32.is_nan());
1763        assert!(neg_nan16_from_32.is_nan());
1764    }
1765
1766    #[test]
1767    fn test_nan_conversion_to_larger() {
1768        let nan16 = f16::from_bits(0x7C01u16);
1769        let neg_nan16 = f16::from_bits(0xFC01u16);
1770        let nan32 = f32::from_bits(0x7F80_0001u32);
1771        let neg_nan32 = f32::from_bits(0xFF80_0001u32);
1772        let nan32_from_16 = f32::from(nan16);
1773        let neg_nan32_from_16 = f32::from(neg_nan16);
1774        let nan64_from_16 = f64::from(nan16);
1775        let neg_nan64_from_16 = f64::from(neg_nan16);
1776        let nan64_from_32 = f64::from(nan32);
1777        let neg_nan64_from_32 = f64::from(neg_nan32);
1778
1779        assert!(nan16.is_nan() && nan16.is_sign_positive());
1780        assert!(neg_nan16.is_nan() && neg_nan16.is_sign_negative());
1781        assert!(nan32.is_nan() && nan32.is_sign_positive());
1782        assert!(neg_nan32.is_nan() && neg_nan32.is_sign_negative());
1783
1784        // f32/f64 NaN conversion sign is non-deterministic: https://github.com/VoidStarKat/half-rs/issues/103
1785        assert!(nan32_from_16.is_nan());
1786        assert!(neg_nan32_from_16.is_nan());
1787        assert!(nan64_from_16.is_nan());
1788        assert!(neg_nan64_from_16.is_nan());
1789        assert!(nan64_from_32.is_nan());
1790        assert!(neg_nan64_from_32.is_nan());
1791    }
1792
1793    #[test]
1794    #[cfg_attr(miri, ignore)]
1795    fn test_f16_to_f32() {
1796        let f = f16::from_f32(7.0);
1797        assert_eq!(f.to_f32(), 7.0f32);
1798
1799        // 7.1 is NOT exactly representable in 16-bit, it's rounded
1800        let f = f16::from_f32(7.1);
1801        let diff = (f.to_f32() - 7.1f32).abs();
1802        // diff must be <= 4 * EPSILON, as 7 has two more significant bits than 1
1803        assert!(diff <= 4.0 * f16::EPSILON.to_f32());
1804
1805        assert_eq!(f16::from_bits(0x0000_0001).to_f32(), 2.0f32.powi(-24));
1806        assert_eq!(f16::from_bits(0x0000_0005).to_f32(), 5.0 * 2.0f32.powi(-24));
1807
1808        assert_eq!(f16::from_bits(0x0000_0001), f16::from_f32(2.0f32.powi(-24)));
1809        assert_eq!(f16::from_bits(0x0000_0005), f16::from_f32(5.0 * 2.0f32.powi(-24)));
1810    }
1811
1812    #[test]
1813    #[cfg_attr(miri, ignore)]
1814    fn test_f16_to_f64() {
1815        let f = f16::from_f64(7.0);
1816        assert_eq!(f.to_f64(), 7.0f64);
1817
1818        // 7.1 is NOT exactly representable in 16-bit, it's rounded
1819        let f = f16::from_f64(7.1);
1820        let diff = (f.to_f64() - 7.1f64).abs();
1821        // diff must be <= 4 * EPSILON, as 7 has two more significant bits than 1
1822        assert!(diff <= 4.0 * f16::EPSILON.to_f64());
1823
1824        assert_eq!(f16::from_bits(0x0000_0001).to_f64(), 2.0f64.powi(-24));
1825        assert_eq!(f16::from_bits(0x0000_0005).to_f64(), 5.0 * 2.0f64.powi(-24));
1826
1827        assert_eq!(f16::from_bits(0x0000_0001), f16::from_f64(2.0f64.powi(-24)));
1828        assert_eq!(f16::from_bits(0x0000_0005), f16::from_f64(5.0 * 2.0f64.powi(-24)));
1829    }
1830
1831    #[test]
1832    fn test_comparisons() {
1833        let zero = f16::from_f64(0.0);
1834        let one = f16::from_f64(1.0);
1835        let neg_zero = f16::from_f64(-0.0);
1836        let neg_one = f16::from_f64(-1.0);
1837
1838        assert_eq!(zero.partial_cmp(&neg_zero), Some(Ordering::Equal));
1839        assert_eq!(neg_zero.partial_cmp(&zero), Some(Ordering::Equal));
1840        assert!(zero == neg_zero);
1841        assert!(neg_zero == zero);
1842        assert!(!(zero != neg_zero));
1843        assert!(!(neg_zero != zero));
1844        assert!(!(zero < neg_zero));
1845        assert!(!(neg_zero < zero));
1846        assert!(zero <= neg_zero);
1847        assert!(neg_zero <= zero);
1848        assert!(!(zero > neg_zero));
1849        assert!(!(neg_zero > zero));
1850        assert!(zero >= neg_zero);
1851        assert!(neg_zero >= zero);
1852
1853        assert_eq!(one.partial_cmp(&neg_zero), Some(Ordering::Greater));
1854        assert_eq!(neg_zero.partial_cmp(&one), Some(Ordering::Less));
1855        assert!(!(one == neg_zero));
1856        assert!(!(neg_zero == one));
1857        assert!(one != neg_zero);
1858        assert!(neg_zero != one);
1859        assert!(!(one < neg_zero));
1860        assert!(neg_zero < one);
1861        assert!(!(one <= neg_zero));
1862        assert!(neg_zero <= one);
1863        assert!(one > neg_zero);
1864        assert!(!(neg_zero > one));
1865        assert!(one >= neg_zero);
1866        assert!(!(neg_zero >= one));
1867
1868        assert_eq!(one.partial_cmp(&neg_one), Some(Ordering::Greater));
1869        assert_eq!(neg_one.partial_cmp(&one), Some(Ordering::Less));
1870        assert!(!(one == neg_one));
1871        assert!(!(neg_one == one));
1872        assert!(one != neg_one);
1873        assert!(neg_one != one);
1874        assert!(!(one < neg_one));
1875        assert!(neg_one < one);
1876        assert!(!(one <= neg_one));
1877        assert!(neg_one <= one);
1878        assert!(one > neg_one);
1879        assert!(!(neg_one > one));
1880        assert!(one >= neg_one);
1881        assert!(!(neg_one >= one));
1882    }
1883
1884    #[test]
1885    #[allow(clippy::erasing_op, clippy::identity_op)]
1886    #[cfg_attr(miri, ignore)]
1887    fn round_to_even_f32() {
1888        // smallest positive subnormal = 0b0.0000_0000_01 * 2^-14 = 2^-24
1889        let min_sub = f16::from_bits(1);
1890        let min_sub_f = (-24f32).exp2();
1891        assert_eq!(f16::from_f32(min_sub_f).to_bits(), min_sub.to_bits());
1892        assert_eq!(f32::from(min_sub).to_bits(), min_sub_f.to_bits());
1893
1894        // 0.0000000000_011111 rounded to 0.0000000000 (< tie, no rounding)
1895        // 0.0000000000_100000 rounded to 0.0000000000 (tie and even, remains at even)
1896        // 0.0000000000_100001 rounded to 0.0000000001 (> tie, rounds up)
1897        assert_eq!(f16::from_f32(min_sub_f * 0.49).to_bits(), min_sub.to_bits() * 0);
1898        assert_eq!(f16::from_f32(min_sub_f * 0.50).to_bits(), min_sub.to_bits() * 0);
1899        assert_eq!(f16::from_f32(min_sub_f * 0.51).to_bits(), min_sub.to_bits() * 1);
1900
1901        // 0.0000000001_011111 rounded to 0.0000000001 (< tie, no rounding)
1902        // 0.0000000001_100000 rounded to 0.0000000010 (tie and odd, rounds up to even)
1903        // 0.0000000001_100001 rounded to 0.0000000010 (> tie, rounds up)
1904        assert_eq!(f16::from_f32(min_sub_f * 1.49).to_bits(), min_sub.to_bits() * 1);
1905        assert_eq!(f16::from_f32(min_sub_f * 1.50).to_bits(), min_sub.to_bits() * 2);
1906        assert_eq!(f16::from_f32(min_sub_f * 1.51).to_bits(), min_sub.to_bits() * 2);
1907
1908        // 0.0000000010_011111 rounded to 0.0000000010 (< tie, no rounding)
1909        // 0.0000000010_100000 rounded to 0.0000000010 (tie and even, remains at even)
1910        // 0.0000000010_100001 rounded to 0.0000000011 (> tie, rounds up)
1911        assert_eq!(f16::from_f32(min_sub_f * 2.49).to_bits(), min_sub.to_bits() * 2);
1912        assert_eq!(f16::from_f32(min_sub_f * 2.50).to_bits(), min_sub.to_bits() * 2);
1913        assert_eq!(f16::from_f32(min_sub_f * 2.51).to_bits(), min_sub.to_bits() * 3);
1914
1915        assert_eq!(f16::from_f32(2000.49f32).to_bits(), f16::from_f32(2000.0).to_bits());
1916        assert_eq!(f16::from_f32(2000.50f32).to_bits(), f16::from_f32(2000.0).to_bits());
1917        assert_eq!(f16::from_f32(2000.51f32).to_bits(), f16::from_f32(2001.0).to_bits());
1918        assert_eq!(f16::from_f32(2001.49f32).to_bits(), f16::from_f32(2001.0).to_bits());
1919        assert_eq!(f16::from_f32(2001.50f32).to_bits(), f16::from_f32(2002.0).to_bits());
1920        assert_eq!(f16::from_f32(2001.51f32).to_bits(), f16::from_f32(2002.0).to_bits());
1921        assert_eq!(f16::from_f32(2002.49f32).to_bits(), f16::from_f32(2002.0).to_bits());
1922        assert_eq!(f16::from_f32(2002.50f32).to_bits(), f16::from_f32(2002.0).to_bits());
1923        assert_eq!(f16::from_f32(2002.51f32).to_bits(), f16::from_f32(2003.0).to_bits());
1924    }
1925
1926    #[test]
1927    #[allow(clippy::erasing_op, clippy::identity_op)]
1928    #[cfg_attr(miri, ignore)]
1929    fn round_to_even_f64() {
1930        // smallest positive subnormal = 0b0.0000_0000_01 * 2^-14 = 2^-24
1931        let min_sub = f16::from_bits(1);
1932        let min_sub_f = (-24f64).exp2();
1933        assert_eq!(f16::from_f64(min_sub_f).to_bits(), min_sub.to_bits());
1934        assert_eq!(f64::from(min_sub).to_bits(), min_sub_f.to_bits());
1935
1936        // 0.0000000000_011111 rounded to 0.0000000000 (< tie, no rounding)
1937        // 0.0000000000_100000 rounded to 0.0000000000 (tie and even, remains at even)
1938        // 0.0000000000_100001 rounded to 0.0000000001 (> tie, rounds up)
1939        assert_eq!(f16::from_f64(min_sub_f * 0.49).to_bits(), min_sub.to_bits() * 0);
1940        assert_eq!(f16::from_f64(min_sub_f * 0.50).to_bits(), min_sub.to_bits() * 0);
1941        assert_eq!(f16::from_f64(min_sub_f * 0.51).to_bits(), min_sub.to_bits() * 1);
1942
1943        // 0.0000000001_011111 rounded to 0.0000000001 (< tie, no rounding)
1944        // 0.0000000001_100000 rounded to 0.0000000010 (tie and odd, rounds up to even)
1945        // 0.0000000001_100001 rounded to 0.0000000010 (> tie, rounds up)
1946        assert_eq!(f16::from_f64(min_sub_f * 1.49).to_bits(), min_sub.to_bits() * 1);
1947        assert_eq!(f16::from_f64(min_sub_f * 1.50).to_bits(), min_sub.to_bits() * 2);
1948        assert_eq!(f16::from_f64(min_sub_f * 1.51).to_bits(), min_sub.to_bits() * 2);
1949
1950        // 0.0000000010_011111 rounded to 0.0000000010 (< tie, no rounding)
1951        // 0.0000000010_100000 rounded to 0.0000000010 (tie and even, remains at even)
1952        // 0.0000000010_100001 rounded to 0.0000000011 (> tie, rounds up)
1953        assert_eq!(f16::from_f64(min_sub_f * 2.49).to_bits(), min_sub.to_bits() * 2);
1954        assert_eq!(f16::from_f64(min_sub_f * 2.50).to_bits(), min_sub.to_bits() * 2);
1955        assert_eq!(f16::from_f64(min_sub_f * 2.51).to_bits(), min_sub.to_bits() * 3);
1956
1957        assert_eq!(f16::from_f64(2000.49f64).to_bits(), f16::from_f64(2000.0).to_bits());
1958        assert_eq!(f16::from_f64(2000.50f64).to_bits(), f16::from_f64(2000.0).to_bits());
1959        assert_eq!(f16::from_f64(2000.51f64).to_bits(), f16::from_f64(2001.0).to_bits());
1960        assert_eq!(f16::from_f64(2001.49f64).to_bits(), f16::from_f64(2001.0).to_bits());
1961        assert_eq!(f16::from_f64(2001.50f64).to_bits(), f16::from_f64(2002.0).to_bits());
1962        assert_eq!(f16::from_f64(2001.51f64).to_bits(), f16::from_f64(2002.0).to_bits());
1963        assert_eq!(f16::from_f64(2002.49f64).to_bits(), f16::from_f64(2002.0).to_bits());
1964        assert_eq!(f16::from_f64(2002.50f64).to_bits(), f16::from_f64(2002.0).to_bits());
1965        assert_eq!(f16::from_f64(2002.51f64).to_bits(), f16::from_f64(2003.0).to_bits());
1966    }
1967
1968    #[test]
1969    fn arithmetic() {
1970        assert_eq!(f16::ONE + f16::ONE, f16::from_f32(2.));
1971        assert_eq!(f16::ONE - f16::ONE, f16::ZERO);
1972        assert_eq!(f16::ONE * f16::ONE, f16::ONE);
1973        assert_eq!(f16::from_f32(2.) * f16::from_f32(2.), f16::from_f32(4.));
1974        assert_eq!(f16::ONE / f16::ONE, f16::ONE);
1975        assert_eq!(f16::from_f32(4.) / f16::from_f32(2.), f16::from_f32(2.));
1976        assert_eq!(f16::from_f32(4.) % f16::from_f32(3.), f16::from_f32(1.));
1977    }
1978
1979    #[test]
1980    fn issue_116() {
1981        // SEE: https://github.com/starkat99/half-rs/issues/116
1982        //  This is lossy until `_mm_cvtpd_ph` will be stable on x86.
1983        let max_diff = if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
1984            1
1985        } else {
1986            0
1987        };
1988
1989        // from the round-to-even section of the test case
1990        let x: f64 = unsafe { mem::transmute(0x3f0ffbfffffffffcu64) };
1991        let bits = f16::from_f64(x).to_bits();
1992        let const_bits = f16::from_f64_const(x).to_bits();
1993        let inst_bits = f16::from_f64_instrinsic(x).to_bits();
1994        assert_eq!(const_bits, bits);
1995        assert!(inst_bits.abs_diff(bits) <= max_diff);
1996
1997        // from the double rounding section of the test case
1998        // comment from the cpython test case: should be 2047, if double-rounded
1999        // 64>32>16, becomes 2048
2000        let x: f64 = unsafe { mem::transmute(0x409ffdffffff0000u64) };
2001        let bits = f16::from_f64(x).to_bits();
2002        let const_bits = f16::from_f64_const(x).to_bits();
2003        let inst_bits = f16::from_f64_instrinsic(x).to_bits();
2004        assert_eq!(const_bits, bits);
2005        assert!(inst_bits.abs_diff(bits) <= max_diff);
2006    }
2007
2008    #[test]
2009    fn from_f32_lossless() {
2010        let from_f32 = |v: f32| f16::from_f32_lossless(v);
2011        let roundtrip = |v: f32, expected: Option<f16>| {
2012            let half = from_f32(v);
2013            assert_eq!(half, expected);
2014            if !expected.is_none() {
2015                let as_f32 = expected.unwrap().to_f32_const();
2016                assert_eq!(v, as_f32);
2017            }
2018        };
2019
2020        assert_eq!(from_f32(f32::NAN).map(f16::is_nan), Some(true));
2021        roundtrip(f32::INFINITY, Some(f16::INFINITY));
2022        roundtrip(f32::NEG_INFINITY, Some(f16::NEG_INFINITY));
2023        roundtrip(f32::from_bits(0b0_00000000_00000000000000000000000), Some(f16(0)));
2024        roundtrip(f32::from_bits(0b1_00000000_00000000000000000000000), Some(f16(f16::SIGN_MASK)));
2025        roundtrip(f32::from_bits(1), None);
2026
2027        // special truncation with denormals, etc.
2028        roundtrip(f32::from_bits(0b0_01100111_00000000000000000000000), Some(f16(1)));
2029        roundtrip(f32::from_bits(0b0_01101000_00000000000000000000000), Some(f16(2)));
2030        roundtrip(f32::from_bits(0b0_01101000_10000000000000000000000), Some(f16(3)));
2031        roundtrip(f32::from_bits(0b0_01100111_10000000000000000000000), None);
2032        roundtrip(f32::from_bits(0b0_01101000_11000000000000000000000), None);
2033        // ~2.2888184e-5 and has bits until 16 to the end, so truncated 2. but this is
2034        // denormal as f16
2035        roundtrip(f32::from_bits(0b0_01101111_00000000000000000000000), Some(f16(0x100)));
2036        roundtrip(f32::from_bits(0b0_01101111_10000000000000000000000), Some(f16(0x180)));
2037        roundtrip(f32::from_bits(0b0_01101111_11000000000000000000000), Some(f16(0x1c0)));
2038        roundtrip(f32::from_bits(0b0_01101111_11000001000000000000000), Some(f16(0x1c1)));
2039        roundtrip(f32::from_bits(0b0_01101111_11000001100000000000000), None);
2040        //2.0f32
2041        roundtrip(f32::from_bits(0b0_10000000_00000000000000000000000), Some(f16(0x4000)));
2042        roundtrip(f32::from_bits(0b0_10000000_10000000000000000000000), Some(f16(0x4200)));
2043        roundtrip(f32::from_bits(0b0_10000000_10000000010000000000000), Some(f16(0x4201)));
2044        roundtrip(f32::from_bits(0b0_10000000_10000000011000000000000), None);
2045        // check overflow
2046        roundtrip(f32::from_bits(0b0_10001111_00000000000000000000000), None);
2047        roundtrip(f32::from_bits(0b0_10001110_00000000000000000000000), Some(f16(0x7800)));
2048    }
2049
2050    #[test]
2051    fn from_f64_lossless() {
2052        let from_f64 = |v: f64| f16::from_f64_lossless(v);
2053        let roundtrip = |v: f64, expected: Option<f16>| {
2054            let half = from_f64(v);
2055            assert_eq!(half, expected);
2056            if !expected.is_none() {
2057                let as_f64 = expected.unwrap().to_f64_const();
2058                assert_eq!(v, as_f64);
2059            }
2060        };
2061
2062        assert_eq!(from_f64(f64::NAN).map(f16::is_nan), Some(true));
2063        roundtrip(f64::INFINITY, Some(f16::INFINITY));
2064        roundtrip(f64::NEG_INFINITY, Some(f16::NEG_INFINITY));
2065        roundtrip(
2066            f64::from_bits(0b0_00000000000_0000000000000000000000000000000000000000000000000000),
2067            Some(f16(0)),
2068        );
2069        roundtrip(
2070            f64::from_bits(0b1_00000000000_0000000000000000000000000000000000000000000000000000),
2071            Some(f16(f16::SIGN_MASK)),
2072        );
2073        roundtrip(
2074            f64::from_bits(0b0_01110001010_1010100101011010010110110111111110000111101000001111),
2075            None,
2076        );
2077        // check overflow to inf
2078        roundtrip(
2079            f64::from_bits(0b0_10000001110_1000000000000000000000000000000000000000000000000000),
2080            Some(f16(0x7a00)),
2081        );
2082        roundtrip(
2083            f64::from_bits(0b0_10000001111_1000000000000000000000000000000000000000000000000000),
2084            None,
2085        );
2086        // check denormals and truncation
2087        roundtrip(
2088            f64::from_bits(0b0_01111100111_0000000000000000000000000000000000000000000000000000),
2089            Some(f16(1)),
2090        );
2091        roundtrip(
2092            f64::from_bits(0b0_01111100111_1000000000000000000000000000000000000000000000000000),
2093            None,
2094        );
2095        roundtrip(
2096            f64::from_bits(0b0_01111101000_0000000000000000000000000000000000000000000000000000),
2097            Some(f16(2)),
2098        );
2099        roundtrip(
2100            f64::from_bits(0b0_01111101000_1000000000000000000000000000000000000000000000000000),
2101            Some(f16(3)),
2102        );
2103        roundtrip(
2104            f64::from_bits(0b0_01111101000_1100000000000000000000000000000000000000000000000000),
2105            None,
2106        );
2107        // check basic, normal and positive numbers
2108        roundtrip(
2109            f64::from_bits(0b0_01111111000_0000000000000000000000000000000000000000000000000000),
2110            Some(f16(0x2000)),
2111        );
2112        roundtrip(
2113            f64::from_bits(0b0_01111111000_1000000000000000000000000000000000000000000000000000),
2114            Some(f16(0x2200)),
2115        );
2116        roundtrip(
2117            f64::from_bits(0b0_01111111000_1110000000000000000000000000000000000000000000000000),
2118            Some(f16(0x2380)),
2119        );
2120        roundtrip(
2121            f64::from_bits(0b0_01111111000_1110000001000000000000000000000000000000000000000000),
2122            Some(f16(0x2381)),
2123        );
2124        roundtrip(
2125            f64::from_bits(0b0_01111111000_1110000001100000000000000000000000000000000000000000),
2126            None,
2127        );
2128    }
2129
2130    #[test]
2131    fn test_max() {
2132        let a = f16::from_f32(0.0);
2133        let b = f16::from_f32(42.0);
2134        assert_eq!(a.max(b), b);
2135
2136        let a = f16::from_f32(42.0);
2137        let b = f16::from_f32(0.0);
2138        assert_eq!(a.max(b), a);
2139
2140        let a = f16::NAN;
2141        let b = f16::from_f32(42.0);
2142        assert_eq!(a.max(b), b);
2143
2144        let a = f16::from_f32(42.0);
2145        let b = f16::NAN;
2146        assert_eq!(a.max(b), a);
2147
2148        let a = f16::NAN;
2149        let b = f16::NAN;
2150        assert!(a.max(b).is_nan());
2151    }
2152
2153    #[test]
2154    fn test_min() {
2155        let a = f16::from_f32(0.0);
2156        let b = f16::from_f32(42.0);
2157        assert_eq!(a.min(b), a);
2158
2159        let a = f16::from_f32(42.0);
2160        let b = f16::from_f32(0.0);
2161        assert_eq!(a.min(b), b);
2162
2163        let a = f16::NAN;
2164        let b = f16::from_f32(42.0);
2165        assert_eq!(a.min(b), b);
2166
2167        let a = f16::from_f32(42.0);
2168        let b = f16::NAN;
2169        assert_eq!(a.min(b), a);
2170
2171        let a = f16::NAN;
2172        let b = f16::NAN;
2173        assert!(a.min(b).is_nan());
2174    }
2175}
float16/binary16.rs

float16/
binary16.rs