float8/lib.rs
1//! Eight bit floating point types in Rust.
2//!
3//! This crate provides 2 types:
4//! - [`F8E4M3`]: Sign + 4-bit exponent + 3-bit mantissa. More precise but less dynamic range.
5//! - [`F8E5M2`]: Sign + 5-bit exponent + 2-bit mantissa. Less precise but more dynamic range (same exponent as [`struct@f16`]).
6//!
7//! Generally, this crate is modelled after the [`half`] crate, so it can be
8//! used alongside and with minimal code changes.
9//!
10//! # Serialization
11//!
12//! When the `serde` feature is enabled, [`F8E4M3`] and [`F8E5M2`] will be serialized as a newtype of
13//! [`u16`] by default. In binary formats this is ideal, as it will generally use just two bytes for
14//! storage. For string formats like JSON, however, this isn't as useful, and due to design
15//! limitations of serde, it's not possible for the default `Serialize` implementation to support
16//! different serialization for different formats.
17//!
18//! It is up to the container type of the floats to control how it is serialized. This can
19//! easily be controlled when using the derive macros using `#[serde(serialize_with="")]`
20//! attributes. For both [`F8E4M3`] and [`F8E5M2`], a `serialize_as_f32` and `serialize_as_string` are
21//! provided for use with this attribute.
22//!
23//! Deserialization of both float types supports deserializing from the default serialization,
24//! strings, and `f32`/`f64` values, so no additional work is required.
25//!
26//! # Cargo Features
27//!
28//! This crate supports a number of optional cargo features. None of these features are enabled by
29//! default, even `std`.
30//!
31//! - **`std`** — Enable features that depend on the Rust [`std`] library.
32//!
33//! - **`serde`** — Adds support for the [`serde`] crate by implementing [`Serialize`] and
34//! [`Deserialize`] traits for both [`F8E4M3`] and [`F8E5M2`].
35//!
36//! - **`num-traits`** — Adds support for the [`num-traits`] crate by implementing [`ToPrimitive`],
37//! [`FromPrimitive`], [`AsPrimitive`], [`Num`], [`Float`], [`FloatCore`], and [`Bounded`] traits
38//! for both [`F8E4M3`] and [`F8E5M2`].
39//!
40//! - **`bytemuck`** — Adds support for the [`bytemuck`] crate by implementing [`Zeroable`] and
41//! [`Pod`] traits for both [`F8E4M3`] and [`F8E5M2`].
42//!
43//! - **`zerocopy`** — Adds support for the [`zerocopy`] crate by implementing [`AsBytes`] and
44//! [`FromBytes`] traits for both [`F8E4M3`] and [`F8E5M2`].
45//!
46//! - **`rand_distr`** — Adds support for the [`rand_distr`] crate by implementing [`Distribution`]
47//! and other traits for both [`F8E4M3`] and [`F8E5M2`].
48//!
49//! - **`rkyv`** -- Enable zero-copy deserialization with [`rkyv`] crate.
50//!
51//! [`alloc`]: https://doc.rust-lang.org/alloc/
52//! [`std`]: https://doc.rust-lang.org/std/
53//! [`binary16`]: https://en.wikipedia.org/wiki/Half-precision_floating-point_format
54//! [`bfloat16`]: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
55//! [`serde`]: https://crates.io/crates/serde
56//! [`bytemuck`]: https://crates.io/crates/bytemuck
57//! [`num-traits`]: https://crates.io/crates/num-traits
58//! [`zerocopy`]: https://crates.io/crates/zerocopy
59//! [`rand_distr`]: https://crates.io/crates/rand_distr
60//! [`rkyv`]: https://crates.io/crates/rkyv
61//! [`FromBytes`]: https://docs.rs/zerocopy/latest/zerocopy/trait.FromBytes.html
62//! [`Distribution`]: https://docs.rs/rand/latest/rand/distributions/trait.Distribution.html
63//! [`AsBytes`]: https://docs.rs/zerocopy/0.6.6/zerocopy/trait.AsBytes.html
64//! [`Pod`]: https://docs.rs/bytemuck/latest/bytemuck/trait.Pod.html
65//! [`Zeroable`]: https://docs.rs/bytemuck/latest/bytemuck/trait.Zeroable.html
66//! [`Bounded`]: https://docs.rs/num-traits/latest/num_traits/bounds/trait.Bounded.html
67//! [`FloatCore`]: https://docs.rs/num-traits/latest/num_traits/float/trait.FloatCore.html
68//! [`Float`]: https://docs.rs/num-traits/latest/num_traits/float/trait.Float.html
69//! [`Num`]: https://docs.rs/num-traits/latest/num_traits/trait.Num.html
70//! [`AsPrimitive`]: https://docs.rs/num-traits/latest/num_traits/cast/trait.AsPrimitive.html
71//! [`ToPrimitive`]: https://docs.rs/num-traits/latest/num_traits/cast/trait.ToPrimitive.html
72//! [`FromPrimitive`]: https://docs.rs/num-traits/latest/num_traits/cast/trait.FromPrimitive.html
73//! [`Deserialize`]: https://docs.rs/serde/latest/serde/trait.Deserialize.html
74//! [`Serialize`]: https://docs.rs/serde/latest/serde/trait.Serialize.html
75
76#![no_std]
77
78#[cfg(feature = "num-traits")]
79mod num_traits;
80#[cfg(feature = "rand_distr")]
81mod rand_distr;
82
83use core::{
84 cmp::Ordering,
85 f64,
86 fmt::{self, Debug, Display, LowerExp, LowerHex, UpperExp, UpperHex},
87 mem,
88 num::{FpCategory, ParseFloatError},
89 ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Rem, RemAssign, Sub, SubAssign},
90 str::FromStr,
91};
92use half::f16;
93
94#[cfg(feature = "bytemuck")]
95use bytemuck::{Pod, Zeroable};
96#[cfg(feature = "serde")]
97use serde::{Deserialize, Serialize};
98#[cfg(feature = "zerocopy")]
99use zerocopy::{AsBytes, FromBytes};
100
101#[derive(Clone, Copy, PartialEq)]
102enum Kind {
103 E4M3,
104 E5M2,
105}
106
107#[allow(dead_code)]
108#[derive(Clone, Copy, PartialEq, Default)]
109/// Saturation type. If `NoSat`, allow NaN and inf.
110enum SaturationType {
111 NoSat,
112 #[default]
113 SatFinite,
114}
115
116// https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/blob/main/cuda_fp8.hpp?ref_type=heads#L97
117const fn convert_to_fp8(x: f64, saturate: SaturationType, fp8_interpretation: Kind) -> u8 {
118 // TODO: use x.to_bits() with MSRV 1.83
119 #[allow(unknown_lints, unnecessary_transmutes)]
120 let xbits: u64 = unsafe { mem::transmute::<f64, u64>(x) };
121
122 let (
123 fp8_maxnorm,
124 fp8_mantissa_mask,
125 fp8_exp_bias,
126 fp8_significand_bits,
127 fp8_mindenorm_o2,
128 fp8_overflow_threshold,
129 fp8_minnorm,
130 ) = match fp8_interpretation {
131 Kind::E4M3 => (
132 0x7E_u8,
133 0x7_u8,
134 7_u16,
135 4_u64,
136 0x3F50000000000000_u64,
137 0x407D000000000000_u64,
138 0x3F90000000000000_u64,
139 ),
140 Kind::E5M2 => (
141 0x7B_u8,
142 0x3_u8,
143 15_u16,
144 3_u64,
145 0x3EE0000000000000_u64,
146 0x40EE000000000000_u64 - 1,
147 0x3F10000000000000_u64,
148 ),
149 };
150
151 const DP_INF_BITS: u64 = 0x7FF0000000000000;
152 let fp8_dp_half_ulp: u64 = 1 << (53 - fp8_significand_bits - 1);
153 let sign: u8 = ((xbits >> 63) << 7) as u8;
154 let exp: u8 = ((((xbits >> 52) as u16) & 0x7FF)
155 .wrapping_sub(1023)
156 .wrapping_add(fp8_exp_bias)) as u8;
157 let mantissa: u8 = ((xbits >> (53 - fp8_significand_bits)) & (fp8_mantissa_mask as u64)) as u8;
158 let absx: u64 = xbits & 0x7FFFFFFFFFFFFFFF;
159
160 let res = if absx <= fp8_mindenorm_o2 {
161 // Zero or underflow
162 0
163 } else if absx > DP_INF_BITS {
164 // Preserve NaNs
165 match fp8_interpretation {
166 Kind::E4M3 => 0x7F,
167 Kind::E5M2 => 0x7E | mantissa,
168 }
169 } else if absx > fp8_overflow_threshold {
170 // Saturate
171 match saturate {
172 SaturationType::SatFinite => fp8_maxnorm,
173 SaturationType::NoSat => match fp8_interpretation {
174 Kind::E4M3 => 0x7F, // NaN
175 Kind::E5M2 => 0x7C, // Inf in E5M2
176 },
177 }
178 } else if absx >= fp8_minnorm {
179 // Round, normal range
180 let mut res = (exp << (fp8_significand_bits - 1)) | mantissa;
181
182 // Round off bits and round-to-nearest-even adjustment
183 let round = xbits & ((fp8_dp_half_ulp << 1) - 1);
184 if (round > fp8_dp_half_ulp) || ((round == fp8_dp_half_ulp) && (mantissa & 1 != 0)) {
185 res = res.wrapping_add(1);
186 }
187 res
188 } else {
189 // Denormal numbers
190 let shift = 1_u8.wrapping_sub(exp);
191 let mantissa = mantissa | (1 << (fp8_significand_bits - 1));
192 let mut res = mantissa >> shift;
193
194 // Round off bits and round-to-nearest-even adjustment
195 let round = (xbits | (1 << (53 - 1))) & ((fp8_dp_half_ulp << (shift as u64 + 1)) - 1);
196 if (round > (fp8_dp_half_ulp << shift as u64))
197 || ((round == (fp8_dp_half_ulp << shift as u64)) && (res & 1 != 0))
198 {
199 res = res.wrapping_add(1);
200 }
201 res
202 };
203
204 res | sign
205}
206
207// https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/blob/main/cuda_fp8.hpp?ref_type=heads#L463
208const fn convert_fp8_to_fp16(x: u8, fp8_interpretation: Kind) -> u16 {
209 let mut ur = (x as u16) << 8;
210
211 match fp8_interpretation {
212 Kind::E5M2 => {
213 if (ur & 0x7FFF) > 0x7C00 {
214 // If NaN, return canonical NaN
215 ur = 0x7FFF;
216 }
217 }
218 Kind::E4M3 => {
219 let sign = ur & 0x8000;
220 let mut exponent = ((ur & 0x7800) >> 1).wrapping_add(0x2000);
221 let mut mantissa = (ur & 0x0700) >> 1;
222 let absx = 0x7F & x;
223
224 if absx == 0x7F {
225 // FP16 canonical NaN, discard sign
226 ur = 0x7FFF;
227 } else if exponent == 0x2000 {
228 // Zero or denormal
229 if mantissa != 0 {
230 // Normalize
231 mantissa <<= 1;
232 while (mantissa & 0x0400) == 0 {
233 mantissa <<= 1;
234 exponent = exponent.wrapping_sub(0x0400);
235 }
236 // Discard implicit leading bit
237 mantissa &= 0x03FF;
238 } else {
239 // Zero
240 exponent = 0;
241 }
242 ur = sign | exponent | mantissa;
243 } else {
244 ur = sign | exponent | mantissa;
245 }
246 }
247 };
248
249 ur
250}
251
252#[derive(Clone, Copy, Default)]
253#[cfg_attr(feature = "serde", derive(Serialize))]
254#[cfg_attr(
255 feature = "rkyv",
256 derive(rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)
257)]
258#[cfg_attr(feature = "rkyv", archive(resolver = "F8E4M3Resolver"))]
259#[cfg_attr(feature = "bytemuck", derive(Zeroable, Pod))]
260#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
261#[repr(transparent)]
262/// Eight bit floating point type with 4-bit exponent and 3-bit mantissa.
263pub struct F8E4M3(u8);
264
265impl F8E4M3 {
266 const INTERPRETATION: Kind = Kind::E4M3;
267
268 /// Construct an 8-bit floating point value from the raw bits.
269 pub const fn from_bits(bits: u8) -> Self {
270 Self(bits)
271 }
272
273 /// Return the raw bits.
274 pub const fn to_bits(&self) -> u8 {
275 self.0
276 }
277
278 /// Convert a [`prim@f64`] type into [`F8E4M3`].
279 ///
280 /// This operation is lossy.
281 ///
282 /// - If the 64-bit value is to large to fit in 8-bits, ±∞ will result.
283 /// - NaN values are preserved.
284 /// - 64-bit subnormal values are too tiny to be represented in 8-bits and result in ±0.
285 /// - Exponents that underflow the minimum 8-bit exponent will result in 8-bit subnormals or ±0.
286 /// - All other values are truncated and rounded to the nearest representable 8-bit value.
287 pub const fn from_f64(x: f64) -> Self {
288 Self(convert_to_fp8(
289 x,
290 SaturationType::SatFinite,
291 Self::INTERPRETATION,
292 ))
293 }
294
295 /// Convert a [`f32`] type into [`F8E4M3`].
296 ///
297 /// This operation is lossy.
298 ///
299 /// - If the 32-bit value is to large to fit in 8-bits, ±∞ will result.
300 /// - NaN values are preserved.
301 /// - 32-bit subnormal values are too tiny to be represented in 8-bits and result in ±0.
302 /// - Exponents that underflow the minimum 8-bit exponent will result in 8-bit subnormals or ±0.
303 /// - All other values are truncated and rounded to the nearest representable 8-bit value.
304 pub const fn from_f32(x: f32) -> Self {
305 Self::from_f64(x as f64)
306 }
307
308 /// Convert this [`F8E4M3`] type into a [`struct@f16`] type.
309 ///
310 /// This operation may be lossy.
311 ///
312 /// - NaN and zero values are preserved.
313 /// - Subnormal values are normalized.
314 /// - Otherwise, the values are mapped to the appropriate 16-bit value.
315 pub const fn to_f16(&self) -> f16 {
316 f16::from_bits(convert_fp8_to_fp16(self.0, Self::INTERPRETATION))
317 }
318
319 /// Convert this [`F8E4M3`] type into a [`f32`] type.
320 ///
321 /// This operation may be lossy.
322 ///
323 /// - NaN and zero values are preserved.
324 /// - Subnormal values are normalized.
325 /// - Otherwise, the values are mapped to the appropriate 16-bit value.
326 pub const fn to_f32(&self) -> f32 {
327 self.to_f16().to_f32_const()
328 }
329
330 /// Convert this [`F8E4M3`] type into a [`prim@f64`] type.
331 ///
332 /// This operation may be lossy.
333 ///
334 /// - NaN and zero values are preserved.
335 /// - Subnormal values are normalized.
336 /// - Otherwise, the values are mapped to the appropriate 16-bit value.
337 pub const fn to_f64(&self) -> f64 {
338 self.to_f16().to_f64_const()
339 }
340
341 /// Returns the ordering between `self` and `other`.
342 ///
343 /// - negative quiet NaN
344 /// - negative signaling NaN
345 /// - negative infinity
346 /// - negative numbers
347 /// - negative subnormal numbers
348 /// - negative zero
349 /// - positive zero
350 /// - positive subnormal numbers
351 /// - positive numbers
352 /// - positive infinity
353 /// - positive signaling NaN
354 /// - positive quiet NaN.
355 ///
356 /// The ordering established by this function does not always agree with the
357 /// [`PartialOrd`] and [`PartialEq`] implementations. For example,
358 /// they consider negative and positive zero equal, while `total_cmp`
359 /// doesn't.
360 ///
361 /// # Example
362 /// ```
363 /// # use float8::F8E4M3;
364 ///
365 /// let mut v: Vec<F8E4M3> = vec![];
366 /// v.push(F8E4M3::ONE);
367 /// v.push(F8E4M3::INFINITY);
368 /// v.push(F8E4M3::NEG_INFINITY);
369 /// v.push(F8E4M3::NAN);
370 /// v.push(F8E4M3::MAX_SUBNORMAL);
371 /// v.push(-F8E4M3::MAX_SUBNORMAL);
372 /// v.push(F8E4M3::ZERO);
373 /// v.push(F8E4M3::NEG_ZERO);
374 /// v.push(F8E4M3::NEG_ONE);
375 /// v.push(F8E4M3::MIN_POSITIVE);
376 ///
377 /// v.sort_by(|a, b| a.total_cmp(&b));
378 ///
379 /// assert!(v
380 /// .into_iter()
381 /// .zip(
382 /// [
383 /// F8E4M3::NEG_INFINITY,
384 /// F8E4M3::NEG_ONE,
385 /// -F8E4M3::MAX_SUBNORMAL,
386 /// F8E4M3::NEG_ZERO,
387 /// F8E4M3::ZERO,
388 /// F8E4M3::MAX_SUBNORMAL,
389 /// F8E4M3::MIN_POSITIVE,
390 /// F8E4M3::ONE,
391 /// F8E4M3::INFINITY,
392 /// F8E4M3::NAN
393 /// ]
394 /// .iter()
395 /// )
396 /// .all(|(a, b)| a.to_bits() == b.to_bits()));
397 /// ```
398 pub fn total_cmp(&self, other: &Self) -> Ordering {
399 let mut left = self.to_bits() as i8;
400 let mut right = other.to_bits() as i8;
401 left ^= (((left >> 7) as u8) >> 1) as i8;
402 right ^= (((right >> 7) as u8) >> 1) as i8;
403 left.cmp(&right)
404 }
405
406 /// Returns `true` if and only if `self` has a positive sign, including +0.0, NaNs with a
407 /// positive sign bit and +∞.
408 pub const fn is_sign_positive(&self) -> bool {
409 self.0 & 0x80u8 == 0
410 }
411
412 /// Returns `true` if and only if `self` has a negative sign, including −0.0, NaNs with a
413 /// negative sign bit and −∞.
414 pub const fn is_sign_negative(&self) -> bool {
415 self.0 & 0x80u8 != 0
416 }
417
418 /// Returns `true` if this value is NaN and `false` otherwise.
419 ///
420 /// # Examples
421 ///
422 /// ```rust
423 /// # use float8::*;
424 ///
425 /// let nan = F8E4M3::NAN;
426 /// let f = F8E4M3::from_f32(7.0_f32);
427 ///
428 /// assert!(nan.is_nan());
429 /// assert!(!f.is_nan());
430 /// ```
431 pub const fn is_nan(&self) -> bool {
432 self.0 == 0x7Fu8 || self.0 == 0xFFu8
433 }
434
435 /// Returns `true` if this value is ±∞ and `false` otherwise.
436 ///
437 /// # Examples
438 ///
439 /// ```rust
440 /// # use float8::*;
441 ///
442 /// let f = F8E4M3::from_f32(7.0f32);
443 /// let inf = F8E4M3::INFINITY;
444 /// let neg_inf = F8E4M3::NEG_INFINITY;
445 /// let nan = F8E4M3::NAN;
446 ///
447 /// assert!(!f.is_infinite());
448 /// assert!(!nan.is_infinite());
449 ///
450 /// assert!(inf.is_infinite());
451 /// assert!(neg_inf.is_infinite());
452 /// ```
453 pub const fn is_infinite(&self) -> bool {
454 self.0 & 0x7Fu8 == 0x7Eu8
455 }
456
457 /// Returns true if this number is neither infinite nor NaN.
458 ///
459 /// # Examples
460 ///
461 /// ```rust
462 /// # use float8::*;
463 ///
464 /// let f = F8E4M3::from_f32(7.0f32);
465 /// let inf = F8E4M3::INFINITY;
466 /// let neg_inf = F8E4M3::NEG_INFINITY;
467 /// let nan = F8E4M3::NAN;
468 ///
469 /// assert!(f.is_finite());
470 ///
471 /// assert!(!nan.is_finite());
472 /// assert!(!inf.is_finite());
473 /// assert!(!neg_inf.is_finite());
474 /// ```
475 pub const fn is_finite(&self) -> bool {
476 !(self.is_infinite() || self.is_nan())
477 }
478
479 /// Returns `true` if the number is neither zero, infinite, subnormal, or `NaN` and `false` otherwise.
480 ///
481 /// # Examples
482 ///
483 /// ```rust
484 /// # use float8::*;
485 ///
486 /// let min = F8E4M3::MIN_POSITIVE;
487 /// let max = F8E4M3::MAX;
488 /// let lower_than_min = F8E4M3::from_f32(1.0e-10_f32);
489 /// let zero = F8E4M3::from_f32(0.0_f32);
490 ///
491 /// assert!(min.is_normal());
492 /// assert!(max.is_normal());
493 ///
494 /// assert!(!zero.is_normal());
495 /// assert!(!F8E4M3::NAN.is_normal());
496 /// assert!(!F8E4M3::INFINITY.is_normal());
497 /// // Values between `0` and `min` are Subnormal.
498 /// assert!(!lower_than_min.is_normal());
499 /// ```
500 pub const fn is_normal(&self) -> bool {
501 #[allow(clippy::unusual_byte_groupings)]
502 let exp = self.0 & 0b0_1111_000;
503 exp != 0 && self.is_finite()
504 }
505
506 /// Returns the minimum of the two numbers.
507 ///
508 /// If one of the arguments is NaN, then the other argument is returned.
509 ///
510 /// # Examples
511 ///
512 /// ```
513 /// # use float8::*;
514 /// let x = F8E4M3::from_f32(1.0);
515 /// let y = F8E4M3::from_f32(2.0);
516 ///
517 /// assert_eq!(x.min(y), x);
518 /// ```
519 pub fn min(self, other: Self) -> Self {
520 if other < self && !other.is_nan() {
521 other
522 } else {
523 self
524 }
525 }
526
527 /// Returns the minimum of the two numbers.
528 ///
529 /// If one of the arguments is NaN, then the other argument is returned.
530 ///
531 /// # Examples
532 ///
533 /// ```
534 /// # use float8::*;
535 /// let x = F8E4M3::from_f32(1.0);
536 /// let y = F8E4M3::from_f32(2.0);
537 ///
538 /// assert_eq!(x.min(y), x);
539 /// ```
540 pub fn max(self, other: Self) -> Self {
541 if other > self && !other.is_nan() {
542 other
543 } else {
544 self
545 }
546 }
547
548 /// Restrict a value to a certain interval unless it is NaN.
549 ///
550 /// Returns `max` if `self` is greater than `max`, and `min` if `self` is less than `min`.
551 /// Otherwise this returns `self`.
552 ///
553 /// Note that this function returns NaN if the initial value was NaN as well.
554 ///
555 /// # Panics
556 /// Panics if `min > max`, `min` is NaN, or `max` is NaN.
557 ///
558 /// # Examples
559 ///
560 /// ```
561 /// # use float8::*;
562 /// assert!(F8E4M3::from_f32(-3.0).clamp(F8E4M3::from_f32(-2.0), F8E4M3::from_f32(1.0)) == F8E4M3::from_f32(-2.0));
563 /// assert!(F8E4M3::from_f32(0.0).clamp(F8E4M3::from_f32(-2.0), F8E4M3::from_f32(1.0)) == F8E4M3::from_f32(0.0));
564 /// assert!(F8E4M3::from_f32(2.0).clamp(F8E4M3::from_f32(-2.0), F8E4M3::from_f32(1.0)) == F8E4M3::from_f32(1.0));
565 /// assert!(F8E4M3::NAN.clamp(F8E4M3::from_f32(-2.0), F8E4M3::from_f32(1.0)).is_nan());
566 /// ```
567 pub fn clamp(self, min: Self, max: Self) -> Self {
568 assert!(min <= max);
569 let mut x = self;
570 if x < min {
571 x = min;
572 }
573 if x > max {
574 x = max;
575 }
576 x
577 }
578
579 /// Returns a number composed of the magnitude of `self` and the sign of `sign`.
580 ///
581 /// Equal to `self` if the sign of `self` and `sign` are the same, otherwise equal to `-self`.
582 /// If `self` is NaN, then NaN with the sign of `sign` is returned.
583 ///
584 /// # Examples
585 ///
586 /// ```
587 /// # use float8::*;
588 /// let f = F8E4M3::from_f32(3.5);
589 ///
590 /// assert_eq!(f.copysign(F8E4M3::from_f32(0.42)), F8E4M3::from_f32(3.5));
591 /// assert_eq!(f.copysign(F8E4M3::from_f32(-0.42)), F8E4M3::from_f32(-3.5));
592 /// assert_eq!((-f).copysign(F8E4M3::from_f32(0.42)), F8E4M3::from_f32(3.5));
593 /// assert_eq!((-f).copysign(F8E4M3::from_f32(-0.42)), F8E4M3::from_f32(-3.5));
594 ///
595 /// assert!(F8E4M3::NAN.copysign(F8E4M3::from_f32(1.0)).is_nan());
596 /// ```
597 pub const fn copysign(self, sign: Self) -> Self {
598 Self((sign.0 & 0x80u8) | (self.0 & 0x7Fu8))
599 }
600
601 /// Returns a number that represents the sign of `self`.
602 ///
603 /// * `1.0` if the number is positive, `+0.0` or [`INFINITY`][Self::INFINITY]
604 /// * `-1.0` if the number is negative, `-0.0` or [`NEG_INFINITY`][Self::NEG_INFINITY]
605 /// * [`NAN`][Self::NAN] if the number is `NaN`
606 ///
607 /// # Examples
608 ///
609 /// ```rust
610 /// # use float8::*;
611 ///
612 /// let f = F8E4M3::from_f32(3.5_f32);
613 ///
614 /// assert_eq!(f.signum(), F8E4M3::from_f32(1.0));
615 /// assert_eq!(F8E4M3::NEG_INFINITY.signum(), F8E4M3::from_f32(-1.0));
616 ///
617 /// assert!(F8E4M3::NAN.signum().is_nan());
618 /// ```
619 pub const fn signum(self) -> Self {
620 if self.is_nan() {
621 self
622 } else if self.0 & 0x80u8 != 0 {
623 Self::NEG_ONE
624 } else {
625 Self::ONE
626 }
627 }
628
629 /// Returns the floating point category of the number.
630 ///
631 /// If only one property is going to be tested, it is generally faster to use the specific
632 /// predicate instead.
633 ///
634 /// # Examples
635 ///
636 /// ```rust
637 /// use std::num::FpCategory;
638 /// # use float8::*;
639 ///
640 /// let num = F8E4M3::from_f32(12.4_f32);
641 /// let inf = F8E4M3::INFINITY;
642 ///
643 /// assert_eq!(num.classify(), FpCategory::Normal);
644 /// assert_eq!(inf.classify(), FpCategory::Infinite);
645 /// ```
646 pub const fn classify(&self) -> FpCategory {
647 if self.is_infinite() {
648 FpCategory::Infinite
649 } else if !self.is_normal() {
650 FpCategory::Subnormal
651 } else if self.is_nan() {
652 FpCategory::Nan
653 } else if self.0 & 0x7Fu8 == 0 {
654 FpCategory::Zero
655 } else {
656 FpCategory::Normal
657 }
658 }
659}
660
661#[cfg(feature = "serde")]
662struct VisitorF8E4M3;
663
664#[cfg(feature = "serde")]
665impl<'de> Deserialize<'de> for F8E4M3 {
666 fn deserialize<D>(deserializer: D) -> Result<F8E4M3, D::Error>
667 where
668 D: serde::de::Deserializer<'de>,
669 {
670 deserializer.deserialize_newtype_struct("f8e4m3", VisitorF8E4M3)
671 }
672}
673
674#[cfg(feature = "serde")]
675impl<'de> serde::de::Visitor<'de> for VisitorF8E4M3 {
676 type Value = F8E4M3;
677
678 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
679 write!(formatter, "tuple struct f8e4m3")
680 }
681
682 fn visit_newtype_struct<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
683 where
684 D: serde::Deserializer<'de>,
685 {
686 Ok(F8E4M3(<u8 as Deserialize>::deserialize(deserializer)?))
687 }
688
689 fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
690 where
691 E: serde::de::Error,
692 {
693 v.parse().map_err(|_| {
694 serde::de::Error::invalid_value(serde::de::Unexpected::Str(v), &"a float string")
695 })
696 }
697
698 fn visit_f32<E>(self, v: f32) -> Result<Self::Value, E>
699 where
700 E: serde::de::Error,
701 {
702 Ok(F8E4M3::from_f32(v))
703 }
704
705 fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E>
706 where
707 E: serde::de::Error,
708 {
709 Ok(F8E4M3::from_f64(v))
710 }
711}
712
713#[derive(Clone, Copy, Default)]
714#[cfg_attr(feature = "serde", derive(Serialize))]
715#[cfg_attr(
716 feature = "rkyv",
717 derive(rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)
718)]
719#[cfg_attr(feature = "rkyv", archive(resolver = "F8E5M2Resolver"))]
720#[cfg_attr(feature = "bytemuck", derive(Zeroable, Pod))]
721#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
722#[repr(transparent)]
723/// Eight bit floating point type with 5-bit exponent and 2-bit mantissa.
724pub struct F8E5M2(u8);
725
726impl F8E5M2 {
727 const INTERPRETATION: Kind = Kind::E5M2;
728
729 /// Construct an 8-bit floating point value from the raw bits.
730 pub const fn from_bits(bits: u8) -> Self {
731 Self(bits)
732 }
733
734 /// Return the raw bits.
735 pub const fn to_bits(&self) -> u8 {
736 self.0
737 }
738
739 /// Convert a [`prim@f64`] type into [`F8E5M2`].
740 ///
741 /// This operation is lossy.
742 ///
743 /// - If the 64-bit value is to large to fit in 8-bits, ±∞ will result.
744 /// - NaN values are preserved.
745 /// - 64-bit subnormal values are too tiny to be represented in 8-bits and result in ±0.
746 /// - Exponents that underflow the minimum 8-bit exponent will result in 8-bit subnormals or ±0.
747 /// - All other values are truncated and rounded to the nearest representable 8-bit value.
748 pub const fn from_f64(x: f64) -> Self {
749 Self(convert_to_fp8(
750 x,
751 SaturationType::SatFinite,
752 Self::INTERPRETATION,
753 ))
754 }
755
756 /// Convert a [`f32`] type into [`F8E5M2`].
757 ///
758 /// This operation is lossy.
759 ///
760 /// - If the 32-bit value is to large to fit in 8-bits, ±∞ will result.
761 /// - NaN values are preserved.
762 /// - 32-bit subnormal values are too tiny to be represented in 8-bits and result in ±0.
763 /// - Exponents that underflow the minimum 8-bit exponent will result in 8-bit subnormals or ±0.
764 /// - All other values are truncated and rounded to the nearest representable 8-bit value.
765 pub const fn from_f32(x: f32) -> Self {
766 Self::from_f64(x as f64)
767 }
768
769 /// Convert this [`F8E5M2`] type into a [`struct@f16`] type.
770 ///
771 /// This operation may be lossy.
772 ///
773 /// - NaN and zero values are preserved.
774 /// - Subnormal values are normalized.
775 /// - Otherwise, the values are mapped to the appropriate 16-bit value.
776 pub const fn to_f16(&self) -> f16 {
777 f16::from_bits(convert_fp8_to_fp16(self.0, Self::INTERPRETATION))
778 }
779
780 /// Convert this [`F8E5M2`] type into a [`prim@f32`] type.
781 ///
782 /// This operation may be lossy.
783 ///
784 /// - NaN and zero values are preserved.
785 /// - Subnormal values are normalized.
786 /// - Otherwise, the values are mapped to the appropriate 16-bit value.
787 pub const fn to_f32(&self) -> f32 {
788 self.to_f16().to_f32_const()
789 }
790
791 /// Convert this [`F8E5M2`] type into a [`prim@f64`] type.
792 ///
793 /// This operation may be lossy.
794 ///
795 /// - NaN and zero values are preserved.
796 /// - Subnormal values are normalized.
797 /// - Otherwise, the values are mapped to the appropriate 16-bit value.
798 pub const fn to_f64(&self) -> f64 {
799 self.to_f16().to_f64_const()
800 }
801
802 /// Returns the ordering between `self` and `other`.
803 ///
804 /// - negative quiet NaN
805 /// - negative signaling NaN
806 /// - negative infinity
807 /// - negative numbers
808 /// - negative subnormal numbers
809 /// - negative zero
810 /// - positive zero
811 /// - positive subnormal numbers
812 /// - positive numbers
813 /// - positive infinity
814 /// - positive signaling NaN
815 /// - positive quiet NaN.
816 ///
817 /// The ordering established by this function does not always agree with the
818 /// [`PartialOrd`] and [`PartialEq`] implementations. For example,
819 /// they consider negative and positive zero equal, while `total_cmp`
820 /// doesn't.
821 ///
822 /// # Example
823 /// ```
824 /// # use float8::F8E5M2;
825 ///
826 /// let mut v: Vec<F8E5M2> = vec![];
827 /// v.push(F8E5M2::ONE);
828 /// v.push(F8E5M2::INFINITY);
829 /// v.push(F8E5M2::NEG_INFINITY);
830 /// v.push(F8E5M2::NAN);
831 /// v.push(F8E5M2::MAX_SUBNORMAL);
832 /// v.push(-F8E5M2::MAX_SUBNORMAL);
833 /// v.push(F8E5M2::ZERO);
834 /// v.push(F8E5M2::NEG_ZERO);
835 /// v.push(F8E5M2::NEG_ONE);
836 /// v.push(F8E5M2::MIN_POSITIVE);
837 ///
838 /// v.sort_by(|a, b| a.total_cmp(&b));
839 ///
840 /// assert!(v
841 /// .into_iter()
842 /// .zip(
843 /// [
844 /// F8E5M2::NEG_INFINITY,
845 /// F8E5M2::NEG_ONE,
846 /// -F8E5M2::MAX_SUBNORMAL,
847 /// F8E5M2::NEG_ZERO,
848 /// F8E5M2::ZERO,
849 /// F8E5M2::MAX_SUBNORMAL,
850 /// F8E5M2::MIN_POSITIVE,
851 /// F8E5M2::ONE,
852 /// F8E5M2::INFINITY,
853 /// F8E5M2::NAN
854 /// ]
855 /// .iter()
856 /// )
857 /// .all(|(a, b)| a.to_bits() == b.to_bits()));
858 /// ```
859 pub fn total_cmp(&self, other: &Self) -> Ordering {
860 let mut left = self.to_bits() as i8;
861 let mut right = other.to_bits() as i8;
862 left ^= (((left >> 7) as u8) >> 1) as i8;
863 right ^= (((right >> 7) as u8) >> 1) as i8;
864 left.cmp(&right)
865 }
866
867 /// Returns `true` if and only if `self` has a positive sign, including +0.0, NaNs with a
868 /// positive sign bit and +∞.
869 pub const fn is_sign_positive(&self) -> bool {
870 self.0 & 0x80u8 == 0
871 }
872
873 /// Returns `true` if and only if `self` has a negative sign, including −0.0, NaNs with a
874 /// negative sign bit and −∞.
875 pub const fn is_sign_negative(&self) -> bool {
876 self.0 & 0x80u8 != 0
877 }
878
879 /// Returns `true` if this value is NaN and `false` otherwise.
880 ///
881 /// # Examples
882 ///
883 /// ```rust
884 /// # use float8::*;
885 ///
886 /// let nan = F8E5M2::NAN;
887 /// let f = F8E5M2::from_f32(7.0_f32);
888 ///
889 /// assert!(nan.is_nan());
890 /// assert!(!f.is_nan());
891 /// ```
892 pub const fn is_nan(&self) -> bool {
893 self.0 == 0x7Eu8 || self.0 == 0xFEu8
894 }
895
896 /// Returns `true` if this value is ±∞ and `false` otherwise.
897 ///
898 /// # Examples
899 ///
900 /// ```rust
901 /// # use float8::*;
902 ///
903 /// let f = F8E5M2::from_f32(7.0f32);
904 /// let inf = F8E5M2::INFINITY;
905 /// let neg_inf = F8E5M2::NEG_INFINITY;
906 /// let nan = F8E5M2::NAN;
907 ///
908 /// assert!(!f.is_infinite());
909 /// assert!(!nan.is_infinite());
910 ///
911 /// assert!(inf.is_infinite());
912 /// assert!(neg_inf.is_infinite());
913 /// ```
914 pub const fn is_infinite(&self) -> bool {
915 self.0 & 0x7Fu8 == 0x7Bu8
916 }
917
918 /// Returns true if this number is neither infinite nor NaN.
919 ///
920 /// # Examples
921 ///
922 /// ```rust
923 /// # use float8::*;
924 ///
925 /// let f = F8E5M2::from_f32(7.0f32);
926 /// let inf = F8E5M2::INFINITY;
927 /// let neg_inf = F8E5M2::NEG_INFINITY;
928 /// let nan = F8E5M2::NAN;
929 ///
930 /// assert!(f.is_finite());
931 ///
932 /// assert!(!nan.is_finite());
933 /// assert!(!inf.is_finite());
934 /// assert!(!neg_inf.is_finite());
935 /// ```
936 pub const fn is_finite(&self) -> bool {
937 !(self.is_infinite() || self.is_nan())
938 }
939
940 /// Returns `true` if the number is neither zero, infinite, subnormal, or `NaN` and `false` otherwise.
941 ///
942 /// # Examples
943 ///
944 /// ```rust
945 /// # use float8::*;
946 ///
947 /// let min = F8E5M2::MIN_POSITIVE;
948 /// let max = F8E5M2::MAX;
949 /// let lower_than_min = F8E5M2::from_f32(1.0e-10_f32);
950 /// let zero = F8E5M2::from_f32(0.0_f32);
951 ///
952 /// assert!(min.is_normal());
953 /// assert!(max.is_normal());
954 ///
955 /// assert!(!zero.is_normal());
956 /// assert!(!F8E5M2::NAN.is_normal());
957 /// assert!(!F8E5M2::INFINITY.is_normal());
958 /// // Values between `0` and `min` are Subnormal.
959 /// assert!(!lower_than_min.is_normal());
960 /// ```
961 pub const fn is_normal(&self) -> bool {
962 #[allow(clippy::unusual_byte_groupings)]
963 let exp = self.0 & 0b0_11111_00;
964 exp != 0 && self.is_finite()
965 }
966
967 /// Returns the minimum of the two numbers.
968 ///
969 /// If one of the arguments is NaN, then the other argument is returned.
970 ///
971 /// # Examples
972 ///
973 /// ```
974 /// # use float8::*;
975 /// let x = F8E5M2::from_f32(1.0);
976 /// let y = F8E5M2::from_f32(2.0);
977 ///
978 /// assert_eq!(x.min(y), x);
979 /// ```
980 pub fn min(self, other: Self) -> Self {
981 if other < self && !other.is_nan() {
982 other
983 } else {
984 self
985 }
986 }
987
988 /// Returns the minimum of the two numbers.
989 ///
990 /// If one of the arguments is NaN, then the other argument is returned.
991 ///
992 /// # Examples
993 ///
994 /// ```
995 /// # use float8::*;
996 /// let x = F8E5M2::from_f32(1.0);
997 /// let y = F8E5M2::from_f32(2.0);
998 ///
999 /// assert_eq!(x.min(y), x);
1000 /// ```
1001 pub fn max(self, other: Self) -> Self {
1002 if other > self && !other.is_nan() {
1003 other
1004 } else {
1005 self
1006 }
1007 }
1008
1009 /// Restrict a value to a certain interval unless it is NaN.
1010 ///
1011 /// Returns `max` if `self` is greater than `max`, and `min` if `self` is less than `min`.
1012 /// Otherwise this returns `self`.
1013 ///
1014 /// Note that this function returns NaN if the initial value was NaN as well.
1015 ///
1016 /// # Panics
1017 /// Panics if `min > max`, `min` is NaN, or `max` is NaN.
1018 ///
1019 /// # Examples
1020 ///
1021 /// ```
1022 /// # use float8::*;
1023 /// assert!(F8E5M2::from_f32(-3.0).clamp(F8E5M2::from_f32(-2.0), F8E5M2::from_f32(1.0)) == F8E5M2::from_f32(-2.0));
1024 /// assert!(F8E5M2::from_f32(0.0).clamp(F8E5M2::from_f32(-2.0), F8E5M2::from_f32(1.0)) == F8E5M2::from_f32(0.0));
1025 /// assert!(F8E5M2::from_f32(2.0).clamp(F8E5M2::from_f32(-2.0), F8E5M2::from_f32(1.0)) == F8E5M2::from_f32(1.0));
1026 /// assert!(F8E5M2::NAN.clamp(F8E5M2::from_f32(-2.0), F8E5M2::from_f32(1.0)).is_nan());
1027 /// ```
1028 pub fn clamp(self, min: Self, max: Self) -> Self {
1029 assert!(min <= max);
1030 let mut x = self;
1031 if x < min {
1032 x = min;
1033 }
1034 if x > max {
1035 x = max;
1036 }
1037 x
1038 }
1039
1040 /// Returns a number composed of the magnitude of `self` and the sign of `sign`.
1041 ///
1042 /// Equal to `self` if the sign of `self` and `sign` are the same, otherwise equal to `-self`.
1043 /// If `self` is NaN, then NaN with the sign of `sign` is returned.
1044 ///
1045 /// # Examples
1046 ///
1047 /// ```
1048 /// # use float8::*;
1049 /// let f = F8E5M2::from_f32(3.5);
1050 ///
1051 /// assert_eq!(f.copysign(F8E5M2::from_f32(0.42)), F8E5M2::from_f32(3.5));
1052 /// assert_eq!(f.copysign(F8E5M2::from_f32(-0.42)), F8E5M2::from_f32(-3.5));
1053 /// assert_eq!((-f).copysign(F8E5M2::from_f32(0.42)), F8E5M2::from_f32(3.5));
1054 /// assert_eq!((-f).copysign(F8E5M2::from_f32(-0.42)), F8E5M2::from_f32(-3.5));
1055 ///
1056 /// assert!(F8E5M2::NAN.copysign(F8E5M2::from_f32(1.0)).is_nan());
1057 /// ```
1058 pub const fn copysign(self, sign: Self) -> Self {
1059 Self((sign.0 & 0x80u8) | (self.0 & 0x7Fu8))
1060 }
1061
1062 /// Returns a number that represents the sign of `self`.
1063 ///
1064 /// * `1.0` if the number is positive, `+0.0` or [`INFINITY`][Self::INFINITY]
1065 /// * `-1.0` if the number is negative, `-0.0` or [`NEG_INFINITY`][Self::NEG_INFINITY]
1066 /// * [`NAN`][Self::NAN] if the number is `NaN`
1067 ///
1068 /// # Examples
1069 ///
1070 /// ```rust
1071 /// # use float8::*;
1072 ///
1073 /// let f = F8E5M2::from_f32(3.5_f32);
1074 ///
1075 /// assert_eq!(f.signum(), F8E5M2::from_f32(1.0));
1076 /// assert_eq!(F8E5M2::NEG_INFINITY.signum(), F8E5M2::from_f32(-1.0));
1077 ///
1078 /// assert!(F8E5M2::NAN.signum().is_nan());
1079 /// ```
1080 pub const fn signum(self) -> Self {
1081 if self.is_nan() {
1082 self
1083 } else if self.0 & 0x80u8 != 0 {
1084 Self::NEG_ONE
1085 } else {
1086 Self::ONE
1087 }
1088 }
1089
1090 /// Returns the floating point category of the number.
1091 ///
1092 /// If only one property is going to be tested, it is generally faster to use the specific
1093 /// predicate instead.
1094 ///
1095 /// # Examples
1096 ///
1097 /// ```rust
1098 /// use std::num::FpCategory;
1099 /// # use float8::*;
1100 ///
1101 /// let num = F8E5M2::from_f32(12.4_f32);
1102 /// let inf = F8E5M2::INFINITY;
1103 ///
1104 /// assert_eq!(num.classify(), FpCategory::Normal);
1105 /// assert_eq!(inf.classify(), FpCategory::Infinite);
1106 /// ```
1107 pub const fn classify(&self) -> FpCategory {
1108 if self.is_infinite() {
1109 FpCategory::Infinite
1110 } else if !self.is_normal() {
1111 FpCategory::Subnormal
1112 } else if self.is_nan() {
1113 FpCategory::Nan
1114 } else if self.0 & 0x7Fu8 == 0 {
1115 FpCategory::Zero
1116 } else {
1117 FpCategory::Normal
1118 }
1119 }
1120}
1121
1122#[cfg(feature = "serde")]
1123struct VisitorF8E5M2;
1124
1125#[cfg(feature = "serde")]
1126impl<'de> Deserialize<'de> for F8E5M2 {
1127 fn deserialize<D>(deserializer: D) -> Result<F8E5M2, D::Error>
1128 where
1129 D: serde::de::Deserializer<'de>,
1130 {
1131 deserializer.deserialize_newtype_struct("f8e5m2", VisitorF8E5M2)
1132 }
1133}
1134
1135#[cfg(feature = "serde")]
1136impl<'de> serde::de::Visitor<'de> for VisitorF8E5M2 {
1137 type Value = F8E5M2;
1138
1139 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
1140 write!(formatter, "tuple struct f8e5m2")
1141 }
1142
1143 fn visit_newtype_struct<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
1144 where
1145 D: serde::Deserializer<'de>,
1146 {
1147 Ok(F8E5M2(<u8 as Deserialize>::deserialize(deserializer)?))
1148 }
1149
1150 fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
1151 where
1152 E: serde::de::Error,
1153 {
1154 v.parse().map_err(|_| {
1155 serde::de::Error::invalid_value(serde::de::Unexpected::Str(v), &"a float string")
1156 })
1157 }
1158
1159 fn visit_f32<E>(self, v: f32) -> Result<Self::Value, E>
1160 where
1161 E: serde::de::Error,
1162 {
1163 Ok(F8E5M2::from_f32(v))
1164 }
1165
1166 fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E>
1167 where
1168 E: serde::de::Error,
1169 {
1170 Ok(F8E5M2::from_f64(v))
1171 }
1172}
1173
1174macro_rules! comparison {
1175 ($t:ident) => {
1176 impl PartialEq for $t {
1177 fn eq(&self, other: &Self) -> bool {
1178 if self.is_nan() || other.is_nan() {
1179 false
1180 } else {
1181 (self.0 == other.0) || ((self.0 | other.0) & 0x7Fu8 == 0)
1182 }
1183 }
1184 }
1185
1186 impl PartialOrd for $t {
1187 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
1188 if self.is_nan() || other.is_nan() {
1189 None
1190 } else {
1191 let neg = self.0 & 0x80u8 != 0;
1192 let other_neg = other.0 & 0x80u8 != 0;
1193 match (neg, other_neg) {
1194 (false, false) => Some(self.0.cmp(&other.0)),
1195 (false, true) => {
1196 if (self.0 | other.0) & 0x7Fu8 == 0 {
1197 Some(Ordering::Equal)
1198 } else {
1199 Some(Ordering::Greater)
1200 }
1201 }
1202 (true, false) => {
1203 if (self.0 | other.0) & 0x7Fu8 == 0 {
1204 Some(Ordering::Equal)
1205 } else {
1206 Some(Ordering::Less)
1207 }
1208 }
1209 (true, true) => Some(other.0.cmp(&self.0)),
1210 }
1211 }
1212 }
1213
1214 fn lt(&self, other: &Self) -> bool {
1215 if self.is_nan() || other.is_nan() {
1216 false
1217 } else {
1218 let neg = self.0 & 0x80u8 != 0;
1219 let other_neg = other.0 & 0x80u8 != 0;
1220 match (neg, other_neg) {
1221 (false, false) => self.0 < other.0,
1222 (false, true) => false,
1223 (true, false) => (self.0 | other.0) & 0x7Fu8 != 0,
1224 (true, true) => self.0 > other.0,
1225 }
1226 }
1227 }
1228
1229 fn le(&self, other: &Self) -> bool {
1230 if self.is_nan() || other.is_nan() {
1231 false
1232 } else {
1233 let neg = self.0 & 0x80u8 != 0;
1234 let other_neg = other.0 & 0x80u8 != 0;
1235 match (neg, other_neg) {
1236 (false, false) => self.0 <= other.0,
1237 (false, true) => (self.0 | other.0) & 0x7Fu8 == 0,
1238 (true, false) => true,
1239 (true, true) => self.0 >= other.0,
1240 }
1241 }
1242 }
1243
1244 fn gt(&self, other: &Self) -> bool {
1245 if self.is_nan() || other.is_nan() {
1246 false
1247 } else {
1248 let neg = self.0 & 0x80u8 != 0;
1249 let other_neg = other.0 & 0x80u8 != 0;
1250 match (neg, other_neg) {
1251 (false, false) => self.0 > other.0,
1252 (false, true) => (self.0 | other.0) & 0x7Fu8 != 0,
1253 (true, false) => false,
1254 (true, true) => self.0 < other.0,
1255 }
1256 }
1257 }
1258
1259 fn ge(&self, other: &Self) -> bool {
1260 if self.is_nan() || other.is_nan() {
1261 false
1262 } else {
1263 let neg = self.0 & 0x80u8 != 0;
1264 let other_neg = other.0 & 0x80u8 != 0;
1265 match (neg, other_neg) {
1266 (false, false) => self.0 >= other.0,
1267 (false, true) => true,
1268 (true, false) => (self.0 | other.0) & 0x7Fu8 == 0,
1269 (true, true) => self.0 <= other.0,
1270 }
1271 }
1272 }
1273 }
1274 };
1275}
1276
1277comparison!(F8E4M3);
1278comparison!(F8E5M2);
1279
1280macro_rules! constants {
1281 ($t:ident) => {
1282 impl $t {
1283 /// π
1284 pub const PI: Self = Self::from_f64(f64::consts::PI);
1285
1286 /// The full circle constant (τ)
1287 ///
1288 /// Equal to 2π.
1289 pub const TAU: Self = Self::from_f64(f64::consts::TAU);
1290
1291 /// π/2
1292 pub const FRAC_PI_2: Self = Self::from_f64(f64::consts::FRAC_PI_2);
1293
1294 /// π/3
1295 pub const FRAC_PI_3: Self = Self::from_f64(f64::consts::FRAC_PI_3);
1296
1297 /// π/4
1298 pub const FRAC_PI_4: Self = Self::from_f64(f64::consts::FRAC_PI_4);
1299
1300 /// π/6
1301 pub const FRAC_PI_6: Self = Self::from_f64(f64::consts::FRAC_PI_6);
1302
1303 /// π/8
1304 pub const FRAC_PI_8: Self = Self::from_f64(f64::consts::FRAC_PI_8);
1305
1306 /// 1/π
1307 pub const FRAC_1_PI: Self = Self::from_f64(f64::consts::FRAC_1_PI);
1308
1309 /// 2/π
1310 pub const FRAC_2_PI: Self = Self::from_f64(f64::consts::FRAC_2_PI);
1311
1312 /// 2/sqrt(π)
1313 pub const FRAC_2_SQRT_PI: Self = Self::from_f64(f64::consts::FRAC_2_SQRT_PI);
1314
1315 /// sqrt(2)
1316 pub const SQRT_2: Self = Self::from_f64(f64::consts::SQRT_2);
1317
1318 /// 1/sqrt(2)
1319 pub const FRAC_1_SQRT_2: Self = Self::from_f64(f64::consts::FRAC_1_SQRT_2);
1320
1321 /// Euler's number (e)
1322 pub const E: Self = Self::from_f64(f64::consts::E);
1323
1324 /// log<sub>2</sub>(10)
1325 pub const LOG2_10: Self = Self::from_f64(f64::consts::LOG2_10);
1326
1327 /// log<sub>2</sub>(e)
1328 pub const LOG2_E: Self = Self::from_f64(f64::consts::LOG2_E);
1329
1330 /// log<sub>10</sub>(2)
1331 pub const LOG10_2: Self = Self::from_f64(f64::consts::LOG10_2);
1332
1333 /// log<sub>10</sub>(e)
1334 pub const LOG10_E: Self = Self::from_f64(f64::consts::LOG10_E);
1335
1336 /// ln(2)
1337 pub const LN_2: Self = Self::from_f64(f64::consts::LN_2);
1338
1339 /// ln(10)
1340 pub const LN_10: Self = Self::from_f64(f64::consts::LN_10);
1341 }
1342 };
1343}
1344
1345constants!(F8E4M3);
1346constants!(F8E5M2);
1347
1348#[allow(clippy::unusual_byte_groupings)]
1349impl F8E4M3 {
1350 /// Number of mantissa digits
1351 pub const MANTISSA_DIGITS: u32 = 3;
1352 /// Maximum possible value
1353 pub const MAX: Self = Self::from_bits(0x7E - 1);
1354 /// Minimum possible value
1355 pub const MIN: Self = Self::from_bits(0xFE - 1);
1356 /// Positive infinity ∞
1357 pub const INFINITY: Self = Self::from_bits(0x7E);
1358 /// Negative infinity -∞
1359 pub const NEG_INFINITY: Self = Self::from_bits(0xFE);
1360 /// Smallest possible normal value
1361 pub const MIN_POSITIVE: Self = Self::from_bits(0b0_0001_000);
1362 /// Smallest possible subnormal value
1363 pub const MIN_POSITIVE_SUBNORMAL: Self = Self::from_bits(0b0_0000_001);
1364 /// Smallest possible subnormal value
1365 pub const MAX_SUBNORMAL: Self = Self::from_bits(0b0_0000_111);
1366 /// This is the difference between 1.0 and the next largest representable number.
1367 pub const EPSILON: Self = Self::from_bits(0b0_0100_000);
1368 /// NaN value
1369 pub const NAN: Self = Self::from_bits(0x7F);
1370 /// 1
1371 pub const ONE: Self = Self::from_bits(0b0_0111_000);
1372 /// 0
1373 pub const ZERO: Self = Self::from_bits(0b0_0000_000);
1374 /// -1
1375 pub const NEG_ONE: Self = Self::from_bits(0b1_0111_000);
1376 /// -0
1377 pub const NEG_ZERO: Self = Self::from_bits(0b1_0000_000);
1378 /// One greater than the minimum possible normal power of 2 exponent
1379 pub const MIN_EXP: i32 = -5;
1380 /// Minimum possible normal power of 10 exponent
1381 pub const MIN_10_EXP: i32 = -1;
1382 /// Maximum possible normal power of 2 exponent
1383 pub const MAX_EXP: i32 = 7;
1384 /// Maximum possible normal power of 10 exponent
1385 pub const MAX_10_EXP: i32 = 2;
1386 /// Approximate number of significant digits in base 10
1387 pub const DIGITS: u32 = 0;
1388}
1389
1390#[allow(clippy::unusual_byte_groupings)]
1391impl F8E5M2 {
1392 /// Number of mantissa digits
1393 pub const MANTISSA_DIGITS: u32 = 2;
1394 /// Maximum possible value
1395 pub const MAX: Self = Self::from_bits(0x7B - 1);
1396 /// Minimum possible value
1397 pub const MIN: Self = Self::from_bits(0xFB - 1);
1398 /// Positive infinity ∞
1399 pub const INFINITY: Self = Self::from_bits(0x7B);
1400 /// Negative infinity -∞
1401 pub const NEG_INFINITY: Self = Self::from_bits(0xFB);
1402 /// Smallest possible normal value
1403 pub const MIN_POSITIVE: Self = Self::from_bits(0b0_00001_00);
1404 /// Smallest possible subnormal value
1405 pub const MIN_POSITIVE_SUBNORMAL: Self = Self::from_bits(0b0_00000_01);
1406 /// Smallest possible subnormal value
1407 pub const MAX_SUBNORMAL: Self = Self::from_bits(0b0_00000_11);
1408 /// This is the difference between 1.0 and the next largest representable number.
1409 pub const EPSILON: Self = Self::from_bits(0b0_01101_00);
1410 /// NaN value
1411 pub const NAN: Self = Self::from_bits(0x7E);
1412 /// 1
1413 pub const ONE: Self = Self::from_bits(0b0_01111_00);
1414 /// 0
1415 pub const ZERO: Self = Self::from_bits(0b0_00000_00);
1416 /// -1
1417 pub const NEG_ONE: Self = Self::from_bits(0b1_01111_00);
1418 /// -0
1419 pub const NEG_ZERO: Self = Self::from_bits(0b1_00000_00);
1420 /// One greater than the minimum possible normal power of 2 exponent
1421 pub const MIN_EXP: i32 = -13;
1422 /// Minimum possible normal power of 10 exponent
1423 pub const MIN_10_EXP: i32 = -4;
1424 /// Maximum possible normal power of 2 exponent
1425 pub const MAX_EXP: i32 = 15;
1426 /// Maximum possible normal power of 10 exponent
1427 pub const MAX_10_EXP: i32 = 4;
1428 /// Approximate number of significant digits in base 10
1429 pub const DIGITS: u32 = 0;
1430}
1431
1432macro_rules! io {
1433 ($t:ident) => {
1434 impl Display for $t {
1435 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1436 Display::fmt(&self.to_f32(), f)
1437 }
1438 }
1439 impl Debug for $t {
1440 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1441 Debug::fmt(&self.to_f32(), f)
1442 }
1443 }
1444 impl FromStr for $t {
1445 type Err = ParseFloatError;
1446 fn from_str(src: &str) -> Result<$t, ParseFloatError> {
1447 f32::from_str(src).map($t::from_f32)
1448 }
1449 }
1450 impl From<f16> for $t {
1451 fn from(x: f16) -> $t {
1452 Self::from_f32(x.to_f32())
1453 }
1454 }
1455 impl From<f32> for $t {
1456 fn from(x: f32) -> $t {
1457 Self::from_f32(x)
1458 }
1459 }
1460 impl From<f64> for $t {
1461 fn from(x: f64) -> $t {
1462 Self::from_f64(x)
1463 }
1464 }
1465 impl LowerExp for $t {
1466 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1467 write!(f, "{:e}", self.to_f32())
1468 }
1469 }
1470 impl LowerHex for $t {
1471 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1472 write!(f, "{:x}", self.0)
1473 }
1474 }
1475 impl UpperExp for $t {
1476 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1477 write!(f, "{:E}", self.to_f32())
1478 }
1479 }
1480 impl UpperHex for $t {
1481 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1482 write!(f, "{:X}", self.0)
1483 }
1484 }
1485 };
1486}
1487
1488io!(F8E4M3);
1489io!(F8E5M2);
1490
1491macro_rules! binary {
1492 ($trait:ident, $fn_name:ident, $t:ident, $op:tt) => {
1493 impl $trait for $t {
1494 type Output = Self;
1495
1496 fn $fn_name(self, rhs: Self) -> Self::Output {
1497 Self::from_f32(self.to_f32() $op rhs.to_f32())
1498 }
1499 }
1500 };
1501}
1502
1503macro_rules! assign_binary {
1504 ($trait:ident, $fn_name:ident, $t:ident, $op:tt) => {
1505 impl $trait for $t {
1506 fn $fn_name(&mut self, rhs: Self) {
1507 *self = Self::from_f32(self.to_f32() $op rhs.to_f32())
1508 }
1509 }
1510 };
1511}
1512
1513macro_rules! unary {
1514 ($trait:ident, $fn_name:ident, $t:ident, $op:tt) => {
1515 impl $trait for $t {
1516 type Output = Self;
1517
1518 fn $fn_name(self) -> Self::Output {
1519 Self::from_f32($op self.to_f32())
1520 }
1521 }
1522 };
1523}
1524
1525binary!(Add, add, F8E4M3, +);
1526binary!(Sub, sub, F8E4M3, -);
1527binary!(Mul, mul, F8E4M3, *);
1528binary!(Div, div, F8E4M3, /);
1529binary!(Rem, rem, F8E4M3, %);
1530assign_binary!(AddAssign, add_assign, F8E4M3, +);
1531assign_binary!(SubAssign, sub_assign, F8E4M3, -);
1532assign_binary!(MulAssign, mul_assign, F8E4M3, *);
1533assign_binary!(DivAssign, div_assign, F8E4M3, /);
1534assign_binary!(RemAssign, rem_assign, F8E4M3, %);
1535unary!(Neg, neg, F8E4M3, -);
1536
1537binary!(Add, add, F8E5M2, +);
1538binary!(Sub, sub, F8E5M2, -);
1539binary!(Mul, mul, F8E5M2, *);
1540binary!(Div, div, F8E5M2, /);
1541binary!(Rem, rem, F8E5M2, %);
1542assign_binary!(AddAssign, add_assign, F8E5M2, +);
1543assign_binary!(SubAssign, sub_assign, F8E5M2, -);
1544assign_binary!(MulAssign, mul_assign, F8E5M2, *);
1545assign_binary!(DivAssign, div_assign, F8E5M2, /);
1546assign_binary!(RemAssign, rem_assign, F8E5M2, %);
1547unary!(Neg, neg, F8E5M2, -);
1548
1549macro_rules! from_t {
1550 ($t:ident) => {
1551 impl From<$t> for f64 {
1552 fn from(value: $t) -> Self {
1553 value.to_f64()
1554 }
1555 }
1556
1557 impl From<$t> for f32 {
1558 fn from(value: $t) -> Self {
1559 value.to_f32()
1560 }
1561 }
1562
1563 impl From<$t> for f16 {
1564 fn from(value: $t) -> Self {
1565 value.to_f16()
1566 }
1567 }
1568 };
1569}
1570
1571from_t!(F8E4M3);
1572from_t!(F8E5M2);
1573
1574#[cfg(feature = "cuda")]
1575unsafe impl cudarc::driver::DeviceRepr for F8E4M3 {}
1576#[cfg(feature = "cuda")]
1577unsafe impl cudarc::driver::ValidAsZeroBits for F8E4M3 {}
1578
1579#[cfg(feature = "cuda")]
1580unsafe impl cudarc::driver::safe::DeviceRepr for F8E5M2 {}
1581#[cfg(feature = "cuda")]
1582unsafe impl cudarc::driver::ValidAsZeroBits for F8E5M2 {}