1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//! IEEE 754 half-precision (binary16) floating-point type.
//!
//! This is a minimal implementation providing only the operations needed for JPEG XL decoding,
//! avoiding external dependencies like `half` which pulls in `zerocopy`.
/// IEEE 754 binary16 half-precision floating-point type.
///
/// Format: 1 sign bit, 5 exponent bits (bias 15), 10 mantissa bits.
#[allow(non_camel_case_types)]
#[derive(Copy, Clone, Default, PartialEq, Eq, Hash, bytemuck::Pod, bytemuck::Zeroable)]
#[repr(transparent)]
pub struct f16(u16);
impl f16 {
/// Positive zero.
pub const ZERO: Self = Self(0);
/// Creates an f16 from its raw bit representation.
#[inline]
pub const fn from_bits(bits: u16) -> Self {
Self(bits)
}
/// Returns the raw bit representation.
#[inline]
pub const fn to_bits(self) -> u16 {
self.0
}
/// Converts to f32.
#[inline]
pub fn to_f32(self) -> f32 {
let bits = self.0;
let sign = ((bits >> 15) & 1) as u32;
let exp = ((bits >> 10) & 0x1F) as u32;
let mant = (bits & 0x3FF) as u32;
let f32_bits = if exp == 0 {
if mant == 0 {
// Zero (signed)
sign << 31
} else {
// Denormal f16 -> normalized f32
// Find the leading 1 bit in mantissa
let mut m = mant;
let mut e = 0u32;
while (m & 0x400) == 0 {
m <<= 1;
e += 1;
}
m &= 0x3FF; // Remove the implicit leading 1
let new_exp = 127 - 15 - e; // Rebias: f16 bias=15, f32 bias=127
(sign << 31) | (new_exp << 23) | (m << 13)
}
} else if exp == 31 {
// Infinity or NaN
if mant == 0 {
// Infinity
(sign << 31) | (0xFF << 23)
} else {
// NaN - preserve some payload bits, ensure quiet NaN
(sign << 31) | (0xFF << 23) | (mant << 13) | 0x0040_0000
}
} else {
// Normal number
// Rebias: f16 uses bias 15, f32 uses bias 127
// new_exp = exp - 15 + 127 = exp + 112
let new_exp = exp + 112;
(sign << 31) | (new_exp << 23) | (mant << 13)
};
f32::from_bits(f32_bits)
}
/// Creates an f16 from an f32.
#[inline]
pub fn from_f32(f: f32) -> Self {
let bits = f.to_bits();
let sign = ((bits >> 31) & 1) as u16;
let exp = ((bits >> 23) & 0xFF) as i32;
let mant = bits & 0x007F_FFFF;
let h_bits = if exp == 0 {
// Zero or f32 denormal -> f16 zero (too small)
sign << 15
} else if exp == 255 {
// Infinity or NaN
if mant == 0 {
(sign << 15) | (0x1F << 10) // Infinity
} else {
(sign << 15) | (0x1F << 10) | 0x0200 // Quiet NaN
}
} else {
let unbiased = exp - 127;
if unbiased < -24 {
// Too small, underflow to zero
sign << 15
} else if unbiased < -14 {
// Denormal f16
let shift = (-14 - unbiased) as u32;
let m = ((mant | 0x0080_0000) >> (shift + 14)) as u16;
(sign << 15) | m
} else if unbiased > 15 {
// Overflow to infinity
(sign << 15) | (0x1F << 10)
} else {
// Normal f16
let h_exp = (unbiased + 15) as u16;
let h_mant = (mant >> 13) as u16;
// Round to nearest, ties to even
let round_bit = (mant >> 12) & 1;
let sticky = mant & 0x0FFF;
let h_mant = if round_bit == 1 && (sticky != 0 || (h_mant & 1) == 1) {
h_mant + 1
} else {
h_mant
};
// Handle mantissa overflow from rounding
if h_mant > 0x3FF {
if h_exp >= 30 {
// Overflow to infinity
(sign << 15) | (0x1F << 10)
} else {
(sign << 15) | ((h_exp + 1) << 10)
}
} else {
(sign << 15) | (h_exp << 10) | h_mant
}
}
};
Self(h_bits)
}
/// Creates an f16 from an f64.
#[inline]
pub fn from_f64(f: f64) -> Self {
// Convert via f32 - sufficient precision for f16
Self::from_f32(f as f32)
}
/// Converts to f64.
#[inline]
pub fn to_f64(self) -> f64 {
self.to_f32() as f64
}
/// Returns true if this is neither infinite nor NaN.
#[inline]
pub fn is_finite(self) -> bool {
// Exponent of 31 means infinity or NaN
((self.0 >> 10) & 0x1F) != 31
}
/// Returns the bytes in little-endian order.
#[inline]
pub const fn to_le_bytes(self) -> [u8; 2] {
self.0.to_le_bytes()
}
/// Returns the bytes in big-endian order.
#[inline]
pub const fn to_be_bytes(self) -> [u8; 2] {
self.0.to_be_bytes()
}
}
impl From<f16> for f32 {
#[inline]
fn from(f: f16) -> f32 {
f.to_f32()
}
}
impl From<f16> for f64 {
#[inline]
fn from(f: f16) -> f64 {
f.to_f64()
}
}
impl core::fmt::Debug for f16 {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "{}", self.to_f32())
}
}
impl core::fmt::Display for f16 {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "{}", self.to_f32())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_zero() {
let z = f16::ZERO;
assert_eq!(z.to_bits(), 0);
assert_eq!(z.to_f32(), 0.0);
assert!(z.is_finite());
}
#[test]
fn test_one() {
// 1.0 in f16: sign=0, exp=15 (biased), mant=0 -> 0x3C00
let one = f16::from_bits(0x3C00);
assert!((one.to_f32() - 1.0).abs() < 1e-6);
assert!(one.is_finite());
}
#[test]
fn test_negative_one() {
// -1.0 in f16: sign=1, exp=15, mant=0 -> 0xBC00
let neg_one = f16::from_bits(0xBC00);
assert!((neg_one.to_f32() - (-1.0)).abs() < 1e-6);
}
#[test]
fn test_infinity() {
// +Inf: sign=0, exp=31, mant=0 -> 0x7C00
let inf = f16::from_bits(0x7C00);
assert!(inf.to_f32().is_infinite());
assert!(!inf.is_finite());
// -Inf: 0xFC00
let neg_inf = f16::from_bits(0xFC00);
assert!(neg_inf.to_f32().is_infinite());
assert!(!neg_inf.is_finite());
}
#[test]
fn test_nan() {
// NaN: exp=31, mant!=0 -> 0x7C01 (or any mant != 0)
let nan = f16::from_bits(0x7C01);
assert!(nan.to_f32().is_nan());
assert!(!nan.is_finite());
}
#[test]
fn test_denormal() {
// Smallest positive denormal: 0x0001
let tiny = f16::from_bits(0x0001);
let val = tiny.to_f32();
assert!(val > 0.0);
assert!(val < 1e-6);
assert!(tiny.is_finite());
}
#[test]
fn test_roundtrip_normal() {
let test_values: [f32; 8] = [0.5, 1.0, 2.0, 100.0, 0.001, -0.5, -1.0, -100.0];
for &v in &test_values {
let h = f16::from_f32(v);
let back = h.to_f32();
// f16 has limited precision, allow ~0.1% error for normal values
let rel_err = ((v - back) / v).abs();
assert!(
rel_err < 0.002,
"Roundtrip failed for {}: got {}, rel_err {}",
v,
back,
rel_err
);
}
}
#[test]
fn test_roundtrip_special() {
// Zero
assert_eq!(f16::from_f32(0.0).to_f32(), 0.0);
// Infinity
assert!(f16::from_f32(f32::INFINITY).to_f32().is_infinite());
assert!(f16::from_f32(f32::NEG_INFINITY).to_f32().is_infinite());
// NaN
assert!(f16::from_f32(f32::NAN).to_f32().is_nan());
}
#[test]
fn test_overflow_to_infinity() {
// f16 max is ~65504, values above should overflow to infinity
let big = f16::from_f32(100000.0);
assert!(big.to_f32().is_infinite());
}
#[test]
fn test_underflow_to_zero() {
// Very small values should underflow to zero
let tiny = f16::from_f32(1e-10);
assert_eq!(tiny.to_f32(), 0.0);
}
#[test]
fn test_bytes() {
let h = f16::from_bits(0x1234);
assert_eq!(h.to_le_bytes(), [0x34, 0x12]);
assert_eq!(h.to_be_bytes(), [0x12, 0x34]);
}
}