1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
//! IEEE 754 binary16 <-> binary32 conversion helpers.
//!
//! Pure-Rust conversions used by the lib (e.g. `training_runner`
//! wiring the batched binary AdamW step) and by the HIP
//! fp16 <-> fp32 boundary. No platform intrinsics; the conversions
//! are bit-level and round-trip safe.
//!
// IEEE 754 binary16 <-> binary32 conversion helpers shared between
// the lib (e.g. `training_runner` wiring the batched binary AdamW
// into the per-parameter step loop) and the `hip_adamw` test
// suite. The HIP kernel takes fp16 weights/gradients and fp32
// moments; the model layer stores everything as fp32, so every
// wire-format round-trip needs a host-side conversion.
//
// The conversion matches HIP's `__half2float` (decode) and
// `__float2half_rn` (encode round-to-nearest-even) bit-exactly so
// a value uploaded to the GPU and read back to the host is
// byte-identical to the original.
/// IEEE 754 binary16 -> binary32 conversion (matches HIP
/// `__half2float`).
pub fn f16_to_f32(bits: u16) -> f32 {
let sign = ((bits >> 15) & 1) as u32;
let exp = ((bits >> 10) & 0x1f) as u32;
let frac = (bits & 0x3ff) as u32;
let f32_bits = if exp == 0 {
if frac == 0 {
sign << 31
} else {
let mut e: i32 = -14;
let mut f = frac;
while (f & 0x400) == 0 {
f <<= 1;
e -= 1;
}
f &= 0x3ff;
let f32_exp = (e + 127) as u32;
(sign << 31) | (f32_exp << 23) | (f << 13)
}
} else if exp == 0x1f {
(sign << 31) | (0xff << 23) | (frac << 13) | u32::from(frac != 0)
} else {
let f32_exp = exp + 112;
(sign << 31) | (f32_exp << 23) | (frac << 13)
};
f32::from_bits(f32_bits)
}
/// IEEE 754 binary32 -> binary16 round-to-nearest-even conversion
/// (matches HIP `__float2half_rn`).
pub fn f32_to_f16(value: f32) -> u16 {
let bits = value.to_bits();
let sign = ((bits >> 31) & 1) as u16;
let exp = ((bits >> 23) & 0xff) as i32;
let frac = bits & 0x7fffff;
if exp == 0xff {
return (sign << 15) | 0x7c00 | u16::from(frac != 0);
}
let new_exp = exp - 127 + 15;
if new_exp >= 0x1f {
return (sign << 15) | 0x7c00;
}
if new_exp <= 0 {
if new_exp < -10 {
return sign << 15;
}
// Subnormal fp16: the 24-bit mantissa (frac | 0x800000)
// shifts right by (14 - new_exp) to land in the 10-bit
// subnormal fraction field. For new_exp = 0 the shift
// is 14 (largest subnormal exponent); for new_exp = -10
// the shift is 24 (the full mantissa width, so the
// rounded result is 0 or 1). The rounding bit sits at
// `shift - 1` and the sticky mask is everything below
// it. If rounding pushes the mantissa over `0x400`, the
// result is the smallest NORMAL fp16 (2^-14), not a
// subnormal — that case is handled below.
let m = frac | 0x800000u32;
let shift = (14 - new_exp) as u32;
let round_bit = 1u32 << (shift - 1);
let sticky = (m & (round_bit - 1)) != 0;
let mut rounded = m >> shift;
if (m & round_bit) != 0 && (sticky || (rounded & 1) == 1) {
rounded += 1;
}
if rounded == 0x400 {
// Rounded up across the subnormal -> normal boundary.
// Smallest normal fp16 is 2^-14, encoded as
// sign=., exp_field=1, frac=0 = `1u16 << 10` (= 0x0400).
// The previous code returned `16u16 << 10` (= 0x4000),
// which is the encoding for value ±2.0. That bug
// silently rounded any fp32 value just below 2^-14 to
// ±2.0, producing 2.0 absolute errors on subnormal-boundary
// reductions in cpu_gemm_bw_grad_a etc.
return (sign << 15) | (1u16 << 10);
}
return (sign << 15) | (rounded as u16 & 0x3ff);
}
let round_bit = 1u32 << 12;
let sticky = (frac & (round_bit - 1)) != 0;
let mut rounded = (frac >> 13) & 0x3ff;
if (frac & round_bit) != 0 && (sticky || (rounded & 1) == 1) {
rounded += 1;
if rounded == 0x400 {
return (sign << 15) | ((new_exp as u16 + 1) << 10);
}
}
(sign << 15) | ((new_exp as u16) << 10) | (rounded as u16)
}
#[cfg(test)]
mod tests {
use super::*;
fn roundtrip(value: f32) -> f32 {
f16_to_f32(f32_to_f16(value))
}
#[test]
fn roundtrip_preserves_zero_and_one() {
assert_eq!(roundtrip(0.0f32).to_bits(), 0.0f32.to_bits());
assert_eq!(roundtrip(-0.0f32).to_bits(), (-0.0f32).to_bits());
assert_eq!(roundtrip(1.0f32).to_bits(), 1.0f32.to_bits());
assert_eq!(roundtrip(-1.0f32).to_bits(), (-1.0f32).to_bits());
}
#[test]
fn roundtrip_finite_values_within_one_ulp() {
// Pick a few values that exercise subnormal/exponent
// boundary behavior; the round-trip should land within 1
// ULP of the input.
for v in [
0.1f32, 0.5, 0.99999, 1.0e-3, 1.0e3, -3.14, 65504.0, -65504.0,
] {
let rt = roundtrip(v);
let ulp = (v - rt).abs();
assert!(
ulp <= v.abs() * 5e-4 + 1e-7,
"roundtrip of {v} gave {rt}, ulp={ulp}"
);
}
}
#[test]
fn f16_to_f32_decode_matches_known_patterns() {
// 1.0 in fp16 is sign=0, exp=15, frac=0 -> 0x3C00
assert_eq!(f16_to_f32(0x3C00).to_bits(), 1.0f32.to_bits());
// 2.0 in fp16 is sign=0, exp=16, frac=0 -> 0x4000
assert_eq!(f16_to_f32(0x4000).to_bits(), 2.0f32.to_bits());
// -1.0 is 0xBC00
assert_eq!(f16_to_f32(0xBC00).to_bits(), (-1.0f32).to_bits());
// 0.5 is 0x3800
assert_eq!(f16_to_f32(0x3800).to_bits(), 0.5f32.to_bits());
}
#[test]
fn f32_to_f16_encode_matches_known_patterns() {
assert_eq!(f32_to_f16(1.0), 0x3C00);
assert_eq!(f32_to_f16(2.0), 0x4000);
assert_eq!(f32_to_f16(-1.0), 0xBC00);
assert_eq!(f32_to_f16(0.5), 0x3800);
}
#[test]
fn f32_to_f16_subnormal_encode_matches_known_patterns() {
// Regression for the subnormal-shift bug. Before the fix,
// the encoder used `(frac|0x800000) >> (1-new_exp)` then
// `>> (13-new_exp)`, which combined shift by
// `14 - 2*new_exp` instead of `14 - new_exp`. For
// `2^-23` (new_exp=-8) that returned 0 instead of
// `0x0002`, silently zeroing any weight/grad in
// `(0, 2^-14)` and stalling training at initialization
// (init scales like `1/sqrt(fan_in)` for an
// ~1024-fan-in layer land near `2^-10`, well within
// the buggy range).
//
// Verified reference values match HIP `__float2half_rn`:
// 2^-24 -> 0x0001 (smallest subnormal)
// 2^-23 -> 0x0002
// 2^-22 -> 0x0004
// 2^-16 -> 0x0100
// 2^-15 -> 0x0200 (largest subnormal)
// 2^-14 -> 0x0400 (smallest normal)
// 2^-10 -> 0x1400 (normal; exp_field=5)
let p2 = |e: i32| -> f32 { f32::from_bits(((127 - e) as u32) << 23) };
assert_eq!(f32_to_f16(p2(24)), 0x0001, "2^-24 (smallest subnormal)");
assert_eq!(f32_to_f16(p2(23)), 0x0002, "2^-23");
assert_eq!(f32_to_f16(p2(22)), 0x0004, "2^-22");
assert_eq!(f32_to_f16(p2(16)), 0x0100, "2^-16");
assert_eq!(f32_to_f16(p2(15)), 0x0200, "2^-15 (largest subnormal)");
assert_eq!(f32_to_f16(p2(14)), 0x0400, "2^-14 (smallest normal)");
assert_eq!(f32_to_f16(p2(10)), 0x1400, "2^-10 (normal)");
// Negative subnormals preserve sign.
assert_eq!(f32_to_f16(-p2(23)), 0x8002, "-2^-23");
}
#[test]
fn f32_to_f16_subnormal_rounds_to_nearest_even() {
// Midpoint between 2^-24 and 2^-23: value = 1.5 * 2^-24.
// Round-to-nearest-even picks the EVEN mantissa, i.e.
// 0x0002 (mantissa 2) rather than 0x0001 (mantissa 1).
// This matches the normal-path's RNE semantics in the
// encoder.
let v = 1.5f32 * 2f32.powi(-24);
assert_eq!(f32_to_f16(v), 0x0002, "1.5 * 2^-24 -> 0x0002 (RNE)");
}
#[test]
fn f32_to_f16_subnormal_rounds_up_to_smallest_normal_not_to_two() {
// Regression: a previous bug returned (16u16 << 10) = 0x4000
// (encoding for value 2.0) instead of (1u16 << 10) = 0x0400
// (encoding for smallest normal fp16, 2^-14) when an fp32
// value just below the subnormal/normal boundary rounded up.
//
// Concrete repro: cpu_gemm_bw_grad_a's reduction at
// (row=58, col=69) of a 1024x1024 random fp16 input lands
// at -0.0000610128 (fp32). Its f32_bits are exp=112,
// mantissa=0x7ffeb1, which after shift-14 and RNE rounds
// up to mantissa 0x400, the smallest-normal boundary. The
// correct fp16 is 0x8400 (-2^-14 ≈ -0.0000610352); the bug
// produced 0xc000 (-2.0), masking itself as a 2.0 GPU error.
let v = -0.0000610128f32;
assert_eq!(
f32_to_f16(v),
0x8400,
"fp32 value just below -2^-14 should round to smallest negative normal fp16 (0x8400), not -2.0 (0xc000)"
);
// The positive side too: just under +2^-14 should also
// round up to +smallest_normal, not +2.0.
let vp = 0.0000610128f32;
assert_eq!(
f32_to_f16(vp),
0x0400,
"fp32 value just below +2^-14 should round to smallest positive normal fp16 (0x0400), not 2.0 (0x4000)"
);
}
#[test]
fn f32_to_f16_boundary_exhaustive() {
// Exhaustive sweep of every fp32 value with `exp_field=112`
// (the largest subnormal exponent in fp16, spanning
// `[2^-15, 2^-14)` in fp32). For each of the 2^24 values
// (frac in `[0, 0x7fffff]`, both signs), verify the
// returned fp16 has `exp_field` in `{0, 1}` — i.e. either a
// subnormal (exp_field=0) or the smallest normal
// (exp_field=1). It must NEVER be 16, which would encode
// value 2.0 (the 0x4000 vs 0x0400 bug from Task #75).
//
// The previous encoder returned `(16u16 << 10)` when an
// fp32 value just below the boundary rounded up across it
// to the smallest-normal mantissa 0x400, instead of
// `(1u16 << 10)`. That off-by-15 in the exponent field
// silently turned `±2^-14` reductions into `±2.0` and
// masked itself as a 2.0 GPU error.
let mut first_violation: Option<(u32, u16)> = None;
'outer: for frac in 0u32..=0x7fffffu32 {
let pos_bits: u32 = (112u32 << 23) | frac;
let neg_bits: u32 = pos_bits | (1u32 << 31);
for bits in [pos_bits, neg_bits] {
let v = f32::from_bits(bits);
let h = f32_to_f16(v);
let exp_field = h & 0x7c00;
if exp_field > 0x0400 {
first_violation = Some((bits, h));
break 'outer;
}
}
}
if let Some((bits, h)) = first_violation {
panic!(
"f32_to_f16({:#010x}) = {:#06x}: exp_field={:#x} > 0x0400. \
For fp32 values strictly below 2^-14, the returned fp16 \
must have exp_field in {{0, 1}} (subnormal or smallest \
normal), never 16 (which encodes value 2.0 — the \
subnormal-boundary round-up bug from Task #75).",
bits,
h,
h & 0x7c00
);
}
}
#[test]
fn f32_to_f16_roundtrip_property_random() {
// Property test: ~1000 random positive fp32 values across the
// full range (NaN/Inf and >65504 skipped), seeded with an
// LCG so the test is reproducible without adding a `rand`
// dependency. For each value v we check:
// 1. Round-trip correctness: f32 -> f16 -> f32' satisfies
// |v - v'| <= |v| * 5e-4 + 1e-7 (the same relative
// bound as the existing `roundtrip_finite_values_*`
// test, which is tight against fp16's 10-bit mantissa).
// 2. Boundary property: when v has fp32 exp_field=112
// (i.e. v in [2^-15, 2^-14)), the fp16 exp_field is
// in {0, 1} — never 16.
let mut rng_state: u64 = 0x12345678_9abcdef0_u64;
let mut next_u32 = || -> u32 {
// Numerical-Recipes LCG (same family as MMIX).
rng_state = rng_state
.wrapping_mul(6364136223846793005_u64)
.wrapping_add(1442695040888963407_u64);
(rng_state >> 33) as u32
};
let mut checked: u32 = 0;
let mut attempts: u32 = 0;
while checked < 1000 && attempts < 5000 {
attempts += 1;
let pos_bits = next_u32() & 0x7fff_ffff_u32; // strip sign
// Skip NaN / +inf (sign stripped already, so only +inf).
if (pos_bits & 0x7f80_0000_u32) == 0x7f80_0000_u32 {
continue;
}
let v = f32::from_bits(pos_bits);
// Skip values that round up to +inf in fp16 (largest
// finite fp16 is 65504.0, so anything above that will
// have an infinite round-trip and the relative bound
// is meaningless). The encoder correctly returns
// 0x7c00 in this case.
if v > 65504.0_f32 {
checked += 1;
continue;
}
let h = f32_to_f16(v);
let v_round = f16_to_f32(h);
let err = (v - v_round).abs();
assert!(
err <= v.abs() * 5e-4 + 1e-7,
"roundtrip of v={} (bits {:#010x}) gave v'={}, err={}",
v,
pos_bits,
v_round,
err
);
// For v in [2^-15, 2^-14), the fp16 exp field is 0
// (subnormal) or 1 (smallest normal) — never 16.
let exp = (pos_bits >> 23) & 0xff;
if exp == 112 {
let exp_field = h & 0x7c00;
assert!(
exp_field <= 0x0400,
"fp32 bits {:#010x} (v={}) in [2^-15, 2^-14) must \
round to fp16 exp_field in {{0, 1}}, got {:#06x}",
pos_bits,
v,
h
);
}
checked += 1;
}
assert!(
checked >= 500,
"not enough valid random samples collected: {}",
checked
);
}
}