Skip to main content

jxl_encoder_simd/
entropy.rs

1// Copyright (c) Imazen LLC and the JPEG XL Project Authors.
2// Algorithms and constants derived from libjxl (BSD-3-Clause).
3// Licensed under AGPL-3.0-or-later. Commercial licenses at https://www.imazen.io/pricing
4
5//! SIMD-accelerated coefficient processing for entropy estimation.
6//!
7//! The inner coefficient loop of `estimate_entropy_full` is the single biggest
8//! encoder hotspot (~7.5% CPU). This kernel vectorizes the per-coefficient math:
9//!   val = (block_c[i] - block_y[i] * cmap_factor) / weights[i] * quant
10//!   rval = round(val)
11//!   entropy_sum += sqrt(|rval|) * cost_delta
12//!   nzeros += (rval != 0)
13
14/// Results from vectorized entropy coefficient processing.
15#[derive(Debug, Clone, Copy)]
16pub struct EntropyCoeffResult {
17    /// Sum of sqrt(|round(val)|) * cost_delta for all coefficients.
18    pub entropy_sum: f32,
19    /// Count of non-zero quantized coefficients.
20    pub nzeros_sum: f32,
21    /// Sum of |val - round(val)| (coefficient-domain mode only).
22    pub info_loss_sum: f32,
23    /// Sum of (val - round(val))^2 (coefficient-domain mode only).
24    pub info_loss2_sum: f32,
25}
26
27impl EntropyCoeffResult {
28    pub const ZERO: Self = Self {
29        entropy_sum: 0.0,
30        nzeros_sum: 0.0,
31        info_loss_sum: 0.0,
32        info_loss2_sum: 0.0,
33    };
34}
35
36/// Vectorized entropy coefficient processing.
37///
38/// For each coefficient `i` in 0..n:
39///   `val = (block_c[i] - block_y[i] * cmap_factor) * inv_weights[i] * quant`
40///   rval = round(val)
41///   entropy_sum += sqrt(|rval|) * k_cost_delta
42///   nzeros += (rval != 0)
43///
44/// `inv_weights` contains precomputed reciprocals (1/quant_weight) to replace
45/// per-coefficient SIMD division with multiplication.
46///
47/// In pixel-domain mode: writes `error_coeffs[i] = weights[i] * (val - rval)`
48/// In coefficient-domain mode: accumulates info_loss stats and k_cost2 penalty.
49#[inline]
50#[allow(clippy::too_many_arguments)]
51pub fn entropy_estimate_coeffs(
52    block_c: &[f32],
53    block_y: &[f32],
54    weights: &[f32],
55    inv_weights: &[f32],
56    n: usize,
57    cmap_factor: f32,
58    quant: f32,
59    k_cost_delta: f32,
60    k_cost2: f32,
61    pixel_domain: bool,
62    error_coeffs: &mut [f32],
63) -> EntropyCoeffResult {
64    #[cfg(target_arch = "x86_64")]
65    {
66        use archmage::SimdToken;
67        if let Some(token) = archmage::X64V3Token::summon() {
68            return entropy_coeffs_avx2(
69                token,
70                block_c,
71                block_y,
72                weights,
73                inv_weights,
74                n,
75                cmap_factor,
76                quant,
77                k_cost_delta,
78                k_cost2,
79                pixel_domain,
80                error_coeffs,
81            );
82        }
83    }
84
85    #[cfg(target_arch = "aarch64")]
86    {
87        use archmage::SimdToken;
88        if let Some(token) = archmage::NeonToken::summon() {
89            return entropy_coeffs_neon(
90                token,
91                block_c,
92                block_y,
93                weights,
94                inv_weights,
95                n,
96                cmap_factor,
97                quant,
98                k_cost_delta,
99                k_cost2,
100                pixel_domain,
101                error_coeffs,
102            );
103        }
104    }
105
106    #[cfg(target_arch = "wasm32")]
107    {
108        use archmage::SimdToken;
109        if let Some(token) = archmage::Wasm128Token::summon() {
110            return entropy_coeffs_wasm128(
111                token,
112                block_c,
113                block_y,
114                weights,
115                inv_weights,
116                n,
117                cmap_factor,
118                quant,
119                k_cost_delta,
120                k_cost2,
121                pixel_domain,
122                error_coeffs,
123            );
124        }
125    }
126
127    entropy_coeffs_scalar(
128        block_c,
129        block_y,
130        weights,
131        inv_weights,
132        n,
133        cmap_factor,
134        quant,
135        k_cost_delta,
136        k_cost2,
137        pixel_domain,
138        error_coeffs,
139    )
140}
141
142#[inline]
143#[allow(clippy::too_many_arguments)]
144pub fn entropy_coeffs_scalar(
145    block_c: &[f32],
146    block_y: &[f32],
147    weights: &[f32],
148    inv_weights: &[f32],
149    n: usize,
150    cmap_factor: f32,
151    quant: f32,
152    k_cost_delta: f32,
153    k_cost2: f32,
154    pixel_domain: bool,
155    error_coeffs: &mut [f32],
156) -> EntropyCoeffResult {
157    let mut entropy_sum = 0.0f32;
158    let mut nzeros_sum = 0.0f32;
159    let mut info_loss_sum = 0.0f32;
160    let mut info_loss2_sum = 0.0f32;
161
162    for i in 0..n {
163        let val_in = block_c[i];
164        let val_y = block_y[i] * cmap_factor;
165        let val = (val_in - val_y) * inv_weights[i] * quant;
166        let rval = val.round();
167        let diff = val - rval;
168
169        if pixel_domain {
170            error_coeffs[i] = weights[i] * diff;
171        }
172
173        let q = rval.abs();
174        entropy_sum = q.sqrt().mul_add(k_cost_delta, entropy_sum);
175        if q != 0.0 {
176            nzeros_sum += 1.0;
177        }
178
179        if !pixel_domain {
180            let diff_abs = diff.abs();
181            info_loss_sum += diff_abs;
182            info_loss2_sum = diff_abs.mul_add(diff_abs, info_loss2_sum);
183            if q >= 1.5 {
184                entropy_sum += k_cost2;
185            }
186        }
187    }
188
189    EntropyCoeffResult {
190        entropy_sum,
191        nzeros_sum,
192        info_loss_sum,
193        info_loss2_sum,
194    }
195}
196
197#[cfg(target_arch = "x86_64")]
198#[inline]
199#[archmage::arcane]
200#[allow(clippy::too_many_arguments)]
201pub fn entropy_coeffs_avx2(
202    token: archmage::X64V3Token,
203    block_c: &[f32],
204    block_y: &[f32],
205    weights: &[f32],
206    inv_weights: &[f32],
207    n: usize,
208    cmap_factor: f32,
209    quant: f32,
210    k_cost_delta: f32,
211    k_cost2: f32,
212    pixel_domain: bool,
213    error_coeffs: &mut [f32],
214) -> EntropyCoeffResult {
215    use magetypes::simd::f32x8;
216
217    let cmap_v = f32x8::splat(token, cmap_factor);
218    let quant_v = f32x8::splat(token, quant);
219    let cost_delta_v = f32x8::splat(token, k_cost_delta);
220    let cost2_v = f32x8::splat(token, k_cost2);
221    let zero = f32x8::zero(token);
222    let one = f32x8::splat(token, 1.0);
223    let thr_1_5 = f32x8::splat(token, 1.5);
224
225    let mut entropy_acc = f32x8::zero(token);
226    let mut nzeros_acc = f32x8::zero(token);
227    let mut info_loss_acc = f32x8::zero(token);
228    let mut info_loss2_acc = f32x8::zero(token);
229    let mut cost2_acc = f32x8::zero(token);
230
231    let chunks = n / 8;
232    let simd_n = chunks * 8;
233    let block_c_s = &block_c[..simd_n];
234    let block_y_s = &block_y[..simd_n];
235    let weights_s = &weights[..simd_n];
236    let inv_weights_s = &inv_weights[..simd_n];
237    let error_coeffs_s = &mut error_coeffs[..simd_n];
238    for chunk in 0..chunks {
239        let base = chunk * 8;
240
241        let bc = crate::load_f32x8(token, block_c_s, base);
242        let by_v = crate::load_f32x8(token, block_y_s, base);
243        let w = crate::load_f32x8(token, weights_s, base);
244        let iw = crate::load_f32x8(token, inv_weights_s, base);
245
246        // val = (block_c - block_y * cmap_factor) * inv_weights * quant
247        let adjusted = bc - by_v * cmap_v;
248        let val = adjusted * iw * quant_v;
249
250        let rval = val.round();
251        let diff = val - rval;
252
253        // Write error coefficients for pixel-domain loss
254        if pixel_domain {
255            let err = w * diff;
256            crate::store_f32x8(error_coeffs_s, base, err);
257        }
258
259        // Entropy accumulation: entropy += sqrt(|rval|) * cost_delta
260        let q = rval.abs();
261        entropy_acc = q.sqrt().mul_add(cost_delta_v, entropy_acc);
262
263        // nzeros: count non-zero rounded values
264        let nz_mask = q.simd_ne(zero);
265        nzeros_acc += f32x8::blend(nz_mask, one, zero);
266
267        // Coefficient-domain statistics
268        if !pixel_domain {
269            let diff_abs = diff.abs();
270            info_loss_acc += diff_abs;
271            info_loss2_acc = diff_abs.mul_add(diff_abs, info_loss2_acc);
272
273            // q >= 1.5 penalty
274            let ge_mask = q.simd_ge(thr_1_5);
275            cost2_acc += f32x8::blend(ge_mask, cost2_v, zero);
276        }
277    }
278
279    // Handle remainder with scalar fallback (skip when n is multiple of 8)
280    let start = chunks * 8;
281    let remainder = if start < n {
282        entropy_coeffs_scalar(
283            &block_c[start..n],
284            &block_y[start..n],
285            &weights[start..n],
286            &inv_weights[start..n],
287            n - start,
288            cmap_factor,
289            quant,
290            k_cost_delta,
291            k_cost2,
292            pixel_domain,
293            &mut error_coeffs[start..n],
294        )
295    } else {
296        EntropyCoeffResult::ZERO
297    };
298
299    let mut entropy_sum = entropy_acc.reduce_add() + remainder.entropy_sum;
300    if !pixel_domain {
301        entropy_sum += cost2_acc.reduce_add();
302    }
303
304    EntropyCoeffResult {
305        entropy_sum,
306        nzeros_sum: nzeros_acc.reduce_add() + remainder.nzeros_sum,
307        info_loss_sum: info_loss_acc.reduce_add() + remainder.info_loss_sum,
308        info_loss2_sum: info_loss2_acc.reduce_add() + remainder.info_loss2_sum,
309    }
310}
311
312// ============================================================================
313// aarch64 NEON implementation
314// ============================================================================
315
316#[cfg(target_arch = "aarch64")]
317#[inline]
318#[archmage::arcane]
319#[allow(clippy::too_many_arguments)]
320pub fn entropy_coeffs_neon(
321    token: archmage::NeonToken,
322    block_c: &[f32],
323    block_y: &[f32],
324    weights: &[f32],
325    inv_weights: &[f32],
326    n: usize,
327    cmap_factor: f32,
328    quant: f32,
329    k_cost_delta: f32,
330    k_cost2: f32,
331    pixel_domain: bool,
332    error_coeffs: &mut [f32],
333) -> EntropyCoeffResult {
334    use magetypes::simd::f32x4;
335
336    let cmap_v = f32x4::splat(token, cmap_factor);
337    let quant_v = f32x4::splat(token, quant);
338    let cost_delta_v = f32x4::splat(token, k_cost_delta);
339    let cost2_v = f32x4::splat(token, k_cost2);
340    let zero = f32x4::zero(token);
341    let one = f32x4::splat(token, 1.0);
342    let thr_1_5 = f32x4::splat(token, 1.5);
343
344    let mut entropy_acc = f32x4::zero(token);
345    let mut nzeros_acc = f32x4::zero(token);
346    let mut info_loss_acc = f32x4::zero(token);
347    let mut info_loss2_acc = f32x4::zero(token);
348    let mut cost2_acc = f32x4::zero(token);
349
350    let chunks = n / 4;
351    let simd_n = chunks * 4;
352    let block_c_s = &block_c[..simd_n];
353    let block_y_s = &block_y[..simd_n];
354    let weights_s = &weights[..simd_n];
355    let inv_weights_s = &inv_weights[..simd_n];
356    for chunk in 0..chunks {
357        let base = chunk * 4;
358
359        let bc = f32x4::from_slice(token, &block_c_s[base..]);
360        let by_v = f32x4::from_slice(token, &block_y_s[base..]);
361        let w = f32x4::from_slice(token, &weights_s[base..]);
362        let iw = f32x4::from_slice(token, &inv_weights_s[base..]);
363
364        // val = (block_c - block_y * cmap_factor) * inv_weights * quant
365        let adjusted = bc - by_v * cmap_v;
366        let val = adjusted * iw * quant_v;
367
368        let rval = val.round();
369        let diff = val - rval;
370
371        if pixel_domain {
372            let err = w * diff;
373            let out: &mut [f32; 4] = (&mut error_coeffs[base..base + 4]).try_into().unwrap();
374            err.store(out);
375        }
376
377        let q = rval.abs();
378        entropy_acc = q.sqrt().mul_add(cost_delta_v, entropy_acc);
379
380        let nz_mask = q.simd_ne(zero);
381        nzeros_acc += f32x4::blend(nz_mask, one, zero);
382
383        if !pixel_domain {
384            let diff_abs = diff.abs();
385            info_loss_acc += diff_abs;
386            info_loss2_acc = diff_abs.mul_add(diff_abs, info_loss2_acc);
387
388            let ge_mask = q.simd_ge(thr_1_5);
389            cost2_acc += f32x4::blend(ge_mask, cost2_v, zero);
390        }
391    }
392
393    // Scalar remainder
394    let start = chunks * 4;
395    let remainder = entropy_coeffs_scalar(
396        &block_c[start..n],
397        &block_y[start..n],
398        &weights[start..n],
399        &inv_weights[start..n],
400        n - start,
401        cmap_factor,
402        quant,
403        k_cost_delta,
404        k_cost2,
405        pixel_domain,
406        &mut error_coeffs[start..n],
407    );
408
409    let mut entropy_sum = entropy_acc.reduce_add() + remainder.entropy_sum;
410    if !pixel_domain {
411        entropy_sum += cost2_acc.reduce_add();
412    }
413
414    EntropyCoeffResult {
415        entropy_sum,
416        nzeros_sum: nzeros_acc.reduce_add() + remainder.nzeros_sum,
417        info_loss_sum: info_loss_acc.reduce_add() + remainder.info_loss_sum,
418        info_loss2_sum: info_loss2_acc.reduce_add() + remainder.info_loss2_sum,
419    }
420}
421
422// ============================================================================
423// wasm32 SIMD128 implementation
424// ============================================================================
425
426#[cfg(target_arch = "wasm32")]
427#[inline]
428#[archmage::arcane]
429#[allow(clippy::too_many_arguments)]
430pub fn entropy_coeffs_wasm128(
431    token: archmage::Wasm128Token,
432    block_c: &[f32],
433    block_y: &[f32],
434    weights: &[f32],
435    inv_weights: &[f32],
436    n: usize,
437    cmap_factor: f32,
438    quant: f32,
439    k_cost_delta: f32,
440    k_cost2: f32,
441    pixel_domain: bool,
442    error_coeffs: &mut [f32],
443) -> EntropyCoeffResult {
444    use magetypes::simd::f32x4;
445
446    let cmap_v = f32x4::splat(token, cmap_factor);
447    let quant_v = f32x4::splat(token, quant);
448    let cost_delta_v = f32x4::splat(token, k_cost_delta);
449    let cost2_v = f32x4::splat(token, k_cost2);
450    let zero = f32x4::zero(token);
451    let one = f32x4::splat(token, 1.0);
452    let thr_1_5 = f32x4::splat(token, 1.5);
453
454    let mut entropy_acc = f32x4::zero(token);
455    let mut nzeros_acc = f32x4::zero(token);
456    let mut info_loss_acc = f32x4::zero(token);
457    let mut info_loss2_acc = f32x4::zero(token);
458    let mut cost2_acc = f32x4::zero(token);
459
460    let chunks = n / 4;
461    let simd_n = chunks * 4;
462    let block_c_s = &block_c[..simd_n];
463    let block_y_s = &block_y[..simd_n];
464    let weights_s = &weights[..simd_n];
465    let inv_weights_s = &inv_weights[..simd_n];
466    for chunk in 0..chunks {
467        let base = chunk * 4;
468
469        let bc = f32x4::from_slice(token, &block_c_s[base..]);
470        let by_v = f32x4::from_slice(token, &block_y_s[base..]);
471        let w = f32x4::from_slice(token, &weights_s[base..]);
472        let iw = f32x4::from_slice(token, &inv_weights_s[base..]);
473
474        // val = (block_c - block_y * cmap_factor) * inv_weights * quant
475        let adjusted = bc - by_v * cmap_v;
476        let val = adjusted * iw * quant_v;
477
478        let rval = val.round();
479        let diff = val - rval;
480
481        if pixel_domain {
482            let err = w * diff;
483            let out: &mut [f32; 4] = (&mut error_coeffs[base..base + 4]).try_into().unwrap();
484            err.store(out);
485        }
486
487        let q = rval.abs();
488        entropy_acc = q.sqrt().mul_add(cost_delta_v, entropy_acc);
489
490        let nz_mask = q.simd_ne(zero);
491        nzeros_acc += f32x4::blend(nz_mask, one, zero);
492
493        if !pixel_domain {
494            let diff_abs = diff.abs();
495            info_loss_acc += diff_abs;
496            info_loss2_acc = diff_abs.mul_add(diff_abs, info_loss2_acc);
497
498            let ge_mask = q.simd_ge(thr_1_5);
499            cost2_acc += f32x4::blend(ge_mask, cost2_v, zero);
500        }
501    }
502
503    // Scalar remainder
504    let start = chunks * 4;
505    let remainder = entropy_coeffs_scalar(
506        &block_c[start..n],
507        &block_y[start..n],
508        &weights[start..n],
509        &inv_weights[start..n],
510        n - start,
511        cmap_factor,
512        quant,
513        k_cost_delta,
514        k_cost2,
515        pixel_domain,
516        &mut error_coeffs[start..n],
517    );
518
519    let mut entropy_sum = entropy_acc.reduce_add() + remainder.entropy_sum;
520    if !pixel_domain {
521        entropy_sum += cost2_acc.reduce_add();
522    }
523
524    EntropyCoeffResult {
525        entropy_sum,
526        nzeros_sum: nzeros_acc.reduce_add() + remainder.nzeros_sum,
527        info_loss_sum: info_loss_acc.reduce_add() + remainder.info_loss_sum,
528        info_loss2_sum: info_loss2_acc.reduce_add() + remainder.info_loss2_sum,
529    }
530}
531
532// ============================================================================
533// Shannon entropy computation (P6: histogram entropy for clustering)
534// ============================================================================
535
536// fast_log2f polynomial coefficients (shared with mask1x1, adaptive_quant).
537// Used by arch-gated Shannon entropy functions and scalar fallback.
538const LOG2_P0: f32 = -1.850_383_3e-6;
539const LOG2_P1: f32 = 1.428_716;
540const LOG2_P2: f32 = 0.742_458_7;
541const LOG2_Q0: f32 = 0.990_328_14;
542const LOG2_Q1: f32 = 1.009_671_9;
543const LOG2_Q2: f32 = 0.174_093_43;
544
545/// Fast log2 approximation. Max relative error ~3e-7. Input must be > 0.
546///
547/// Uses integer bit manipulation on f32 with a Padé approximant for the
548/// fractional part. Matches libjxl's `FastLog2f` from `fast_math-inl.h`.
549#[inline(always)]
550pub fn fast_log2f(x: f32) -> f32 {
551    let x_bits = x.to_bits() as i32;
552    let exp_bits = x_bits.wrapping_sub(0x3f2a_aaab_u32 as i32);
553    let exp_shifted = exp_bits >> 23;
554    let mantissa = f32::from_bits((x_bits.wrapping_sub(exp_shifted << 23)) as u32);
555    let exp_val = exp_shifted as f32;
556    let frac = mantissa - 1.0;
557    let num = LOG2_P0 + frac * (LOG2_P1 + frac * LOG2_P2);
558    let den = LOG2_Q0 + frac * (LOG2_Q1 + frac * LOG2_Q2);
559    num / den + exp_val
560}
561
562/// Fast base-2 exponentiation. Max relative error ~3e-7.
563///
564/// Matches libjxl's `FastPow2f` from `fast_math-inl.h` (line 72).
565/// Uses integer bit manipulation for the integer exponent part and a (3,3)
566/// rational polynomial for the fractional part.
567#[inline(always)]
568#[allow(clippy::excessive_precision)]
569pub fn fast_pow2f(x: f32) -> f32 {
570    let floorx = x.floor();
571    // Integer part → IEEE 754 exponent via bit shift
572    let exp = f32::from_bits(((floorx as i32 + 127) << 23) as u32);
573    let frac = x - floorx;
574    // (3,3) rational polynomial for 2^frac, frac in [0, 1)
575    // Coefficients from libjxl fast_math-inl.h — must match exactly.
576    // Numerator: Horner form
577    let mut num = frac + 1.01749063e+01;
578    num = num * frac + 4.88687798e+01;
579    num = num * frac + 9.85506591e+01;
580    num *= exp;
581    // Denominator: Horner form
582    let mut den = frac * 2.10242958e-01 + (-2.22328856e-02);
583    den = den * frac + (-1.94414990e+01);
584    den = den * frac + 9.85506633e+01;
585    num / den
586}
587
588/// Fast power function: `base^exponent`. Max relative error ~3e-5.
589///
590/// Matches libjxl's `FastPowf` from `fast_math-inl.h` (line 90).
591/// Computes `2^(log2(base) * exponent)` using [`fast_log2f`] and [`fast_pow2f`].
592/// Input `base` must be > 0.
593#[inline(always)]
594pub fn fast_powf(base: f32, exponent: f32) -> f32 {
595    fast_pow2f(fast_log2f(base) * exponent)
596}
597
598/// Compute Shannon entropy of a histogram: -sum(count * log2(count / total)).
599///
600/// Returns total entropy in bits. Excludes zero counts and the case where
601/// a single count equals total (entropy contribution = 0).
602///
603/// Uses fast_log2f approximation (~3e-7 relative error per log2 call).
604#[inline]
605pub fn shannon_entropy_bits(counts: &[i32], total_count: usize) -> f32 {
606    if total_count == 0 {
607        return 0.0;
608    }
609
610    #[cfg(target_arch = "x86_64")]
611    {
612        use archmage::SimdToken;
613        if let Some(token) = archmage::X64V3Token::summon() {
614            return shannon_entropy_avx2(token, counts, total_count);
615        }
616    }
617
618    #[cfg(target_arch = "aarch64")]
619    {
620        use archmage::SimdToken;
621        if let Some(token) = archmage::NeonToken::summon() {
622            return shannon_entropy_neon(token, counts, total_count);
623        }
624    }
625
626    #[cfg(target_arch = "wasm32")]
627    {
628        use archmage::SimdToken;
629        if let Some(token) = archmage::Wasm128Token::summon() {
630            return shannon_entropy_wasm128(token, counts, total_count);
631        }
632    }
633
634    shannon_entropy_scalar(counts, total_count)
635}
636
637/// Scalar Shannon entropy using fast_log2f.
638#[inline]
639pub fn shannon_entropy_scalar(counts: &[i32], total_count: usize) -> f32 {
640    let inv_total = 1.0 / total_count as f32;
641    let total_f = total_count as f32;
642    let mut entropy = 0.0f32;
643
644    for &count in counts {
645        if count > 0 {
646            let c = count as f32;
647            if c != total_f {
648                entropy -= c * fast_log2f(c * inv_total);
649            }
650        }
651    }
652
653    entropy
654}
655
656#[cfg(target_arch = "x86_64")]
657#[inline]
658#[archmage::arcane]
659pub fn shannon_entropy_avx2(
660    token: archmage::X64V3Token,
661    counts: &[i32],
662    total_count: usize,
663) -> f32 {
664    use magetypes::simd::{f32x8, i32x8};
665
666    let inv_total_v = f32x8::splat(token, 1.0 / total_count as f32);
667    let total_f_v = f32x8::splat(token, total_count as f32);
668    let zero_f = f32x8::zero(token);
669    let mut acc = f32x8::zero(token);
670
671    // fast_log2f constants
672    let offset = i32x8::splat(token, 0x3f2a_aaab_u32 as i32);
673    let one = f32x8::splat(token, 1.0);
674    let p0 = f32x8::splat(token, LOG2_P0);
675    let p1 = f32x8::splat(token, LOG2_P1);
676    let p2 = f32x8::splat(token, LOG2_P2);
677    let q0 = f32x8::splat(token, LOG2_Q0);
678    let q1 = f32x8::splat(token, LOG2_Q1);
679    let q2 = f32x8::splat(token, LOG2_Q2);
680
681    #[allow(clippy::too_many_arguments)]
682    #[inline(always)]
683    fn fast_log2f_x8(
684        x: f32x8,
685        offset: i32x8,
686        p0: f32x8,
687        p1: f32x8,
688        p2: f32x8,
689        q0: f32x8,
690        q1: f32x8,
691        q2: f32x8,
692        one: f32x8,
693    ) -> f32x8 {
694        let x_bits: i32x8 = x.bitcast_i32x8();
695        let exp_bits = x_bits - offset;
696        let exp_shifted = exp_bits.shr_arithmetic::<23>();
697        let mantissa_bits = x_bits - exp_shifted.shl::<23>();
698        let mantissa = mantissa_bits.bitcast_f32x8();
699        let exp_val = exp_shifted.to_f32x8();
700        let frac = mantissa - one;
701        let num = frac.mul_add(p2, p1).mul_add(frac, p0);
702        let den = frac.mul_add(q2, q1).mul_add(frac, q0);
703        num / den + exp_val
704    }
705
706    let chunks = counts.len() / 8;
707    for chunk in 0..chunks {
708        let base = chunk * 8;
709        let c_i = i32x8::from_slice(token, &counts[base..]);
710        let c_f = c_i.to_f32x8();
711
712        // Compare in f32 space (blend needs f32 masks)
713        let nonzero_mask = c_f.simd_gt(zero_f);
714        let not_total_mask = c_f.simd_ne(total_f_v);
715        // Combine masks: multiply 1.0/0.0 selects
716        let nz_float = f32x8::blend(nonzero_mask, one, zero_f);
717        let nt_float = f32x8::blend(not_total_mask, one, zero_f);
718        let valid_mask = nz_float * nt_float; // 1.0 where valid, 0.0 where not
719
720        // For log2, we need count > 0 to avoid log2(0). Use max(count, 1) for safe input.
721        let safe_c = f32x8::blend(nonzero_mask, c_f, one);
722        let prob = safe_c * inv_total_v;
723        let log2_prob = fast_log2f_x8(prob, offset, p0, p1, p2, q0, q1, q2, one);
724
725        // contribution = -count * log2(count/total), masked to 0 where invalid
726        let contribution = c_f * log2_prob * valid_mask;
727        acc -= contribution;
728    }
729
730    // Handle remainder (scalar)
731    let mut scalar_sum = 0.0f32;
732    let inv_total = 1.0 / total_count as f32;
733    let total_f = total_count as f32;
734    for &count in &counts[chunks * 8..] {
735        if count > 0 {
736            let c = count as f32;
737            if c != total_f {
738                scalar_sum -= c * fast_log2f(c * inv_total);
739            }
740        }
741    }
742
743    acc.reduce_add() + scalar_sum
744}
745
746#[cfg(target_arch = "aarch64")]
747#[inline]
748#[archmage::arcane]
749pub fn shannon_entropy_neon(token: archmage::NeonToken, counts: &[i32], total_count: usize) -> f32 {
750    use magetypes::simd::{f32x4, i32x4};
751
752    let inv_total_v = f32x4::splat(token, 1.0 / total_count as f32);
753    let total_f_v = f32x4::splat(token, total_count as f32);
754    let zero_f = f32x4::zero(token);
755    let mut acc = f32x4::zero(token);
756
757    let offset = i32x4::splat(token, 0x3f2a_aaab_u32 as i32);
758    let one = f32x4::splat(token, 1.0);
759    let p0 = f32x4::splat(token, LOG2_P0);
760    let p1 = f32x4::splat(token, LOG2_P1);
761    let p2 = f32x4::splat(token, LOG2_P2);
762    let q0 = f32x4::splat(token, LOG2_Q0);
763    let q1 = f32x4::splat(token, LOG2_Q1);
764    let q2 = f32x4::splat(token, LOG2_Q2);
765
766    #[inline(always)]
767    fn fast_log2f_x4(
768        x: f32x4,
769        offset: i32x4,
770        p0: f32x4,
771        p1: f32x4,
772        p2: f32x4,
773        q0: f32x4,
774        q1: f32x4,
775        q2: f32x4,
776        one: f32x4,
777    ) -> f32x4 {
778        let x_bits: i32x4 = x.bitcast_i32x4();
779        let exp_bits = x_bits - offset;
780        let exp_shifted = exp_bits.shr_arithmetic::<23>();
781        let mantissa_bits = x_bits - exp_shifted.shl::<23>();
782        let mantissa = mantissa_bits.bitcast_f32x4();
783        let exp_val = exp_shifted.to_f32x4();
784        let frac = mantissa - one;
785        let num = frac.mul_add(p2, p1).mul_add(frac, p0);
786        let den = frac.mul_add(q2, q1).mul_add(frac, q0);
787        num / den + exp_val
788    }
789
790    let chunks = counts.len() / 4;
791    for chunk in 0..chunks {
792        let base = chunk * 4;
793        let c_i = i32x4::from_slice(token, &counts[base..]);
794        let c_f = c_i.to_f32x4();
795
796        // Compare in f32 space (blend needs f32 masks)
797        let nonzero_mask = c_f.simd_gt(zero_f);
798        let not_total_mask = c_f.simd_ne(total_f_v);
799        let nz_float = f32x4::blend(nonzero_mask, one, zero_f);
800        let nt_float = f32x4::blend(not_total_mask, one, zero_f);
801        let valid_mask = nz_float * nt_float;
802
803        let safe_c = f32x4::blend(nonzero_mask, c_f, one);
804        let prob = safe_c * inv_total_v;
805        let log2_prob = fast_log2f_x4(prob, offset, p0, p1, p2, q0, q1, q2, one);
806
807        let contribution = c_f * log2_prob * valid_mask;
808        acc -= contribution;
809    }
810
811    let mut scalar_sum = 0.0f32;
812    let inv_total = 1.0 / total_count as f32;
813    let total_f = total_count as f32;
814    for &count in &counts[chunks * 4..] {
815        if count > 0 {
816            let c = count as f32;
817            if c != total_f {
818                scalar_sum -= c * fast_log2f(c * inv_total);
819            }
820        }
821    }
822
823    acc.reduce_add() + scalar_sum
824}
825
826#[cfg(target_arch = "wasm32")]
827#[inline]
828#[archmage::arcane]
829pub fn shannon_entropy_wasm128(
830    token: archmage::Wasm128Token,
831    counts: &[i32],
832    total_count: usize,
833) -> f32 {
834    use magetypes::simd::{f32x4, i32x4};
835
836    let inv_total_v = f32x4::splat(token, 1.0 / total_count as f32);
837    let total_f_v = f32x4::splat(token, total_count as f32);
838    let zero_f = f32x4::zero(token);
839    let mut acc = f32x4::zero(token);
840
841    let offset = i32x4::splat(token, 0x3f2a_aaab_u32 as i32);
842    let one = f32x4::splat(token, 1.0);
843    let p0 = f32x4::splat(token, LOG2_P0);
844    let p1 = f32x4::splat(token, LOG2_P1);
845    let p2 = f32x4::splat(token, LOG2_P2);
846    let q0 = f32x4::splat(token, LOG2_Q0);
847    let q1 = f32x4::splat(token, LOG2_Q1);
848    let q2 = f32x4::splat(token, LOG2_Q2);
849
850    #[inline(always)]
851    fn fast_log2f_x4(
852        x: f32x4,
853        offset: i32x4,
854        p0: f32x4,
855        p1: f32x4,
856        p2: f32x4,
857        q0: f32x4,
858        q1: f32x4,
859        q2: f32x4,
860        one: f32x4,
861    ) -> f32x4 {
862        let x_bits: i32x4 = x.bitcast_i32x4();
863        let exp_bits = x_bits - offset;
864        let exp_shifted = exp_bits.shr_arithmetic::<23>();
865        let mantissa_bits = x_bits - exp_shifted.shl::<23>();
866        let mantissa = mantissa_bits.bitcast_f32x4();
867        let exp_val = exp_shifted.to_f32x4();
868        let frac = mantissa - one;
869        let num = frac.mul_add(p2, p1).mul_add(frac, p0);
870        let den = frac.mul_add(q2, q1).mul_add(frac, q0);
871        num / den + exp_val
872    }
873
874    let chunks = counts.len() / 4;
875    for chunk in 0..chunks {
876        let base = chunk * 4;
877        let c_i = i32x4::from_slice(token, &counts[base..]);
878        let c_f = c_i.to_f32x4();
879
880        // Compare in f32 space (blend needs f32 masks)
881        let nonzero_mask = c_f.simd_gt(zero_f);
882        let not_total_mask = c_f.simd_ne(total_f_v);
883        let nz_float = f32x4::blend(nonzero_mask, one, zero_f);
884        let nt_float = f32x4::blend(not_total_mask, one, zero_f);
885        let valid_mask = nz_float * nt_float;
886
887        let safe_c = f32x4::blend(nonzero_mask, c_f, one);
888        let prob = safe_c * inv_total_v;
889        let log2_prob = fast_log2f_x4(prob, offset, p0, p1, p2, q0, q1, q2, one);
890
891        let contribution = c_f * log2_prob * valid_mask;
892        acc -= contribution;
893    }
894
895    let mut scalar_sum = 0.0f32;
896    let inv_total = 1.0 / total_count as f32;
897    let total_f = total_count as f32;
898    for &count in &counts[chunks * 4..] {
899        if count > 0 {
900            let c = count as f32;
901            if c != total_f {
902                scalar_sum -= c * fast_log2f(c * inv_total);
903            }
904        }
905    }
906
907    acc.reduce_add() + scalar_sum
908}
909
910#[cfg(test)]
911mod tests {
912    use super::*;
913    extern crate alloc;
914    extern crate std;
915    use alloc::vec;
916    use alloc::vec::Vec;
917
918    /// Verify SIMD matches scalar for pixel-domain mode.
919    #[test]
920    fn test_entropy_coeffs_pixel_domain() {
921        let n = 64;
922        let block_c: Vec<f32> = (0..n).map(|i| (i as f32 * 0.7 - 20.0) * 0.1).collect();
923        let block_y: Vec<f32> = (0..n).map(|i| (i as f32 * 0.5 - 15.0) * 0.1).collect();
924        let weights: Vec<f32> = (0..n).map(|i| 0.01 + (i as f32) * 0.005).collect();
925        let inv_weights: Vec<f32> = weights.iter().map(|&w| 1.0 / w).collect();
926
927        let cmap_factor = 0.15f32;
928        let quant = 3.5f32;
929        let k_cost_delta = 5.335f32;
930        let k_cost2 = 4.463f32;
931
932        // Reference: scalar
933        let mut error_ref = vec![0.0f32; n];
934        let ref_result = entropy_coeffs_scalar(
935            &block_c,
936            &block_y,
937            &weights,
938            &inv_weights,
939            n,
940            cmap_factor,
941            quant,
942            k_cost_delta,
943            k_cost2,
944            true,
945            &mut error_ref,
946        );
947
948        let report = archmage::testing::for_each_token_permutation(
949            archmage::testing::CompileTimePolicy::Warn,
950            |perm| {
951                let mut error_simd = vec![0.0f32; n];
952                let simd_result = entropy_estimate_coeffs(
953                    &block_c,
954                    &block_y,
955                    &weights,
956                    &inv_weights,
957                    n,
958                    cmap_factor,
959                    quant,
960                    k_cost_delta,
961                    k_cost2,
962                    true,
963                    &mut error_simd,
964                );
965                let rel_eps = 0.005;
966                let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
967                    / ref_result.entropy_sum.abs();
968                assert!(
969                    entropy_rel < rel_eps,
970                    "entropy_sum rel_err={entropy_rel:.4} [{perm}]"
971                );
972                let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
973                    / ref_result.nzeros_sum.abs().max(1.0);
974                assert!(nz_rel < 0.05, "nzeros_sum rel_err={nz_rel:.4} [{perm}]");
975                let mut max_err = 0.0f32;
976                for i in 0..n {
977                    max_err = max_err.max((error_simd[i] - error_ref[i]).abs());
978                }
979                assert!(
980                    max_err < 0.5,
981                    "Error coeffs max diff: {max_err:.2e} [{perm}]"
982                );
983            },
984        );
985        std::eprintln!("{report}");
986    }
987
988    /// Verify SIMD matches scalar for coefficient-domain mode.
989    #[test]
990    fn test_entropy_coeffs_coeff_domain() {
991        let n = 64;
992        let block_c: Vec<f32> = (0..n).map(|i| (i as f32 * 1.3 - 40.0) * 0.05).collect();
993        let block_y: Vec<f32> = (0..n).map(|i| (i as f32 * 0.9 - 30.0) * 0.05).collect();
994        let weights: Vec<f32> = (0..n).map(|i| 0.02 + (i as f32) * 0.003).collect();
995        let inv_weights: Vec<f32> = weights.iter().map(|&w| 1.0 / w).collect();
996
997        let cmap_factor = 0.0f32;
998        let quant = 5.0f32;
999        let k_cost_delta = 5.335f32;
1000        let k_cost2 = 4.463f32;
1001
1002        let mut error_ref = vec![0.0f32; n];
1003        let ref_result = entropy_coeffs_scalar(
1004            &block_c,
1005            &block_y,
1006            &weights,
1007            &inv_weights,
1008            n,
1009            cmap_factor,
1010            quant,
1011            k_cost_delta,
1012            k_cost2,
1013            false,
1014            &mut error_ref,
1015        );
1016
1017        let report = archmage::testing::for_each_token_permutation(
1018            archmage::testing::CompileTimePolicy::Warn,
1019            |perm| {
1020                let mut error_simd = vec![0.0f32; n];
1021                let simd_result = entropy_estimate_coeffs(
1022                    &block_c,
1023                    &block_y,
1024                    &weights,
1025                    &inv_weights,
1026                    n,
1027                    cmap_factor,
1028                    quant,
1029                    k_cost_delta,
1030                    k_cost2,
1031                    false,
1032                    &mut error_simd,
1033                );
1034                let rel_eps = 0.005;
1035                let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
1036                    / ref_result.entropy_sum.abs();
1037                assert!(entropy_rel < rel_eps, "entropy_sum [{perm}]");
1038                let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
1039                    / ref_result.nzeros_sum.abs().max(1.0);
1040                assert!(nz_rel < 0.05, "nzeros_sum [{perm}]");
1041                let il_rel = (simd_result.info_loss_sum - ref_result.info_loss_sum).abs()
1042                    / ref_result.info_loss_sum.abs().max(1.0);
1043                assert!(il_rel < rel_eps, "info_loss_sum [{perm}]");
1044                let il2_rel = (simd_result.info_loss2_sum - ref_result.info_loss2_sum).abs()
1045                    / ref_result.info_loss2_sum.abs().max(1.0);
1046                assert!(il2_rel < rel_eps, "info_loss2_sum [{perm}]");
1047            },
1048        );
1049        std::eprintln!("{report}");
1050    }
1051
1052    /// Test with non-multiple-of-8 sizes (remainder handling).
1053    #[test]
1054    fn test_entropy_coeffs_remainder() {
1055        let n = 67;
1056        let block_c: Vec<f32> = (0..n).map(|i| (i as f32) * 0.1 - 3.0).collect();
1057        let block_y: Vec<f32> = (0..n).map(|i| (i as f32) * 0.08 - 2.5).collect();
1058        let weights: Vec<f32> = (0..n).map(|i| 0.01 + (i as f32) * 0.002).collect();
1059        let inv_weights: Vec<f32> = weights.iter().map(|&w| 1.0 / w).collect();
1060
1061        let mut error_ref = vec![0.0f32; n];
1062        let ref_result = entropy_coeffs_scalar(
1063            &block_c,
1064            &block_y,
1065            &weights,
1066            &inv_weights,
1067            n,
1068            0.2,
1069            4.0,
1070            5.335,
1071            4.463,
1072            true,
1073            &mut error_ref,
1074        );
1075
1076        let report = archmage::testing::for_each_token_permutation(
1077            archmage::testing::CompileTimePolicy::Warn,
1078            |perm| {
1079                let mut error_simd = vec![0.0f32; n];
1080                let simd_result = entropy_estimate_coeffs(
1081                    &block_c,
1082                    &block_y,
1083                    &weights,
1084                    &inv_weights,
1085                    n,
1086                    0.2,
1087                    4.0,
1088                    5.335,
1089                    4.463,
1090                    true,
1091                    &mut error_simd,
1092                );
1093                let rel_eps = 0.005;
1094                let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
1095                    / ref_result.entropy_sum.abs().max(1.0);
1096                assert!(entropy_rel < rel_eps, "entropy_sum [{perm}]");
1097                let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
1098                    / ref_result.nzeros_sum.abs().max(1.0);
1099                assert!(nz_rel < 0.05, "nzeros_sum [{perm}]");
1100                let max_err = error_simd
1101                    .iter()
1102                    .zip(error_ref.iter())
1103                    .take(n)
1104                    .map(|(a, b)| (a - b).abs())
1105                    .fold(0.0f32, f32::max);
1106                assert!(
1107                    max_err < 0.01,
1108                    "Error coeffs max diff: {max_err:.2e} [{perm}]"
1109                );
1110            },
1111        );
1112        std::eprintln!("{report}");
1113    }
1114
1115    /// Test with large blocks (DCT64x64 = 4096 coefficients).
1116    #[test]
1117    fn test_entropy_coeffs_large_block() {
1118        let n = 4096;
1119        let block_c: Vec<f32> = (0..n).map(|i| ((i as f32) * 0.01).sin() * 5.0).collect();
1120        let block_y: Vec<f32> = (0..n).map(|i| ((i as f32) * 0.013).cos() * 4.0).collect();
1121        let weights: Vec<f32> = (0..n).map(|i| 0.005 + (i as f32) * 0.001).collect();
1122        let inv_weights: Vec<f32> = weights.iter().map(|&w| 1.0 / w).collect();
1123
1124        let mut error_ref = vec![0.0f32; n];
1125        let ref_result = entropy_coeffs_scalar(
1126            &block_c,
1127            &block_y,
1128            &weights,
1129            &inv_weights,
1130            n,
1131            0.1,
1132            2.0,
1133            5.335,
1134            4.463,
1135            true,
1136            &mut error_ref,
1137        );
1138
1139        let report = archmage::testing::for_each_token_permutation(
1140            archmage::testing::CompileTimePolicy::Warn,
1141            |perm| {
1142                let mut error_simd = vec![0.0f32; n];
1143                let simd_result = entropy_estimate_coeffs(
1144                    &block_c,
1145                    &block_y,
1146                    &weights,
1147                    &inv_weights,
1148                    n,
1149                    0.1,
1150                    2.0,
1151                    5.335,
1152                    4.463,
1153                    true,
1154                    &mut error_simd,
1155                );
1156
1157                // Large block: use relative tolerance
1158                let rel_eps = 0.005;
1159                let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
1160                    / ref_result.entropy_sum.abs();
1161                assert!(
1162                    entropy_rel < rel_eps,
1163                    "entropy_sum: SIMD={}, ref={}, rel_err={:.4}% [{perm}]",
1164                    simd_result.entropy_sum,
1165                    ref_result.entropy_sum,
1166                    entropy_rel * 100.0
1167                );
1168
1169                let max_err = error_simd
1170                    .iter()
1171                    .zip(error_ref.iter())
1172                    .take(n)
1173                    .map(|(a, b)| (a - b).abs())
1174                    .fold(0.0f32, f32::max);
1175                assert!(
1176                    max_err < 1e-3,
1177                    "Error coeffs max diff: {:.2e} [{perm}]",
1178                    max_err
1179                );
1180            },
1181        );
1182        std::eprintln!("{report}");
1183    }
1184
1185    // =====================================================================
1186    // Shannon entropy tests
1187    // =====================================================================
1188
1189    /// Reference Shannon entropy using f32::log2 (not fast_log2f).
1190    fn reference_shannon_entropy(counts: &[i32], total_count: usize) -> f32 {
1191        if total_count == 0 {
1192            return 0.0;
1193        }
1194        let inv_total = 1.0 / total_count as f32;
1195        let total_f = total_count as f32;
1196        let mut entropy = 0.0f32;
1197        for &count in counts {
1198            if count > 0 {
1199                let c = count as f32;
1200                if c != total_f {
1201                    entropy -= c * (c * inv_total).log2();
1202                }
1203            }
1204        }
1205        entropy
1206    }
1207
1208    #[test]
1209    fn test_shannon_entropy_uniform() {
1210        // Uniform distribution: entropy = n * log2(n) bits total
1211        let counts = [100i32, 100, 100, 100, 0, 0, 0, 0];
1212        let total = 400;
1213        let ref_ent = reference_shannon_entropy(&counts, total);
1214        let simd_ent = shannon_entropy_bits(&counts, total);
1215        let scalar_ent = shannon_entropy_scalar(&counts, total);
1216
1217        // Expected: 400 * log2(4) = 800
1218        assert!((ref_ent - 800.0).abs() < 0.1, "ref = {ref_ent}");
1219        assert!(
1220            (simd_ent - ref_ent).abs() < 0.5,
1221            "simd={simd_ent} ref={ref_ent}"
1222        );
1223        assert!(
1224            (scalar_ent - ref_ent).abs() < 0.5,
1225            "scalar={scalar_ent} ref={ref_ent}"
1226        );
1227    }
1228
1229    #[test]
1230    fn test_shannon_entropy_single_symbol() {
1231        // All counts in one symbol: entropy = 0
1232        let counts = [1000i32, 0, 0, 0, 0, 0, 0, 0];
1233        let total = 1000;
1234        let ent = shannon_entropy_bits(&counts, total);
1235        assert!(ent.abs() < 0.01, "entropy should be 0, got {ent}");
1236    }
1237
1238    #[test]
1239    fn test_shannon_entropy_realistic_histogram() {
1240        // Realistic distribution like AC coefficient magnitudes
1241        let mut counts = alloc::vec![0i32; 64];
1242        counts[0] = 5000; // lots of zeros (but treated as symbol 0)
1243        counts[1] = 2000;
1244        counts[2] = 1000;
1245        counts[3] = 500;
1246        counts[4] = 200;
1247        counts[5] = 100;
1248        counts[6] = 50;
1249        counts[7] = 20;
1250        let total: usize = counts.iter().map(|&c| c as usize).sum();
1251
1252        let ref_ent = reference_shannon_entropy(&counts, total);
1253
1254        let report = archmage::testing::for_each_token_permutation(
1255            archmage::testing::CompileTimePolicy::Warn,
1256            |perm| {
1257                let simd_ent = shannon_entropy_bits(&counts, total);
1258                let rel_err = (simd_ent - ref_ent).abs() / ref_ent.abs().max(1.0);
1259                assert!(
1260                    rel_err < 0.001,
1261                    "Shannon entropy: simd={simd_ent}, ref={ref_ent}, rel_err={rel_err:.4} [{perm}]"
1262                );
1263            },
1264        );
1265        std::eprintln!("{report}");
1266    }
1267
1268    #[test]
1269    fn test_shannon_entropy_large_alphabet() {
1270        // Large alphabet (256 symbols) with Zipf-like distribution
1271        let mut counts = alloc::vec![0i32; 256];
1272        let mut total = 0usize;
1273        for (i, count) in counts.iter_mut().enumerate() {
1274            *count = 10000 / (i as i32 + 1);
1275            total += *count as usize;
1276        }
1277
1278        let ref_ent = reference_shannon_entropy(&counts, total);
1279
1280        let report = archmage::testing::for_each_token_permutation(
1281            archmage::testing::CompileTimePolicy::Warn,
1282            |perm| {
1283                let simd_ent = shannon_entropy_bits(&counts, total);
1284                let rel_err = (simd_ent - ref_ent).abs() / ref_ent.abs().max(1.0);
1285                assert!(
1286                    rel_err < 0.001,
1287                    "Large alphabet: simd={simd_ent}, ref={ref_ent}, rel_err={rel_err:.4} [{perm}]"
1288                );
1289            },
1290        );
1291        std::eprintln!("{report}");
1292    }
1293
1294    #[test]
1295    fn test_shannon_entropy_empty() {
1296        let counts = [0i32; 8];
1297        let ent = shannon_entropy_bits(&counts, 0);
1298        assert_eq!(ent, 0.0);
1299    }
1300
1301    #[test]
1302    fn test_fast_pow2f_accuracy() {
1303        // Test exact powers of 2
1304        assert!((fast_pow2f(0.0) - 1.0).abs() < 1e-5);
1305        assert!((fast_pow2f(1.0) - 2.0).abs() < 1e-4);
1306        assert!((fast_pow2f(3.0) - 8.0).abs() < 1e-3);
1307        assert!((fast_pow2f(-1.0) - 0.5).abs() < 1e-5);
1308
1309        // Test fractional exponents
1310        let val = fast_pow2f(0.5);
1311        let expected = core::f32::consts::SQRT_2;
1312        assert!(
1313            (val - expected).abs() / expected < 5e-7,
1314            "2^0.5: got {val}, expected {expected}"
1315        );
1316    }
1317
1318    #[test]
1319    fn test_fast_powf_accuracy() {
1320        // Test basic powers
1321        let val = fast_powf(2.0, 3.0);
1322        assert!(
1323            (val - 8.0).abs() / 8.0 < 5e-5,
1324            "2^3: got {val}, expected 8.0"
1325        );
1326
1327        // Test sRGB TF: (0.5)^2.4
1328        let base = 0.5f32;
1329        let exact = base.powf(2.4);
1330        let fast = fast_powf(base, 2.4);
1331        assert!(
1332            (fast - exact).abs() / exact < 5e-5,
1333            "0.5^2.4: got {fast}, expected {exact}"
1334        );
1335
1336        // Test ratio^K_POW (the compute_scaled_constants case)
1337        let ratio = 1.5f32;
1338        let exact = ratio.powf(0.337);
1339        let fast = fast_powf(ratio, 0.337);
1340        assert!(
1341            (fast - exact).abs() / exact < 5e-5,
1342            "1.5^0.337: got {fast}, expected {exact}"
1343        );
1344    }
1345}