1#[derive(Debug, Clone, Copy)]
16pub struct EntropyCoeffResult {
17 pub entropy_sum: f32,
19 pub nzeros_sum: f32,
21 pub info_loss_sum: f32,
23 pub info_loss2_sum: f32,
25}
26
27impl EntropyCoeffResult {
28 pub const ZERO: Self = Self {
29 entropy_sum: 0.0,
30 nzeros_sum: 0.0,
31 info_loss_sum: 0.0,
32 info_loss2_sum: 0.0,
33 };
34}
35
36#[inline]
50#[allow(clippy::too_many_arguments)]
51pub fn entropy_estimate_coeffs(
52 block_c: &[f32],
53 block_y: &[f32],
54 weights: &[f32],
55 inv_weights: &[f32],
56 n: usize,
57 cmap_factor: f32,
58 quant: f32,
59 k_cost_delta: f32,
60 k_cost2: f32,
61 pixel_domain: bool,
62 error_coeffs: &mut [f32],
63) -> EntropyCoeffResult {
64 #[cfg(target_arch = "x86_64")]
65 {
66 use archmage::SimdToken;
67 if let Some(token) = archmage::X64V3Token::summon() {
68 return entropy_coeffs_avx2(
69 token,
70 block_c,
71 block_y,
72 weights,
73 inv_weights,
74 n,
75 cmap_factor,
76 quant,
77 k_cost_delta,
78 k_cost2,
79 pixel_domain,
80 error_coeffs,
81 );
82 }
83 }
84
85 #[cfg(target_arch = "aarch64")]
86 {
87 use archmage::SimdToken;
88 if let Some(token) = archmage::NeonToken::summon() {
89 return entropy_coeffs_neon(
90 token,
91 block_c,
92 block_y,
93 weights,
94 inv_weights,
95 n,
96 cmap_factor,
97 quant,
98 k_cost_delta,
99 k_cost2,
100 pixel_domain,
101 error_coeffs,
102 );
103 }
104 }
105
106 #[cfg(target_arch = "wasm32")]
107 {
108 use archmage::SimdToken;
109 if let Some(token) = archmage::Wasm128Token::summon() {
110 return entropy_coeffs_wasm128(
111 token,
112 block_c,
113 block_y,
114 weights,
115 inv_weights,
116 n,
117 cmap_factor,
118 quant,
119 k_cost_delta,
120 k_cost2,
121 pixel_domain,
122 error_coeffs,
123 );
124 }
125 }
126
127 entropy_coeffs_scalar(
128 block_c,
129 block_y,
130 weights,
131 inv_weights,
132 n,
133 cmap_factor,
134 quant,
135 k_cost_delta,
136 k_cost2,
137 pixel_domain,
138 error_coeffs,
139 )
140}
141
142#[inline]
143#[allow(clippy::too_many_arguments)]
144pub fn entropy_coeffs_scalar(
145 block_c: &[f32],
146 block_y: &[f32],
147 weights: &[f32],
148 inv_weights: &[f32],
149 n: usize,
150 cmap_factor: f32,
151 quant: f32,
152 k_cost_delta: f32,
153 k_cost2: f32,
154 pixel_domain: bool,
155 error_coeffs: &mut [f32],
156) -> EntropyCoeffResult {
157 let mut entropy_sum = 0.0f32;
158 let mut nzeros_sum = 0.0f32;
159 let mut info_loss_sum = 0.0f32;
160 let mut info_loss2_sum = 0.0f32;
161
162 for i in 0..n {
163 let val_in = block_c[i];
164 let val_y = block_y[i] * cmap_factor;
165 let val = (val_in - val_y) * inv_weights[i] * quant;
166 let rval = val.round();
167 let diff = val - rval;
168
169 if pixel_domain {
170 error_coeffs[i] = weights[i] * diff;
171 }
172
173 let q = rval.abs();
174 entropy_sum = q.sqrt().mul_add(k_cost_delta, entropy_sum);
175 if q != 0.0 {
176 nzeros_sum += 1.0;
177 }
178
179 if !pixel_domain {
180 let diff_abs = diff.abs();
181 info_loss_sum += diff_abs;
182 info_loss2_sum = diff_abs.mul_add(diff_abs, info_loss2_sum);
183 if q >= 1.5 {
184 entropy_sum += k_cost2;
185 }
186 }
187 }
188
189 EntropyCoeffResult {
190 entropy_sum,
191 nzeros_sum,
192 info_loss_sum,
193 info_loss2_sum,
194 }
195}
196
197#[cfg(target_arch = "x86_64")]
198#[inline]
199#[archmage::arcane]
200#[allow(clippy::too_many_arguments)]
201pub fn entropy_coeffs_avx2(
202 token: archmage::X64V3Token,
203 block_c: &[f32],
204 block_y: &[f32],
205 weights: &[f32],
206 inv_weights: &[f32],
207 n: usize,
208 cmap_factor: f32,
209 quant: f32,
210 k_cost_delta: f32,
211 k_cost2: f32,
212 pixel_domain: bool,
213 error_coeffs: &mut [f32],
214) -> EntropyCoeffResult {
215 use magetypes::simd::f32x8;
216
217 let cmap_v = f32x8::splat(token, cmap_factor);
218 let quant_v = f32x8::splat(token, quant);
219 let cost_delta_v = f32x8::splat(token, k_cost_delta);
220 let cost2_v = f32x8::splat(token, k_cost2);
221 let zero = f32x8::zero(token);
222 let one = f32x8::splat(token, 1.0);
223 let thr_1_5 = f32x8::splat(token, 1.5);
224
225 let mut entropy_acc = f32x8::zero(token);
226 let mut nzeros_acc = f32x8::zero(token);
227 let mut info_loss_acc = f32x8::zero(token);
228 let mut info_loss2_acc = f32x8::zero(token);
229 let mut cost2_acc = f32x8::zero(token);
230
231 let chunks = n / 8;
232 let simd_n = chunks * 8;
233 let block_c_s = &block_c[..simd_n];
234 let block_y_s = &block_y[..simd_n];
235 let weights_s = &weights[..simd_n];
236 let inv_weights_s = &inv_weights[..simd_n];
237 let error_coeffs_s = &mut error_coeffs[..simd_n];
238 for chunk in 0..chunks {
239 let base = chunk * 8;
240
241 let bc = crate::load_f32x8(token, block_c_s, base);
242 let by_v = crate::load_f32x8(token, block_y_s, base);
243 let w = crate::load_f32x8(token, weights_s, base);
244 let iw = crate::load_f32x8(token, inv_weights_s, base);
245
246 let adjusted = bc - by_v * cmap_v;
248 let val = adjusted * iw * quant_v;
249
250 let rval = val.round();
251 let diff = val - rval;
252
253 if pixel_domain {
255 let err = w * diff;
256 crate::store_f32x8(error_coeffs_s, base, err);
257 }
258
259 let q = rval.abs();
261 entropy_acc = q.sqrt().mul_add(cost_delta_v, entropy_acc);
262
263 let nz_mask = q.simd_ne(zero);
265 nzeros_acc += f32x8::blend(nz_mask, one, zero);
266
267 if !pixel_domain {
269 let diff_abs = diff.abs();
270 info_loss_acc += diff_abs;
271 info_loss2_acc = diff_abs.mul_add(diff_abs, info_loss2_acc);
272
273 let ge_mask = q.simd_ge(thr_1_5);
275 cost2_acc += f32x8::blend(ge_mask, cost2_v, zero);
276 }
277 }
278
279 let start = chunks * 8;
281 let remainder = if start < n {
282 entropy_coeffs_scalar(
283 &block_c[start..n],
284 &block_y[start..n],
285 &weights[start..n],
286 &inv_weights[start..n],
287 n - start,
288 cmap_factor,
289 quant,
290 k_cost_delta,
291 k_cost2,
292 pixel_domain,
293 &mut error_coeffs[start..n],
294 )
295 } else {
296 EntropyCoeffResult::ZERO
297 };
298
299 let mut entropy_sum = entropy_acc.reduce_add() + remainder.entropy_sum;
300 if !pixel_domain {
301 entropy_sum += cost2_acc.reduce_add();
302 }
303
304 EntropyCoeffResult {
305 entropy_sum,
306 nzeros_sum: nzeros_acc.reduce_add() + remainder.nzeros_sum,
307 info_loss_sum: info_loss_acc.reduce_add() + remainder.info_loss_sum,
308 info_loss2_sum: info_loss2_acc.reduce_add() + remainder.info_loss2_sum,
309 }
310}
311
312#[cfg(target_arch = "aarch64")]
317#[inline]
318#[archmage::arcane]
319#[allow(clippy::too_many_arguments)]
320pub fn entropy_coeffs_neon(
321 token: archmage::NeonToken,
322 block_c: &[f32],
323 block_y: &[f32],
324 weights: &[f32],
325 inv_weights: &[f32],
326 n: usize,
327 cmap_factor: f32,
328 quant: f32,
329 k_cost_delta: f32,
330 k_cost2: f32,
331 pixel_domain: bool,
332 error_coeffs: &mut [f32],
333) -> EntropyCoeffResult {
334 use magetypes::simd::f32x4;
335
336 let cmap_v = f32x4::splat(token, cmap_factor);
337 let quant_v = f32x4::splat(token, quant);
338 let cost_delta_v = f32x4::splat(token, k_cost_delta);
339 let cost2_v = f32x4::splat(token, k_cost2);
340 let zero = f32x4::zero(token);
341 let one = f32x4::splat(token, 1.0);
342 let thr_1_5 = f32x4::splat(token, 1.5);
343
344 let mut entropy_acc = f32x4::zero(token);
345 let mut nzeros_acc = f32x4::zero(token);
346 let mut info_loss_acc = f32x4::zero(token);
347 let mut info_loss2_acc = f32x4::zero(token);
348 let mut cost2_acc = f32x4::zero(token);
349
350 let chunks = n / 4;
351 let simd_n = chunks * 4;
352 let block_c_s = &block_c[..simd_n];
353 let block_y_s = &block_y[..simd_n];
354 let weights_s = &weights[..simd_n];
355 let inv_weights_s = &inv_weights[..simd_n];
356 for chunk in 0..chunks {
357 let base = chunk * 4;
358
359 let bc = f32x4::from_slice(token, &block_c_s[base..]);
360 let by_v = f32x4::from_slice(token, &block_y_s[base..]);
361 let w = f32x4::from_slice(token, &weights_s[base..]);
362 let iw = f32x4::from_slice(token, &inv_weights_s[base..]);
363
364 let adjusted = bc - by_v * cmap_v;
366 let val = adjusted * iw * quant_v;
367
368 let rval = val.round();
369 let diff = val - rval;
370
371 if pixel_domain {
372 let err = w * diff;
373 let out: &mut [f32; 4] = (&mut error_coeffs[base..base + 4]).try_into().unwrap();
374 err.store(out);
375 }
376
377 let q = rval.abs();
378 entropy_acc = q.sqrt().mul_add(cost_delta_v, entropy_acc);
379
380 let nz_mask = q.simd_ne(zero);
381 nzeros_acc += f32x4::blend(nz_mask, one, zero);
382
383 if !pixel_domain {
384 let diff_abs = diff.abs();
385 info_loss_acc += diff_abs;
386 info_loss2_acc = diff_abs.mul_add(diff_abs, info_loss2_acc);
387
388 let ge_mask = q.simd_ge(thr_1_5);
389 cost2_acc += f32x4::blend(ge_mask, cost2_v, zero);
390 }
391 }
392
393 let start = chunks * 4;
395 let remainder = entropy_coeffs_scalar(
396 &block_c[start..n],
397 &block_y[start..n],
398 &weights[start..n],
399 &inv_weights[start..n],
400 n - start,
401 cmap_factor,
402 quant,
403 k_cost_delta,
404 k_cost2,
405 pixel_domain,
406 &mut error_coeffs[start..n],
407 );
408
409 let mut entropy_sum = entropy_acc.reduce_add() + remainder.entropy_sum;
410 if !pixel_domain {
411 entropy_sum += cost2_acc.reduce_add();
412 }
413
414 EntropyCoeffResult {
415 entropy_sum,
416 nzeros_sum: nzeros_acc.reduce_add() + remainder.nzeros_sum,
417 info_loss_sum: info_loss_acc.reduce_add() + remainder.info_loss_sum,
418 info_loss2_sum: info_loss2_acc.reduce_add() + remainder.info_loss2_sum,
419 }
420}
421
422#[cfg(target_arch = "wasm32")]
427#[inline]
428#[archmage::arcane]
429#[allow(clippy::too_many_arguments)]
430pub fn entropy_coeffs_wasm128(
431 token: archmage::Wasm128Token,
432 block_c: &[f32],
433 block_y: &[f32],
434 weights: &[f32],
435 inv_weights: &[f32],
436 n: usize,
437 cmap_factor: f32,
438 quant: f32,
439 k_cost_delta: f32,
440 k_cost2: f32,
441 pixel_domain: bool,
442 error_coeffs: &mut [f32],
443) -> EntropyCoeffResult {
444 use magetypes::simd::f32x4;
445
446 let cmap_v = f32x4::splat(token, cmap_factor);
447 let quant_v = f32x4::splat(token, quant);
448 let cost_delta_v = f32x4::splat(token, k_cost_delta);
449 let cost2_v = f32x4::splat(token, k_cost2);
450 let zero = f32x4::zero(token);
451 let one = f32x4::splat(token, 1.0);
452 let thr_1_5 = f32x4::splat(token, 1.5);
453
454 let mut entropy_acc = f32x4::zero(token);
455 let mut nzeros_acc = f32x4::zero(token);
456 let mut info_loss_acc = f32x4::zero(token);
457 let mut info_loss2_acc = f32x4::zero(token);
458 let mut cost2_acc = f32x4::zero(token);
459
460 let chunks = n / 4;
461 let simd_n = chunks * 4;
462 let block_c_s = &block_c[..simd_n];
463 let block_y_s = &block_y[..simd_n];
464 let weights_s = &weights[..simd_n];
465 let inv_weights_s = &inv_weights[..simd_n];
466 for chunk in 0..chunks {
467 let base = chunk * 4;
468
469 let bc = f32x4::from_slice(token, &block_c_s[base..]);
470 let by_v = f32x4::from_slice(token, &block_y_s[base..]);
471 let w = f32x4::from_slice(token, &weights_s[base..]);
472 let iw = f32x4::from_slice(token, &inv_weights_s[base..]);
473
474 let adjusted = bc - by_v * cmap_v;
476 let val = adjusted * iw * quant_v;
477
478 let rval = val.round();
479 let diff = val - rval;
480
481 if pixel_domain {
482 let err = w * diff;
483 let out: &mut [f32; 4] = (&mut error_coeffs[base..base + 4]).try_into().unwrap();
484 err.store(out);
485 }
486
487 let q = rval.abs();
488 entropy_acc = q.sqrt().mul_add(cost_delta_v, entropy_acc);
489
490 let nz_mask = q.simd_ne(zero);
491 nzeros_acc += f32x4::blend(nz_mask, one, zero);
492
493 if !pixel_domain {
494 let diff_abs = diff.abs();
495 info_loss_acc += diff_abs;
496 info_loss2_acc = diff_abs.mul_add(diff_abs, info_loss2_acc);
497
498 let ge_mask = q.simd_ge(thr_1_5);
499 cost2_acc += f32x4::blend(ge_mask, cost2_v, zero);
500 }
501 }
502
503 let start = chunks * 4;
505 let remainder = entropy_coeffs_scalar(
506 &block_c[start..n],
507 &block_y[start..n],
508 &weights[start..n],
509 &inv_weights[start..n],
510 n - start,
511 cmap_factor,
512 quant,
513 k_cost_delta,
514 k_cost2,
515 pixel_domain,
516 &mut error_coeffs[start..n],
517 );
518
519 let mut entropy_sum = entropy_acc.reduce_add() + remainder.entropy_sum;
520 if !pixel_domain {
521 entropy_sum += cost2_acc.reduce_add();
522 }
523
524 EntropyCoeffResult {
525 entropy_sum,
526 nzeros_sum: nzeros_acc.reduce_add() + remainder.nzeros_sum,
527 info_loss_sum: info_loss_acc.reduce_add() + remainder.info_loss_sum,
528 info_loss2_sum: info_loss2_acc.reduce_add() + remainder.info_loss2_sum,
529 }
530}
531
532const LOG2_P0: f32 = -1.850_383_3e-6;
539const LOG2_P1: f32 = 1.428_716;
540const LOG2_P2: f32 = 0.742_458_7;
541const LOG2_Q0: f32 = 0.990_328_14;
542const LOG2_Q1: f32 = 1.009_671_9;
543const LOG2_Q2: f32 = 0.174_093_43;
544
545#[inline(always)]
550pub fn fast_log2f(x: f32) -> f32 {
551 let x_bits = x.to_bits() as i32;
552 let exp_bits = x_bits.wrapping_sub(0x3f2a_aaab_u32 as i32);
553 let exp_shifted = exp_bits >> 23;
554 let mantissa = f32::from_bits((x_bits.wrapping_sub(exp_shifted << 23)) as u32);
555 let exp_val = exp_shifted as f32;
556 let frac = mantissa - 1.0;
557 let num = LOG2_P0 + frac * (LOG2_P1 + frac * LOG2_P2);
558 let den = LOG2_Q0 + frac * (LOG2_Q1 + frac * LOG2_Q2);
559 num / den + exp_val
560}
561
562#[inline(always)]
568#[allow(clippy::excessive_precision)]
569pub fn fast_pow2f(x: f32) -> f32 {
570 let floorx = x.floor();
571 let exp = f32::from_bits(((floorx as i32 + 127) << 23) as u32);
573 let frac = x - floorx;
574 let mut num = frac + 1.01749063e+01;
578 num = num * frac + 4.88687798e+01;
579 num = num * frac + 9.85506591e+01;
580 num *= exp;
581 let mut den = frac * 2.10242958e-01 + (-2.22328856e-02);
583 den = den * frac + (-1.94414990e+01);
584 den = den * frac + 9.85506633e+01;
585 num / den
586}
587
588#[inline(always)]
594pub fn fast_powf(base: f32, exponent: f32) -> f32 {
595 fast_pow2f(fast_log2f(base) * exponent)
596}
597
598#[inline]
605pub fn shannon_entropy_bits(counts: &[i32], total_count: usize) -> f32 {
606 if total_count == 0 {
607 return 0.0;
608 }
609
610 #[cfg(target_arch = "x86_64")]
611 {
612 use archmage::SimdToken;
613 if let Some(token) = archmage::X64V3Token::summon() {
614 return shannon_entropy_avx2(token, counts, total_count);
615 }
616 }
617
618 #[cfg(target_arch = "aarch64")]
619 {
620 use archmage::SimdToken;
621 if let Some(token) = archmage::NeonToken::summon() {
622 return shannon_entropy_neon(token, counts, total_count);
623 }
624 }
625
626 #[cfg(target_arch = "wasm32")]
627 {
628 use archmage::SimdToken;
629 if let Some(token) = archmage::Wasm128Token::summon() {
630 return shannon_entropy_wasm128(token, counts, total_count);
631 }
632 }
633
634 shannon_entropy_scalar(counts, total_count)
635}
636
637#[inline]
639pub fn shannon_entropy_scalar(counts: &[i32], total_count: usize) -> f32 {
640 let inv_total = 1.0 / total_count as f32;
641 let total_f = total_count as f32;
642 let mut entropy = 0.0f32;
643
644 for &count in counts {
645 if count > 0 {
646 let c = count as f32;
647 if c != total_f {
648 entropy -= c * fast_log2f(c * inv_total);
649 }
650 }
651 }
652
653 entropy
654}
655
656#[cfg(target_arch = "x86_64")]
657#[inline]
658#[archmage::arcane]
659pub fn shannon_entropy_avx2(
660 token: archmage::X64V3Token,
661 counts: &[i32],
662 total_count: usize,
663) -> f32 {
664 use magetypes::simd::{f32x8, i32x8};
665
666 let inv_total_v = f32x8::splat(token, 1.0 / total_count as f32);
667 let total_f_v = f32x8::splat(token, total_count as f32);
668 let zero_f = f32x8::zero(token);
669 let mut acc = f32x8::zero(token);
670
671 let offset = i32x8::splat(token, 0x3f2a_aaab_u32 as i32);
673 let one = f32x8::splat(token, 1.0);
674 let p0 = f32x8::splat(token, LOG2_P0);
675 let p1 = f32x8::splat(token, LOG2_P1);
676 let p2 = f32x8::splat(token, LOG2_P2);
677 let q0 = f32x8::splat(token, LOG2_Q0);
678 let q1 = f32x8::splat(token, LOG2_Q1);
679 let q2 = f32x8::splat(token, LOG2_Q2);
680
681 #[allow(clippy::too_many_arguments)]
682 #[inline(always)]
683 fn fast_log2f_x8(
684 x: f32x8,
685 offset: i32x8,
686 p0: f32x8,
687 p1: f32x8,
688 p2: f32x8,
689 q0: f32x8,
690 q1: f32x8,
691 q2: f32x8,
692 one: f32x8,
693 ) -> f32x8 {
694 let x_bits: i32x8 = x.bitcast_i32x8();
695 let exp_bits = x_bits - offset;
696 let exp_shifted = exp_bits.shr_arithmetic::<23>();
697 let mantissa_bits = x_bits - exp_shifted.shl::<23>();
698 let mantissa = mantissa_bits.bitcast_f32x8();
699 let exp_val = exp_shifted.to_f32x8();
700 let frac = mantissa - one;
701 let num = frac.mul_add(p2, p1).mul_add(frac, p0);
702 let den = frac.mul_add(q2, q1).mul_add(frac, q0);
703 num / den + exp_val
704 }
705
706 let chunks = counts.len() / 8;
707 for chunk in 0..chunks {
708 let base = chunk * 8;
709 let c_i = i32x8::from_slice(token, &counts[base..]);
710 let c_f = c_i.to_f32x8();
711
712 let nonzero_mask = c_f.simd_gt(zero_f);
714 let not_total_mask = c_f.simd_ne(total_f_v);
715 let nz_float = f32x8::blend(nonzero_mask, one, zero_f);
717 let nt_float = f32x8::blend(not_total_mask, one, zero_f);
718 let valid_mask = nz_float * nt_float; let safe_c = f32x8::blend(nonzero_mask, c_f, one);
722 let prob = safe_c * inv_total_v;
723 let log2_prob = fast_log2f_x8(prob, offset, p0, p1, p2, q0, q1, q2, one);
724
725 let contribution = c_f * log2_prob * valid_mask;
727 acc -= contribution;
728 }
729
730 let mut scalar_sum = 0.0f32;
732 let inv_total = 1.0 / total_count as f32;
733 let total_f = total_count as f32;
734 for &count in &counts[chunks * 8..] {
735 if count > 0 {
736 let c = count as f32;
737 if c != total_f {
738 scalar_sum -= c * fast_log2f(c * inv_total);
739 }
740 }
741 }
742
743 acc.reduce_add() + scalar_sum
744}
745
746#[cfg(target_arch = "aarch64")]
747#[inline]
748#[archmage::arcane]
749pub fn shannon_entropy_neon(token: archmage::NeonToken, counts: &[i32], total_count: usize) -> f32 {
750 use magetypes::simd::{f32x4, i32x4};
751
752 let inv_total_v = f32x4::splat(token, 1.0 / total_count as f32);
753 let total_f_v = f32x4::splat(token, total_count as f32);
754 let zero_f = f32x4::zero(token);
755 let mut acc = f32x4::zero(token);
756
757 let offset = i32x4::splat(token, 0x3f2a_aaab_u32 as i32);
758 let one = f32x4::splat(token, 1.0);
759 let p0 = f32x4::splat(token, LOG2_P0);
760 let p1 = f32x4::splat(token, LOG2_P1);
761 let p2 = f32x4::splat(token, LOG2_P2);
762 let q0 = f32x4::splat(token, LOG2_Q0);
763 let q1 = f32x4::splat(token, LOG2_Q1);
764 let q2 = f32x4::splat(token, LOG2_Q2);
765
766 #[inline(always)]
767 fn fast_log2f_x4(
768 x: f32x4,
769 offset: i32x4,
770 p0: f32x4,
771 p1: f32x4,
772 p2: f32x4,
773 q0: f32x4,
774 q1: f32x4,
775 q2: f32x4,
776 one: f32x4,
777 ) -> f32x4 {
778 let x_bits: i32x4 = x.bitcast_i32x4();
779 let exp_bits = x_bits - offset;
780 let exp_shifted = exp_bits.shr_arithmetic::<23>();
781 let mantissa_bits = x_bits - exp_shifted.shl::<23>();
782 let mantissa = mantissa_bits.bitcast_f32x4();
783 let exp_val = exp_shifted.to_f32x4();
784 let frac = mantissa - one;
785 let num = frac.mul_add(p2, p1).mul_add(frac, p0);
786 let den = frac.mul_add(q2, q1).mul_add(frac, q0);
787 num / den + exp_val
788 }
789
790 let chunks = counts.len() / 4;
791 for chunk in 0..chunks {
792 let base = chunk * 4;
793 let c_i = i32x4::from_slice(token, &counts[base..]);
794 let c_f = c_i.to_f32x4();
795
796 let nonzero_mask = c_f.simd_gt(zero_f);
798 let not_total_mask = c_f.simd_ne(total_f_v);
799 let nz_float = f32x4::blend(nonzero_mask, one, zero_f);
800 let nt_float = f32x4::blend(not_total_mask, one, zero_f);
801 let valid_mask = nz_float * nt_float;
802
803 let safe_c = f32x4::blend(nonzero_mask, c_f, one);
804 let prob = safe_c * inv_total_v;
805 let log2_prob = fast_log2f_x4(prob, offset, p0, p1, p2, q0, q1, q2, one);
806
807 let contribution = c_f * log2_prob * valid_mask;
808 acc -= contribution;
809 }
810
811 let mut scalar_sum = 0.0f32;
812 let inv_total = 1.0 / total_count as f32;
813 let total_f = total_count as f32;
814 for &count in &counts[chunks * 4..] {
815 if count > 0 {
816 let c = count as f32;
817 if c != total_f {
818 scalar_sum -= c * fast_log2f(c * inv_total);
819 }
820 }
821 }
822
823 acc.reduce_add() + scalar_sum
824}
825
826#[cfg(target_arch = "wasm32")]
827#[inline]
828#[archmage::arcane]
829pub fn shannon_entropy_wasm128(
830 token: archmage::Wasm128Token,
831 counts: &[i32],
832 total_count: usize,
833) -> f32 {
834 use magetypes::simd::{f32x4, i32x4};
835
836 let inv_total_v = f32x4::splat(token, 1.0 / total_count as f32);
837 let total_f_v = f32x4::splat(token, total_count as f32);
838 let zero_f = f32x4::zero(token);
839 let mut acc = f32x4::zero(token);
840
841 let offset = i32x4::splat(token, 0x3f2a_aaab_u32 as i32);
842 let one = f32x4::splat(token, 1.0);
843 let p0 = f32x4::splat(token, LOG2_P0);
844 let p1 = f32x4::splat(token, LOG2_P1);
845 let p2 = f32x4::splat(token, LOG2_P2);
846 let q0 = f32x4::splat(token, LOG2_Q0);
847 let q1 = f32x4::splat(token, LOG2_Q1);
848 let q2 = f32x4::splat(token, LOG2_Q2);
849
850 #[inline(always)]
851 fn fast_log2f_x4(
852 x: f32x4,
853 offset: i32x4,
854 p0: f32x4,
855 p1: f32x4,
856 p2: f32x4,
857 q0: f32x4,
858 q1: f32x4,
859 q2: f32x4,
860 one: f32x4,
861 ) -> f32x4 {
862 let x_bits: i32x4 = x.bitcast_i32x4();
863 let exp_bits = x_bits - offset;
864 let exp_shifted = exp_bits.shr_arithmetic::<23>();
865 let mantissa_bits = x_bits - exp_shifted.shl::<23>();
866 let mantissa = mantissa_bits.bitcast_f32x4();
867 let exp_val = exp_shifted.to_f32x4();
868 let frac = mantissa - one;
869 let num = frac.mul_add(p2, p1).mul_add(frac, p0);
870 let den = frac.mul_add(q2, q1).mul_add(frac, q0);
871 num / den + exp_val
872 }
873
874 let chunks = counts.len() / 4;
875 for chunk in 0..chunks {
876 let base = chunk * 4;
877 let c_i = i32x4::from_slice(token, &counts[base..]);
878 let c_f = c_i.to_f32x4();
879
880 let nonzero_mask = c_f.simd_gt(zero_f);
882 let not_total_mask = c_f.simd_ne(total_f_v);
883 let nz_float = f32x4::blend(nonzero_mask, one, zero_f);
884 let nt_float = f32x4::blend(not_total_mask, one, zero_f);
885 let valid_mask = nz_float * nt_float;
886
887 let safe_c = f32x4::blend(nonzero_mask, c_f, one);
888 let prob = safe_c * inv_total_v;
889 let log2_prob = fast_log2f_x4(prob, offset, p0, p1, p2, q0, q1, q2, one);
890
891 let contribution = c_f * log2_prob * valid_mask;
892 acc -= contribution;
893 }
894
895 let mut scalar_sum = 0.0f32;
896 let inv_total = 1.0 / total_count as f32;
897 let total_f = total_count as f32;
898 for &count in &counts[chunks * 4..] {
899 if count > 0 {
900 let c = count as f32;
901 if c != total_f {
902 scalar_sum -= c * fast_log2f(c * inv_total);
903 }
904 }
905 }
906
907 acc.reduce_add() + scalar_sum
908}
909
910#[cfg(test)]
911mod tests {
912 use super::*;
913 extern crate alloc;
914 extern crate std;
915 use alloc::vec;
916 use alloc::vec::Vec;
917
918 #[test]
920 fn test_entropy_coeffs_pixel_domain() {
921 let n = 64;
922 let block_c: Vec<f32> = (0..n).map(|i| (i as f32 * 0.7 - 20.0) * 0.1).collect();
923 let block_y: Vec<f32> = (0..n).map(|i| (i as f32 * 0.5 - 15.0) * 0.1).collect();
924 let weights: Vec<f32> = (0..n).map(|i| 0.01 + (i as f32) * 0.005).collect();
925 let inv_weights: Vec<f32> = weights.iter().map(|&w| 1.0 / w).collect();
926
927 let cmap_factor = 0.15f32;
928 let quant = 3.5f32;
929 let k_cost_delta = 5.335f32;
930 let k_cost2 = 4.463f32;
931
932 let mut error_ref = vec![0.0f32; n];
934 let ref_result = entropy_coeffs_scalar(
935 &block_c,
936 &block_y,
937 &weights,
938 &inv_weights,
939 n,
940 cmap_factor,
941 quant,
942 k_cost_delta,
943 k_cost2,
944 true,
945 &mut error_ref,
946 );
947
948 let report = archmage::testing::for_each_token_permutation(
949 archmage::testing::CompileTimePolicy::Warn,
950 |perm| {
951 let mut error_simd = vec![0.0f32; n];
952 let simd_result = entropy_estimate_coeffs(
953 &block_c,
954 &block_y,
955 &weights,
956 &inv_weights,
957 n,
958 cmap_factor,
959 quant,
960 k_cost_delta,
961 k_cost2,
962 true,
963 &mut error_simd,
964 );
965 let rel_eps = 0.005;
966 let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
967 / ref_result.entropy_sum.abs();
968 assert!(
969 entropy_rel < rel_eps,
970 "entropy_sum rel_err={entropy_rel:.4} [{perm}]"
971 );
972 let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
973 / ref_result.nzeros_sum.abs().max(1.0);
974 assert!(nz_rel < 0.05, "nzeros_sum rel_err={nz_rel:.4} [{perm}]");
975 let mut max_err = 0.0f32;
976 for i in 0..n {
977 max_err = max_err.max((error_simd[i] - error_ref[i]).abs());
978 }
979 assert!(
980 max_err < 0.5,
981 "Error coeffs max diff: {max_err:.2e} [{perm}]"
982 );
983 },
984 );
985 std::eprintln!("{report}");
986 }
987
988 #[test]
990 fn test_entropy_coeffs_coeff_domain() {
991 let n = 64;
992 let block_c: Vec<f32> = (0..n).map(|i| (i as f32 * 1.3 - 40.0) * 0.05).collect();
993 let block_y: Vec<f32> = (0..n).map(|i| (i as f32 * 0.9 - 30.0) * 0.05).collect();
994 let weights: Vec<f32> = (0..n).map(|i| 0.02 + (i as f32) * 0.003).collect();
995 let inv_weights: Vec<f32> = weights.iter().map(|&w| 1.0 / w).collect();
996
997 let cmap_factor = 0.0f32;
998 let quant = 5.0f32;
999 let k_cost_delta = 5.335f32;
1000 let k_cost2 = 4.463f32;
1001
1002 let mut error_ref = vec![0.0f32; n];
1003 let ref_result = entropy_coeffs_scalar(
1004 &block_c,
1005 &block_y,
1006 &weights,
1007 &inv_weights,
1008 n,
1009 cmap_factor,
1010 quant,
1011 k_cost_delta,
1012 k_cost2,
1013 false,
1014 &mut error_ref,
1015 );
1016
1017 let report = archmage::testing::for_each_token_permutation(
1018 archmage::testing::CompileTimePolicy::Warn,
1019 |perm| {
1020 let mut error_simd = vec![0.0f32; n];
1021 let simd_result = entropy_estimate_coeffs(
1022 &block_c,
1023 &block_y,
1024 &weights,
1025 &inv_weights,
1026 n,
1027 cmap_factor,
1028 quant,
1029 k_cost_delta,
1030 k_cost2,
1031 false,
1032 &mut error_simd,
1033 );
1034 let rel_eps = 0.005;
1035 let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
1036 / ref_result.entropy_sum.abs();
1037 assert!(entropy_rel < rel_eps, "entropy_sum [{perm}]");
1038 let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
1039 / ref_result.nzeros_sum.abs().max(1.0);
1040 assert!(nz_rel < 0.05, "nzeros_sum [{perm}]");
1041 let il_rel = (simd_result.info_loss_sum - ref_result.info_loss_sum).abs()
1042 / ref_result.info_loss_sum.abs().max(1.0);
1043 assert!(il_rel < rel_eps, "info_loss_sum [{perm}]");
1044 let il2_rel = (simd_result.info_loss2_sum - ref_result.info_loss2_sum).abs()
1045 / ref_result.info_loss2_sum.abs().max(1.0);
1046 assert!(il2_rel < rel_eps, "info_loss2_sum [{perm}]");
1047 },
1048 );
1049 std::eprintln!("{report}");
1050 }
1051
1052 #[test]
1054 fn test_entropy_coeffs_remainder() {
1055 let n = 67;
1056 let block_c: Vec<f32> = (0..n).map(|i| (i as f32) * 0.1 - 3.0).collect();
1057 let block_y: Vec<f32> = (0..n).map(|i| (i as f32) * 0.08 - 2.5).collect();
1058 let weights: Vec<f32> = (0..n).map(|i| 0.01 + (i as f32) * 0.002).collect();
1059 let inv_weights: Vec<f32> = weights.iter().map(|&w| 1.0 / w).collect();
1060
1061 let mut error_ref = vec![0.0f32; n];
1062 let ref_result = entropy_coeffs_scalar(
1063 &block_c,
1064 &block_y,
1065 &weights,
1066 &inv_weights,
1067 n,
1068 0.2,
1069 4.0,
1070 5.335,
1071 4.463,
1072 true,
1073 &mut error_ref,
1074 );
1075
1076 let report = archmage::testing::for_each_token_permutation(
1077 archmage::testing::CompileTimePolicy::Warn,
1078 |perm| {
1079 let mut error_simd = vec![0.0f32; n];
1080 let simd_result = entropy_estimate_coeffs(
1081 &block_c,
1082 &block_y,
1083 &weights,
1084 &inv_weights,
1085 n,
1086 0.2,
1087 4.0,
1088 5.335,
1089 4.463,
1090 true,
1091 &mut error_simd,
1092 );
1093 let rel_eps = 0.005;
1094 let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
1095 / ref_result.entropy_sum.abs().max(1.0);
1096 assert!(entropy_rel < rel_eps, "entropy_sum [{perm}]");
1097 let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
1098 / ref_result.nzeros_sum.abs().max(1.0);
1099 assert!(nz_rel < 0.05, "nzeros_sum [{perm}]");
1100 let max_err = error_simd
1101 .iter()
1102 .zip(error_ref.iter())
1103 .take(n)
1104 .map(|(a, b)| (a - b).abs())
1105 .fold(0.0f32, f32::max);
1106 assert!(
1107 max_err < 0.01,
1108 "Error coeffs max diff: {max_err:.2e} [{perm}]"
1109 );
1110 },
1111 );
1112 std::eprintln!("{report}");
1113 }
1114
1115 #[test]
1117 fn test_entropy_coeffs_large_block() {
1118 let n = 4096;
1119 let block_c: Vec<f32> = (0..n).map(|i| ((i as f32) * 0.01).sin() * 5.0).collect();
1120 let block_y: Vec<f32> = (0..n).map(|i| ((i as f32) * 0.013).cos() * 4.0).collect();
1121 let weights: Vec<f32> = (0..n).map(|i| 0.005 + (i as f32) * 0.001).collect();
1122 let inv_weights: Vec<f32> = weights.iter().map(|&w| 1.0 / w).collect();
1123
1124 let mut error_ref = vec![0.0f32; n];
1125 let ref_result = entropy_coeffs_scalar(
1126 &block_c,
1127 &block_y,
1128 &weights,
1129 &inv_weights,
1130 n,
1131 0.1,
1132 2.0,
1133 5.335,
1134 4.463,
1135 true,
1136 &mut error_ref,
1137 );
1138
1139 let report = archmage::testing::for_each_token_permutation(
1140 archmage::testing::CompileTimePolicy::Warn,
1141 |perm| {
1142 let mut error_simd = vec![0.0f32; n];
1143 let simd_result = entropy_estimate_coeffs(
1144 &block_c,
1145 &block_y,
1146 &weights,
1147 &inv_weights,
1148 n,
1149 0.1,
1150 2.0,
1151 5.335,
1152 4.463,
1153 true,
1154 &mut error_simd,
1155 );
1156
1157 let rel_eps = 0.005;
1159 let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
1160 / ref_result.entropy_sum.abs();
1161 assert!(
1162 entropy_rel < rel_eps,
1163 "entropy_sum: SIMD={}, ref={}, rel_err={:.4}% [{perm}]",
1164 simd_result.entropy_sum,
1165 ref_result.entropy_sum,
1166 entropy_rel * 100.0
1167 );
1168
1169 let max_err = error_simd
1170 .iter()
1171 .zip(error_ref.iter())
1172 .take(n)
1173 .map(|(a, b)| (a - b).abs())
1174 .fold(0.0f32, f32::max);
1175 assert!(
1176 max_err < 1e-3,
1177 "Error coeffs max diff: {:.2e} [{perm}]",
1178 max_err
1179 );
1180 },
1181 );
1182 std::eprintln!("{report}");
1183 }
1184
1185 fn reference_shannon_entropy(counts: &[i32], total_count: usize) -> f32 {
1191 if total_count == 0 {
1192 return 0.0;
1193 }
1194 let inv_total = 1.0 / total_count as f32;
1195 let total_f = total_count as f32;
1196 let mut entropy = 0.0f32;
1197 for &count in counts {
1198 if count > 0 {
1199 let c = count as f32;
1200 if c != total_f {
1201 entropy -= c * (c * inv_total).log2();
1202 }
1203 }
1204 }
1205 entropy
1206 }
1207
1208 #[test]
1209 fn test_shannon_entropy_uniform() {
1210 let counts = [100i32, 100, 100, 100, 0, 0, 0, 0];
1212 let total = 400;
1213 let ref_ent = reference_shannon_entropy(&counts, total);
1214 let simd_ent = shannon_entropy_bits(&counts, total);
1215 let scalar_ent = shannon_entropy_scalar(&counts, total);
1216
1217 assert!((ref_ent - 800.0).abs() < 0.1, "ref = {ref_ent}");
1219 assert!(
1220 (simd_ent - ref_ent).abs() < 0.5,
1221 "simd={simd_ent} ref={ref_ent}"
1222 );
1223 assert!(
1224 (scalar_ent - ref_ent).abs() < 0.5,
1225 "scalar={scalar_ent} ref={ref_ent}"
1226 );
1227 }
1228
1229 #[test]
1230 fn test_shannon_entropy_single_symbol() {
1231 let counts = [1000i32, 0, 0, 0, 0, 0, 0, 0];
1233 let total = 1000;
1234 let ent = shannon_entropy_bits(&counts, total);
1235 assert!(ent.abs() < 0.01, "entropy should be 0, got {ent}");
1236 }
1237
1238 #[test]
1239 fn test_shannon_entropy_realistic_histogram() {
1240 let mut counts = alloc::vec![0i32; 64];
1242 counts[0] = 5000; counts[1] = 2000;
1244 counts[2] = 1000;
1245 counts[3] = 500;
1246 counts[4] = 200;
1247 counts[5] = 100;
1248 counts[6] = 50;
1249 counts[7] = 20;
1250 let total: usize = counts.iter().map(|&c| c as usize).sum();
1251
1252 let ref_ent = reference_shannon_entropy(&counts, total);
1253
1254 let report = archmage::testing::for_each_token_permutation(
1255 archmage::testing::CompileTimePolicy::Warn,
1256 |perm| {
1257 let simd_ent = shannon_entropy_bits(&counts, total);
1258 let rel_err = (simd_ent - ref_ent).abs() / ref_ent.abs().max(1.0);
1259 assert!(
1260 rel_err < 0.001,
1261 "Shannon entropy: simd={simd_ent}, ref={ref_ent}, rel_err={rel_err:.4} [{perm}]"
1262 );
1263 },
1264 );
1265 std::eprintln!("{report}");
1266 }
1267
1268 #[test]
1269 fn test_shannon_entropy_large_alphabet() {
1270 let mut counts = alloc::vec![0i32; 256];
1272 let mut total = 0usize;
1273 for (i, count) in counts.iter_mut().enumerate() {
1274 *count = 10000 / (i as i32 + 1);
1275 total += *count as usize;
1276 }
1277
1278 let ref_ent = reference_shannon_entropy(&counts, total);
1279
1280 let report = archmage::testing::for_each_token_permutation(
1281 archmage::testing::CompileTimePolicy::Warn,
1282 |perm| {
1283 let simd_ent = shannon_entropy_bits(&counts, total);
1284 let rel_err = (simd_ent - ref_ent).abs() / ref_ent.abs().max(1.0);
1285 assert!(
1286 rel_err < 0.001,
1287 "Large alphabet: simd={simd_ent}, ref={ref_ent}, rel_err={rel_err:.4} [{perm}]"
1288 );
1289 },
1290 );
1291 std::eprintln!("{report}");
1292 }
1293
1294 #[test]
1295 fn test_shannon_entropy_empty() {
1296 let counts = [0i32; 8];
1297 let ent = shannon_entropy_bits(&counts, 0);
1298 assert_eq!(ent, 0.0);
1299 }
1300
1301 #[test]
1302 fn test_fast_pow2f_accuracy() {
1303 assert!((fast_pow2f(0.0) - 1.0).abs() < 1e-5);
1305 assert!((fast_pow2f(1.0) - 2.0).abs() < 1e-4);
1306 assert!((fast_pow2f(3.0) - 8.0).abs() < 1e-3);
1307 assert!((fast_pow2f(-1.0) - 0.5).abs() < 1e-5);
1308
1309 let val = fast_pow2f(0.5);
1311 let expected = core::f32::consts::SQRT_2;
1312 assert!(
1313 (val - expected).abs() / expected < 5e-7,
1314 "2^0.5: got {val}, expected {expected}"
1315 );
1316 }
1317
1318 #[test]
1319 fn test_fast_powf_accuracy() {
1320 let val = fast_powf(2.0, 3.0);
1322 assert!(
1323 (val - 8.0).abs() / 8.0 < 5e-5,
1324 "2^3: got {val}, expected 8.0"
1325 );
1326
1327 let base = 0.5f32;
1329 let exact = base.powf(2.4);
1330 let fast = fast_powf(base, 2.4);
1331 assert!(
1332 (fast - exact).abs() / exact < 5e-5,
1333 "0.5^2.4: got {fast}, expected {exact}"
1334 );
1335
1336 let ratio = 1.5f32;
1338 let exact = ratio.powf(0.337);
1339 let fast = fast_powf(ratio, 0.337);
1340 assert!(
1341 (fast - exact).abs() / exact < 5e-5,
1342 "1.5^0.337: got {fast}, expected {exact}"
1343 );
1344 }
1345}