1#[derive(Debug, Clone, Copy)]
16pub struct EntropyCoeffResult {
17 pub entropy_sum: f32,
19 pub nzeros_sum: f32,
21 pub info_loss_sum: f32,
23 pub info_loss2_sum: f32,
25}
26
27#[inline]
38#[allow(clippy::too_many_arguments)]
39pub fn entropy_estimate_coeffs(
40 block_c: &[f32],
41 block_y: &[f32],
42 weights: &[f32],
43 n: usize,
44 cmap_factor: f32,
45 quant: f32,
46 k_cost_delta: f32,
47 k_cost2: f32,
48 pixel_domain: bool,
49 error_coeffs: &mut [f32],
50) -> EntropyCoeffResult {
51 #[cfg(target_arch = "x86_64")]
52 {
53 use archmage::SimdToken;
54 if let Some(token) = archmage::X64V3Token::summon() {
55 return entropy_coeffs_avx2(
56 token,
57 block_c,
58 block_y,
59 weights,
60 n,
61 cmap_factor,
62 quant,
63 k_cost_delta,
64 k_cost2,
65 pixel_domain,
66 error_coeffs,
67 );
68 }
69 }
70
71 #[cfg(target_arch = "aarch64")]
72 {
73 use archmage::SimdToken;
74 if let Some(token) = archmage::NeonToken::summon() {
75 return entropy_coeffs_neon(
76 token,
77 block_c,
78 block_y,
79 weights,
80 n,
81 cmap_factor,
82 quant,
83 k_cost_delta,
84 k_cost2,
85 pixel_domain,
86 error_coeffs,
87 );
88 }
89 }
90
91 entropy_coeffs_scalar(
92 block_c,
93 block_y,
94 weights,
95 n,
96 cmap_factor,
97 quant,
98 k_cost_delta,
99 k_cost2,
100 pixel_domain,
101 error_coeffs,
102 )
103}
104
105#[inline]
106#[allow(clippy::too_many_arguments)]
107pub fn entropy_coeffs_scalar(
108 block_c: &[f32],
109 block_y: &[f32],
110 weights: &[f32],
111 n: usize,
112 cmap_factor: f32,
113 quant: f32,
114 k_cost_delta: f32,
115 k_cost2: f32,
116 pixel_domain: bool,
117 error_coeffs: &mut [f32],
118) -> EntropyCoeffResult {
119 let mut entropy_sum = 0.0f32;
120 let mut nzeros_sum = 0.0f32;
121 let mut info_loss_sum = 0.0f32;
122 let mut info_loss2_sum = 0.0f32;
123
124 for i in 0..n {
125 let val_in = block_c[i];
126 let val_y = block_y[i] * cmap_factor;
127 let val = (val_in - val_y) * (1.0 / weights[i]) * quant;
128 let rval = val.round();
129 let diff = val - rval;
130
131 if pixel_domain {
132 error_coeffs[i] = weights[i] * diff;
133 }
134
135 let q = rval.abs();
136 entropy_sum += q.sqrt() * k_cost_delta;
137 if q != 0.0 {
138 nzeros_sum += 1.0;
139 }
140
141 if !pixel_domain {
142 let diff_abs = diff.abs();
143 info_loss_sum += diff_abs;
144 info_loss2_sum += diff_abs * diff_abs;
145 if q >= 1.5 {
146 entropy_sum += k_cost2;
147 }
148 }
149 }
150
151 EntropyCoeffResult {
152 entropy_sum,
153 nzeros_sum,
154 info_loss_sum,
155 info_loss2_sum,
156 }
157}
158
159#[cfg(target_arch = "x86_64")]
160#[inline]
161#[archmage::arcane]
162#[allow(clippy::too_many_arguments)]
163pub fn entropy_coeffs_avx2(
164 token: archmage::X64V3Token,
165 block_c: &[f32],
166 block_y: &[f32],
167 weights: &[f32],
168 n: usize,
169 cmap_factor: f32,
170 quant: f32,
171 k_cost_delta: f32,
172 k_cost2: f32,
173 pixel_domain: bool,
174 error_coeffs: &mut [f32],
175) -> EntropyCoeffResult {
176 use magetypes::simd::f32x8;
177
178 let cmap_v = f32x8::splat(token, cmap_factor);
179 let quant_v = f32x8::splat(token, quant);
180 let cost_delta_v = f32x8::splat(token, k_cost_delta);
181 let cost2_v = f32x8::splat(token, k_cost2);
182 let zero = f32x8::zero(token);
183 let one = f32x8::splat(token, 1.0);
184 let thr_1_5 = f32x8::splat(token, 1.5);
185
186 let mut entropy_acc = f32x8::zero(token);
187 let mut nzeros_acc = f32x8::zero(token);
188 let mut info_loss_acc = f32x8::zero(token);
189 let mut info_loss2_acc = f32x8::zero(token);
190 let mut cost2_acc = f32x8::zero(token);
191
192 let chunks = n / 8;
193 let simd_n = chunks * 8;
196 let block_c_s = &block_c[..simd_n];
197 let block_y_s = &block_y[..simd_n];
198 let weights_s = &weights[..simd_n];
199 for chunk in 0..chunks {
200 let base = chunk * 8;
201
202 let bc = f32x8::from_slice(token, &block_c_s[base..]);
203 let by_v = f32x8::from_slice(token, &block_y_s[base..]);
204 let w = f32x8::from_slice(token, &weights_s[base..]);
205
206 let adjusted = bc - by_v * cmap_v;
208 let val = adjusted / w * quant_v;
209
210 let rval = val.round();
211 let diff = val - rval;
212
213 if pixel_domain {
215 let err = w * diff;
216 let out: &mut [f32; 8] = (&mut error_coeffs[base..base + 8]).try_into().unwrap();
217 err.store(out);
218 }
219
220 let q = rval.abs();
222 entropy_acc = q.sqrt().mul_add(cost_delta_v, entropy_acc);
223
224 let nz_mask = q.simd_ne(zero);
226 nzeros_acc += f32x8::blend(nz_mask, one, zero);
227
228 if !pixel_domain {
230 let diff_abs = diff.abs();
231 info_loss_acc += diff_abs;
232 info_loss2_acc = diff_abs.mul_add(diff_abs, info_loss2_acc);
233
234 let ge_mask = q.simd_ge(thr_1_5);
236 cost2_acc += f32x8::blend(ge_mask, cost2_v, zero);
237 }
238 }
239
240 let start = chunks * 8;
242 let remainder = entropy_coeffs_scalar(
243 &block_c[start..n],
244 &block_y[start..n],
245 &weights[start..n],
246 n - start,
247 cmap_factor,
248 quant,
249 k_cost_delta,
250 k_cost2,
251 pixel_domain,
252 &mut error_coeffs[start..n],
253 );
254
255 let mut entropy_sum = entropy_acc.reduce_add() + remainder.entropy_sum;
256 if !pixel_domain {
257 entropy_sum += cost2_acc.reduce_add();
258 }
259
260 EntropyCoeffResult {
261 entropy_sum,
262 nzeros_sum: nzeros_acc.reduce_add() + remainder.nzeros_sum,
263 info_loss_sum: info_loss_acc.reduce_add() + remainder.info_loss_sum,
264 info_loss2_sum: info_loss2_acc.reduce_add() + remainder.info_loss2_sum,
265 }
266}
267
268#[cfg(target_arch = "aarch64")]
273#[inline]
274#[archmage::arcane]
275#[allow(clippy::too_many_arguments)]
276pub fn entropy_coeffs_neon(
277 token: archmage::NeonToken,
278 block_c: &[f32],
279 block_y: &[f32],
280 weights: &[f32],
281 n: usize,
282 cmap_factor: f32,
283 quant: f32,
284 k_cost_delta: f32,
285 k_cost2: f32,
286 pixel_domain: bool,
287 error_coeffs: &mut [f32],
288) -> EntropyCoeffResult {
289 use magetypes::simd::f32x4;
290
291 let cmap_v = f32x4::splat(token, cmap_factor);
292 let quant_v = f32x4::splat(token, quant);
293 let cost_delta_v = f32x4::splat(token, k_cost_delta);
294 let cost2_v = f32x4::splat(token, k_cost2);
295 let zero = f32x4::zero(token);
296 let one = f32x4::splat(token, 1.0);
297 let thr_1_5 = f32x4::splat(token, 1.5);
298
299 let mut entropy_acc = f32x4::zero(token);
300 let mut nzeros_acc = f32x4::zero(token);
301 let mut info_loss_acc = f32x4::zero(token);
302 let mut info_loss2_acc = f32x4::zero(token);
303 let mut cost2_acc = f32x4::zero(token);
304
305 let chunks = n / 4;
306 let simd_n = chunks * 4;
307 let block_c_s = &block_c[..simd_n];
308 let block_y_s = &block_y[..simd_n];
309 let weights_s = &weights[..simd_n];
310 for chunk in 0..chunks {
311 let base = chunk * 4;
312
313 let bc = f32x4::from_slice(token, &block_c_s[base..]);
314 let by_v = f32x4::from_slice(token, &block_y_s[base..]);
315 let w = f32x4::from_slice(token, &weights_s[base..]);
316
317 let adjusted = bc - by_v * cmap_v;
319 let val = adjusted / w * quant_v;
320
321 let rval = val.round();
322 let diff = val - rval;
323
324 if pixel_domain {
325 let err = w * diff;
326 let out: &mut [f32; 4] = (&mut error_coeffs[base..base + 4]).try_into().unwrap();
327 err.store(out);
328 }
329
330 let q = rval.abs();
331 entropy_acc = q.sqrt().mul_add(cost_delta_v, entropy_acc);
332
333 let nz_mask = q.simd_ne(zero);
334 nzeros_acc += f32x4::blend(nz_mask, one, zero);
335
336 if !pixel_domain {
337 let diff_abs = diff.abs();
338 info_loss_acc += diff_abs;
339 info_loss2_acc = diff_abs.mul_add(diff_abs, info_loss2_acc);
340
341 let ge_mask = q.simd_ge(thr_1_5);
342 cost2_acc += f32x4::blend(ge_mask, cost2_v, zero);
343 }
344 }
345
346 let start = chunks * 4;
348 let remainder = entropy_coeffs_scalar(
349 &block_c[start..n],
350 &block_y[start..n],
351 &weights[start..n],
352 n - start,
353 cmap_factor,
354 quant,
355 k_cost_delta,
356 k_cost2,
357 pixel_domain,
358 &mut error_coeffs[start..n],
359 );
360
361 let mut entropy_sum = entropy_acc.reduce_add() + remainder.entropy_sum;
362 if !pixel_domain {
363 entropy_sum += cost2_acc.reduce_add();
364 }
365
366 EntropyCoeffResult {
367 entropy_sum,
368 nzeros_sum: nzeros_acc.reduce_add() + remainder.nzeros_sum,
369 info_loss_sum: info_loss_acc.reduce_add() + remainder.info_loss_sum,
370 info_loss2_sum: info_loss2_acc.reduce_add() + remainder.info_loss2_sum,
371 }
372}
373
374#[cfg(test)]
375mod tests {
376 use super::*;
377 extern crate alloc;
378 use alloc::vec;
379 use alloc::vec::Vec;
380
381 #[test]
383 fn test_entropy_coeffs_pixel_domain() {
384 let n = 64;
385 let block_c: Vec<f32> = (0..n).map(|i| (i as f32 * 0.7 - 20.0) * 0.1).collect();
386 let block_y: Vec<f32> = (0..n).map(|i| (i as f32 * 0.5 - 15.0) * 0.1).collect();
387 let weights: Vec<f32> = (0..n).map(|i| 0.01 + (i as f32) * 0.005).collect();
388
389 let cmap_factor = 0.15f32;
390 let quant = 3.5f32;
391 let k_cost_delta = 5.335f32;
392 let k_cost2 = 4.463f32;
393
394 let mut error_ref = vec![0.0f32; n];
396 let ref_result = entropy_coeffs_scalar(
397 &block_c,
398 &block_y,
399 &weights,
400 n,
401 cmap_factor,
402 quant,
403 k_cost_delta,
404 k_cost2,
405 true,
406 &mut error_ref,
407 );
408
409 let mut error_simd = vec![0.0f32; n];
411 let simd_result = entropy_estimate_coeffs(
412 &block_c,
413 &block_y,
414 &weights,
415 n,
416 cmap_factor,
417 quant,
418 k_cost_delta,
419 k_cost2,
420 true,
421 &mut error_simd,
422 );
423
424 let rel_eps = 0.005; let entropy_rel =
428 (simd_result.entropy_sum - ref_result.entropy_sum).abs() / ref_result.entropy_sum.abs();
429 assert!(
430 entropy_rel < rel_eps,
431 "entropy_sum: SIMD={}, ref={}, rel_err={:.4}%",
432 simd_result.entropy_sum,
433 ref_result.entropy_sum,
434 entropy_rel * 100.0
435 );
436 let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
438 / ref_result.nzeros_sum.abs().max(1.0);
439 assert!(
440 nz_rel < 0.05, "nzeros_sum: SIMD={}, ref={}, rel_err={:.4}%",
442 simd_result.nzeros_sum,
443 ref_result.nzeros_sum,
444 nz_rel * 100.0
445 );
446
447 let mut max_err = 0.0f32;
450 for i in 0..n {
451 max_err = max_err.max((error_simd[i] - error_ref[i]).abs());
452 }
453 assert!(max_err < 0.5, "Error coeffs max diff: {:.2e}", max_err);
454 }
455
456 #[test]
458 fn test_entropy_coeffs_coeff_domain() {
459 let n = 64;
460 let block_c: Vec<f32> = (0..n).map(|i| (i as f32 * 1.3 - 40.0) * 0.05).collect();
461 let block_y: Vec<f32> = (0..n).map(|i| (i as f32 * 0.9 - 30.0) * 0.05).collect();
462 let weights: Vec<f32> = (0..n).map(|i| 0.02 + (i as f32) * 0.003).collect();
463
464 let cmap_factor = 0.0f32;
465 let quant = 5.0f32;
466 let k_cost_delta = 5.335f32;
467 let k_cost2 = 4.463f32;
468
469 let mut error_ref = vec![0.0f32; n];
470 let ref_result = entropy_coeffs_scalar(
471 &block_c,
472 &block_y,
473 &weights,
474 n,
475 cmap_factor,
476 quant,
477 k_cost_delta,
478 k_cost2,
479 false,
480 &mut error_ref,
481 );
482
483 let mut error_simd = vec![0.0f32; n];
484 let simd_result = entropy_estimate_coeffs(
485 &block_c,
486 &block_y,
487 &weights,
488 n,
489 cmap_factor,
490 quant,
491 k_cost_delta,
492 k_cost2,
493 false,
494 &mut error_simd,
495 );
496
497 let rel_eps = 0.005;
498 let entropy_rel =
499 (simd_result.entropy_sum - ref_result.entropy_sum).abs() / ref_result.entropy_sum.abs();
500 assert!(
501 entropy_rel < rel_eps,
502 "entropy_sum: SIMD={}, ref={}, rel_err={:.4}%",
503 simd_result.entropy_sum,
504 ref_result.entropy_sum,
505 entropy_rel * 100.0
506 );
507 let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
508 / ref_result.nzeros_sum.abs().max(1.0);
509 assert!(
510 nz_rel < 0.05,
511 "nzeros_sum: SIMD={}, ref={}, rel_err={:.4}%",
512 simd_result.nzeros_sum,
513 ref_result.nzeros_sum,
514 nz_rel * 100.0
515 );
516 let il_rel = (simd_result.info_loss_sum - ref_result.info_loss_sum).abs()
517 / ref_result.info_loss_sum.abs().max(1.0);
518 assert!(
519 il_rel < rel_eps,
520 "info_loss_sum: SIMD={}, ref={}, rel_err={:.4}%",
521 simd_result.info_loss_sum,
522 ref_result.info_loss_sum,
523 il_rel * 100.0
524 );
525 let il2_rel = (simd_result.info_loss2_sum - ref_result.info_loss2_sum).abs()
526 / ref_result.info_loss2_sum.abs().max(1.0);
527 assert!(
528 il2_rel < rel_eps,
529 "info_loss2_sum: SIMD={}, ref={}, rel_err={:.4}%",
530 simd_result.info_loss2_sum,
531 ref_result.info_loss2_sum,
532 il2_rel * 100.0
533 );
534 }
535
536 #[test]
538 fn test_entropy_coeffs_remainder() {
539 let n = 67;
540 let block_c: Vec<f32> = (0..n).map(|i| (i as f32) * 0.1 - 3.0).collect();
541 let block_y: Vec<f32> = (0..n).map(|i| (i as f32) * 0.08 - 2.5).collect();
542 let weights: Vec<f32> = (0..n).map(|i| 0.01 + (i as f32) * 0.002).collect();
543
544 let mut error_ref = vec![0.0f32; n];
545 let ref_result = entropy_coeffs_scalar(
546 &block_c,
547 &block_y,
548 &weights,
549 n,
550 0.2,
551 4.0,
552 5.335,
553 4.463,
554 true,
555 &mut error_ref,
556 );
557
558 let mut error_simd = vec![0.0f32; n];
559 let simd_result = entropy_estimate_coeffs(
560 &block_c,
561 &block_y,
562 &weights,
563 n,
564 0.2,
565 4.0,
566 5.335,
567 4.463,
568 true,
569 &mut error_simd,
570 );
571
572 let rel_eps = 0.005;
573 let entropy_rel = (simd_result.entropy_sum - ref_result.entropy_sum).abs()
574 / ref_result.entropy_sum.abs().max(1.0);
575 assert!(
576 entropy_rel < rel_eps,
577 "entropy_sum: SIMD={}, ref={}, rel_err={:.4}%",
578 simd_result.entropy_sum,
579 ref_result.entropy_sum,
580 entropy_rel * 100.0
581 );
582 let nz_rel = (simd_result.nzeros_sum - ref_result.nzeros_sum).abs()
583 / ref_result.nzeros_sum.abs().max(1.0);
584 assert!(
585 nz_rel < 0.05,
586 "nzeros_sum: SIMD={}, ref={}",
587 simd_result.nzeros_sum,
588 ref_result.nzeros_sum
589 );
590
591 let max_err = error_simd
592 .iter()
593 .zip(error_ref.iter())
594 .take(n)
595 .map(|(a, b)| (a - b).abs())
596 .fold(0.0f32, f32::max);
597 assert!(max_err < 0.01, "Error coeffs max diff: {:.2e}", max_err);
598 }
599
600 #[test]
602 fn test_entropy_coeffs_large_block() {
603 let n = 4096;
604 let block_c: Vec<f32> = (0..n).map(|i| ((i as f32) * 0.01).sin() * 5.0).collect();
605 let block_y: Vec<f32> = (0..n).map(|i| ((i as f32) * 0.013).cos() * 4.0).collect();
606 let weights: Vec<f32> = (0..n).map(|i| 0.005 + (i as f32) * 0.001).collect();
607
608 let mut error_ref = vec![0.0f32; n];
609 let ref_result = entropy_coeffs_scalar(
610 &block_c,
611 &block_y,
612 &weights,
613 n,
614 0.1,
615 2.0,
616 5.335,
617 4.463,
618 true,
619 &mut error_ref,
620 );
621
622 let mut error_simd = vec![0.0f32; n];
623 let simd_result = entropy_estimate_coeffs(
624 &block_c,
625 &block_y,
626 &weights,
627 n,
628 0.1,
629 2.0,
630 5.335,
631 4.463,
632 true,
633 &mut error_simd,
634 );
635
636 let rel_eps = 0.005;
638 let entropy_rel =
639 (simd_result.entropy_sum - ref_result.entropy_sum).abs() / ref_result.entropy_sum.abs();
640 assert!(
641 entropy_rel < rel_eps,
642 "entropy_sum: SIMD={}, ref={}, rel_err={:.4}%",
643 simd_result.entropy_sum,
644 ref_result.entropy_sum,
645 entropy_rel * 100.0
646 );
647
648 let max_err = error_simd
649 .iter()
650 .zip(error_ref.iter())
651 .take(n)
652 .map(|(a, b)| (a - b).abs())
653 .fold(0.0f32, f32::max);
654 assert!(max_err < 1e-3, "Error coeffs max diff: {:.2e}", max_err);
655 }
656}