zenanalyze 0.1.0

Image content analyzers (variance, edges, chroma sharpness, DCT energy, derived likelihoods) for adaptive codec decisions. Pulls rows on demand from any zenpixels PixelSlice.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
//! Tier 3: luma histogram entropy + high-frequency DCT energy ratio
//! + derived likelihoods (text / screen-content / natural).
//!
//! Direct ports of `coefficient::analysis::evalchroma_ext` (Tier 3)
//! and `image_adaptive::compute_derived_likelihoods`. Numbers are
//! kept identical so the parity example matches within an f32
//! epsilon.

use super::feature::RawAnalysis;
use super::row_stream::RowStream;
use archmage::{incant, magetypes};

/// Cap on sampled 8×8 luma blocks for the Tier 3 DCT pass
/// (`high_freq_energy_ratio`, `dct_compressibility_y/uv`,
/// `patch_fraction`).
///
/// **Tradeoff**: a 139-image convergence study showed:
/// - 256 blocks: 11-16 % p50 error on DCT-energy features, 100 % on
///   `patch_fraction`. ~1 ms at 4 MP.
/// - 1024 blocks: 4-6 % p50 error on DCT-energy, ~57 % on
///   `patch_fraction` (still structurally limited). ~1 ms at 4 MP
///   *with* the three-plane f32x8 DCT kernel (was ~4 ms pre-SIMD).
/// - 4096 blocks: convergence floor, ~+2 ms at 4 MP over 1024.
///
/// **Default = 1024.** The cross-block f32x8 DCT cut the per-block
/// cost ~8-15× vs the pre-SIMD scalar version, so the accuracy tier
/// (4-6 % error on the three DCT-energy features) costs only ~+1 ms
/// at 8 MP over the 256-block sloppy tier — well worth it. Earlier
/// revisions defaulted to 256 because the scalar DCT made 1024 cost
/// 3-4 ms; with SIMD that pressure is gone.
///
/// `patch_fraction` is still structurally sample-limited at any block
/// cap below dense-scan; bump to 4096+ if that feature matters, or
/// accept the 57 %-at-1024 / 100 %-at-256 error and rely on the
/// other DCT features for content classification.
///
/// Re-exported as `AnalyzerConfig::default().hf_max_blocks`.
pub(crate) const DEFAULT_HF_MAX_BLOCKS: usize = 1024;

/// Fill in `high_freq_energy_ratio` and `luma_histogram_entropy`.
///
/// `hf_max_blocks` caps the number of 8×8 luma blocks sampled for the
/// high-frequency DCT energy ratio. Pass [`DEFAULT_HF_MAX_BLOCKS`] to
/// match the oracle-trained reference; lower for proxy-server speed.
pub fn populate_tier3(out: &mut RawAnalysis, stream: &mut RowStream<'_>, hf_max_blocks: usize) {
    let h_stats = luma_histogram_stats(stream);
    out.luma_histogram_entropy = h_stats.entropy;
    #[cfg(feature = "composites")]
    {
        out.line_art_score = h_stats.line_art_score;
    }
    let _ = h_stats;
    let dct = dct_stats(stream, hf_max_blocks);
    out.high_freq_energy_ratio = dct.high_freq_ratio;
    // The libwebp α metrics and patch_fraction land in cfg-gated
    // RawAnalysis fields; the DCT pass that produces them runs
    // unconditionally (its bulk cost is the DCT itself, which the
    // unflagged HighFreqEnergyRatio also needs).
    #[cfg(feature = "experimental")]
    {
        out.dct_compressibility_y = dct.compressibility_y;
        out.dct_compressibility_uv = dct.compressibility_uv;
        out.patch_fraction = dct.patch_fraction;
        out.aq_map_mean = dct.aq_map_mean;
        out.aq_map_std = dct.aq_map_std;
        out.noise_floor_y = dct.noise_floor_y;
        out.noise_floor_uv = dct.noise_floor_uv;
        out.gradient_fraction = dct.gradient_fraction;
    }
    #[cfg(not(feature = "experimental"))]
    {
        let _ = (
            dct.compressibility_y,
            dct.compressibility_uv,
            dct.patch_fraction,
            dct.aq_map_mean,
            dct.aq_map_std,
            dct.noise_floor_y,
            dct.noise_floor_uv,
            dct.gradient_fraction,
        );
    }
}

/// Aggregated DCT stats from one tier-3 sweep. Fused into a single
/// function so we don't pay for the DCT twice.
struct Tier3DctStats {
    /// `Σ AC[zz≥16] / max(1, Σ AC[zz∈1..16])` (luma).
    high_freq_ratio: f32,
    /// Mean per-block libwebp α (luma): `256 * last_non_zero / max_count`
    /// of the 64-bin |AC|/16 histogram. See `block_alpha`.
    compressibility_y: f32,
    /// Mean per-block libwebp α taken over `max(α_cb, α_cr)` per block —
    /// surfaces colour-detail content that luma α misses.
    compressibility_uv: f32,
    /// Fraction of sampled blocks whose 32-bit DCT signature matches at
    /// least one other sampled block. Real perceptual-hash patch
    /// detection — repeats indicate UI / icon / screen content.
    patch_fraction: f32,
    /// Mean of `log10(1 + Σ AC²)` over sampled luma blocks. Stand-in
    /// for the per-block AQ scale factor — orchestrators that want
    /// "image-average detail level" read this; analyzer-driven AQ
    /// scheduling reads `aq_map_std` for "how spread is the detail".
    aq_map_mean: f32,
    /// Standard deviation of the same per-block log-AC-energy. High
    /// std ⇒ heterogeneous content (mix of flat and busy regions) ⇒
    /// AQ pays off; low std ⇒ uniform busyness, AQ is a no-op.
    aq_map_std: f32,
    /// Robust luma noise floor estimate, normalized to `[0, 1]`. 10th
    /// percentile of per-block √(low-AC-energy / 15) over sampled
    /// luma 8×8 blocks, then divided by 32 for normalization. Flat
    /// blocks have minimal real AC content, so their residual low-AC
    /// is dominated by sensor / quantization noise. The 10th-
    /// percentile selector picks the flattest 10 % of blocks across
    /// the image, robust to busy-image content.
    noise_floor_y: f32,
    /// Same for chroma — `max(p10_cb, p10_cr)` of per-block √(low-
    /// AC-energy / 15) on the existing Cb / Cr DCT coefficients,
    /// normalized to `[0, 1]` by dividing by 32.
    noise_floor_uv: f32,
    /// Fraction `[0, 1]` of sampled luma 8×8 blocks where the
    /// low-zigzag (indices 1–15, the 15 AC positions matched by
    /// the same `zz < 16` predicate as `high_freq_energy_ratio`)
    /// energy is ≥ 90 % of the total AC energy. **Smooth-content /
    /// gradient signal** — drives JXL
    /// `with_force_strategy` (DCT16 / DCT32 selection — larger
    /// transforms pay off when most energy is in the lowest
    /// frequencies) and zenrav1e's deblock strength. Distinct from
    /// `high_freq_energy_ratio` (which is a global mean across all
    /// blocks): this is the per-block-thresholded fraction, robust
    /// to a few high-detail blocks dragging the mean.
    gradient_fraction: f32,
}

/// libwebp `GetAlpha`-style score on a single 8×8 DCT block. Higher
/// = harder to compress (more spread AC, fewer near-zero coefficients).
///
/// **Range:** the formula `256 * last_non_zero / max_count` returns
/// values in `[0, 256 × 63 / 2] = [0, 8064]`, not `[0, 255]` as
/// earlier docs claimed. `last_non_zero` ∈ `[0, 63]` is the highest
/// histogram bin with at least one coefficient; `max_count` ≥ 2 by
/// the guard below. On real corpora `compressibility_y` lands in
/// `[0, ~30]` for photos and `compressibility_uv` even lower —
/// nowhere near the theoretical max — but downstream calibration
/// must NOT clamp or normalise against 255.
///
/// Build a 64-bin histogram of `|AC[k]| / bin_div` (clipped to 63),
/// find `max_count` and `last_non_zero` index, return
/// `256 * last_non_zero / max_count`. Matches libwebp's `analysis_enc.c`
/// convention with `MAX_ALPHA=7`, `ALPHA_SCALE=256`,
/// `MAX_COEFF_THRESH=64`.
///
/// `bin_div` controls histogram granularity. Use `BIN_DIV_LUMA = 16`
/// for luma (the libwebp convention) and `BIN_DIV_CHROMA = 8` for
/// chroma. Chroma DCT coefficient magnitudes run ~half luma's; the
/// finer chroma bin spreads the histogram into the same dynamic
/// range as luma so chroma α isn't suppressed near zero.
#[inline(always)]
fn block_alpha(coeffs: &[[f32; 8]; 8], bin_div: f32) -> u32 {
    let mut histo = [0u32; 64];
    let inv = 1.0 / bin_div;
    for k in 1..64 {
        let u = k % 8;
        let v = k / 8;
        let mag = (coeffs[v][u].abs() * inv) as i32;
        let bin = mag.clamp(0, 63) as usize;
        histo[bin] += 1;
    }
    let mut max_count: u32 = 0;
    let mut last_non_zero: u32 = 1;
    for (k, &c) in histo.iter().enumerate() {
        if c > 0 {
            if c > max_count {
                max_count = c;
            }
            last_non_zero = k as u32;
        }
    }
    if max_count > 1 {
        (256 * last_non_zero) / max_count
    } else {
        0
    }
}

/// Histogram bin divisor for luma DCT α — libwebp's conventional value.
const BIN_DIV_LUMA: f32 = 16.0;
/// Histogram bin divisor for chroma DCT α. Chroma coefficient
/// magnitudes run ~half luma's, so a finer bin (÷ 8 vs ÷ 16) spreads
/// the chroma histogram into the same dynamic range as luma. Empirical
/// CID22 / CLIC / gb82-sc validation shows this lifts chroma α from
/// p50 ≈ 1-2 (≪ luma p50 ≈ 16) into a similarly-discriminating range.
const BIN_DIV_CHROMA: f32 = 8.0;

/// 32-bit perceptual signature of an 8×8 DCT block. Used for
/// patch-fraction matching.
///
/// **Scale-invariant**: the per-coefficient threshold is 25% of the
/// block's own peak |AC|, not a fixed absolute value. Two blocks with
/// the same content at different resolutions (or with different
/// global brightness) produce the same signature because the DCT
/// coefficients scale uniformly. Pre-fix the threshold was a fixed
/// 32.0 in raw f32 magnitude, which made the signature depend on
/// image resolution (DCT amplitude shifts with downsampling).
///
/// Per coefficient (first 16 zigzag-positioned AC): 2 bits =
/// `(sign_bit << 1) | (|c| > 0.25·max_ac)`. 16 × 2 = 32 bits.
#[inline(always)]
fn block_signature(coeffs: &[[f32; 8]; 8]) -> u32 {
    // Per-block peak |AC| over the first 16 zigzag positions.
    let mut peak: f32 = 0.0;
    for k in 1..64 {
        if RASTER_TO_ZIGZAG[k] >= 17 {
            continue;
        }
        let u = k % 8;
        let v = k / 8;
        let m = coeffs[v][u].abs();
        if m > peak {
            peak = m;
        }
    }
    // Flat block (peak ≈ 0): all bits → signature 0. Flat blocks ALL
    // match each other — that's correct, they are mutual patches.
    let threshold = peak * 0.25;
    let mut sig: u32 = 0;
    let mut bit_pos: u32 = 0;
    for k in 1..64 {
        let zz = RASTER_TO_ZIGZAG[k];
        if zz >= 17 {
            continue;
        }
        let u = k % 8;
        let v = k / 8;
        let c = coeffs[v][u];
        if c.abs() > threshold {
            sig |= 1 << bit_pos;
        }
        bit_pos += 1;
        if c < 0.0 {
            sig |= 1 << bit_pos;
        }
        bit_pos += 1;
    }
    sig
}

struct LumaHistStats {
    /// Shannon entropy of the 32-bin luma histogram, bits `[0, 5]`.
    entropy: f32,
    /// Soft line-art / engineering-drawing score `[0, 1]`. Combines:
    /// (a) Otsu bimodality (η = σ²B / σ²total — high when histogram
    ///     splits cleanly into two well-separated peaks).
    /// (b) Top-2-bin coverage (fraction of pixels in the two most
    ///     populated bins — typically ≥ 0.7 for line art on a flat
    ///     background, vs ≤ 0.3 for photos).
    /// (c) Low entropy (≤ 2 bits ≈ near-binary distribution).
    /// `line_art_score = min(η, top2_coverage, 1 - entropy/2.5)`
    /// clamped to `[0, 1]`. The min combinator makes the score
    /// conservative — it goes high only when ALL three conditions
    /// agree, sparing photos with one accidentally-bimodal channel
    /// from being misclassified.
    line_art_score: f32,
}

/// Walk the luma histogram once to produce both entropy and the
/// line-art-score signal — same single pass, two reductions.
/// Same 32-bin BT.601 histogram, every-4th-pixel sampling pattern as
/// the original `luma_histogram_entropy`, with cross-row carry.
fn luma_histogram_stats(stream: &mut RowStream<'_>) -> LumaHistStats {
    let width = stream.width() as usize;
    let height = stream.height() as usize;
    if width == 0 || height == 0 {
        return LumaHistStats {
            entropy: 0.0,
            line_art_score: 0.0,
        };
    }
    // Per-primaries fixed-point luma weights — keeps wide-gamut
    // bytes interpreted with the right matrix (BT.2020 for Rec.2020,
    // etc.). sRGB/BT.709 keeps the BT.601 baseline (66/129/25) so
    // the trained histogram thresholds still apply.
    let w = crate::luma::LumaWeights::for_primaries(stream.primaries());
    let qr = w.qr as u32;
    let qg = w.qg as u32;
    let qb = w.qb as u32;
    let mut bins = [0u32; 32];
    let mut n = 0u32;
    let mut carry: usize = 0;
    for yy in 0..height {
        let row = stream.borrow_row(yy as u32);
        let start = (4 - carry) % 4;
        let mut x = start;
        while x < width {
            let off = x * 3;
            let p = &row[off..off + 3];
            let y = ((qr * p[0] as u32 + qg * p[1] as u32 + qb * p[2] as u32 + 128) >> 8) as u8;
            bins[(y >> 3) as usize] += 1;
            n += 1;
            x += 4;
        }
        carry = (carry + width) % 4;
    }
    if n == 0 {
        return LumaHistStats {
            entropy: 0.0,
            line_art_score: 0.0,
        };
    }
    let n_f = n as f32;

    // Entropy.
    let mut entropy = 0.0f32;
    for &c in &bins {
        if c > 0 {
            let p = c as f32 / n_f;
            entropy -= p * p.log2();
        }
    }

    // Otsu bimodality. For each split index k, compute between-class
    // variance σ²B(k) = ω0·ω1·(μ0 - μ1)². Pick max. Also compute
    // total variance σ²T to normalize.
    let mut total_sum: f64 = 0.0;
    for (i, &c) in bins.iter().enumerate() {
        total_sum += (i as f64) * (c as f64);
    }
    let mu_total = total_sum / n_f as f64;
    let mut total_var: f64 = 0.0;
    for (i, &c) in bins.iter().enumerate() {
        let d = i as f64 - mu_total;
        total_var += (c as f64) * d * d;
    }
    total_var /= n_f as f64;

    let mut max_between_var: f64 = 0.0;
    let mut sum0: f64 = 0.0;
    let mut count0: f64 = 0.0;
    for k in 0..31 {
        sum0 += (k as f64) * (bins[k] as f64);
        count0 += bins[k] as f64;
        let count1 = n_f as f64 - count0;
        if count0 < 1.0 || count1 < 1.0 {
            continue;
        }
        let mu0 = sum0 / count0;
        let mu1 = (total_sum - sum0) / count1;
        let omega0 = count0 / n_f as f64;
        let omega1 = count1 / n_f as f64;
        let between_var = omega0 * omega1 * (mu0 - mu1) * (mu0 - mu1);
        if between_var > max_between_var {
            max_between_var = between_var;
        }
    }
    let bimodality = if total_var > 1e-6 {
        (max_between_var / total_var).clamp(0.0, 1.0) as f32
    } else {
        0.0
    };

    // Top-2-bin coverage.
    let mut top1 = 0u32;
    let mut top2 = 0u32;
    for &c in &bins {
        if c > top1 {
            top2 = top1;
            top1 = c;
        } else if c > top2 {
            top2 = c;
        }
    }
    let top2_coverage = (top1 + top2) as f32 / n_f;

    // Low-entropy gate: 1.0 at entropy=0, 0.0 at entropy ≥ 2.5 bits.
    let low_entropy = ((2.5 - entropy) / 2.5).clamp(0.0, 1.0);

    let line_art_score = bimodality.min(top2_coverage).min(low_entropy);

    LumaHistStats {
        entropy,
        line_art_score,
    }
}

/// Raster → zigzag position lookup for 8×8 DCT coefficients.
///
/// `RASTER_TO_ZIGZAG[u + v*8]` returns the zigzag index of the
/// coefficient at column `u`, row `v`. Standard JPEG ITU-T T.81
/// zigzag order: (0,0) → 0 (DC), (1,0) → 1, (0,1) → 2, (0,2) → 3,
/// (1,1) → 4, (2,0) → 5, etc.
const RASTER_TO_ZIGZAG: [u8; 64] = [
    0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, 3, 8, 12, 17, 25, 30, 41, 43, 9, 11,
    18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34,
    37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63,
];

/// 8×8 DCT-II coefficient matrix, scaled by `0.5 * c[k]` where
/// `c[0] = 1/sqrt(2)` and `c[k] = 1` for `k > 0`. Pre-baked so the
/// inner loops are just `mul_add` chains — no per-iteration `.cos()`
/// (which was the previous version's hottest hot path).
///
/// `DCT_COEF[k][n] = 0.5 * c[k] * cos((2n+1) * k * π / 16)`.
///
/// Forward DCT: `Y[k] = Σ_n DCT_COEF[k][n] * X[n]`.
///
/// Layout note: [`DCT_COEF`] is row-major (output `k` outer, input `n`
/// inner). The transposed copy [`DCT_COEF_T`] is needed by the
/// vectorized DCT below — its row `n` is `[DCT_COEF[0][n], …,
/// DCT_COEF[7][n]]`, i.e. column `n` of D. With that layout each f32x8
/// vector loaded from `DCT_COEF_T[n]` directly broadcasts the column-of-D
/// the row pass needs (lanes-as-output-k).
const DCT_COEF: [[f32; 8]; 8] = [
    // k=0: 0.5 * (1/√2) * cos(0) = 1/(2√2) = 0.353553...
    [
        0.353_553_4,
        0.353_553_4,
        0.353_553_4,
        0.353_553_4,
        0.353_553_4,
        0.353_553_4,
        0.353_553_4,
        0.353_553_4,
    ],
    // k=1..7: 0.5 * cos((2n+1)*k*π/16)
    [
        0.490_392_64,
        0.415_734_8,
        0.277_785_12,
        0.097_545_16,
        -0.097_545_16,
        -0.277_785_12,
        -0.415_734_8,
        -0.490_392_64,
    ],
    [
        0.461_939_77,
        0.191_341_72,
        -0.191_341_72,
        -0.461_939_77,
        -0.461_939_77,
        -0.191_341_72,
        0.191_341_72,
        0.461_939_77,
    ],
    [
        0.415_734_8,
        -0.097_545_16,
        -0.490_392_64,
        -0.277_785_12,
        0.277_785_12,
        0.490_392_64,
        0.097_545_16,
        -0.415_734_8,
    ],
    [
        0.353_553_4,
        -0.353_553_4,
        -0.353_553_4,
        0.353_553_4,
        0.353_553_4,
        -0.353_553_4,
        -0.353_553_4,
        0.353_553_4,
    ],
    [
        0.277_785_12,
        -0.490_392_64,
        0.097_545_16,
        0.415_734_8,
        -0.415_734_8,
        -0.097_545_16,
        0.490_392_64,
        -0.277_785_12,
    ],
    [
        0.191_341_72,
        -0.461_939_77,
        0.461_939_77,
        -0.191_341_72,
        -0.191_341_72,
        0.461_939_77,
        -0.461_939_77,
        0.191_341_72,
    ],
    [
        0.097_545_16,
        -0.277_785_12,
        0.415_734_8,
        -0.490_392_64,
        0.490_392_64,
        -0.415_734_8,
        0.277_785_12,
        -0.097_545_16,
    ],
];

/// Transposed DCT-II coefficient matrix: `DCT_COEF_T[n][k] = DCT_COEF[k][n]`.
///
/// Used by the magetypes f32x8 dct2d kernel — each row of `DCT_COEF_T`
/// is one column of `D`, packed lanes-as-output-k. Loaded once at the
/// top of the SIMD function and reused across both 1D DCT passes.
const DCT_COEF_T: [[f32; 8]; 8] = {
    let mut t = [[0.0f32; 8]; 8];
    let mut n = 0;
    while n < 8 {
        let mut k = 0;
        while k < 8 {
            t[n][k] = DCT_COEF[k][n];
            k += 1;
        }
        n += 1;
    }
    t
};

/// Ratio of high-frequency to low-frequency AC DCT energy on sampled
/// 8×8 luma blocks. `Σ AC[zz ≥ 16] / max(1, Σ AC[zz ∈ 1..=15])` where
/// `zz` is the JPEG ITU-T T.81 zigzag index — the same scan order
/// JPEG itself uses to drop high frequencies first. The split is at
/// the predicate `zz < 16` (low side ⇒ zigzag indices 1–15 = **15
/// AC positions** after DC; zigzag 16 and beyond go to the high
/// side). The 15 low positions cover most of the upper-left 4×4
/// triangle (`u + v ≤ 3`), keeping the split symmetric in
/// horizontal/vertical detail, unlike the older raster-order split
/// which biased toward vertical content.
///
/// Naive separable 1D DCT — exactness isn't required for a feature,
/// only stable scale and ordering. A faster approximate DCT could
/// substitute as long as the ratio is preserved within an f32 ULP or
/// two; bit-exact coefficient values are not relied on.
///
/// Pulls 8 rows at a time (one block-row's worth) and samples the
/// `bx` columns selected by `block_idx % stride`. Keeps memory at
/// 8 × width × 3 bytes regardless of image size.
fn dct_stats(stream: &mut RowStream<'_>, max_blocks: usize) -> Tier3DctStats {
    let width = stream.width() as usize;
    let height = stream.height() as usize;
    if width < 8 || height < 8 {
        return Tier3DctStats {
            high_freq_ratio: 0.0,
            compressibility_y: 0.0,
            compressibility_uv: 0.0,
            patch_fraction: 0.0,
            aq_map_mean: 0.0,
            aq_map_std: 0.0,
            noise_floor_y: 0.0,
            noise_floor_uv: 0.0,
            gradient_fraction: 0.0,
        };
    }
    // Per-primaries luma weights — used in the per-block fixed-point
    // YCbCr build below. Wide-gamut u8 sources go through the DCT
    // pipeline with the right matrix for their primaries; sRGB /
    // BT.709 keeps the BT.601 baseline (66/129/25) so trained
    // thresholds still apply.
    let lw = crate::luma::LumaWeights::for_primaries(stream.primaries());
    let qr = lw.qr;
    let qg = lw.qg;
    let qb = lw.qb;
    let blocks_x = width / 8;
    let blocks_y = height / 8;
    let total_blocks = blocks_x * blocks_y;
    if total_blocks == 0 {
        return Tier3DctStats {
            high_freq_ratio: 0.0,
            compressibility_y: 0.0,
            compressibility_uv: 0.0,
            patch_fraction: 0.0,
            aq_map_mean: 0.0,
            aq_map_std: 0.0,
            noise_floor_y: 0.0,
            noise_floor_uv: 0.0,
            gradient_fraction: 0.0,
        };
    }
    let stride = (total_blocks / max_blocks).max(1);

    let mut low_energy = 0.0f64;
    let mut high_energy = 0.0f64;
    let mut blocks_sampled: u32 = 0;
    let mut alpha_y_sum: u64 = 0;
    let mut alpha_uv_sum: u64 = 0;
    // Per-block log-AC-energy for AQ-map mean/std. The log
    // compresses the heavy right tail (texture blocks vs flat
    // blocks differ by 4-5 orders of magnitude in raw energy) and
    // gives a mean / std that lives on a useful 0-7 scale.
    //
    // We stage the raw `block_ac` values into a Vec during the main
    // DCT loop and batch the `log10(1 + ac)` reduction afterwards
    // via magetypes `log2_lowp` — vectorising 8 lanes per call vs
    // 8 sequential scalar `f64::ln()` invocations (~50 cycles each).
    let mut block_acs: Vec<f32> = Vec::with_capacity(max_blocks.min(4096));
    // Per-block low-AC-energy, retained for noise-floor estimation.
    // 10th percentile across blocks ≈ noise floor (flattest blocks'
    // residual AC). 4-byte storage × max_blocks = ~4 KB at default.
    let mut block_low_y: Vec<f32> = Vec::with_capacity(max_blocks.min(4096));
    let mut block_low_cb: Vec<f32> = Vec::with_capacity(max_blocks.min(4096));
    let mut block_low_cr: Vec<f32> = Vec::with_capacity(max_blocks.min(4096));
    // GradientFraction: count blocks where ≥ 90 % of AC energy lives
    // in the low-zigzag positions (smooth-content blocks where larger
    // DCT transforms pay off).
    let mut gradient_blocks: u32 = 0;
    // Bounded to `max_blocks` (default 256) — small enough that a
    // sort-and-sweep is faster than a hash map on this CPU.
    let mut signatures: Vec<u32> = Vec::with_capacity(max_blocks.min(2048));
    let row_bytes = width * 3;
    let mut block_buf = vec![0u8; 8 * row_bytes]; // 8 rows of one block-row
    let mut block_idx = 0usize;

    for by in 0..blocks_y {
        // Determine whether any sampled block lives in this block-row.
        let row_start = by * blocks_x;
        let row_end = row_start + blocks_x;
        let any_sampled = (row_start..row_end).any(|k| k % stride == 0);
        if !any_sampled {
            block_idx += blocks_x;
            continue;
        }

        // Pull 8 contiguous rows for the block-row.
        for i in 0..8 {
            stream.fetch_into(
                (by * 8 + i) as u32,
                &mut block_buf[i * row_bytes..(i + 1) * row_bytes],
            );
        }

        for bx in 0..blocks_x {
            if block_idx % stride != 0 {
                block_idx += 1;
                continue;
            }
            block_idx += 1;

            // Extract Y, Cb, Cr per pixel into three 8×8 blocks.
            // BT.601 integer-quantized luma matches the existing
            // convention; chroma uses the same scaled-integer form
            // (Cb = (B − Y) ≈ scaled b−g pattern; Cr = (R − Y)) so the
            // DCT inputs land in the same dynamic range as luma.
            let mut blk_y = [[0.0f32; 8]; 8];
            let mut blk_cb = [[0.0f32; 8]; 8];
            let mut blk_cr = [[0.0f32; 8]; 8];
            for y in 0..8 {
                let row = &block_buf[y * row_bytes..(y + 1) * row_bytes];
                for x in 0..8 {
                    let off = (bx * 8 + x) * 3;
                    let p = &row[off..off + 3];
                    let r = p[0] as i32;
                    let g = p[1] as i32;
                    let b = p[2] as i32;
                    let l_i = (qr * r + qg * g + qb * b + 128) >> 8;
                    // Cb / Cr keep their BT.601-derived integer
                    // matrix here. The per-primaries adjustment
                    // shifts luma; chroma differences (B−Y / R−Y)
                    // would also drift, but the chroma-DCT
                    // compressibility / noise-floor signals are
                    // ratio-based and small per-primaries drift on
                    // the chroma matrix doesn't materially move
                    // them. Revisit if a corpus eval shows wide-
                    // gamut chroma stats reading off vs sRGB.
                    let cb_i = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
                    let cr_i = ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
                    blk_y[y][x] = l_i as f32 - 128.0;
                    blk_cb[y][x] = cb_i as f32 - 128.0;
                    blk_cr[y][x] = cr_i as f32 - 128.0;
                }
            }

            // 2D DCT for all three planes in one magetypes f32x8 batch.
            // 128 fmas per plane vs ~1920 scalar ops in the old
            // separable-scalar version.
            let mut coeffs_y = [[0.0f32; 8]; 8];
            let mut coeffs_cb = [[0.0f32; 8]; 8];
            let mut coeffs_cr = [[0.0f32; 8]; 8];
            dct2d_8_three_planes(
                &blk_y,
                &blk_cb,
                &blk_cr,
                &mut coeffs_y,
                &mut coeffs_cb,
                &mut coeffs_cr,
            );

            blocks_sampled += 1;
            // High/low AC energy on luma (existing high_freq_energy_ratio).
            // Per-block: total AC for AQ map + low-AC for noise floor.
            let mut block_ac: f64 = 0.0;
            let mut block_low_y_ac: f64 = 0.0;
            for k in 1..64 {
                let u = k % 8;
                let v = k / 8;
                let e = (coeffs_y[v][u] * coeffs_y[v][u]) as f64;
                block_ac += e;
                if RASTER_TO_ZIGZAG[k] < 16 {
                    low_energy += e;
                    block_low_y_ac += e;
                } else {
                    high_energy += e;
                }
            }
            block_acs.push(block_ac as f32);
            block_low_y.push(block_low_y_ac as f32);

            // Per-block gradient flag. Threshold 0.9 picks blocks
            // where almost all the AC energy is in the lowest 15
            // zigzag positions — those are smooth gradients / soft
            // textures where larger-than-8×8 DCT transforms (JXL
            // DCT16 / DCT32) pay off. Skip the test on near-flat
            // blocks (block_ac < 16 ≈ ~1 unit of average AC) so a
            // dead-flat region doesn't drag the fraction up.
            if block_ac > 16.0 && block_low_y_ac >= 0.9 * block_ac {
                gradient_blocks += 1;
            }

            // Chroma low-AC for chroma noise floor. Reuses the
            // already-computed coeffs_cb / coeffs_cr.
            let mut low_cb: f64 = 0.0;
            let mut low_cr: f64 = 0.0;
            for k in 1..64 {
                if RASTER_TO_ZIGZAG[k] < 16 {
                    let u = k % 8;
                    let v = k / 8;
                    low_cb += (coeffs_cb[v][u] * coeffs_cb[v][u]) as f64;
                    low_cr += (coeffs_cr[v][u] * coeffs_cr[v][u]) as f64;
                }
            }
            block_low_cb.push(low_cb as f32);
            block_low_cr.push(low_cr as f32);
            // libwebp α per block (luma): /16 bin divisor.
            alpha_y_sum += block_alpha(&coeffs_y, BIN_DIV_LUMA) as u64;
            // libwebp α per block (chroma): /8 bin divisor (finer bins
            // because chroma coefficient magnitudes are smaller; keeps
            // the chroma α in the same dynamic range as luma α). Max
            // across Cb/Cr per block to surface whichever channel
            // carries more detail.
            let a_cb = block_alpha(&coeffs_cb, BIN_DIV_CHROMA);
            let a_cr = block_alpha(&coeffs_cr, BIN_DIV_CHROMA);
            alpha_uv_sum += a_cb.max(a_cr) as u64;
            // Patch-detection signature on luma DCT coefficients.
            signatures.push(block_signature(&coeffs_y));
        }
    }

    let high_freq_ratio = if low_energy < 1e-6 {
        0.0
    } else {
        (high_energy / low_energy) as f32
    };

    let compressibility_y = if blocks_sampled > 0 {
        (alpha_y_sum as f64 / blocks_sampled as f64) as f32
    } else {
        0.0
    };
    let compressibility_uv = if blocks_sampled > 0 {
        (alpha_uv_sum as f64 / blocks_sampled as f64) as f32
    } else {
        0.0
    };

    // Patch fraction: count blocks whose 32-bit DCT signature appears
    // at least twice in the sample. Sort-and-sweep over the small
    // sampled set is O(N log N) — at N ≤ 256 this is ~10 µs, faster
    // than a hash map.
    let patch_fraction = if blocks_sampled > 1 {
        signatures.sort_unstable();
        let mut matched: u32 = 0;
        let mut i = 0;
        while i < signatures.len() {
            let mut j = i + 1;
            while j < signatures.len() && signatures[j] == signatures[i] {
                j += 1;
            }
            let run = (j - i) as u32;
            if run > 1 {
                matched += run;
            }
            i = j;
        }
        matched as f32 / blocks_sampled as f32
    } else {
        0.0
    };

    // Batched log10(1 + ac) over `block_acs` via magetypes `ln_lowp`,
    // dispatched to v4 / v3 / NEON / WASM128 / scalar. Replaces the
    // per-block scalar `f64::ln()` (was ~50 cycles each × N blocks)
    // with one `ln_lowp` call per 8 blocks at low precision (well
    // above the noise floor for an aq_map_std on the 0–7 log scale).
    let (aq_log_sum, aq_log_sq_sum) =
        log10_sum_and_sq_sum_dispatch(&block_acs);
    let (aq_map_mean, aq_map_std) = if blocks_sampled > 0 {
        let n = blocks_sampled as f64;
        let mean = aq_log_sum / n;
        let var = (aq_log_sq_sum / n - mean * mean).max(0.0);
        (mean as f32, var.sqrt() as f32)
    } else {
        (0.0, 0.0)
    };

    // Noise-floor estimate via 10th-percentile per-block low-AC-energy.
    // Flat blocks' low-AC is residual noise; the 10th percentile
    // selects the flattest 10 % across the image. Convert from
    // sum-of-squared-coefficients to a per-coefficient σ via
    // sqrt(low_ac / 15) (15 low-zigzag coefficients per block, since
    // index 0 = DC). Normalize by 32 to land on `[0, 1]`-ish — scale
    // chosen so a "noisy JPEG" reads ~0.5 and pristine reads <0.1.
    let p10 = |arr: &mut [f32]| -> f32 {
        if arr.is_empty() {
            return 0.0;
        }
        arr.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
        let idx = (arr.len() / 10).min(arr.len() - 1);
        arr[idx]
    };
    let noise_floor_y = ((p10(&mut block_low_y) / 15.0).sqrt() / 32.0).clamp(0.0, 1.0);
    let p10_cb = ((p10(&mut block_low_cb) / 15.0).sqrt() / 32.0).clamp(0.0, 1.0);
    let p10_cr = ((p10(&mut block_low_cr) / 15.0).sqrt() / 32.0).clamp(0.0, 1.0);
    let noise_floor_uv = p10_cb.max(p10_cr);

    let gradient_fraction = if blocks_sampled > 0 {
        gradient_blocks as f32 / blocks_sampled as f32
    } else {
        0.0
    };

    Tier3DctStats {
        high_freq_ratio,
        compressibility_y,
        compressibility_uv,
        patch_fraction,
        aq_map_mean,
        aq_map_std,
        noise_floor_y,
        noise_floor_uv,
        gradient_fraction,
    }
}

/// Runtime-dispatched f32x8 2D DCT for the three YCbCr planes of one
/// 8×8 block. Returns coefficients in raster order: `out[v][u]` is the
/// coefficient at horizontal frequency `u`, vertical frequency `v`.
///
/// All three planes go through one [`incant!`] call so dispatch cost is
/// amortized 3× and LLVM keeps the eight `DCT_COEF_T` lane vectors hot
/// in YMM registers across planes.
#[inline]
fn dct2d_8_three_planes(
    blk_y: &[[f32; 8]; 8],
    blk_cb: &[[f32; 8]; 8],
    blk_cr: &[[f32; 8]; 8],
    coeffs_y: &mut [[f32; 8]; 8],
    coeffs_cb: &mut [[f32; 8]; 8],
    coeffs_cr: &mut [[f32; 8]; 8],
) {
    incant!(dct2d_8_three_planes_simd(
        blk_y, blk_cb, blk_cr, coeffs_y, coeffs_cb, coeffs_cr
    ));
}

/// f32x8 SIMD 2D DCT-II for three 8×8 blocks (Y, Cb, Cr).
///
/// Each row of the output is one f32x8 with lane k = `Y[v][k]`. The
/// row pass computes `Y[v] = Σ_n splat(X[v][n]) * D_col_n_vec` (8 fmas
/// per row); the column pass computes `Z[v] = Σ_w splat(D[v][w]) *
/// Y[w]` (8 fmas per row). Total 128 fmas per plane vs ~1920 scalar ops
/// in the old `dct2d_8` (separable scalar + transpose), an 8-15× LLVM
/// op-count reduction. With three planes batched we saturate the FMA
/// units across each block's worth of work before the per-block scalar
/// (alpha / signature / energy split) takes over.
#[magetypes(define(f32x8), v4, v3, neon, wasm128, scalar)]
fn dct2d_8_three_planes_simd(
    token: Token,
    blk_y: &[[f32; 8]; 8],
    blk_cb: &[[f32; 8]; 8],
    blk_cr: &[[f32; 8]; 8],
    coeffs_y: &mut [[f32; 8]; 8],
    coeffs_cb: &mut [[f32; 8]; 8],
    coeffs_cr: &mut [[f32; 8]; 8],
) {
    // Load the eight column-of-D vectors once. d_col[n] holds
    // [D[0][n], …, D[7][n]] across lanes — reused for both passes
    // (the column pass uses the same matrix D, just along the v axis).
    let d_col = [
        f32x8::load(token, &DCT_COEF_T[0]),
        f32x8::load(token, &DCT_COEF_T[1]),
        f32x8::load(token, &DCT_COEF_T[2]),
        f32x8::load(token, &DCT_COEF_T[3]),
        f32x8::load(token, &DCT_COEF_T[4]),
        f32x8::load(token, &DCT_COEF_T[5]),
        f32x8::load(token, &DCT_COEF_T[6]),
        f32x8::load(token, &DCT_COEF_T[7]),
    ];

    // The body is identical across the three planes; macro keeps the
    // f32x8 type scope inside the magetypes-generated copy without
    // splitting into a helper (helpers don't share f32x8 across the
    // per-backend functions the macro emits).
    macro_rules! dct2d_one_plane {
        ($blk:ident, $out:ident) => {{
            // Row pass: Y_row[v] (lane k = Y[v][k]) =
            // Σ_n splat(X[v][n]) * d_col[n]. 8 fmas per row.
            let mut y_row: [f32x8; 8] = [
                f32x8::zero(token),
                f32x8::zero(token),
                f32x8::zero(token),
                f32x8::zero(token),
                f32x8::zero(token),
                f32x8::zero(token),
                f32x8::zero(token),
                f32x8::zero(token),
            ];
            let mut v = 0;
            while v < 8 {
                let r = &$blk[v];
                let mut acc = d_col[0] * f32x8::splat(token, r[0]);
                acc = d_col[1].mul_add(f32x8::splat(token, r[1]), acc);
                acc = d_col[2].mul_add(f32x8::splat(token, r[2]), acc);
                acc = d_col[3].mul_add(f32x8::splat(token, r[3]), acc);
                acc = d_col[4].mul_add(f32x8::splat(token, r[4]), acc);
                acc = d_col[5].mul_add(f32x8::splat(token, r[5]), acc);
                acc = d_col[6].mul_add(f32x8::splat(token, r[6]), acc);
                acc = d_col[7].mul_add(f32x8::splat(token, r[7]), acc);
                y_row[v] = acc;
                v += 1;
            }

            // Column pass: Z_row[v] (lane k = Z[v][k]) =
            // Σ_w D[v][w] * Y_row[w] (lane k). Each lane k runs an
            // independent 1D DCT down the column — no transpose, no
            // horizontal reduction. Store straight into the caller's
            // [f32; 8] rows.
            let mut v = 0;
            while v < 8 {
                let d_row = &DCT_COEF[v];
                let mut acc = y_row[0] * f32x8::splat(token, d_row[0]);
                acc = y_row[1].mul_add(f32x8::splat(token, d_row[1]), acc);
                acc = y_row[2].mul_add(f32x8::splat(token, d_row[2]), acc);
                acc = y_row[3].mul_add(f32x8::splat(token, d_row[3]), acc);
                acc = y_row[4].mul_add(f32x8::splat(token, d_row[4]), acc);
                acc = y_row[5].mul_add(f32x8::splat(token, d_row[5]), acc);
                acc = y_row[6].mul_add(f32x8::splat(token, d_row[6]), acc);
                acc = y_row[7].mul_add(f32x8::splat(token, d_row[7]), acc);
                acc.store(&mut $out[v]);
                v += 1;
            }
        }};
    }

    dct2d_one_plane!(blk_y, coeffs_y);
    dct2d_one_plane!(blk_cb, coeffs_cb);
    dct2d_one_plane!(blk_cr, coeffs_cr);
}

/// Populate derived likelihood scores from leaf features computed by
/// other tiers. The const bools `T3` and `PAL` say which tiers ran:
///
/// - `T3` ⇒ `luma_histogram_entropy` is real → can compute
///   [`text_likelihood`] and (with `PAL`) [`natural_likelihood`].
/// - `PAL` ⇒ `distinct_color_bins` is real → can compute
///   [`screen_content_likelihood`] and (with `T3`) [`natural_likelihood`].
///
/// Tier 1 outputs (`cb_sharpness`, `cr_sharpness`, `edge_density`,
/// `flat_color_block_ratio`) are always available since Tier 1 is
/// always-on.
///
/// **Layered defense.** Even if [`crate::analyze_features`]'s
/// dispatch-axis tables drift and pick the wrong axis, this gate
/// refuses to write a likelihood whose dependencies weren't computed.
/// The field stays at `Default` (0.0), `into_results` doesn't emit
/// it (because the caller's [`feature::FeatureSet`] still has to
/// list it), and the caller gets `None`. **Never garbage.**
///
/// In release `if T3 { … }` is straight-line const-eval'd into the
/// caller's monomorphized variant, so the unused branches contribute
/// zero code and zero runtime cost.
///
/// **Empirical saturation.** Each likelihood is a weighted sum of
/// clamped sub-components and is `clamp(0, 1)`'d again at the end.
/// On a 219-image labeled corpus the observed maxes are:
///
/// - `text_likelihood`: max 0.71 (entropy_low + edge_hi + chroma_lo
///   don't all max simultaneously on real text)
/// - `screen_content_likelihood`: max 0.70 (typical screens have
///   > 4000 distinct color bins, forcing `palette_small` to 0)
/// - `natural_likelihood`: max 0.69 (entropy_hi + palette_large +
///   chroma_moderate + not_flat don't all max simultaneously)
///
/// Recommended consumer thresholds (best F1 on the labeled corpus):
/// `text_likelihood >= 0.30`, `screen_content_likelihood >= 0.60`,
/// `natural_likelihood >= 0.06` (photo detection). Thresholds at
/// or above 0.8 fire on nothing. See
/// `docs/calibration-corpus-2026-04-27.md` for the full empirical
/// distribution and AUC table.
///
/// [`text_likelihood`]: crate::feature::AnalysisFeature::TextLikelihood
/// [`natural_likelihood`]: crate::feature::AnalysisFeature::NaturalLikelihood
/// [`screen_content_likelihood`]: crate::feature::AnalysisFeature::ScreenContentLikelihood
#[cfg(feature = "composites")]
pub fn compute_derived_likelihoods<const T3: bool, const PAL: bool>(out: &mut RawAnalysis) {
    let chroma_sh = out.cb_sharpness + out.cr_sharpness;
    let chroma_lo = (0.005 - chroma_sh).clamp(0.0, 0.005) / 0.005;
    let edge_hi = (out.edge_density / 0.25).min(1.0);
    let flat_high = (out.flat_color_block_ratio / 0.5).min(1.0);

    if T3 {
        let entropy_low = (4.0 - out.luma_histogram_entropy).clamp(0.0, 4.0) / 4.0;
        out.text_likelihood = (entropy_low * 0.4 + edge_hi * 0.3 + chroma_lo * 0.3).clamp(0.0, 1.0);
    }
    if PAL {
        let palette_small = if out.distinct_color_bins == 0 {
            0.0
        } else {
            (1.0 - (out.distinct_color_bins as f32 / 4000.0).min(1.0)).clamp(0.0, 1.0)
        };
        out.screen_content_likelihood =
            (flat_high * 0.6 + palette_small * 0.3 + chroma_lo * 0.1).clamp(0.0, 1.0);
    }
    if T3 && PAL {
        let entropy_hi = (out.luma_histogram_entropy - 3.5).clamp(0.0, 1.5) / 1.5;
        let palette_large = if out.distinct_color_bins < 2000 {
            0.0
        } else {
            ((out.distinct_color_bins as f32 - 2000.0) / 8000.0).clamp(0.0, 1.0)
        };
        let chroma_moderate = (chroma_sh / 0.012).min(1.0);
        let not_flat = (1.0 - (out.flat_color_block_ratio / 0.3).min(1.0)).clamp(0.0, 1.0);
        out.natural_likelihood =
            (entropy_hi * 0.3 + palette_large * 0.25 + chroma_moderate * 0.2 + not_flat * 0.25)
                .clamp(0.0, 1.0);
    }
}

/// `composites`-disabled stub — keeps the call site in `lib.rs`
/// unconditional. With `composites` off, no likelihood fields exist
/// on `RawAnalysis`, so the body collapses to a no-op.
#[cfg(not(feature = "composites"))]
pub fn compute_derived_likelihoods<const T3: bool, const PAL: bool>(_out: &mut RawAnalysis) {}

/// Dispatcher for the batched `log10(1 + ac)` reduction over the
/// `block_acs` accumulator collected during the DCT-stats pass.
/// Returns `(Σ log10(1+ac), Σ log10(1+ac)²)` as f64 — same shape
/// the per-block scalar version produced.
fn log10_sum_and_sq_sum_dispatch(block_acs: &[f32]) -> (f64, f64) {
    incant!(log10_sum_and_sq_sum_simd(block_acs))
}

/// SIMD batched `log10(1 + ac)` and its square-sum, vectorised
/// 8 lanes at a time via magetypes `log10_lowp`. The low-precision
/// variant is ~12-bit accurate — far above the noise floor on the
/// `aq_map_std` 0–7 log scale that consumes this output. Switching
/// from per-block scalar `f64::ln() / LN_10` (~50 cycles each) to
/// `log10_lowp` over an 8-wide vector removes a per-block transcendental
/// call from the hot Tier 3 DCT loop.
#[magetypes(define(f32x8), v4, v3, neon, wasm128, scalar)]
fn log10_sum_and_sq_sum_simd(token: Token, block_acs: &[f32]) -> (f64, f64) {
    let one_v = f32x8::splat(token, 1.0);
    let mut sum_v = f32x8::zero(token);
    let mut sq_sum_v = f32x8::zero(token);
    let mut sum_f64: f64 = 0.0;
    let mut sq_sum_f64: f64 = 0.0;
    // FLUSH cadence — same `f32`-mantissa argument as the row-stats
    // pass: log10 outputs land in `[0, ~7]` so partial sums of 32
    // 8-lane chunks reach ~1.8 K, well below the 16 M f32 mantissa
    // boundary.
    const FLUSH: usize = 32;
    let mut iters_since_flush = 0usize;
    let chunks = block_acs.chunks_exact(8);
    let remainder = chunks.remainder();
    for chunk in chunks {
        let arr: &[f32; 8] = chunk.try_into().unwrap();
        let ac_v = f32x8::load(token, arr);
        let log_v = (ac_v + one_v).log10_lowp();
        sum_v += log_v;
        sq_sum_v = log_v.mul_add(log_v, sq_sum_v);
        iters_since_flush += 1;
        if iters_since_flush >= FLUSH {
            sum_f64 += sum_v.reduce_add() as f64;
            sq_sum_f64 += sq_sum_v.reduce_add() as f64;
            sum_v = f32x8::zero(token);
            sq_sum_v = f32x8::zero(token);
            iters_since_flush = 0;
        }
    }
    sum_f64 += sum_v.reduce_add() as f64;
    sq_sum_f64 += sq_sum_v.reduce_add() as f64;
    // Scalar tail (≤ 7 leftover blocks) — use the same `log10_lowp`
    // semantics the SIMD pass produced so the lowp/scalar boundary
    // doesn't drift the aggregate. Native `f32::log10` is fine here
    // because the tail count is tiny (typical ≤ 7 calls per analysis).
    for &ac in remainder {
        let l = (ac + 1.0).log10();
        sum_f64 += l as f64;
        sq_sum_f64 += (l as f64) * (l as f64);
    }
    (sum_f64, sq_sum_f64)
}