jxl-encoder 0.3.0

JPEG XL encoder in pure Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
// Copyright (c) Imazen LLC and the JPEG XL Project Authors.
// Algorithms and constants derived from libjxl (BSD-3-Clause).
// Licensed under AGPL-3.0-or-later. Commercial licenses at https://www.imazen.io/pricing

//! ANS (Asymmetric Numeral Systems) entropy code building and serialization.
//!
//! Contains all ANS-specific code: distribution building, header writing,
//! token writing, and verification/roundtrip utilities.

#![allow(dead_code)]

use super::ans::{ANSEncodingHistogram, ANSHistogramStrategy, AnsDistribution, AnsEncoder};
use super::context_map::move_to_front_transform;
use super::encode::{
    ALPHABET_SIZE, PrefixCode, encode_token_value, encode_token_value_with_config,
    write_var_len_uint16,
};
use super::encode_huffman::{
    convert_bit_depths_to_symbols, create_huffman_tree, write_prefix_code,
};
use super::hybrid_uint::HybridUintConfig;
use super::lz77::Lz77Params;
use super::token::{Lz77UintCoder, Token, UintCoder};
use crate::bit_writer::BitWriter;
use crate::error::{Error, Result};

/// Log2 of alphabet size for ANS. With split=4, max token is 4+31=35, so we need 6 bits.
pub const ANS_LOG_ALPHA_SIZE: usize = 6;

/// An owned ANS entropy code (context map + ANS distributions on the heap).
#[derive(Debug)]
pub struct OwnedAnsEntropyCode {
    /// Context map: maps context ID -> distribution index.
    pub context_map: Vec<u8>,
    /// ANS encoding histograms for header serialization.
    pub histograms: Vec<ANSEncodingHistogram>,
    /// ANS distributions for runtime encoding.
    pub distributions: Vec<AnsDistribution>,
    /// Log2 of alphabet size. 6 for normal, 8 when LZ77 is enabled.
    pub log_alpha_size: usize,
    /// Per-histogram HybridUint configs (one per histogram).
    /// When empty, all histograms use the default {4, 2, 0} config.
    pub uint_configs: Vec<HybridUintConfig>,
}

impl OwnedAnsEntropyCode {
    /// Get the number of distributions.
    pub fn num_distributions(&self) -> usize {
        self.distributions.len()
    }
}

/// Accumulated histogram data from a token stream (or multiple token streams).
///
/// This struct captures all the statistical information needed to build an ANS
/// entropy code, without retaining the tokens themselves. This enables a two-phase
/// encode approach: accumulate statistics (Phase 1) → build codes → re-tokenize
/// and write (Phase 2), avoiding O(total_tokens) memory for large images.
pub struct AccumulatedAnsData {
    /// Per-context symbol frequency counts.
    pub histograms: Vec<super::histogram::Histogram>,
    /// Per-context raw value frequencies (value → count).
    /// Used for HybridUint config optimization after histogram clustering.
    pub value_freqs: Vec<alloc::collections::BTreeMap<u32, u32>>,
    /// Per-context LZ77 symbol frequencies (symbol → count).
    pub lz77_freqs: Vec<alloc::collections::BTreeMap<u32, u32>>,
    /// Number of contexts.
    pub num_contexts: usize,
}

impl AccumulatedAnsData {
    /// Create a new empty accumulator for the given number of contexts.
    pub fn new(num_contexts: usize) -> Self {
        Self {
            histograms: (0..num_contexts)
                .map(|_| super::histogram::Histogram::new())
                .collect(),
            value_freqs: (0..num_contexts)
                .map(|_| alloc::collections::BTreeMap::new())
                .collect(),
            lz77_freqs: (0..num_contexts)
                .map(|_| alloc::collections::BTreeMap::new())
                .collect(),
            num_contexts,
        }
    }

    /// Accumulate a token into the histograms and value frequency maps.
    #[inline]
    pub fn add_token(&mut self, token: &Token, lz77: Option<&Lz77Params>) {
        let ctx = token.context() as usize;
        if ctx < self.num_contexts {
            let (_encoded, sym) = encode_token_value(token, lz77);
            self.histograms[ctx].add(sym as usize);
            if token.is_lz77_length() {
                if let Some(lz77_params) = lz77 {
                    let encoded = Lz77UintCoder::encode(token.value);
                    let lz77_sym = encoded.token + lz77_params.min_symbol;
                    *self.lz77_freqs[ctx].entry(lz77_sym).or_insert(0) += 1;
                }
            } else {
                *self.value_freqs[ctx].entry(token.value).or_insert(0) += 1;
            }
        }
    }

    /// Accumulate all tokens from a slice.
    pub fn add_tokens(&mut self, tokens: &[Token], lz77: Option<&Lz77Params>) {
        for token in tokens {
            self.add_token(token, lz77);
        }
    }

    /// Merge another accumulator into this one (for combining per-thread results).
    pub fn merge(&mut self, other: &Self) {
        debug_assert_eq!(self.num_contexts, other.num_contexts);
        for ctx in 0..self.num_contexts {
            self.histograms[ctx].add_histogram(&other.histograms[ctx]);
            for (&val, &count) in &other.value_freqs[ctx] {
                *self.value_freqs[ctx].entry(val).or_insert(0) += count;
            }
            for (&sym, &count) in &other.lz77_freqs[ctx] {
                *self.lz77_freqs[ctx].entry(sym).or_insert(0) += count;
            }
        }
    }
}

/// Build an ANS entropy code from accumulated histogram data.
///
/// This is Phase B of the two-phase approach: takes pre-accumulated statistics
/// and builds the entropy code (clustering, HybridUint optimization, ANS distributions).
pub fn build_entropy_code_from_accumulated_ans(
    data: AccumulatedAnsData,
    enhanced_clustering: bool,
    optimize_uint_configs: bool,
    lz77: Option<&Lz77Params>,
    total_pixel_hint: Option<usize>,
) -> OwnedAnsEntropyCode {
    use crate::entropy_coding::cluster::{
        ClusteringType, EntropyType, cluster_histograms as enhanced_cluster,
    };
    use crate::entropy_coding::histogram::Histogram as EnhancedHistogram;

    let num_contexts = data.num_contexts;

    // Cluster histograms
    let cluster_type = if enhanced_clustering {
        ClusteringType::Best
    } else {
        ClusteringType::Fast
    };

    let mut max_histograms = num_contexts.min(128);
    if let Some(tp) = total_pixel_hint {
        max_histograms = max_histograms.min((tp / 2048).max(1));
    }
    let result = enhanced_cluster(
        cluster_type,
        EntropyType::Ans,
        &data.histograms,
        max_histograms,
    )
    .expect("ANS clustering failed");

    let context_map: Vec<u8> = result.symbols.iter().map(|&s| s as u8).collect();
    debug_assert_eq!(context_map.len(), num_contexts);

    // Merge per-context value frequencies into per-merged-histogram frequencies
    // using the context map from clustering.
    let num_histograms = result.histograms.len();
    let mut merged_value_freqs: Vec<alloc::collections::BTreeMap<u32, u32>> = (0..num_histograms)
        .map(|_| alloc::collections::BTreeMap::new())
        .collect();
    let mut merged_lz77_freqs: Vec<alloc::collections::BTreeMap<u32, u32>> = (0..num_histograms)
        .map(|_| alloc::collections::BTreeMap::new())
        .collect();
    for (ctx, &cm) in context_map.iter().enumerate() {
        let histo_idx = cm as usize;
        if histo_idx < num_histograms {
            for (&val, &count) in &data.value_freqs[ctx] {
                *merged_value_freqs[histo_idx].entry(val).or_insert(0) += count;
            }
            for (&sym, &count) in &data.lz77_freqs[ctx] {
                *merged_lz77_freqs[histo_idx].entry(sym).or_insert(0) += count;
            }
        }
    }

    // Optimize per-histogram HybridUint configs from merged value frequencies.
    // libjxl uses uint_method=kNone (no optimization) for VarDCT AC/DC at effort < 9.
    // The fast optimization can pick non-default configs whose signaling overhead
    // exceeds their coding benefit on VarDCT streams.
    let uint_configs = if !optimize_uint_configs {
        vec![HybridUintConfig::new(4, 2, 0); num_histograms]
    } else if enhanced_clustering {
        optimize_uint_configs_best_from_freqs(&merged_value_freqs, lz77)
    } else {
        optimize_uint_configs_fast_from_freqs(&merged_value_freqs, lz77)
    };

    // Build ANS histograms and distributions with the optimized configs.
    let mut ans_histograms = Vec::with_capacity(num_histograms);
    let mut ans_distributions = Vec::with_capacity(num_histograms);
    let allowed_cache = super::ans::AllowedCountsCache::new();

    for h in 0..num_histograms {
        let config = &uint_configs[h];
        let mut counts: Vec<u32> = Vec::new();
        for (&val, &freq) in &merged_value_freqs[h] {
            let (tok, _, _) = config.encode(val);
            let sym = tok as usize;
            if sym >= counts.len() {
                counts.resize(sym + 1, 0);
            }
            counts[sym] += freq;
        }
        for (&sym, &freq) in &merged_lz77_freqs[h] {
            let s = sym as usize;
            if s >= counts.len() {
                counts.resize(s + 1, 0);
            }
            counts[s] += freq;
        }
        if counts.is_empty() {
            counts.push(0);
        }

        let i32_counts: Vec<i32> = counts.iter().map(|&c| c as i32).collect();
        let histo = EnhancedHistogram::from_counts(&i32_counts);

        let ans_histo = ANSEncodingHistogram::from_histogram_cached(
            &histo,
            ANSHistogramStrategy::Precise,
            &allowed_cache,
        )
        .expect("ANS histogram normalization failed");
        ans_histograms.push(ans_histo);
    }

    // Compute global log_alpha_size
    let max_alphabet_size = ans_histograms
        .iter()
        .map(|h| h.counts.len())
        .max()
        .unwrap_or(1);
    let log_alpha_size = if lz77.is_some_and(|p| p.enabled) {
        8
    } else if max_alphabet_size <= (1 << ANS_LOG_ALPHA_SIZE) {
        ANS_LOG_ALPHA_SIZE
    } else {
        let min_bits = if max_alphabet_size <= 1 {
            5
        } else {
            (max_alphabet_size - 1).ilog2() as usize + 1
        };
        min_bits.clamp(5, 8)
    };

    for ans_histo in &ans_histograms {
        let ans_dist = AnsDistribution::from_normalized_counts_with_log_alpha(
            &ans_histo.counts,
            log_alpha_size,
        )
        .expect("ANS distribution building failed");
        ans_distributions.push(ans_dist);
    }

    OwnedAnsEntropyCode {
        context_map,
        histograms: ans_histograms,
        distributions: ans_distributions,
        log_alpha_size,
        uint_configs,
    }
}

/// Build an ANS entropy code from collected tokens.
///
/// 1. Creates per-context histograms from all tokens.
/// 2. Clusters histograms (max 8 clusters) to produce a context map.
/// 3. Normalizes each cluster histogram to sum to 4096.
/// 4. Builds ANS distributions for encoding.
pub fn build_entropy_code_ans(tokens: &[Token], num_contexts: usize) -> OwnedAnsEntropyCode {
    build_entropy_code_ans_with_options(tokens, num_contexts, false, true, None, None)
}

/// Build an ANS entropy code with optional enhanced clustering.
///
/// When `lz77` is Some, LZ77 length tokens use Lz77UintCoder and are offset by min_symbol.
/// When `total_pixel_hint` is Some, max_histograms is capped to `total_pixels / 2048` (min 1)
/// to prevent header overhead from dominating on small images.
pub fn build_entropy_code_ans_with_options(
    tokens: &[Token],
    num_contexts: usize,
    enhanced_clustering: bool,
    optimize_uint_configs: bool,
    lz77: Option<&Lz77Params>,
    total_pixel_hint: Option<usize>,
) -> OwnedAnsEntropyCode {
    build_entropy_code_ans_from_token_groups(
        &[tokens],
        num_contexts,
        enhanced_clustering,
        optimize_uint_configs,
        lz77,
        total_pixel_hint,
    )
}

/// Build an ANS entropy code from multiple token groups without merging.
///
/// Like `build_entropy_code_ans_with_options`, but accepts separate token slices
/// (e.g., per-group tokens) and iterates them without creating a merged copy.
/// This avoids allocating a merged Vec that can be hundreds of MB for large images.
///
/// Internally uses the two-phase accumulate + build approach: collects per-context
/// histograms and value frequencies in a single pass, then builds codes from the
/// accumulated data.
pub fn build_entropy_code_ans_from_token_groups(
    groups: &[&[Token]],
    num_contexts: usize,
    enhanced_clustering: bool,
    optimize_uint_configs: bool,
    lz77: Option<&Lz77Params>,
    total_pixel_hint: Option<usize>,
) -> OwnedAnsEntropyCode {
    // Phase A: Accumulate per-context histograms and value frequencies.
    let mut accumulated = AccumulatedAnsData::new(num_contexts);
    for group in groups {
        accumulated.add_tokens(group, lz77);
    }

    // Phase B: Build entropy code from accumulated data.
    let code = build_entropy_code_from_accumulated_ans(
        accumulated,
        enhanced_clustering,
        optimize_uint_configs,
        lz77,
        total_pixel_hint,
    );

    // Validate: every token in the stream must have a valid, non-zero frequency
    // in the distribution it maps to. Only in debug builds — this is O(n) over all tokens.
    #[cfg(debug_assertions)]
    for group in groups {
        for (i, token) in group.iter().enumerate() {
            let ctx = token.context() as usize;
            let dist_idx = code.context_map.get(ctx).copied().unwrap_or(0) as usize;
            let config = &code.uint_configs[dist_idx];
            let (_encoded, sym) = encode_token_value_with_config(token, lz77, config);
            let dist = &code.distributions[dist_idx];
            let tok = sym as usize;
            if tok >= dist.symbols.len() {
                panic!(
                    "ANS validation: token[{}] ctx={} val={} tok={} exceeds distribution alphabet_size={} (dist_idx={})",
                    i,
                    ctx,
                    token.value,
                    tok,
                    dist.symbols.len(),
                    dist_idx
                );
            }
            if dist.symbols[tok].freq == 0 {
                panic!(
                    "ANS validation: token[{}] ctx={} val={} tok={} has zero frequency in distribution (dist_idx={})",
                    i, ctx, token.value, tok, dist_idx
                );
            }
        }
    }

    code
}

/// Optimize HybridUint config per histogram cluster (matches libjxl kFast method).
///
/// For each histogram, tries 4 configs and picks the one with lowest estimated cost
/// (Shannon entropy of re-encoded tokens + extra bits + signaling cost).
/// Optimize HybridUint config per histogram from value frequency maps (kFast method).
///
/// Tries 4 configs per histogram (libjxl effort 7). Iterates (value, count) pairs
/// from frequency maps instead of individual values, avoiding O(tokens) storage.
fn optimize_uint_configs_fast_from_freqs(
    freqs_per_histo: &[alloc::collections::BTreeMap<u32, u32>],
    lz77: Option<&Lz77Params>,
) -> Vec<HybridUintConfig> {
    use crate::entropy_coding::ans::ANS_MAX_ALPHABET_SIZE;

    let candidates = [
        HybridUintConfig::new(4, 2, 0),
        HybridUintConfig::new(4, 1, 2),
        HybridUintConfig::new(0, 0, 0),
        HybridUintConfig::new(2, 0, 1),
    ];

    let num_histograms = freqs_per_histo.len();
    let max_alpha = ANS_MAX_ALPHABET_SIZE;

    let mut best_configs = vec![HybridUintConfig::new(4, 2, 0); num_histograms];
    let mut counts_buf: Vec<u32> = Vec::new();

    for h in 0..num_histograms {
        let freqs = &freqs_per_histo[h];
        if freqs.is_empty() {
            continue;
        }

        let max_value = freqs.keys().copied().max().unwrap_or(0);
        let total: u32 = freqs.values().sum();
        let mut best_cost = f64::MAX;

        for &cfg in &candidates {
            let (max_tok, _, _) = cfg.encode(max_value);
            let max_tok_with_lsb = max_tok | ((1u32 << cfg.lsb_in_token) - 1);
            if max_tok_with_lsb as usize >= max_alpha {
                continue;
            }
            if let Some(lz77_params) = lz77
                && max_tok_with_lsb >= lz77_params.min_symbol
            {
                continue;
            }

            let capacity = max_tok_with_lsb as usize + 1;
            counts_buf.clear();
            counts_buf.resize(capacity, 0);
            let mut extra_bits_total: u64 = 0;
            for (&val, &freq) in freqs {
                let (tok, _, nbits) = cfg.encode(val);
                counts_buf[tok as usize] += freq;
                extra_bits_total += nbits as u64 * freq as u64;
            }

            let inv_total = 1.0f32 / total as f32;
            let mut entropy_cost = 0.0f64;
            for &count in &counts_buf[..capacity] {
                if count > 0 {
                    let c = count as f32;
                    entropy_cost -= c as f64 * jxl_simd::fast_log2f(c * inv_total) as f64;
                }
            }

            let signaling_cost = if cfg.split_exponent == 0 {
                0.0
            } else {
                ceil_log2_nonzero_usize(cfg.split_exponent as usize + 1) as f64
                    + ceil_log2_nonzero_usize((cfg.split_exponent - cfg.msb_in_token) as usize + 1)
                        as f64
            };
            let cost = entropy_cost + extra_bits_total as f64 + signaling_cost;

            if cost < best_cost {
                best_cost = cost;
                best_configs[h] = cfg;
            }
        }
    }

    best_configs
}

/// Optimize HybridUint config per histogram from value frequency maps (kBest method).
///
/// Tries 28 curated configs per histogram (from libjxl enc_ans.cc:747-783).
/// More thorough than kFast (4 configs) but 7x more work. Iterates (value, count)
/// pairs from frequency maps instead of individual values, avoiding O(tokens) storage.
fn optimize_uint_configs_best_from_freqs(
    freqs_per_histo: &[alloc::collections::BTreeMap<u32, u32>],
    lz77: Option<&Lz77Params>,
) -> Vec<HybridUintConfig> {
    use crate::entropy_coding::ans::ANS_MAX_ALPHABET_SIZE;

    #[rustfmt::skip]
    let candidates = [
        HybridUintConfig::new(0,0,0),  HybridUintConfig::new(1,0,0),
        HybridUintConfig::new(2,0,0),  HybridUintConfig::new(2,0,1),
        HybridUintConfig::new(3,0,0),  HybridUintConfig::new(3,1,0),
        HybridUintConfig::new(3,0,1),  HybridUintConfig::new(3,1,1),
        HybridUintConfig::new(4,0,0),  HybridUintConfig::new(4,2,0),
        HybridUintConfig::new(4,1,0),  HybridUintConfig::new(4,0,1),
        HybridUintConfig::new(4,2,1),  HybridUintConfig::new(4,1,1),
        HybridUintConfig::new(5,0,0),  HybridUintConfig::new(5,2,0),
        HybridUintConfig::new(5,1,0),  HybridUintConfig::new(5,0,1),
        HybridUintConfig::new(5,2,1),  HybridUintConfig::new(6,0,0),
        HybridUintConfig::new(6,2,0),  HybridUintConfig::new(6,1,0),
        HybridUintConfig::new(7,0,0),  HybridUintConfig::new(7,2,0),
        HybridUintConfig::new(8,0,0),  HybridUintConfig::new(8,2,0),
        HybridUintConfig::new(10,0,0), HybridUintConfig::new(12,0,0),
    ];

    let num_histograms = freqs_per_histo.len();
    let max_alpha = ANS_MAX_ALPHABET_SIZE;

    let mut best_configs = vec![HybridUintConfig::new(4, 2, 0); num_histograms];
    let mut counts_buf: Vec<u32> = Vec::new();

    for h in 0..num_histograms {
        let freqs = &freqs_per_histo[h];
        if freqs.is_empty() {
            continue;
        }

        let max_value = freqs.keys().copied().max().unwrap_or(0);
        let total: u32 = freqs.values().sum();
        let mut best_cost = f64::MAX;

        for &cfg in &candidates {
            let (max_tok, _, _) = cfg.encode(max_value);
            let max_tok_with_lsb = max_tok | ((1u32 << cfg.lsb_in_token) - 1);
            if max_tok_with_lsb as usize >= max_alpha {
                continue;
            }
            if let Some(lz77_params) = lz77
                && max_tok_with_lsb >= lz77_params.min_symbol
            {
                continue;
            }

            let capacity = max_tok_with_lsb as usize + 1;
            counts_buf.clear();
            counts_buf.resize(capacity, 0);
            let mut extra_bits_total: u64 = 0;
            for (&val, &freq) in freqs {
                let (tok, _, nbits) = cfg.encode(val);
                counts_buf[tok as usize] += freq;
                extra_bits_total += nbits as u64 * freq as u64;
            }

            let inv_total = 1.0f32 / total as f32;
            let mut entropy_cost = 0.0f64;
            for &count in &counts_buf[..capacity] {
                if count > 0 {
                    let c = count as f32;
                    entropy_cost -= c as f64 * jxl_simd::fast_log2f(c * inv_total) as f64;
                }
            }

            let signaling_cost = if cfg.split_exponent == 0 {
                0.0
            } else {
                ceil_log2_nonzero_usize(cfg.split_exponent as usize + 1) as f64
                    + ceil_log2_nonzero_usize((cfg.split_exponent - cfg.msb_in_token) as usize + 1)
                        as f64
            };
            let cost = entropy_cost + extra_bits_total as f64 + signaling_cost;

            if cost < best_cost {
                best_cost = cost;
                best_configs[h] = cfg;
            }
        }
    }

    best_configs
}

/// Write ANS entropy code header (context map + distributions).
pub fn write_entropy_code_ans(code: &OwnedAnsEntropyCode, writer: &mut BitWriter) -> Result<()> {
    #[cfg(feature = "debug-tokens")]
    {
        eprintln!("write_entropy_code_ans:");
        eprintln!("  num_contexts: {}", code.context_map.len());
        eprintln!("  num_histograms: {}", code.histograms.len());
        eprintln!(
            "  context_map: {:?}",
            &code.context_map[..code.context_map.len().min(20)]
        );
        for (i, h) in code.histograms.iter().enumerate() {
            eprintln!(
                "  histogram[{}]: alphabet_size={}, method={}, counts[..8]={:?}",
                i,
                h.alphabet_size,
                h.method,
                &h.counts[..h.counts.len().min(8)]
            );
        }
    }

    // Write context map (same format as Huffman)
    // Note: LZ77 is already written by the caller (write_dc_global or write_ac_global)
    let _cm_start = writer.bits_written();
    write_context_map_for_ans(code, writer)?;

    #[cfg(feature = "debug-tokens")]
    eprintln!("  context_map: {} bits", writer.bits_written() - _cm_start);

    // Write use_prefix_code = 0 (use ANS, not Huffman)
    writer.write(1, 0)?;

    // Write log_alpha_size - 5
    let las = code.log_alpha_size;
    writer.write(2, (las - 5) as u64)?;

    #[cfg(feature = "debug-tokens")]
    eprintln!("  use_prefix_code=0, log_alpha_size={}", las);

    // Write HybridUint configs for each histogram
    let _cfg_start = writer.bits_written();
    for (i, _) in code.histograms.iter().enumerate() {
        let config = code.uint_configs.get(i).copied().unwrap_or_default();
        write_hybrid_uint_config_value(las, &config, writer)?;
    }

    #[cfg(feature = "debug-tokens")]
    eprintln!(
        "  HybridUint configs: {} bits ({} histograms)",
        writer.bits_written() - _cfg_start,
        code.histograms.len()
    );

    // Write ANS distributions
    let _hist_start = writer.bits_written();
    #[allow(clippy::unused_enumerate_index)]
    for (_i, histo) in code.histograms.iter().enumerate() {
        let _h_start = writer.bits_written();
        histo.write(writer)?;
        #[cfg(feature = "debug-tokens")]
        eprintln!(
            "  histogram[{}]: {} bits",
            _i,
            writer.bits_written() - _h_start
        );
    }

    #[cfg(feature = "debug-tokens")]
    eprintln!(
        "  All histograms: {} bits",
        writer.bits_written() - _hist_start
    );

    Ok(())
}

/// Write context map for ANS entropy code.
///
/// Matches libjxl's EncodeContextMap: always compares simple (raw bits) vs
/// non-simple (Huffman+MTF) and picks whichever is smaller. Previous code
/// unconditionally used simple for ≤8 histograms, which wastes bits when the
/// context map is large and repetitive (e.g. 1485 AC contexts with 8 histograms:
/// simple = 4455 bits, Huffman+MTF ≈ 800 bits).
fn write_context_map_for_ans(code: &OwnedAnsEntropyCode, writer: &mut BitWriter) -> Result<()> {
    let num_histograms = code.histograms.len();

    if num_histograms == 1 {
        // Simple context map: all contexts map to histogram 0
        writer.write(1, 1)?; // simple_context_map = true
        writer.write(2, 0)?; // nbits = 0
        return Ok(());
    }

    // Compute entry_bits for simple encoding: CeilLog2Nonzero(num_histograms)
    let entry_bits = ceil_log2_nonzero_usize(num_histograms);

    // Simple encoding is only possible when entry_bits < 4 (≤8 histograms).
    // When possible, compare simple vs non-simple and pick the cheaper one.
    // This matches libjxl enc_context_map.cc:113.
    if entry_bits < 4 {
        let simple_cost = 3 + entry_bits * code.context_map.len(); // 1 (is_simple) + 2 (nbits) + data

        // Write non-simple to a scratch writer to measure actual cost
        let mut scratch = BitWriter::with_capacity(code.context_map.len());
        write_context_map_nonsimple(&code.context_map, &mut scratch)?;
        let nonsimple_cost = scratch.bits_written();

        if simple_cost <= nonsimple_cost {
            // Simple is cheaper (or equal), use it
            writer.write(1, 1)?; // simple_context_map = true
            writer.write(2, entry_bits as u64)?;
            for &ctx in &code.context_map {
                writer.write(entry_bits, ctx as u64)?;
            }
            return Ok(());
        }
        // Non-simple is cheaper — copy the scratch bits
        let scratch_bytes = scratch.finish_with_padding();
        let bits_to_copy = nonsimple_cost;
        // Copy bit-by-bit from scratch to writer (scratch is byte-aligned but
        // writer may not be). Use the raw bytes and copy the exact bit count.
        copy_bits(&scratch_bytes, bits_to_copy, writer)?;
        return Ok(());
    }

    // > 8 histograms: always use non-simple
    write_context_map_nonsimple(&code.context_map, writer)
}

/// Write a non-simple context map (Huffman-encoded with optional MTF).
///
/// Format: is_simple=0, use_mtf, lz77_enabled=0, then Huffman prefix code + data.
fn write_context_map_nonsimple(context_map: &[u8], writer: &mut BitWriter) -> Result<()> {
    // Try both direct and MTF, pick whichever has lower estimated cost.
    let mtf_tokens = move_to_front_transform(context_map);

    let direct_cost = estimate_context_map_cost(context_map);
    let mtf_cost = estimate_context_map_cost(&mtf_tokens);
    let use_mtf = mtf_cost < direct_cost;
    let tokens: &[u8] = if use_mtf { &mtf_tokens } else { context_map };

    // is_simple=0, use_mtf, lz77_enabled=0 (3 bits packed)
    let header_bits = if use_mtf { 0b010u64 } else { 0b000u64 };
    writer.write(3, header_bits)?;

    // Now write a Huffman-encoded entropy code for the context map values.
    // Since num_contexts=1 for the context map's own entropy code, no inner context map.

    // use_prefix_code = 1 (Huffman)
    writer.write(1, 1)?;

    // HybridUint config: split=4, msb=2, lsb=0 (same as our UintCoder)
    writer.write(4, 4)?; // split_exponent = 4
    writer.write(3, 2)?; // msb_in_token = 2
    writer.write(2, 0)?; // lsb_in_token = 0

    // Build histogram of encoded token symbols
    let mut histogram = [0u32; ALPHABET_SIZE];
    for &t in tokens {
        let encoded = UintCoder::encode(t as u32);
        histogram[encoded.token as usize] += 1;
    }

    // Find alphabet length (trim trailing zeros)
    let mut length = ALPHABET_SIZE;
    while length > 0 && histogram[length - 1] == 0 {
        length -= 1;
    }
    length = length.max(1);

    // Create Huffman tree
    let mut depths = [0u8; ALPHABET_SIZE];
    create_huffman_tree(&histogram, length, 15, &mut depths);

    let mut bits = [0u16; ALPHABET_SIZE];
    convert_bit_depths_to_symbols(&depths, &mut bits);

    // Write alphabet size
    write_var_len_uint16(length - 1, writer)?;

    // Write prefix code tree
    if length > 1 {
        let pc = PrefixCode { depths, bits };
        write_prefix_code(&pc, writer)?;
    }

    // Write encoded context map entries
    for &t in tokens {
        let encoded = UintCoder::encode(t as u32);
        let tok = encoded.token as usize;
        let depth = depths[tok] as usize;
        let b = bits[tok] as u64;
        let data = b | ((encoded.bits as u64) << depth);
        let total_bits = depth + encoded.nbits as usize;
        writer.write(total_bits, data)?;
    }

    Ok(())
}

/// Copy `num_bits` from a byte slice into a BitWriter.
fn copy_bits(src: &[u8], num_bits: usize, writer: &mut BitWriter) -> Result<()> {
    let full_bytes = num_bits / 8;
    let remaining_bits = num_bits % 8;

    for &byte in &src[..full_bytes] {
        writer.write(8, byte as u64)?;
    }
    if remaining_bits > 0 {
        let last_byte = src[full_bytes];
        let mask = (1u64 << remaining_bits) - 1;
        writer.write(remaining_bits, (last_byte as u64) & mask)?;
    }
    Ok(())
}

/// Estimate the Shannon entropy cost of a byte sequence (for context map cost comparison).
fn estimate_context_map_cost(tokens: &[u8]) -> f64 {
    if tokens.is_empty() {
        return 0.0;
    }
    let mut counts = [0u32; 256];
    for &t in tokens {
        counts[t as usize] += 1;
    }
    let inv_total = 1.0f32 / tokens.len() as f32;
    let mut cost = 0.0f32;
    for &c in &counts {
        if c > 0 {
            let cf = c as f32;
            let p = cf * inv_total;
            cost -= p * jxl_simd::fast_log2f(p);
        }
    }
    (cost * tokens.len() as f32) as f64
}

/// Write HybridUint config with specific split/msb/lsb values.
fn write_hybrid_uint_config_value(
    log_alpha_size: usize,
    config: &HybridUintConfig,
    writer: &mut BitWriter,
) -> Result<()> {
    let split_exponent = config.split_exponent;
    let msb_in_token = config.msb_in_token;
    let lsb_in_token = config.lsb_in_token;

    // CeilLog2Nonzero(log_alpha_size + 1) bits for split_exponent
    let se_bits = ceil_log2_nonzero_usize(log_alpha_size + 1);
    writer.write(se_bits, split_exponent as u64)?;

    if split_exponent as usize == log_alpha_size {
        // msb/lsb don't matter when split_exponent == log_alpha_size
        return Ok(());
    }

    // CeilLog2Nonzero(split_exponent + 1) bits for msb_in_token
    let msb_bits = ceil_log2_nonzero_usize(split_exponent as usize + 1);
    writer.write(msb_bits, msb_in_token as u64)?;

    // CeilLog2Nonzero(split_exponent - msb_in_token + 1) bits for lsb_in_token
    let lsb_bits = ceil_log2_nonzero_usize((split_exponent - msb_in_token) as usize + 1);
    writer.write(lsb_bits, lsb_in_token as u64)?;

    Ok(())
}

/// CeilLog2Nonzero for usize, matching libjxl.
fn ceil_log2_nonzero_usize(x: usize) -> usize {
    debug_assert!(x > 0);
    let x = x as u32;
    let floor = 31 - x.leading_zeros();
    if x.is_power_of_two() {
        floor as usize
    } else {
        (floor + 1) as usize
    }
}

/// Write tokens using ANS entropy coding.
///
/// Tokens are processed in reverse order (ANS requirement), and the output
/// is written in the correct forward order for the decoder.
pub fn write_tokens_ans(
    tokens: &[Token],
    code: &OwnedAnsEntropyCode,
    lz77: Option<&Lz77Params>,
    writer: &mut BitWriter,
) -> Result<()> {
    let mut encoder = AnsEncoder::with_capacity(tokens.len());

    #[cfg(feature = "debug-tokens")]
    {
        eprintln!(
            "write_tokens_ans: {} tokens, {} distributions, context_map len={}",
            tokens.len(),
            code.distributions.len(),
            code.context_map.len()
        );
        eprintln!("  initial state: 0x{:08x}", encoder.state());
    }

    // Process tokens in reverse order
    #[allow(clippy::unused_enumerate_index)]
    for (_i, token) in tokens.iter().rev().enumerate() {
        let ctx = token.context() as usize;
        let dist_idx = code.context_map.get(ctx).copied().unwrap_or(0) as usize;
        let config = code.uint_configs.get(dist_idx).copied().unwrap_or_default();
        let (encoded, sym) = encode_token_value_with_config(token, lz77, &config);

        // Get the distribution for this context
        let dist = code.distributions.get(dist_idx).unwrap_or_else(|| {
            panic!(
                "ANS: missing distribution at index {} for context {}",
                dist_idx, ctx
            )
        });

        // Push extra bits first (they come after the symbol in forward order)
        encoder.push_bits(encoded.bits, encoded.nbits as u8);

        // Push the ANS symbol
        let info = dist.get(sym as usize).unwrap_or_else(|| {
            panic!(
                "ANS: symbol {} not in distribution (ctx={}, dist_idx={})",
                sym, ctx, dist_idx
            )
        });

        #[cfg(feature = "debug-tokens")]
        if _i < 5 || _i >= tokens.len() - 3 {
            eprintln!(
                "  token[{}]: ctx={}, val={}, tok={}, freq={}, state before=0x{:08x}",
                tokens.len() - 1 - _i,
                ctx,
                token.value,
                sym,
                info.freq,
                encoder.state()
            );
        }

        encoder.put_symbol(info);
    }

    #[cfg(feature = "debug-tokens")]
    eprintln!("  final state: 0x{:08x}", encoder.state());

    // Finalize: writes state + reversed bits
    encoder.finalize(writer)?;

    Ok(())
}

/// Verify that each ANS histogram serializes and deserializes correctly.
///
/// Writes each histogram to bits, decodes it back with our decoder, and compares frequencies.
pub fn verify_histogram_serialization(code: &OwnedAnsEntropyCode, label: &str) -> Result<()> {
    use crate::entropy_coding::ans_decode::{AnsHistogram, BitReader};

    for (i, histo) in code.histograms.iter().enumerate() {
        // Write histogram to bits
        let mut writer = BitWriter::new();
        histo.write(&mut writer)?;
        // Add padding bytes so the decoder's peek(7) doesn't read past the end.
        // In a real bitstream, more data follows the histogram. In this isolated
        // test, we add explicit zero padding.
        writer.write(8, 0)?;
        writer.zero_pad_to_byte();
        let bytes = writer.finish();

        // Decode it back
        let mut br = BitReader::new(&bytes);
        let decoded = match AnsHistogram::decode(&mut br, code.log_alpha_size) {
            Ok(d) => d,
            Err(e) => {
                #[cfg(feature = "debug-rect")]
                eprintln!(
                    "{} histo[{}]: DECODE FAILED - {} (method={} alpha={} omit={})",
                    label, i, e, histo.method, histo.alphabet_size, histo.omit_pos
                );
                return Err(e);
            }
        };

        // Compare frequencies
        let mut mismatch = false;
        for j in 0..histo.alphabet_size {
            let expected = histo.counts[j] as u16;
            let got = decoded.frequencies[j];
            if expected != got {
                if !mismatch {
                    #[cfg(feature = "debug-rect")]
                    eprintln!(
                        "{} histo[{}]: FREQ MISMATCH (method={} alpha={})",
                        label, i, histo.method, histo.alphabet_size
                    );
                }
                #[cfg(feature = "debug-rect")]
                eprintln!("sym[{}]: expected={} got={}", j, expected, got);
                mismatch = true;
            }
        }

        if mismatch {
            #[cfg(feature = "debug-rect")]
            {
                eprintln!("counts: {:?}", &histo.counts[..histo.alphabet_size]);
                // Check omit_pos: what would the decoder pick vs what encoder used?
                let mut encoder_omit_logcount = 0u32;
                let mut encoder_omit = 0;
                for (k, &c) in histo.counts.iter().enumerate().take(histo.alphabet_size) {
                    if c > 0 {
                        let lc = crate::entropy_coding::ans::floor_log2_ans(c as u32) + 1;
                        if lc > encoder_omit_logcount {
                            encoder_omit_logcount = lc;
                            encoder_omit = k;
                        }
                    }
                }
                eprintln!(
                    "omit_pos={} (stored: method={} omit={})",
                    encoder_omit, histo.method, histo.omit_pos
                );
                eprintln!(
                    "omit logcount={} count_at_omit={}",
                    encoder_omit_logcount, histo.counts[histo.omit_pos]
                );
                // Check what decoder would see
                for k in 0..histo.alphabet_size.min(40) {
                    let c = histo.counts[k];
                    if c > 0 {
                        let lc = crate::entropy_coding::ans::floor_log2_ans(c as u32) + 1;
                        if lc == encoder_omit_logcount {
                            eprintln!("sym[{}]: count={} logcount={} (same as max)", k, c, lc);
                        }
                    }
                }
            }
            return Err(Error::InvalidHistogram(format!(
                "{} histogram[{}] serialization roundtrip failed",
                label, i
            )));
        }

        // Histogram OK - only log when debug-tokens feature is enabled
        #[cfg(feature = "debug-tokens")]
        {
            let method_desc = match histo.method {
                0 => "flat",
                1 => "small",
                _ => "general",
            };
            eprintln!(
                "  {} histogram[{}]: OK ({}, {} symbols, {} bytes)",
                label,
                i,
                method_desc,
                histo.alphabet_size,
                bytes.len()
            );
        }
    }

    Ok(())
}

/// Verify ANS roundtrip: encode tokens, then decode with our local decoder.
///
/// Returns Ok(()) if all decoded symbols match, or Err with details of first mismatch.
/// This is the critical invariant test for ANS encoding correctness.
pub fn verify_ans_roundtrip(tokens: &[Token], code: &OwnedAnsEntropyCode) -> Result<()> {
    use crate::entropy_coding::ans_decode::{AnsHistogram, AnsReader, BitReader};

    if tokens.is_empty() {
        return Ok(());
    }

    // Step 1: Write the ANS-encoded histogram header + tokens to a buffer
    let mut header_writer = BitWriter::new();
    write_entropy_code_ans(code, &mut header_writer)?;
    let header_bits = header_writer.bits_written();

    let mut token_writer = BitWriter::new();
    write_tokens_ans(tokens, code, None, &mut token_writer)?;
    let _token_bits = token_writer.bits_written();

    // Combine header + tokens into one buffer for decoding
    let mut combined_writer = BitWriter::new();
    write_entropy_code_ans(code, &mut combined_writer)?;
    write_tokens_ans(tokens, code, None, &mut combined_writer)?;
    combined_writer.zero_pad_to_byte();
    let encoded_bytes = combined_writer.finish();

    // Step 2: Decode the histogram header
    let mut br = BitReader::new(&encoded_bytes);

    // Read context map
    let _num_histograms = code.histograms.len();
    let _simple = br.read(1)?; // simple_context_map flag
    // Skip full context map decoding — we'll decode each histogram directly.
    // Instead, just skip to where the histograms start by re-reading the full header.
    let mut br2 = BitReader::new(&encoded_bytes);

    // We need to skip the header and go straight to the token data.
    // The easiest way: just read past header_bits.
    for _ in 0..header_bits {
        br2.read(1)?;
    }

    // Step 3: Decode ANS tokens
    let mut ans_reader = AnsReader::init(&mut br2)?;

    // Decode each histogram from the full header for verification
    let _br_hist = BitReader::new(&encoded_bytes);
    // Skip context map to get to histograms...
    // Actually, let's take a simpler approach: decode histograms independently
    // and build decoder tables from the encoder's known frequencies.

    // Build decoder histograms directly from the encoder's known distributions
    let log_alpha_size = code.log_alpha_size;
    let table_size = 1usize << log_alpha_size;
    let decoder_histograms: Vec<AnsHistogram> = code
        .distributions
        .iter()
        .map(|dist| {
            // Build frequency array padded to alias table size
            let mut freqs = vec![0u16; dist.symbols.len().max(table_size)];
            for (i, sym) in dist.symbols.iter().enumerate() {
                freqs[i] = sym.freq;
            }

            // Build alias map using the decoder's method
            let log_bucket_size = 12 - log_alpha_size; // LOG_SUM_PROBS - log_alpha_size
            let bucket_size = 1u16 << log_bucket_size;
            let bucket_mask = bucket_size as u32 - 1;

            // Check for single-symbol case
            if let Some(single_idx) = freqs.iter().position(|&f| f == 4096) {
                let buckets = freqs
                    .iter()
                    .enumerate()
                    .map(|(i, &f)| crate::entropy_coding::ans_decode::Bucket {
                        dist: f,
                        alias_symbol: single_idx as u8,
                        alias_offset: bucket_size * i as u16,
                        alias_cutoff: 0,
                        alias_dist_xor: f ^ 4096,
                    })
                    .collect();
                return AnsHistogram {
                    buckets,
                    log_bucket_size,
                    bucket_mask,
                    single_symbol: Some(single_idx as u32),
                    frequencies: freqs,
                };
            }

            let buckets =
                AnsHistogram::build_alias_map_from_freqs(freqs.len(), log_bucket_size, &freqs);
            AnsHistogram {
                buckets,
                log_bucket_size,
                bucket_mask,
                single_symbol: None,
                frequencies: freqs,
            }
        })
        .collect();

    // Step 4: Decode tokens and compare
    let mut mismatches = 0;
    #[allow(clippy::unused_enumerate_index)] // _i used in #[cfg(feature = "debug-rect")] output
    for (_i, token) in tokens.iter().enumerate() {
        let ctx = token.context() as usize;
        let dist_idx = code.context_map.get(ctx).copied().unwrap_or(0) as usize;
        let decoder_hist = &decoder_histograms[dist_idx];

        // Decode one ANS symbol
        let decoded_symbol = decoder_hist.read(&mut br2, &mut ans_reader.0);

        // Read extra bits (HybridUint) — use per-histogram config
        let config = code.uint_configs.get(dist_idx).copied().unwrap_or_default();
        let (expected_encoded, _expected_sym) =
            encode_token_value_with_config(token, None, &config);
        let decoded_extra = if expected_encoded.nbits > 0 {
            br2.read(expected_encoded.nbits as usize).unwrap_or(0) as u32
        } else {
            0
        };

        // Compare token (ANS symbol)
        if decoded_symbol != expected_encoded.token {
            if mismatches < 5 {
                #[cfg(feature = "debug-rect")]
                eprintln!(
                    "MISMATCH token[{}]: ctx={} val={} exp={} got={} state=0x{:08x}",
                    _i, ctx, token.value, expected_encoded.token, decoded_symbol, ans_reader.0
                );
            }
            mismatches += 1;
        }

        // Compare extra bits
        if decoded_extra != expected_encoded.bits {
            if mismatches < 5 {
                #[cfg(feature = "debug-rect")]
                eprintln!(
                    "BITS MISMATCH token[{}]: exp=0x{:x} got=0x{:x}",
                    _i, expected_encoded.bits, decoded_extra
                );
            }
            mismatches += 1;
        }
    }

    // Step 5: Verify final state
    if let Err(e) = ans_reader.check_final_state() {
        return Err(Error::Bitstream(format!(
            "ANS roundtrip final state check failed ({} token mismatches): {}",
            mismatches, e
        )));
    }

    if mismatches > 0 {
        return Err(Error::Bitstream(format!(
            "ANS roundtrip had {} mismatches out of {} tokens",
            mismatches,
            tokens.len()
        )));
    }

    #[cfg(feature = "debug-tokens")]
    eprintln!(
        "ANS roundtrip OK: {} tokens, header={} bits, data={} bits",
        tokens.len(),
        header_bits,
        _token_bits
    );

    Ok(())
}

/// Test ANS roundtrip using PARSED histogram (not known distributions).
///
/// This exercises the exact format that a real decoder uses:
/// write_ans_modular_header → parse histogram from bitstream → decode tokens.
/// Unlike verify_ans_roundtrip which builds decoder histograms from encoder's
/// known distributions, this test catches format mismatches where our encoder
/// and our internal decoder agree but external decoders (jxl-rs, djxl) disagree.
#[cfg(debug_assertions)]
pub fn verify_ans_roundtrip_parsed(tokens: &[Token], code: &OwnedAnsEntropyCode) -> Result<()> {
    use crate::entropy_coding::ans_decode::{AnsHistogram, AnsReader, BitReader};

    #[inline]
    fn ceil_log2_nonzero(x: u32) -> u32 {
        if x <= 1 {
            0
        } else {
            u32::BITS - (x - 1).leading_zeros()
        }
    }

    if tokens.is_empty() {
        return Ok(());
    }

    assert_eq!(
        code.histograms.len(),
        1,
        "verify_ans_roundtrip_parsed only supports single-distribution"
    );

    // Write exactly what the modular encoder writes: header + tokens
    let mut writer = BitWriter::new();
    // Inline the modular ANS header format:
    // lz77.enabled = 0
    writer.write(1, 0)?;
    // No context map for num_dist=1
    // use_prefix_code = 0 (ANS)
    writer.write(1, 0)?;
    // log_alpha_size - 5 (2 bits)
    let las = code.log_alpha_size;
    writer.write(2, (las - 5) as u64)?;
    // HybridUint config
    let config = code
        .uint_configs
        .first()
        .copied()
        .unwrap_or(crate::entropy_coding::hybrid_uint::HybridUintConfig::default_config());
    let se_bits = ceil_log2_nonzero(las as u32 + 1) as usize;
    writer.write(se_bits, config.split_exponent as u64)?;
    if (config.split_exponent as usize) != las {
        let msb_bits = ceil_log2_nonzero(config.split_exponent + 1) as usize;
        writer.write(msb_bits, config.msb_in_token as u64)?;
        let lsb_bits = ceil_log2_nonzero(config.split_exponent - config.msb_in_token + 1) as usize;
        writer.write(lsb_bits, config.lsb_in_token as u64)?;
    }
    // Write the single ANS distribution
    code.histograms[0].write(&mut writer)?;
    let header_bits = writer.bits_written();
    write_tokens_ans(tokens, code, None, &mut writer)?;
    writer.zero_pad_to_byte();
    let encoded_bytes = writer.finish();

    eprintln!(
        "verify_ans_roundtrip_parsed: header={} bits, total={} bytes",
        header_bits,
        encoded_bytes.len()
    );

    // Now parse it back exactly as a decoder would
    let mut br = BitReader::new(&encoded_bytes);

    // 1. lz77.enabled
    let lz77_enabled = br.read(1)?;
    assert_eq!(lz77_enabled, 0, "expected lz77.enabled=0");

    // 2. context_map skipped for num_dist=1

    // 3. use_prefix_code
    let use_prefix_code = br.read(1)?;
    assert_eq!(use_prefix_code, 0, "expected use_prefix_code=0 (ANS)");

    // 4. log_alpha_size
    let las = br.read(2)? as usize + 5;
    eprintln!("  parsed log_alpha_size={}", las);
    assert_eq!(las, code.log_alpha_size, "log_alpha_size mismatch");

    // 5. HybridUint config
    let se_bits = ceil_log2_nonzero(las as u32 + 1) as usize;
    let split_exponent = br.read(se_bits)? as u32;
    let (msb_in_token, lsb_in_token) = if split_exponent != las as u32 {
        let msb_bits = ceil_log2_nonzero(split_exponent + 1) as usize;
        let msb = br.read(msb_bits)? as u32;
        let lsb_bits = ceil_log2_nonzero(split_exponent - msb + 1) as usize;
        let lsb = br.read(lsb_bits)? as u32;
        (msb, lsb)
    } else {
        (0, 0)
    };
    let expected_config = code
        .uint_configs
        .first()
        .copied()
        .unwrap_or(crate::entropy_coding::hybrid_uint::HybridUintConfig::default_config());
    eprintln!(
        "  parsed uint_config: se={} msb={} lsb={} (expected se={} msb={} lsb={})",
        split_exponent,
        msb_in_token,
        lsb_in_token,
        expected_config.split_exponent,
        expected_config.msb_in_token,
        expected_config.lsb_in_token
    );
    assert_eq!(
        split_exponent, expected_config.split_exponent,
        "split_exponent mismatch"
    );
    assert_eq!(
        msb_in_token, expected_config.msb_in_token,
        "msb_in_token mismatch"
    );
    assert_eq!(
        lsb_in_token, expected_config.lsb_in_token,
        "lsb_in_token mismatch"
    );

    // 6. Parse the ANS histogram (this is what jxl-rs does)
    let parsed_histo = AnsHistogram::decode(&mut br, las)?;
    let bits_after_histo = br.bits_read();
    eprintln!(
        "  histogram parsed OK at bit {}, freqs: {:?}",
        bits_after_histo,
        &parsed_histo.frequencies[..parsed_histo.frequencies.len().min(10)]
    );

    // Compare parsed frequencies with encoder's distribution
    for (i, sym) in code.distributions[0].symbols.iter().enumerate() {
        let parsed_freq = parsed_histo.frequencies.get(i).copied().unwrap_or(0);
        if parsed_freq != sym.freq {
            eprintln!(
                "  FREQ MISMATCH at symbol {}: encoder={} parsed={}",
                i, sym.freq, parsed_freq
            );
            return Err(Error::Bitstream(format!(
                "Parsed histogram frequency mismatch at symbol {}: encoder={} parsed={}",
                i, sym.freq, parsed_freq
            )));
        }
    }
    eprintln!("  frequencies match encoder's distribution");

    // 7. Read 32-bit ANS state
    let mut ans_reader = AnsReader::init(&mut br)?;
    eprintln!("  ANS initial state: 0x{:08x}", ans_reader.state());

    // 8. Decode tokens
    let config = crate::entropy_coding::hybrid_uint::HybridUintConfig {
        split_exponent,
        split: 1 << split_exponent,
        msb_in_token,
        lsb_in_token,
    };
    let mut mismatches = 0;
    for (i, token) in tokens.iter().enumerate() {
        let (expected_encoded, _) =
            crate::entropy_coding::encode::encode_token_value_with_config(token, None, &config);

        // Decode ANS symbol
        let decoded_symbol = parsed_histo.read(&mut br, &mut ans_reader.0);

        // Read extra bits
        let decoded_extra = if expected_encoded.nbits > 0 {
            br.read(expected_encoded.nbits as usize).unwrap_or(0) as u32
        } else {
            0
        };

        if decoded_symbol != expected_encoded.token || decoded_extra != expected_encoded.bits {
            if mismatches < 5 {
                eprintln!(
                    "  MISMATCH token[{}]: val={} exp_tok={} got_tok={} exp_bits=0x{:x} got_bits=0x{:x}",
                    i,
                    token.value,
                    expected_encoded.token,
                    decoded_symbol,
                    expected_encoded.bits,
                    decoded_extra
                );
            }
            mismatches += 1;
        }
    }

    // 9. Check final state
    if let Err(e) = ans_reader.check_final_state() {
        eprintln!(
            "  FINAL STATE FAILED: {} mismatches, state=0x{:08x}",
            mismatches,
            ans_reader.state()
        );
        return Err(Error::Bitstream(format!(
            "ANS parsed roundtrip final state check failed ({} mismatches): {}",
            mismatches, e
        )));
    }

    if mismatches > 0 {
        return Err(Error::Bitstream(format!(
            "ANS parsed roundtrip had {} mismatches",
            mismatches
        )));
    }

    eprintln!("  ANS parsed roundtrip OK: {} tokens", tokens.len());
    Ok(())
}