jxl-encoder 0.3.0

JPEG XL encoder in pure Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
// Copyright (c) Imazen LLC and the JPEG XL Project Authors.
// Algorithms and constants derived from libjxl (BSD-3-Clause).
// Licensed under AGPL-3.0-or-later. Commercial licenses at https://www.imazen.io/pricing

//! Custom coefficient ordering for AC coefficients.
//!
//! Reorders AC coefficients per-strategy-type so frequently-zero positions
//! appear last. The encoder stops writing at the last non-zero, so pushing
//! zeros to the end saves bits.
//!
//! The permutation is encoded as Lehmer codes in the HF global section,
//! using a separate 8-context entropy code.

use super::ac_group::ac_strategy_info;
use super::ac_strategy::AcStrategyMap;
use super::common::{DCT_BLOCK_SIZE, ceil_log2_nonzero, floor_log2_nonzero};
use crate::bit_writer::BitWriter;
use crate::entropy_coding::encode::{
    build_entropy_code_ans_with_options, build_entropy_code_with_options, write_entropy_code_ans,
    write_tokens, write_tokens_ans,
};
use crate::entropy_coding::token::Token;
use crate::error::Result;

/// Number of order buckets used by our encoder.
/// Bucket 0 = DCT8, Bucket 2 = DCT16x16, Bucket 3 = DCT32x32, Bucket 4 = DCT8x16/DCT16x8.
/// Total 13 buckets exist in the spec.
pub const NUM_ORDER_BUCKETS: usize = 13;

/// Number of contexts for permutation entropy coding.
pub const NUM_PERMUTATION_CONTEXTS: usize = 8;

/// Strategy code to order bucket mapping.
/// Matches libjxl's kStrategyOrder for the strategies we support.
pub const STRATEGY_TO_BUCKET: [u8; 27] = [
    0, 1, 1, 1, 2, 3, 4, 4, 5, 5, 6, 6, 1, 1, 1, 1, 1, 1, 7, 8, 8, 9, 10, 10, 11, 12, 12,
];

/// Get the order bucket for a raw strategy code.
#[inline]
pub fn strategy_bucket(raw_strategy: u8) -> u8 {
    STRATEGY_TO_BUCKET[raw_strategy as usize]
}

/// Generate the natural (zig-zag) coefficient order for a transform with
/// cx x cy covered blocks. Matches jxl-rs `natural_coeff_order`.
///
/// The natural order is a zig-zag scan of the coefficient grid, where
/// CoefficientLayout ensures cx >= cy (transpose if needed).
pub fn natural_coeff_order(cx: usize, cy: usize) -> Vec<u32> {
    // CoefficientLayout: ensure cx >= cy
    let (cx, cy) = if cx >= cy { (cx, cy) } else { (cy, cx) };

    let xsize = cx * 8;
    let total = cx * cy * DCT_BLOCK_SIZE;

    // xs = cx/cy ratio, for skipping rows in rectangular transforms
    let xs = cx / cy;
    let xsm = xs - 1;
    let xss = if xs <= 1 {
        0
    } else {
        ceil_log2_nonzero(xs) as usize
    };

    let mut out = vec![0u32; total];

    // First half of the block (upper-left triangle + main diagonal)
    let mut cur = cx * cy; // skip LLF positions
    for i in 0..xsize {
        for j in 0..=i {
            let (x, mut y) = if i % 2 != 0 { (i - j, j) } else { (j, i - j) };
            if (y & xsm) != 0 {
                continue;
            }
            y >>= xss;

            let val = if x < cx && y < cy {
                y * cx + x
            } else {
                let v = cur;
                cur += 1;
                v
            };

            if val < total {
                out[val] = (y * xsize + x) as u32;
            }
        }
    }

    // Second half (lower-right triangle)
    for ir in 1..xsize {
        let ip = xsize - ir;
        let i = ip - 1;
        for j in 0..=i {
            let (x, mut y) = if i % 2 != 0 {
                (xsize - 1 - j, xsize - 1 - (i - j))
            } else {
                (xsize - 1 - (i - j), xsize - 1 - j)
            };
            if (y & xsm) != 0 {
                continue;
            }
            y >>= xss;

            if cur < total {
                out[cur] = (y * xsize + x) as u32;
            }
            cur += 1;
        }
    }

    out
}

/// Generate the inverse lookup table for a natural coefficient order.
/// `lut[natural_order[i]] = i` for all i.
pub fn natural_coeff_order_lut(cx: usize, cy: usize) -> Vec<u32> {
    let order = natural_coeff_order(cx, cy);
    let mut lut = vec![0u32; order.len()];
    for (i, &pos) in order.iter().enumerate() {
        lut[pos as usize] = i as u32;
    }
    lut
}

/// Compute the context for a permutation value.
/// Uses HybridUint(0,0,0) semantics: token = 1 + floor_log2(val) for val>=1, else 0.
/// Clamped to [0, 7].
#[inline]
pub fn coeff_order_context(val: u32) -> usize {
    if val == 0 {
        0
    } else {
        ((1 + floor_log2_nonzero(val)) as usize).min(NUM_PERMUTATION_CONTEXTS - 1)
    }
}

/// Compute the Lehmer code of a permutation using a Fenwick tree.
/// O(n log n) time.
///
/// The permutation must contain unique indices in [0..n).
/// The output code[i] represents how many elements smaller than
/// permutation[i] appear after position i.
pub fn compute_lehmer_code(permutation: &[u32]) -> Vec<u32> {
    let n = permutation.len();
    let mut code = vec![0u32; n];
    let mut temp = vec![0u32; n + 1];

    for idx in 0..n {
        let s = permutation[idx];

        // Compute prefix sum in Fenwick tree (count of used elements <= s)
        let mut penalty = 0u32;
        let mut i = s as usize + 1;
        while i != 0 {
            penalty += temp[i];
            i &= i - 1; // clear lowest bit
        }

        debug_assert!(s >= penalty, "Lehmer code: s={} < penalty={}", s, penalty);
        code[idx] = s - penalty;

        // Update Fenwick tree: mark s as used
        i = s as usize + 1;
        while i < n + 1 {
            temp[i] += 1;
            i += i & i.wrapping_neg(); // add lowest bit
        }
    }

    code
}

/// Count zero coefficients per position for each (bucket, channel) combination.
///
/// Returns a map: `zero_counts[bucket][channel][position] = count`.
/// LLF positions get count = -1 to ensure they sort first.
pub fn count_zero_coefficients(
    quant_ac: &[Vec<Vec<[i32; DCT_BLOCK_SIZE]>>; 3],
    ac_strategy: &AcStrategyMap,
    xsize_blocks: usize,
    ysize_blocks: usize,
) -> Vec<Vec<Vec<i64>>> {
    // Initialize: 13 buckets x 3 channels x max_size
    let mut counts: Vec<Vec<Vec<i64>>> = (0..NUM_ORDER_BUCKETS)
        .map(|_| vec![Vec::new(); 3])
        .collect();

    // Pre-allocate for buckets we use
    // Bucket 0: DCT8, size 64
    for ch in &mut counts[0] {
        *ch = vec![0i64; 64];
    }
    // Bucket 2: DCT16x16, size 256
    for ch in &mut counts[2] {
        *ch = vec![0i64; 256];
    }
    // Bucket 3: DCT32x32, size 1024
    for ch in &mut counts[3] {
        *ch = vec![0i64; 1024];
    }
    // Bucket 4: DCT8x16/DCT16x8, size 128
    for ch in &mut counts[4] {
        *ch = vec![0i64; 128];
    }
    // Bucket 5: DCT32x8/DCT8x32, size 256 — not implemented, but pre-allocate for safety
    for ch in &mut counts[5] {
        *ch = vec![0i64; 256];
    }
    // Bucket 6: DCT32x16/DCT16x32, size 512
    for ch in &mut counts[6] {
        *ch = vec![0i64; 512];
    }
    // Bucket 7: DCT64x64, size 4096
    for ch in &mut counts[7] {
        *ch = vec![0i64; 4096];
    }
    // Bucket 8: DCT64x32/DCT32x64, size 2048
    for ch in &mut counts[8] {
        *ch = vec![0i64; 2048];
    }

    for by in 0..ysize_blocks {
        for bx in 0..xsize_blocks {
            if !ac_strategy.is_first(bx, by) {
                continue;
            }

            let raw_strategy = ac_strategy.raw_strategy(bx, by);
            let strategy_code = super::ac_strategy::STRATEGY_CODE_LUT[raw_strategy as usize];
            // strategy_bucket expects bitstream strategy code
            let bucket = strategy_bucket(strategy_code) as usize;
            // ac_strategy_info expects raw strategy codes (0-6)
            let (_, _, covered_blocks, _, _) = ac_strategy_info(raw_strategy);
            let size = covered_blocks * DCT_BLOCK_SIZE;

            // Ensure count vector is large enough
            for ch in &mut counts[bucket] {
                if ch.len() < size {
                    ch.resize(size, 0);
                }
            }

            for c in 0..3 {
                // Pre-slice count array to eliminate per-element triple-indexing
                let cnt = &mut counts[bucket][c];
                if covered_blocks == 1 {
                    // DCT8: coefficients are in quant_ac[c][by][bx]
                    let block = &quant_ac[c][by][bx];
                    let cnt_slice = &mut cnt[..DCT_BLOCK_SIZE];
                    for k in 0..DCT_BLOCK_SIZE {
                        if block[k] == 0 {
                            cnt_slice[k] += 1;
                        }
                    }
                } else {
                    // Multi-block: iterate by sub-block with nested loops
                    let covered_x_local = super::ac_strategy::COVERED_X[raw_strategy as usize];
                    let covered_y_local = super::ac_strategy::COVERED_Y[raw_strategy as usize];
                    let mut base_idx = 0;
                    for slot_dy in 0..covered_y_local {
                        for slot_dx in 0..covered_x_local {
                            let block = &quant_ac[c][by + slot_dy][bx + slot_dx];
                            let cnt_slice = &mut cnt[base_idx..base_idx + DCT_BLOCK_SIZE];
                            for k in 0..DCT_BLOCK_SIZE {
                                if block[k] == 0 {
                                    cnt_slice[k] += 1;
                                }
                            }
                            base_idx += DCT_BLOCK_SIZE;
                        }
                    }
                }
            }
        }
    }

    // Mark LLF positions with -1 so they sort first (ascending).
    // Done once per bucket after all blocks are processed, instead of per-block.
    for (bucket, bucket_counts) in counts.iter_mut().enumerate().take(NUM_ORDER_BUCKETS) {
        let size = match bucket {
            0 => 64,   // DCT8
            2 => 256,  // DCT16x16
            3 => 1024, // DCT32x32
            4 => 128,  // DCT8x16/DCT16x8
            6 => 512,  // DCT32x16/DCT16x32
            7 => 4096, // DCT64x64
            8 => 2048, // DCT64x32/DCT32x64
            _ => continue,
        };
        if bucket_counts[0].len() < size {
            continue; // Bucket not used
        }
        let (cx, cy) = bucket_to_cx_cy(bucket);
        let (layout_cx, layout_cy) = if cx >= cy { (cx, cy) } else { (cy, cx) };
        for channel_counts in bucket_counts.iter_mut().take(3) {
            for iy in 0..layout_cy {
                for ix in 0..layout_cx {
                    channel_counts[iy * layout_cx * 8 + ix] = -1;
                }
            }
        }
    }

    counts
}

/// Compute custom coefficient orders from zero counts.
///
/// Returns `(orders, used_orders)` where:
/// - `orders[bucket][channel]` is the custom order (positions sorted by ascending zero count)
/// - `used_orders` is a bitmask of buckets that have non-default orders
pub fn compute_custom_orders(zero_counts: &[Vec<Vec<i64>>]) -> (Vec<Vec<Vec<u32>>>, u32) {
    let mut orders: Vec<Vec<Vec<u32>>> = (0..NUM_ORDER_BUCKETS)
        .map(|_| vec![Vec::new(); 3])
        .collect();
    let mut used_orders = 0u32;

    for bucket in 0..NUM_ORDER_BUCKETS {
        // libjxl's ComputeUsedOrders (enc_coeff_order.cc:53-58) skips buckets > 6.
        // Buckets 7+ (DCT64x64, DCT64x32/DCT32x64) are never customized.
        if bucket > 6 {
            continue;
        }

        if zero_counts[bucket][0].is_empty() {
            continue;
        }

        let size = zero_counts[bucket][0].len();
        let inv_sqrt_sz = 1.0 / (size as f64).sqrt();

        // Get natural order for comparison
        let (cx, cy) = bucket_to_cx_cy(bucket);
        if cx == 0 {
            continue;
        }
        let natural = natural_coeff_order(cx, cy);

        let mut bucket_has_custom = false;

        for c in 0..3 {
            let counts = &zero_counts[bucket][c];
            if counts.is_empty() {
                continue;
            }

            // Build (position, sort_key) array
            // sort_key = (quantized_count << 16) | natural_index for stable sort
            let mut pos_and_count: Vec<(u32, u64)> = Vec::with_capacity(size);
            for (i, &pos) in natural.iter().enumerate().take(size) {
                let raw_count = counts[pos as usize];
                // Quantize count to reduce distinct values (matching C++ line 210)
                let quantized = if raw_count < 0 {
                    0u64 // LLF: keep at front
                } else {
                    // raw_count * inv_sqrt_sz + 0.1, truncated
                    (raw_count as f64 * inv_sqrt_sz + 0.1) as u64
                };
                pos_and_count.push((pos, (quantized << 16) | (i as u64)));
            }

            // Stable sort by count (ascending), preserving natural order for ties
            pos_and_count.sort_by_key(|&(_, key)| key);

            // Extract the order
            let order: Vec<u32> = pos_and_count.iter().map(|&(pos, _)| pos).collect();

            // Check if different from natural order
            let is_different = order.iter().zip(natural.iter()).any(|(a, b)| a != b);
            if is_different {
                bucket_has_custom = true;
            }

            orders[bucket][c] = order;
        }

        if bucket_has_custom {
            // Cost-benefit check: estimate whether the Lehmer code encoding
            // overhead is justified by AC savings from reordering.
            //
            // At high distances, zero counts become uniform (most coefficients
            // are zero regardless of position), so reordering provides minimal
            // benefit while the Lehmer code still costs bits to encode.
            //
            // We estimate:
            // - Cost: bits to encode the Lehmer code for all 3 channels
            // - Savings: expected increase in trailing zeros per block × num_blocks
            //   (each extra trailing zero saves ~1 bit in nzeros coding)
            let (cx_n, cy_n) = if cx >= cy { (cx, cy) } else { (cy, cx) };
            let llf = cx_n * cy_n;
            let natural_lut = natural_coeff_order_lut(cx, cy);

            let mut total_lehmer_cost = 0.0f64;
            let mut total_savings_bits = 0.0f64;

            for c in 0..3 {
                let order = &orders[bucket][c];
                let counts = &zero_counts[bucket][c];
                if order.is_empty() || counts.len() < size {
                    continue;
                }

                // Compute Lehmer code and estimate its encoding cost
                let order_zigzag: Vec<u32> =
                    order.iter().map(|&pos| natural_lut[pos as usize]).collect();
                let lehmer = compute_lehmer_code(&order_zigzag);

                let end = lehmer[llf..size]
                    .iter()
                    .rposition(|&v| v != 0)
                    .map_or(llf, |p| llf + p + 1);

                // End marker token: ~log2(size) bits
                total_lehmer_cost += (size as f64).log2();
                // Per-entry cost: ~0.5 bits for zero, ~1.5 + log2(1+v) for non-zero
                for &val in &lehmer[llf..end] {
                    if val == 0 {
                        total_lehmer_cost += 0.5;
                    } else {
                        total_lehmer_cost += 1.5 + (val as f64 + 1.0).log2();
                    }
                }

                // Estimate savings from reordering.
                // The savings come from increased trailing zeros in each block.
                // Compute expected trailing zeros under both orders using
                // per-position zero rates (zero_count / max_count).
                let max_count = counts.iter().copied().max().unwrap_or(0);
                if max_count <= 0 {
                    continue;
                }

                let zero_rates: Vec<f64> = counts
                    .iter()
                    .map(|&c| {
                        if c < 0 {
                            0.0
                        } else {
                            c as f64 / max_count as f64
                        }
                    })
                    .collect();

                let nzeros_custom = expected_trailing_zeros(order, &zero_rates, llf, size);
                let nzeros_natural = expected_trailing_zeros(&natural, &zero_rates, llf, size);

                // Each additional trailing zero saves ~1 bit per block
                total_savings_bits += (nzeros_custom - nzeros_natural) * max_count as f64;
            }

            if total_savings_bits > total_lehmer_cost {
                used_orders |= 1 << bucket;
            }
        }
    }

    (orders, used_orders)
}

/// Compute expected number of trailing zeros for a scan order.
///
/// For each position from the end of the scan, accumulates the probability
/// that all positions from there to the end are zero. The sum of these
/// probabilities is the expected number of trailing zeros.
fn expected_trailing_zeros(order: &[u32], zero_rates: &[f64], llf: usize, size: usize) -> f64 {
    let mut expected = 0.0;
    let mut prob_all_zero = 1.0;
    for i in (llf..size).rev() {
        let pos = order[i] as usize;
        prob_all_zero *= zero_rates[pos];
        expected += prob_all_zero;
        if prob_all_zero < 1e-10 {
            break; // negligible contribution from here on
        }
    }
    expected
}

/// Convert order bucket index to (cx, cy) for natural order generation.
fn bucket_to_cx_cy(bucket: usize) -> (usize, usize) {
    match bucket {
        0 => (1, 1), // DCT8: 1x1 blocks
        2 => (2, 2), // DCT16x16: 2x2 blocks
        3 => (4, 4), // DCT32x32: 4x4 blocks
        4 => (2, 1), // DCT8x16/DCT16x8: 2x1 blocks (after CoefficientLayout)
        5 => (4, 1), // DCT32x8/DCT8x32: 4x1 blocks (after CoefficientLayout) — not implemented
        6 => (4, 2), // DCT32x16/DCT16x32: 4x2 blocks (after CoefficientLayout)
        7 => (8, 8), // DCT64x64: 8x8 blocks
        8 => (8, 4), // DCT64x32/DCT32x64: 8x4 blocks (after CoefficientLayout)
        _ => (0, 0), // Not supported by our encoder
    }
}

/// Tokenize coefficient orders as Lehmer codes.
///
/// For each (bucket, channel) with its bit set in `used_orders`:
/// 1. Convert the custom order to natural-order-index space (order_zigzag)
/// 2. Compute the Lehmer code
/// 3. Trim trailing zeros
/// 4. Emit tokens with coeff_order_context
pub fn tokenize_coeff_orders(orders: &[Vec<Vec<u32>>], used_orders: u32) -> Vec<Token> {
    let mut tokens = Vec::new();

    for (bucket, bucket_orders) in orders.iter().enumerate().take(NUM_ORDER_BUCKETS) {
        if used_orders & (1 << bucket) == 0 {
            continue;
        }

        let (cx, cy) = bucket_to_cx_cy(bucket);
        if cx == 0 {
            continue;
        }

        let natural_lut = natural_coeff_order_lut(cx, cy);
        let llf = cx * cy; // CoefficientLayout normalized
        let (cx_norm, cy_norm) = if cx >= cy { (cx, cy) } else { (cy, cx) };
        let llf_norm = cx_norm * cy_norm;
        let size = llf_norm * DCT_BLOCK_SIZE;

        for order in bucket_orders {
            if order.is_empty() {
                continue;
            }

            // Convert to natural-order-index space: order_zigzag[i] = natural_lut[order[i]]
            let order_zigzag: Vec<u32> =
                order.iter().map(|&pos| natural_lut[pos as usize]).collect();

            // Compute Lehmer code
            let lehmer = compute_lehmer_code(&order_zigzag);

            // Find the last non-zero Lehmer code entry (past the skip region)
            let mut end = size;
            while end > llf && lehmer[end - 1] == 0 {
                end -= 1;
            }

            // First token: end - skip (how many Lehmer codes to read)
            tokens.push(Token::new(
                coeff_order_context(size as u32) as u32,
                (end - llf) as u32,
            ));

            // Remaining tokens: Lehmer codes with context from previous value
            let mut last = 0u32;
            for &val in &lehmer[llf..end] {
                tokens.push(Token::new(coeff_order_context(last) as u32, val));
                last = val;
            }
        }
    }

    tokens
}

/// Build entropy code and write coefficient orders to the bitstream.
///
/// This writes the permutation entropy code header followed by all the
/// permutation tokens, matching the format expected by decoders.
pub fn build_and_write_coeff_orders(
    tokens: &[Token],
    use_ans: bool,
    writer: &mut BitWriter,
) -> Result<()> {
    if tokens.is_empty() {
        return Ok(());
    }

    // LZ77 flag: no LZ77 for permutation data
    writer.write(1, 0)?;

    if use_ans {
        let code = build_entropy_code_ans_with_options(
            tokens,
            NUM_PERMUTATION_CONTEXTS,
            false, // no enhanced clustering for permutation
            true,  // optimize uint configs
            None,  // no LZ77 for permutation data
            None,  // no pixel hint for permutation data
        );
        write_entropy_code_ans(&code, writer)?;
        write_tokens_ans(tokens, &code, None, writer)?;
    } else {
        let code = build_entropy_code_with_options(tokens, NUM_PERMUTATION_CONTEXTS, false, None);
        let ec = code.as_entropy_code();
        crate::entropy_coding::encode::write_entropy_code(&ec, writer)?;
        write_tokens(tokens, &ec, None, writer)?;
    }

    Ok(())
}

/// Get the custom order for a specific (bucket, channel) from the orders map.
/// Returns None if no custom order exists for this combination.
pub fn get_custom_order(
    orders: &[Vec<Vec<u32>>],
    used_orders: u32,
    raw_strategy: u8,
    channel: usize,
) -> Option<&[u32]> {
    let bucket = strategy_bucket(raw_strategy) as usize;
    if used_orders & (1 << bucket) == 0 {
        return None;
    }
    let order = &orders[bucket][channel];
    if order.is_empty() {
        return None;
    }
    Some(order)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_natural_coeff_order_8x8() {
        // Must match jxl-rs golden data and our COEFF_ORDER_8X8
        let order = natural_coeff_order(1, 1);
        assert_eq!(order.len(), 64);

        let expected: [u32; 64] = [
            0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34,
            27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37,
            44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
        ];
        assert_eq!(order, expected);

        // Must match our existing COEFF_ORDER_8X8
        let existing = super::super::ac_group::COEFF_ORDER_8X8;
        assert_eq!(order, &existing[..]);
    }

    #[test]
    fn test_natural_coeff_order_8x16() {
        // 2x1 covered blocks (after CoefficientLayout normalization)
        let order = natural_coeff_order(2, 1);
        assert_eq!(order.len(), 128);

        let expected: [u32; 128] = [
            0, 1, 16, 2, 3, 17, 32, 18, 4, 5, 19, 33, 48, 34, 20, 6, 7, 21, 35, 49, 64, 50, 36, 22,
            8, 9, 23, 37, 51, 65, 80, 66, 52, 38, 24, 10, 11, 25, 39, 53, 67, 81, 96, 82, 68, 54,
            40, 26, 12, 13, 27, 41, 55, 69, 83, 97, 112, 98, 84, 70, 56, 42, 28, 14, 15, 29, 43,
            57, 71, 85, 99, 113, 114, 100, 86, 72, 58, 44, 30, 31, 45, 59, 73, 87, 101, 115, 116,
            102, 88, 74, 60, 46, 47, 61, 75, 89, 103, 117, 118, 104, 90, 76, 62, 63, 77, 91, 105,
            119, 120, 106, 92, 78, 79, 93, 107, 121, 122, 108, 94, 95, 109, 123, 124, 110, 111,
            125, 126, 127,
        ];
        assert_eq!(order, expected);

        // Must match our existing COEFF_ORDER_8X16
        let existing = super::super::ac_group::COEFF_ORDER_8X16;
        assert_eq!(order, &existing[..]);
    }

    #[test]
    fn test_natural_coeff_order_16x16() {
        // 2x2 covered blocks
        let order = natural_coeff_order(2, 2);
        assert_eq!(order.len(), 256);

        // Must match our existing COEFF_ORDER_16X16
        let existing = super::super::ac_group::COEFF_ORDER_16X16;
        assert_eq!(order, &existing[..]);
    }

    #[test]
    fn test_natural_coeff_order_lut_inverse() {
        // LUT must be inverse of order
        for &(cx, cy) in &[(1, 1), (2, 1), (2, 2)] {
            let order = natural_coeff_order(cx, cy);
            let lut = natural_coeff_order_lut(cx, cy);
            assert_eq!(order.len(), lut.len());

            for (i, &pos) in order.iter().enumerate() {
                assert_eq!(
                    lut[pos as usize], i as u32,
                    "LUT inverse failed at i={}, pos={} for {}x{}",
                    i, pos, cx, cy
                );
            }
        }
    }

    #[test]
    fn test_coeff_order_context() {
        assert_eq!(coeff_order_context(0), 0);
        assert_eq!(coeff_order_context(1), 1); // 1 + floor_log2(1) = 1 + 0 = 1
        assert_eq!(coeff_order_context(2), 2); // 1 + floor_log2(2) = 1 + 1 = 2
        assert_eq!(coeff_order_context(3), 2); // 1 + floor_log2(3) = 1 + 1 = 2
        assert_eq!(coeff_order_context(4), 3); // 1 + floor_log2(4) = 1 + 2 = 3
        assert_eq!(coeff_order_context(7), 3);
        assert_eq!(coeff_order_context(8), 4);
        assert_eq!(coeff_order_context(15), 4);
        assert_eq!(coeff_order_context(16), 5);
        assert_eq!(coeff_order_context(31), 5);
        assert_eq!(coeff_order_context(32), 6);
        assert_eq!(coeff_order_context(63), 6);
        assert_eq!(coeff_order_context(64), 7);
        assert_eq!(coeff_order_context(128), 7); // clamped to 7
        assert_eq!(coeff_order_context(1000), 7);
    }

    #[test]
    fn test_lehmer_code_identity() {
        // Identity permutation [0,1,2,3,4] -> all-zero Lehmer code
        let perm: Vec<u32> = (0..5).collect();
        let code = compute_lehmer_code(&perm);
        assert_eq!(code, vec![0, 0, 0, 0, 0]);
    }

    #[test]
    fn test_lehmer_code_reverse() {
        // Reverse permutation [4,3,2,1,0]
        let perm = vec![4u32, 3, 2, 1, 0];
        let code = compute_lehmer_code(&perm);
        assert_eq!(code, vec![4, 3, 2, 1, 0]);
    }

    #[test]
    fn test_lehmer_code_known() {
        // Known example from jxl-rs tests:
        // Permutation [2, 4, 0, 1, 3] has Lehmer code [2, 3, 0, 0, 0]
        let perm = vec![2u32, 4, 0, 1, 3];
        let code = compute_lehmer_code(&perm);
        assert_eq!(code, vec![2, 3, 0, 0, 0]);
    }

    #[test]
    fn test_lehmer_code_jxl_rs_golden() {
        // From jxl-rs test: expected permutation [0,1,2,3,5,6,8,10,11,15,4,9,7,12,13,14]
        // The Lehmer code of the part after skip=4 is:
        // Permuted slice [5,6,8,10,11,15,4,9,7,12,13,14] with Lehmer code [1,1,2,3,3,6,0,1]
        // But compute_lehmer_code operates on the full permutation as indices [0..n)
        // So we need to set up the test correctly.
        //
        // The full permutation is [0,1,2,3,5,6,8,10,11,15,4,9,7,12,13,14]
        let full_perm = vec![0u32, 1, 2, 3, 5, 6, 8, 10, 11, 15, 4, 9, 7, 12, 13, 14];
        let code = compute_lehmer_code(&full_perm);
        // First 4 are identity, so their Lehmer codes are 0
        assert_eq!(code[0], 0);
        assert_eq!(code[1], 0);
        assert_eq!(code[2], 0);
        assert_eq!(code[3], 0);
        // Elements 4-11 should have Lehmer code [1,1,2,3,3,6,0,1]
        assert_eq!(&code[4..12], &[1, 1, 2, 3, 3, 6, 0, 1]);
    }

    #[test]
    fn test_tokenize_natural_order_produces_minimal_tokens() {
        // When order equals natural order, Lehmer code is all-zeros.
        // Tokenization should produce one token (end=0) per channel.
        let natural = natural_coeff_order(1, 1);
        let mut orders = vec![vec![Vec::new(); 3]; NUM_ORDER_BUCKETS];
        for order in &mut orders[0] {
            *order = natural.clone();
        }
        let used_orders = 1u32; // bucket 0

        let tokens = tokenize_coeff_orders(&orders, used_orders);
        // 3 channels, each should have exactly 1 token (end=0)
        assert_eq!(tokens.len(), 3);
        for t in &tokens {
            assert_eq!(t.value, 0, "Natural order should produce end=0");
        }
    }

    #[test]
    fn test_strategy_bucket() {
        assert_eq!(strategy_bucket(0), 0); // DCT8 -> bucket 0
        assert_eq!(strategy_bucket(4), 2); // DCT16X16 -> bucket 2
        assert_eq!(strategy_bucket(6), 4); // DCT8X16 -> bucket 4
        assert_eq!(strategy_bucket(7), 4); // DCT16X8 -> bucket 4
    }

    #[test]
    fn test_compute_custom_orders_all_zero_image() {
        // For an all-zero quantized image, all positions have the same zero count.
        // Stable sort should preserve natural order, so no custom order is needed.
        let xsize_blocks = 8;
        let ysize_blocks = 8;

        // Create all-zero quantized blocks
        let quant_ac: [Vec<Vec<[i32; DCT_BLOCK_SIZE]>>; 3] = [
            vec![vec![[0i32; DCT_BLOCK_SIZE]; xsize_blocks]; ysize_blocks],
            vec![vec![[0i32; DCT_BLOCK_SIZE]; xsize_blocks]; ysize_blocks],
            vec![vec![[0i32; DCT_BLOCK_SIZE]; xsize_blocks]; ysize_blocks],
        ];

        // All DCT8 strategy
        let ac_strategy = AcStrategyMap::new_dct8(xsize_blocks, ysize_blocks);

        let counts = count_zero_coefficients(&quant_ac, &ac_strategy, xsize_blocks, ysize_blocks);
        let (_, used_orders) = compute_custom_orders(&counts);

        // All-zero should produce natural order (no custom needed)
        assert_eq!(
            used_orders, 0,
            "All-zero image should not need custom orders"
        );
    }
}