zenwebp 0.4.2

High-performance WebP encoding and decoding in pure Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
//! Meta-Huffman encoding for spatially-varying codes.
//!
//! Builds per-tile histograms from backward references, clusters them
//! using entropy-based greedy merging (matching libwebp), and produces
//! a histogram index map for spatially-varying Huffman codes.

use alloc::vec;
use alloc::vec::Vec;

use super::entropy::{
    HistogramCosts, compute_histogram_cost, costs_from_merge, get_combined_histogram_cost,
    get_combined_histogram_cost_with_detail,
};
use super::histogram::Histogram;
use super::types::{BackwardRefs, PixOrCopy, subsample_size};

// === Instrumentation counters for tracing clustering behavior ===
#[cfg(feature = "std")]
mod cluster_trace {
    use core::sync::atomic::{AtomicU64, Ordering};

    static CLUSTER_CALLS: AtomicU64 = AtomicU64::new(0);
    static INITIAL_HISTOGRAMS: AtomicU64 = AtomicU64::new(0);
    static ENTROPY_BIN_MERGES: AtomicU64 = AtomicU64::new(0);
    static STOCHASTIC_OUTER_ITERS: AtomicU64 = AtomicU64::new(0);
    static STOCHASTIC_PAIR_EVALS: AtomicU64 = AtomicU64::new(0);
    static STOCHASTIC_MERGES: AtomicU64 = AtomicU64::new(0);
    static STOCHASTIC_QUEUE_UPDATES: AtomicU64 = AtomicU64::new(0);
    static GREEDY_INITIAL_PAIRS: AtomicU64 = AtomicU64::new(0);
    static GREEDY_MERGES: AtomicU64 = AtomicU64::new(0);
    static GREEDY_NEW_PAIRS: AtomicU64 = AtomicU64::new(0);
    static REMAP_EVALS: AtomicU64 = AtomicU64::new(0);
    static POST_ENTROPY_BIN_COUNT: AtomicU64 = AtomicU64::new(0);
    static POST_STOCHASTIC_COUNT: AtomicU64 = AtomicU64::new(0);
    static POST_GREEDY_COUNT: AtomicU64 = AtomicU64::new(0);

    #[inline]
    pub fn inc_cluster_calls() {
        CLUSTER_CALLS.fetch_add(1, Ordering::Relaxed);
    }
    #[inline]
    pub fn add_initial_histograms(n: u64) {
        INITIAL_HISTOGRAMS.fetch_add(n, Ordering::Relaxed);
    }
    #[inline]
    pub fn inc_entropy_bin_merges() {
        ENTROPY_BIN_MERGES.fetch_add(1, Ordering::Relaxed);
    }
    #[inline]
    pub fn inc_stochastic_outer_iters() {
        STOCHASTIC_OUTER_ITERS.fetch_add(1, Ordering::Relaxed);
    }
    #[inline]
    pub fn add_stochastic_pair_evals(n: u64) {
        STOCHASTIC_PAIR_EVALS.fetch_add(n, Ordering::Relaxed);
    }
    #[inline]
    pub fn inc_stochastic_merges() {
        STOCHASTIC_MERGES.fetch_add(1, Ordering::Relaxed);
    }
    #[inline]
    pub fn inc_stochastic_queue_updates() {
        STOCHASTIC_QUEUE_UPDATES.fetch_add(1, Ordering::Relaxed);
    }
    #[inline]
    pub fn add_greedy_initial_pairs(n: u64) {
        GREEDY_INITIAL_PAIRS.fetch_add(n, Ordering::Relaxed);
    }
    #[inline]
    pub fn inc_greedy_merges() {
        GREEDY_MERGES.fetch_add(1, Ordering::Relaxed);
    }
    #[inline]
    pub fn add_greedy_new_pairs(n: u64) {
        GREEDY_NEW_PAIRS.fetch_add(n, Ordering::Relaxed);
    }
    #[inline]
    pub fn add_remap_evals(n: u64) {
        REMAP_EVALS.fetch_add(n, Ordering::Relaxed);
    }
    #[inline]
    pub fn set_post_entropy_bin_count(n: u64) {
        POST_ENTROPY_BIN_COUNT.fetch_add(n, Ordering::Relaxed);
    }
    #[inline]
    pub fn set_post_stochastic_count(n: u64) {
        POST_STOCHASTIC_COUNT.fetch_add(n, Ordering::Relaxed);
    }
    #[inline]
    pub fn set_post_greedy_count(n: u64) {
        POST_GREEDY_COUNT.fetch_add(n, Ordering::Relaxed);
    }

    pub fn print_and_reset() {
        eprintln!("=== Clustering Stats ===");
        eprintln!(
            "cluster_histograms calls: {}",
            CLUSTER_CALLS.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "initial histograms (total): {}",
            INITIAL_HISTOGRAMS.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "entropy bin merges: {}",
            ENTROPY_BIN_MERGES.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "post-entropy-bin active: {}",
            POST_ENTROPY_BIN_COUNT.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "stochastic outer iters: {}",
            STOCHASTIC_OUTER_ITERS.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "stochastic pair evals (HistoQueuePush): {}",
            STOCHASTIC_PAIR_EVALS.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "stochastic merges: {}",
            STOCHASTIC_MERGES.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "stochastic queue update re-evals: {}",
            STOCHASTIC_QUEUE_UPDATES.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "post-stochastic active: {}",
            POST_STOCHASTIC_COUNT.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "greedy initial pairs eval: {}",
            GREEDY_INITIAL_PAIRS.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "greedy merges: {}",
            GREEDY_MERGES.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "greedy new pair evals: {}",
            GREEDY_NEW_PAIRS.swap(0, Ordering::Relaxed)
        );
        eprintln!(
            "post-greedy active: {}",
            POST_GREEDY_COUNT.swap(0, Ordering::Relaxed)
        );
        eprintln!("remap evals: {}", REMAP_EVALS.swap(0, Ordering::Relaxed));
    }
}

#[cfg(not(feature = "std"))]
mod cluster_trace {
    #[inline]
    pub fn inc_cluster_calls() {}
    #[inline]
    pub fn add_initial_histograms(_n: u64) {}
    #[inline]
    pub fn inc_entropy_bin_merges() {}
    #[inline]
    pub fn inc_stochastic_outer_iters() {}
    #[inline]
    pub fn add_stochastic_pair_evals(_n: u64) {}
    #[inline]
    pub fn inc_stochastic_merges() {}
    #[inline]
    pub fn inc_stochastic_queue_updates() {}
    #[inline]
    pub fn add_greedy_initial_pairs(_n: u64) {}
    #[inline]
    pub fn inc_greedy_merges() {}
    #[inline]
    pub fn add_greedy_new_pairs(_n: u64) {}
    #[inline]
    pub fn add_remap_evals(_n: u64) {}
    #[inline]
    pub fn set_post_entropy_bin_count(_n: u64) {}
    #[inline]
    pub fn set_post_stochastic_count(_n: u64) {}
    #[inline]
    pub fn set_post_greedy_count(_n: u64) {}
    pub fn print_and_reset() {}
}

/// Print and reset clustering statistics.
pub fn print_clustering_stats() {
    cluster_trace::print_and_reset();
}

/// Maximum Huffman group count for greedy combining.
const MAX_HISTO_GREEDY: usize = 100;

/// Number of entropy bins for initial binning (4×4×4 = 64 for 3D binning).
const NUM_PARTITIONS: usize = 4;
const BIN_SIZE: usize = NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS; // 64

/// Meta-Huffman configuration.
pub struct MetaHuffmanInfo {
    /// Histogram bits (block size = 2^bits pixels).
    pub histo_bits: u8,
    /// Number of final histogram groups.
    pub num_histograms: usize,
    /// Mapping from tile index to histogram group index.
    pub histogram_symbols: Vec<u16>,
    /// Final clustered histograms (one per group).
    pub histograms: Vec<Histogram>,
    /// Cached costs for each final histogram.
    pub costs: Vec<HistogramCosts>,
    /// Image width (for sub-image dimension computation).
    pub image_width: usize,
    /// Image height (for sub-image dimension computation).
    pub image_height: usize,
}

/// Build per-tile histograms from backward references.
///
/// Each tile covers a (2^histo_bits × 2^histo_bits) block of pixels.
/// Tokens are assigned to tiles based on the pixel position they encode.
/// Build per-tile histograms from backward references.
///
/// Each tile covers a (2^histo_bits × 2^histo_bits) block of pixels.
/// Tokens are assigned to tiles based on the pixel position they encode.
/// Assumes distances in refs are already plane codes (from apply_2d_locality).
fn build_tile_histograms(
    refs: &BackwardRefs,
    width: usize,
    _height: usize,
    histo_bits: u8,
    cache_bits: u8,
) -> Vec<Histogram> {
    let histo_xsize = subsample_size(width as u32, histo_bits) as usize;
    let histo_ysize = subsample_size(_height as u32, histo_bits) as usize;
    let num_tiles = histo_xsize * histo_ysize;

    let mut histos: Vec<Histogram> = (0..num_tiles).map(|_| Histogram::new(cache_bits)).collect();

    let mut x = 0usize;
    let mut y = 0usize;

    for token in refs.iter() {
        let tile_idx = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
        debug_assert!(tile_idx < num_tiles);

        match *token {
            PixOrCopy::Literal(argb) => {
                histos[tile_idx].add_literal(argb);
                x += 1;
                if x >= width {
                    x = 0;
                    y += 1;
                }
            }
            PixOrCopy::CacheIdx(idx) => {
                histos[tile_idx].add_cache_idx(idx);
                x += 1;
                if x >= width {
                    x = 0;
                    y += 1;
                }
            }
            PixOrCopy::Copy { len, dist } => {
                // Distances are already plane codes
                histos[tile_idx].add_copy(len, dist);

                for _ in 0..len {
                    x += 1;
                    if x >= width {
                        x = 0;
                        y += 1;
                    }
                }
            }
        }
    }

    histos
}

/// Get bin ID for entropy-based binning (matching libwebp's GetBinIdForEntropy).
fn get_bin_id_for_entropy(min: u64, max: u64, val: u64) -> usize {
    if min == max {
        return 0;
    }
    // Map val to [0, NUM_PARTITIONS-1] linearly
    let range = max - min;
    let offset = val.saturating_sub(min);
    let bin = ((NUM_PARTITIONS as u64 - 1) * offset) / range;
    bin.min(NUM_PARTITIONS as u64 - 1) as usize
}

/// Priority queue entry for histogram pair merging.
/// Matches libwebp's HistogramPair struct.
struct HistogramPair {
    idx1: usize,
    idx2: usize,
    cost_diff: i64,
    cost_combo: u64,
    per_type_costs: [u64; 5],
}

/// Min-priority-queue for histogram pairs, sorted by cost_diff (most negative first).
/// Uses a simple array with head tracking (matching libwebp's HistoQueue).
struct HistoQueue {
    queue: Vec<HistogramPair>,
    max_size: usize,
}

impl HistoQueue {
    fn new(max_size: usize) -> Self {
        Self {
            queue: Vec::with_capacity(max_size + 1),
            max_size,
        }
    }

    /// Push a pair if it improves on the threshold. Returns cost_diff if pushed, 0 otherwise.
    fn push(
        &mut self,
        histos: &[Histogram],
        costs: &[HistogramCosts],
        idx1: usize,
        idx2: usize,
        threshold: i64,
    ) -> i64 {
        if self.queue.len() >= self.max_size {
            return 0;
        }
        debug_assert!(threshold <= 0);

        let sum_cost = costs[idx1].total as i64 + costs[idx2].total as i64;
        let cost_threshold = sum_cost.saturating_add(threshold);
        if cost_threshold <= 0 {
            return 0;
        }

        if let Some((cost_combo, per_type)) = get_combined_histogram_cost_with_detail(
            &histos[idx1],
            &costs[idx1],
            &histos[idx2],
            &costs[idx2],
            cost_threshold as u64,
        ) {
            let cost_diff = cost_combo as i64 - sum_cost;
            let pair = HistogramPair {
                idx1,
                idx2,
                cost_diff,
                cost_combo,
                per_type_costs: per_type,
            };
            self.queue.push(pair);
            self.update_head(self.queue.len() - 1);
            return cost_diff;
        }
        0
    }

    /// Push a pair for greedy combining, storing per-type costs.
    fn push_greedy(
        &mut self,
        histos: &[Histogram],
        costs: &[HistogramCosts],
        idx1: usize,
        idx2: usize,
    ) {
        let sum_cost = costs[idx1].total as i64 + costs[idx2].total as i64;
        // Threshold = sum_cost (only accept if combined < sum)
        if sum_cost <= 0 {
            return;
        }

        if let Some((cost_combo, per_type)) = get_combined_histogram_cost_with_detail(
            &histos[idx1],
            &costs[idx1],
            &histos[idx2],
            &costs[idx2],
            sum_cost as u64,
        ) {
            let cost_diff = cost_combo as i64 - sum_cost;
            if cost_diff >= 0 {
                return;
            }
            let pair = HistogramPair {
                idx1,
                idx2,
                cost_diff,
                cost_combo,
                per_type_costs: per_type,
            };
            self.queue.push(pair);
            self.update_head(self.queue.len() - 1);
        }
    }

    /// Ensure queue[0] has the smallest cost_diff (most negative = best savings).
    fn update_head(&mut self, new_idx: usize) {
        if self.queue.is_empty() {
            return;
        }
        if new_idx < self.queue.len() && self.queue[new_idx].cost_diff < self.queue[0].cost_diff {
            self.queue.swap(0, new_idx);
        }
    }

    /// Remove a pair by index, replacing with last element.
    fn pop_at(&mut self, idx: usize) {
        let last = self.queue.len() - 1;
        if idx != last {
            self.queue.swap(idx, last);
        }
        self.queue.pop();
    }

    /// Replace bad_id with good_id in a pair's indices, keeping idx1 < idx2.
    fn fix_pair_idx(pair: &mut HistogramPair, bad_id: usize, good_id: usize) {
        if pair.idx1 == bad_id {
            pair.idx1 = good_id;
        }
        if pair.idx2 == bad_id {
            pair.idx2 = good_id;
        }
        if pair.idx1 > pair.idx2 {
            core::mem::swap(&mut pair.idx1, &mut pair.idx2);
        }
    }

    /// Re-evaluate pair cost. Returns false if pair should be removed.
    fn update_pair(
        histos: &[Histogram],
        costs: &[HistogramCosts],
        pair: &mut HistogramPair,
    ) -> bool {
        let sum_cost = costs[pair.idx1].total as i64 + costs[pair.idx2].total as i64;
        if sum_cost <= 0 {
            return false;
        }
        if let Some((cost_combo, per_type)) = get_combined_histogram_cost_with_detail(
            &histos[pair.idx1],
            &costs[pair.idx1],
            &histos[pair.idx2],
            &costs[pair.idx2],
            sum_cost as u64,
        ) {
            pair.cost_diff = cost_combo as i64 - sum_cost;
            pair.cost_combo = cost_combo;
            pair.per_type_costs = per_type;
            pair.cost_diff < 0
        } else {
            false
        }
    }
}

/// Cluster histograms using entropy binning + stochastic + greedy combining.
/// Matches libwebp's HistogramCombineEntropyBin + HistogramCombineStochastic
/// + HistogramCombineGreedy + HistogramRemap.
fn cluster_histograms(
    tile_histos: &[Histogram],
    quality: u8,
    cache_bits: u8,
) -> (Vec<Histogram>, Vec<u16>) {
    let n = tile_histos.len();
    cluster_trace::inc_cluster_calls();
    cluster_trace::add_initial_histograms(n as u64);
    #[cfg(feature = "std")]
    if std::env::var("ZENWEBP_TRACE").is_ok() {
        // Count empty histograms (no symbols in any type)
        let empty_count = tile_histos
            .iter()
            .filter(|h| {
                !h.literal.iter().any(|&c| c > 0)
                    && !h.red.iter().any(|&c| c > 0)
                    && !h.blue.iter().any(|&c| c > 0)
                    && !h.alpha.iter().any(|&c| c > 0)
                    && !h.distance.iter().any(|&c| c > 0)
            })
            .count();
        eprintln!(
            "[cluster_histograms] n={} quality={} cache_bits={} empty={}",
            n, quality, cache_bits, empty_count
        );
    }

    if n <= 1 {
        if n == 1 {
            return (vec![tile_histos[0].clone()], vec![0]);
        }
        return (Vec::new(), Vec::new());
    }

    // Compute initial costs for all histograms
    let mut costs: Vec<HistogramCosts> = tile_histos.iter().map(compute_histogram_cost).collect();

    // Track which histograms are active and their mapping
    let mut active: Vec<bool> = vec![true; n];
    let mut mapping: Vec<usize> = (0..n).collect(); // tile -> cluster index
    let mut histos: Vec<Histogram> = tile_histos.to_vec();

    // Phase 1: Entropy binning - assign each histogram to an entropy bin
    // Find min/max costs for literal, red, blue types
    let mut lit_min = u64::MAX;
    let mut lit_max = 0u64;
    let mut red_min = u64::MAX;
    let mut red_max = 0u64;
    let mut blue_min = u64::MAX;
    let mut blue_max = 0u64;

    for (i, cost) in costs.iter().enumerate() {
        if !active[i] {
            continue;
        }
        lit_min = lit_min.min(cost.per_type[0]);
        lit_max = lit_max.max(cost.per_type[0]);
        red_min = red_min.min(cost.per_type[1]);
        red_max = red_max.max(cost.per_type[1]);
        blue_min = blue_min.min(cost.per_type[2]);
        blue_max = blue_max.max(cost.per_type[2]);
    }

    // Assign bin IDs (3D: literal × red × blue)
    let mut bin_ids: Vec<usize> = Vec::with_capacity(n);
    for cost in &costs {
        let lit_bin = get_bin_id_for_entropy(lit_min, lit_max, cost.per_type[0]);
        let red_bin = get_bin_id_for_entropy(red_min, red_max, cost.per_type[1]);
        let blue_bin = get_bin_id_for_entropy(blue_min, blue_max, cost.per_type[2]);
        bin_ids
            .push(lit_bin * NUM_PARTITIONS * NUM_PARTITIONS + red_bin * NUM_PARTITIONS + blue_bin);
    }

    // Phase 2: Merge within entropy bins (HistogramCombineEntropyBin)
    let num_active = active.iter().filter(|&&a| a).count();
    let num_bins = BIN_SIZE.min(num_active);
    let do_entropy_combine = num_active > num_bins * 2 && quality < 100;

    if do_entropy_combine {
        // Track first histogram per bin
        let mut bin_first: Vec<Option<usize>> = vec![None; BIN_SIZE];

        // Combine cost factor (matching libwebp's GetCombineCostFactor)
        let mut combine_cost_factor = 16i64;
        if quality < 90 {
            if num_active > 256 {
                combine_cost_factor /= 2;
            }
            if num_active > 512 {
                combine_cost_factor /= 2;
            }
            if num_active > 1024 {
                combine_cost_factor /= 2;
            }
            if quality <= 50 {
                combine_cost_factor /= 2;
            }
        }

        for i in 0..n {
            if !active[i] {
                continue;
            }

            let bin_id = bin_ids[i];
            if let Some(first) = bin_first[bin_id] {
                // Try to merge with first histogram in bin.
                // libwebp uses the incoming histogram's cost for the threshold
                // (not the accumulator's), matching HistogramCombineEntropyBin.
                let bit_cost_incoming = costs[i].total;
                let threshold = costs[first].total + bit_cost_incoming;
                let cost_thresh_val = threshold.saturating_sub(div_round_i64(
                    bit_cost_incoming as i64 * combine_cost_factor,
                    100,
                ) as u64);

                if let Some((combined_cost, per_type)) = get_combined_histogram_cost_with_detail(
                    &histos[first],
                    &costs[first],
                    &histos[i],
                    &costs[i],
                    cost_thresh_val,
                ) {
                    // Merge i into first — use precomputed costs instead of recomputing
                    cluster_trace::inc_entropy_bin_merges();
                    let first_costs = costs[first].clone();
                    let i_costs = costs[i].clone();
                    let i_histo = histos[i].clone();
                    histos[first].add(&i_histo);
                    costs[first] =
                        costs_from_merge(combined_cost, per_type, &first_costs, &i_costs);
                    active[i] = false;

                    // Remap all tiles pointing to i → first
                    for m in mapping.iter_mut() {
                        if *m == i {
                            *m = first;
                        }
                    }
                }
            } else {
                bin_first[bin_id] = Some(i);
            }
        }
    }

    // Phase 2b: Stochastic combining (matching libwebp's HistogramCombineStochastic).
    // Uses priority queue of size 9 with Lehmer RNG, matching libwebp exactly.
    let target_size = {
        let q3 = (quality as u64) * (quality as u64) * (quality as u64);
        let t = 1 + div_round_u64(q3 * (MAX_HISTO_GREEDY as u64 - 1), 100u64 * 100 * 100);
        t.min(MAX_HISTO_GREEDY as u64) as usize
    };

    let num_active_pre_stochastic = active.iter().filter(|&&a| a).count();
    cluster_trace::set_post_entropy_bin_count(num_active_pre_stochastic as u64);

    // do_greedy tracks whether we should run the greedy phase after stochastic
    let mut do_greedy = num_active_pre_stochastic <= target_size;

    if num_active_pre_stochastic > target_size {
        const HISTO_QUEUE_SIZE: usize = 9;
        let mut histo_queue = HistoQueue::new(HISTO_QUEUE_SIZE);

        // Build a compact index: position -> original index
        // This matches libwebp's approach where histograms are stored in a
        // compact array and HistogramSetRemoveHistogram swaps in the last element.
        let mut compact: Vec<usize> = (0..n).filter(|&i| active[i]).collect();
        let mut compact_size = compact.len();

        let outer_iters = compact_size;
        let num_tries_no_success = outer_iters / 2;
        let mut tries_with_no_success = 0usize;
        let mut seed: u32 = 1;

        for _iter in 0..outer_iters {
            if compact_size < 2 || compact_size <= target_size {
                break;
            }
            tries_with_no_success += 1;
            if tries_with_no_success >= num_tries_no_success {
                break;
            }
            cluster_trace::inc_stochastic_outer_iters();

            let best_cost = if histo_queue.queue.is_empty() {
                0i64
            } else {
                histo_queue.queue[0].cost_diff
            };
            let rand_range = (compact_size as u64) * (compact_size as u64 - 1);
            let num_tries = compact_size / 2;

            // Pick random samples (matching libwebp's inner loop)
            let mut pair_evals_this_iter = 0u64;
            for _ in 0..num_tries {
                if compact_size < 2 {
                    break;
                }
                // Lehmer RNG
                seed = ((seed as u64 * 48271u64) % 2147483647u64) as u32;
                let tmp = (seed as u64) % rand_range;
                let ci1 = (tmp / (compact_size as u64 - 1)) as usize;
                let mut ci2 = (tmp % (compact_size as u64 - 1)) as usize;
                if ci2 >= ci1 {
                    ci2 += 1;
                }

                let idx1 = compact[ci1];
                let idx2 = compact[ci2];

                pair_evals_this_iter += 1;
                let curr_cost = histo_queue.push(&histos, &costs, idx1, idx2, best_cost.min(0));
                if curr_cost < 0 {
                    // Break if queue reached full capacity
                    if histo_queue.queue.len() >= histo_queue.max_size {
                        break;
                    }
                }
            }
            cluster_trace::add_stochastic_pair_evals(pair_evals_this_iter);

            if histo_queue.queue.is_empty() {
                continue;
            }

            // Get the best pair from the queue head
            let best_idx1 = histo_queue.queue[0].idx1;
            let merge_idx2 = histo_queue.queue[0].idx2;
            cluster_trace::inc_stochastic_merges();

            // Merge idx2 into idx1 — use precomputed costs from queue
            let best_pair = &histo_queue.queue[0];
            let combined_cost = best_pair.cost_combo;
            let per_type = best_pair.per_type_costs;
            let c1 = costs[best_idx1].clone();
            let c2 = costs[merge_idx2].clone();
            let j_histo = histos[merge_idx2].clone();
            histos[best_idx1].add(&j_histo);
            costs[best_idx1] = costs_from_merge(combined_cost, per_type, &c1, &c2);

            active[merge_idx2] = false;

            // Remap all tiles pointing to merge_idx2 → best_idx1
            for m in mapping.iter_mut() {
                if *m == merge_idx2 {
                    *m = best_idx1;
                }
            }

            // Remove merge_idx2 from compact array (swap with last, like libwebp)
            if let Some(pos) = compact.iter().position(|&x| x == merge_idx2) {
                compact.swap(pos, compact_size - 1);
                compact_size -= 1;
                compact.truncate(compact_size);
            }

            // Update queue: remove pairs involving the merged indices,
            // re-evaluate pairs that reference either idx
            let mut j = 0usize;
            while j < histo_queue.queue.len() {
                let p = &histo_queue.queue[j];
                let is_idx1_best = p.idx1 == best_idx1 || p.idx1 == merge_idx2;
                let is_idx2_best = p.idx2 == best_idx1 || p.idx2 == merge_idx2;

                if is_idx1_best && is_idx2_best {
                    // This is the pair we just merged (or a duplicate)
                    histo_queue.pop_at(j);
                    continue;
                }

                if is_idx1_best || is_idx2_best {
                    // Fix index references
                    HistoQueue::fix_pair_idx(&mut histo_queue.queue[j], merge_idx2, best_idx1);
                    // Re-evaluate cost
                    cluster_trace::inc_stochastic_queue_updates();
                    if !HistoQueue::update_pair(&histos, &costs, &mut histo_queue.queue[j]) {
                        histo_queue.pop_at(j);
                        continue;
                    }
                }

                // Also fix if either index matches the "swapped-in last" index
                // (In libwebp, HistogramSetRemoveHistogram moves the last histo
                // into the removed slot. We don't compact our histos array, so we
                // only need to fix the merge_idx2 reference.)

                // Update head if this pair is now better
                histo_queue.update_head(j);
                j += 1;
            }

            tries_with_no_success = 0;
        }

        do_greedy = compact_size <= target_size;
    }

    // Phase 3: Greedy combining using priority queue (matching libwebp's
    // HistogramCombineGreedy). O(n^2) initial pair evaluation + O(n) per merge.
    // Only runs when stochastic brought count <= target_size (matching libwebp).
    let num_active = active.iter().filter(|&&a| a).count();
    cluster_trace::set_post_stochastic_count(num_active as u64);

    if do_greedy && num_active > 1 {
        let active_indices: Vec<usize> = (0..n).filter(|&i| active[i]).collect();
        let active_n = active_indices.len();

        // Initialize priority queue with all pairs
        let mut histo_queue = HistoQueue::new(active_n * active_n);
        let greedy_init_pairs = (active_n * (active_n - 1)) / 2;
        cluster_trace::add_greedy_initial_pairs(greedy_init_pairs as u64);

        for ai in 0..active_n {
            for aj in (ai + 1)..active_n {
                histo_queue.push_greedy(&histos, &costs, active_indices[ai], active_indices[aj]);
            }
        }

        // Greedily merge the best pair until no beneficial pair remains
        while !histo_queue.queue.is_empty() {
            let best_idx1 = histo_queue.queue[0].idx1;
            let best_idx2 = histo_queue.queue[0].idx2;
            cluster_trace::inc_greedy_merges();

            // Merge idx2 into idx1 — use precomputed costs from queue
            let combined_cost = histo_queue.queue[0].cost_combo;
            let per_type = histo_queue.queue[0].per_type_costs;
            let c1 = costs[best_idx1].clone();
            let c2 = costs[best_idx2].clone();
            let j_histo = histos[best_idx2].clone();
            histos[best_idx1].add(&j_histo);
            costs[best_idx1] = costs_from_merge(combined_cost, per_type, &c1, &c2);
            active[best_idx2] = false;

            for m in mapping.iter_mut() {
                if *m == best_idx2 {
                    *m = best_idx1;
                }
            }

            // Remove stale pairs and update pairs involving merged indices
            let mut j = 0usize;
            while j < histo_queue.queue.len() {
                let p = &histo_queue.queue[j];
                if p.idx1 == best_idx1
                    || p.idx2 == best_idx1
                    || p.idx1 == best_idx2
                    || p.idx2 == best_idx2
                {
                    histo_queue.pop_at(j);
                } else {
                    // No need to fix indices — we don't compact the array
                    histo_queue.update_head(j);
                    j += 1;
                }
            }

            // Add new pairs involving the merged histogram (best_idx1)
            let mut new_pairs = 0u64;
            for i in 0..n {
                if !active[i] || i == best_idx1 {
                    continue;
                }
                new_pairs += 1;
                histo_queue.push_greedy(&histos, &costs, best_idx1, i);
            }
            cluster_trace::add_greedy_new_pairs(new_pairs);
        }
    }

    // Phase 4: Remap - find best cluster for each original tile histogram
    // Matches libwebp's HistogramRemap with progressive threshold tightening.
    let active_indices: Vec<usize> = (0..n).filter(|&i| active[i]).collect();
    cluster_trace::set_post_greedy_count(active_indices.len() as u64);

    if active_indices.len() > 1 {
        // Cache tile histogram costs to avoid recomputation
        let tile_costs: Vec<HistogramCosts> =
            tile_histos.iter().map(compute_histogram_cost).collect();

        let remap_total = n as u64 * active_indices.len() as u64;
        cluster_trace::add_remap_evals(remap_total);

        for tile_idx in 0..n {
            let mut best_cluster = mapping[tile_idx];
            let mut best_bits = i64::MAX;

            for &cluster_idx in &active_indices {
                // Threshold: cluster.cost + best_bits (matching libwebp's
                // HistogramAddThresh which does SaturateAdd(a->bit_cost, &cost_threshold))
                let cost_threshold = if best_bits == i64::MAX {
                    u64::MAX
                } else {
                    let thresh = costs[cluster_idx].total as i64 + best_bits;
                    if thresh <= 0 {
                        continue;
                    }
                    thresh as u64
                };

                if let Some(combined_cost) = get_combined_histogram_cost(
                    &histos[cluster_idx],
                    &costs[cluster_idx],
                    &tile_histos[tile_idx],
                    &tile_costs[tile_idx],
                    cost_threshold,
                ) {
                    // Cost = C(cluster + tile) - C(cluster), matching HistogramAddThresh
                    let cost = combined_cost as i64 - costs[cluster_idx].total as i64;
                    if cost < best_bits {
                        best_bits = cost;
                        best_cluster = cluster_idx;
                    }
                }
            }

            mapping[tile_idx] = best_cluster;
        }
    } else if active_indices.len() == 1 {
        // Single cluster: all tiles map to it
        for m in mapping.iter_mut() {
            *m = active_indices[0];
        }
    }

    // Phase 5: Rebuild final histograms from remapped tiles
    let mut final_histos: Vec<Histogram> = active_indices
        .iter()
        .map(|_| Histogram::new(cache_bits))
        .collect();

    // Create index mapping from cluster_idx → final histogram index
    let mut cluster_to_final: Vec<Option<u16>> = vec![None; n];
    for (final_idx, &cluster_idx) in active_indices.iter().enumerate() {
        cluster_to_final[cluster_idx] = Some(final_idx as u16);
    }

    // Build final symbols and accumulate histograms
    let mut symbols: Vec<u16> = Vec::with_capacity(n);
    for tile_idx in 0..n {
        let cluster = mapping[tile_idx];
        let final_idx = cluster_to_final[cluster].unwrap_or(0);
        symbols.push(final_idx);
        final_histos[final_idx as usize].add(&tile_histos[tile_idx]);
    }

    (final_histos, symbols)
}

#[inline]
fn div_round_i64(num: i64, den: i64) -> i64 {
    if num >= 0 {
        (num + den / 2) / den
    } else {
        (num - den / 2) / den
    }
}

#[inline]
fn div_round_u64(num: u64, den: u64) -> u64 {
    (num + den / 2) / den
}

/// Build meta-Huffman info from backward references.
/// Returns info for spatially-varying Huffman codes.
pub fn build_meta_huffman(
    refs: &BackwardRefs,
    width: usize,
    height: usize,
    histo_bits: u8,
    cache_bits: u8,
    quality: u8,
) -> MetaHuffmanInfo {
    let histo_bits = histo_bits.clamp(2, 8);
    #[cfg(feature = "std")]
    if std::env::var("ZENWEBP_TRACE").is_ok() {
        let xsize = super::types::subsample_size(width as u32, histo_bits);
        let ysize = super::types::subsample_size(height as u32, histo_bits);
        eprintln!(
            "[build_meta_huffman] {}x{} histo_bits={} tiles={}x{}={} cache_bits={} quality={}",
            width,
            height,
            histo_bits,
            xsize,
            ysize,
            xsize as usize * ysize as usize,
            cache_bits,
            quality
        );
    }

    // Build per-tile histograms from backward reference tokens
    let tile_histos = build_tile_histograms(refs, width, height, histo_bits, cache_bits);

    // Cluster histograms
    let (histograms, symbols) = cluster_histograms(&tile_histos, quality, cache_bits);

    // Compute final costs
    let costs = histograms.iter().map(compute_histogram_cost).collect();

    MetaHuffmanInfo {
        histo_bits,
        num_histograms: histograms.len(),
        histogram_symbols: symbols,
        histograms,
        costs,
        image_width: width,
        image_height: height,
    }
}

/// Build a single-histogram MetaHuffmanInfo (no spatial variation).
/// Assumes distances in refs are already plane codes (from apply_2d_locality).
pub fn build_single_histogram(refs: &BackwardRefs, cache_bits: u8) -> MetaHuffmanInfo {
    let histogram = Histogram::from_refs(refs, cache_bits);
    let costs = vec![compute_histogram_cost(&histogram)];

    MetaHuffmanInfo {
        histo_bits: 0,
        num_histograms: 1,
        histogram_symbols: Vec::new(),
        histograms: vec![histogram],
        costs,
        image_width: 0,
        image_height: 0,
    }
}

#[cfg(test)]
mod tests {
    use super::super::types::make_argb;
    use super::*;

    #[test]
    fn test_single_histogram() {
        let mut refs = BackwardRefs::new();
        refs.push(PixOrCopy::literal(make_argb(255, 128, 64, 32)));
        refs.push(PixOrCopy::literal(make_argb(255, 128, 64, 32)));
        let info = build_single_histogram(&refs, 0);
        assert_eq!(info.num_histograms, 1);
        assert!(info.histogram_symbols.is_empty());
    }

    #[test]
    fn test_build_tile_histograms() {
        // 4x4 image with bits=2 → tile size 4x4, so 1 tile
        let mut refs = BackwardRefs::new();
        for _ in 0..16 {
            refs.push(PixOrCopy::literal(make_argb(255, 128, 64, 32)));
        }
        let histos = build_tile_histograms(&refs, 4, 4, 2, 0);
        assert_eq!(histos.len(), 1); // 4/4 = 1 tile for bits=2
    }

    #[test]
    fn test_cluster_identical() {
        // Two identical histograms should merge into one
        let mut h1 = Histogram::new(0);
        let mut h2 = Histogram::new(0);
        for _ in 0..100 {
            h1.add_literal(make_argb(255, 128, 64, 32));
            h2.add_literal(make_argb(255, 128, 64, 32));
        }
        let (groups, symbols) = cluster_histograms(&[h1, h2], 75, 0);
        assert_eq!(groups.len(), 1);
        assert_eq!(symbols.len(), 2);
        assert_eq!(symbols[0], symbols[1]);
    }
}