Skip to main content

jxl_encoder/entropy_coding/
cluster.rs

1// Copyright (c) Imazen LLC and the JPEG XL Project Authors.
2// Algorithms and constants derived from libjxl (BSD-3-Clause).
3// Licensed under AGPL-3.0-or-later. Commercial licenses at https://www.imazen.io/pricing
4
5//! Histogram clustering for entropy coding.
6//!
7//! Ported from libjxl `lib/jxl/enc_cluster.cc`.
8
9use alloc::collections::BinaryHeap;
10use core::cmp::Ordering;
11
12use super::histogram::{Histogram, histogram_distance, histogram_kl_divergence};
13use crate::error::{Error, Result};
14
15/// Minimum distance threshold for creating distinct clusters.
16const MIN_DISTANCE_FOR_DISTINCT: f32 = 48.0;
17
18/// Maximum number of histogram clusters.
19pub const CLUSTERS_LIMIT: usize = 256;
20
21/// Result of clustering histograms.
22#[derive(Debug, Clone)]
23pub struct ClusterResult {
24    /// The clustered histograms.
25    pub histograms: Vec<Histogram>,
26    /// Mapping from input index to cluster index.
27    pub symbols: Vec<u32>,
28}
29
30/// Clustering aggressiveness level.
31#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
32pub enum ClusteringType {
33    /// Only 4 clusters maximum (fastest encoding).
34    Fastest,
35    /// Default clustering.
36    #[default]
37    Fast,
38    /// With pair merge refinement (best compression).
39    Best,
40}
41
42/// Entropy coding method - affects header cost estimation for clustering.
43#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
44pub enum EntropyType {
45    /// Huffman prefix codes (used by libjxl-tiny, simpler header format).
46    #[default]
47    Huffman,
48    /// ANS (Asymmetric Numeral Systems) - used by full libjxl, larger alphabet support.
49    Ans,
50}
51
52/// Fast k-means-like clustering.
53///
54/// Algorithm:
55/// 1. Start with largest histogram as first cluster
56/// 2. Repeatedly add most distant histogram as new cluster
57/// 3. Stop when max clusters reached or distance < threshold
58/// 4. Assign remaining histograms to nearest cluster
59///
60/// Matches libjxl's `FastClusterHistograms` function.
61pub fn fast_cluster_histograms(
62    input: &[Histogram],
63    max_histograms: usize,
64) -> Result<ClusterResult> {
65    fast_cluster_histograms_with_prev(input, max_histograms, &[])
66}
67
68/// Fast clustering with support for pre-existing histograms.
69///
70/// This is the full implementation matching libjxl's `FastClusterHistograms`.
71/// The `prev_histograms` are fixed clusters that new histograms can be assigned to,
72/// but won't be merged into.
73pub fn fast_cluster_histograms_with_prev(
74    input: &[Histogram],
75    max_histograms: usize,
76    prev_histograms: &[Histogram],
77) -> Result<ClusterResult> {
78    if input.is_empty() {
79        return Ok(ClusterResult {
80            histograms: prev_histograms.to_vec(),
81            symbols: Vec::new(),
82        });
83    }
84
85    let prev_count = prev_histograms.len();
86    let mut out: Vec<Histogram> = prev_histograms.to_vec();
87    out.reserve(max_histograms);
88
89    // Initialize symbols to "unassigned" marker
90    let unassigned = max_histograms as u32;
91    let mut symbols = vec![unassigned; input.len()];
92
93    // Initialize distances to max (except empty histograms)
94    let mut dists = vec![f32::MAX; input.len()];
95
96    // Find largest histogram and compute entropies
97    let mut largest_idx = 0;
98    for (i, h) in input.iter().enumerate() {
99        if h.total_count == 0 {
100            // Empty histograms get assigned to cluster 0
101            symbols[i] = 0;
102            dists[i] = 0.0;
103            continue;
104        }
105        h.shannon_entropy(); // Compute and cache entropy
106        if h.total_count > input[largest_idx].total_count {
107            largest_idx = i;
108        }
109    }
110
111    // If there are previous histograms, compute their entropies and
112    // update distances using KL divergence
113    if prev_count > 0 {
114        for h in &out {
115            h.shannon_entropy();
116        }
117        for (i, dist) in dists.iter_mut().enumerate() {
118            if *dist == 0.0 {
119                continue;
120            }
121            for out_hist in out.iter().take(prev_count) {
122                let kl = histogram_kl_divergence(&input[i], out_hist);
123                *dist = dist.min(kl);
124            }
125        }
126        // Find the histogram with maximum distance (most different from prev)
127        if let Some((max_idx, &max_dist)) = dists
128            .iter()
129            .enumerate()
130            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(Ordering::Equal))
131            && max_dist > 0.0
132        {
133            largest_idx = max_idx;
134        }
135    }
136
137    // Main clustering loop
138    while out.len() < prev_count + max_histograms {
139        // Add the largest/most distant histogram as a new cluster
140        symbols[largest_idx] = out.len() as u32;
141        out.push(input[largest_idx].clone());
142        dists[largest_idx] = 0.0;
143
144        // Find next candidate: histogram with maximum distance
145        let mut new_largest_idx = 0;
146        for (i, h) in input.iter().enumerate() {
147            if dists[i] == 0.0 {
148                continue;
149            }
150            // Update distance using histogram distance to new cluster
151            let dist = histogram_distance(h, out.last().unwrap());
152            dists[i] = dists[i].min(dist);
153            if dists[i] > dists[new_largest_idx] {
154                new_largest_idx = i;
155            }
156        }
157        largest_idx = new_largest_idx;
158
159        // Stop if distance is below threshold
160        if dists[largest_idx] < MIN_DISTANCE_FOR_DISTINCT {
161            break;
162        }
163    }
164
165    // Assign remaining histograms to nearest cluster
166    for i in 0..input.len() {
167        if symbols[i] != unassigned {
168            continue;
169        }
170
171        // Find best cluster
172        let mut best = 0;
173        let mut best_dist = f32::MAX;
174
175        for (j, out_hist) in out.iter().enumerate() {
176            let dist = if j < prev_count {
177                // Use KL divergence for previous histograms
178                histogram_kl_divergence(&input[i], out_hist)
179            } else {
180                // Use symmetric distance for new histograms
181                histogram_distance(&input[i], out_hist)
182            };
183
184            if dist < best_dist {
185                best = j;
186                best_dist = dist;
187            }
188        }
189
190        if best_dist >= f32::MAX {
191            return Err(Error::InvalidHistogram(format!(
192                "Failed to find cluster for histogram {}",
193                i
194            )));
195        }
196
197        // Merge into best cluster (only for non-previous histograms)
198        if best >= prev_count {
199            out[best].add_histogram(&input[i]);
200            out[best].shannon_entropy(); // Recompute entropy
201        }
202        symbols[i] = best as u32;
203    }
204
205    Ok(ClusterResult {
206        histograms: out,
207        symbols,
208    })
209}
210
211/// Histogram pair for merge refinement priority queue.
212#[derive(Clone, Copy, Debug)]
213struct HistogramPair {
214    cost: f32,
215    first: u32,
216    second: u32,
217    version: u32,
218}
219
220impl PartialEq for HistogramPair {
221    fn eq(&self, other: &Self) -> bool {
222        self.cost == other.cost
223            && self.first == other.first
224            && self.second == other.second
225            && self.version == other.version
226    }
227}
228
229impl Eq for HistogramPair {}
230
231impl PartialOrd for HistogramPair {
232    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
233        Some(self.cmp(other))
234    }
235}
236
237impl Ord for HistogramPair {
238    fn cmp(&self, other: &Self) -> Ordering {
239        // Reverse order: lower cost = higher priority
240        // Use tuple comparison for tie-breaking
241        let self_tuple = (
242            ordered_float::OrderedFloat(self.cost),
243            self.first,
244            self.second,
245            self.version,
246        );
247        let other_tuple = (
248            ordered_float::OrderedFloat(other.cost),
249            other.first,
250            other.second,
251            other.version,
252        );
253        // Reverse because BinaryHeap is a max-heap
254        other_tuple.cmp(&self_tuple)
255    }
256}
257
258/// Wrapper for f32 that implements Ord for use in priority queues.
259mod ordered_float {
260    use core::cmp::Ordering;
261
262    #[derive(Clone, Copy, Debug)]
263    pub struct OrderedFloat(pub f32);
264
265    impl PartialEq for OrderedFloat {
266        fn eq(&self, other: &Self) -> bool {
267            self.0 == other.0
268        }
269    }
270
271    impl Eq for OrderedFloat {}
272
273    impl PartialOrd for OrderedFloat {
274        fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
275            Some(self.cmp(other))
276        }
277    }
278
279    impl Ord for OrderedFloat {
280        fn cmp(&self, other: &Self) -> Ordering {
281            self.0.partial_cmp(&other.0).unwrap_or(Ordering::Equal)
282        }
283    }
284}
285
286/// Estimate Huffman population cost for clustering merge decisions.
287///
288/// For Huffman coding, the key insight is that header cost savings from
289/// merging histograms are minimal compared to data cost increases.
290/// The simple/tiny clustering uses data-only cost (sum of count * depth)
291/// and produces good results.
292///
293/// This function computes data cost using actual Huffman code lengths,
294/// plus a small header penalty that scales with alphabet size to
295/// discourage creating very large merged alphabets.
296fn huffman_population_cost(h: &Histogram) -> f32 {
297    if h.total_count == 0 {
298        return 0.0;
299    }
300
301    let alphabet_size = h.alphabet_size();
302    if alphabet_size == 0 {
303        return 0.0;
304    }
305
306    // Compute ACTUAL Huffman data cost using real code lengths
307    let data_cost = compute_huffman_data_cost(h, alphabet_size);
308
309    // For merge decisions, we want to penalize large alphabets slightly
310    // because they require more complex tree serialization.
311    // But don't over-penalize - the data cost is the main factor.
312    //
313    // Count non-zero symbols to estimate header complexity
314    let non_zero_count = h
315        .counts
316        .iter()
317        .take(alphabet_size)
318        .filter(|&&c| c > 0)
319        .count();
320
321    // Small header penalty: 0.1 bits per non-zero symbol
322    // This is much smaller than data cost, so it only tips the balance
323    // when data costs are very close.
324    let header_penalty = (non_zero_count as f32) * 0.1;
325
326    data_cost + header_penalty
327}
328
329/// Compute actual Huffman data cost using real code lengths.
330///
331/// This builds a real Huffman tree and computes sum(count * depth),
332/// which is the exact number of bits needed to encode the data.
333fn compute_huffman_data_cost(h: &Histogram, alphabet_size: usize) -> f32 {
334    use super::huffman_tree::create_huffman_tree;
335
336    if alphabet_size == 0 {
337        return 0.0;
338    }
339
340    // Convert to u32 counts for create_huffman_tree
341    let counts: Vec<u32> = h
342        .counts
343        .iter()
344        .take(alphabet_size)
345        .map(|&c| c.max(0) as u32)
346        .collect();
347
348    // Check for empty or single-symbol histogram
349    let non_zero = counts.iter().filter(|&&c| c > 0).count();
350    if non_zero == 0 {
351        return 0.0;
352    }
353    if non_zero == 1 {
354        // Single symbol needs 1 bit per occurrence
355        return counts.iter().sum::<u32>() as f32;
356    }
357
358    // Build actual Huffman tree with depth limit 15
359    let depths = create_huffman_tree(&counts, 15);
360
361    // Compute data cost: sum(count * depth)
362    let mut cost = 0.0f32;
363    for (i, &count) in counts.iter().enumerate() {
364        if count > 0 && i < depths.len() {
365            cost += count as f32 * depths[i] as f32;
366        }
367    }
368
369    cost
370}
371
372/// Compute cost of encoding histogram A's data using a tree built for histogram B.
373///
374/// This is the key insight for correct merge cost estimation:
375/// When contexts are merged, BOTH original contexts use the merged tree,
376/// which is suboptimal for each individually.
377#[allow(dead_code)]
378fn compute_cross_coding_cost(data: &Histogram, tree: &Histogram, alphabet_size: usize) -> f32 {
379    use super::huffman_tree::create_huffman_tree;
380
381    if alphabet_size == 0 {
382        return 0.0;
383    }
384
385    // Build tree from 'tree' histogram
386    let tree_counts: Vec<u32> = tree
387        .counts
388        .iter()
389        .take(alphabet_size)
390        .map(|&c| c.max(0) as u32)
391        .collect();
392
393    let non_zero = tree_counts.iter().filter(|&&c| c > 0).count();
394    if non_zero == 0 {
395        return 0.0;
396    }
397
398    let depths = if non_zero == 1 {
399        vec![1u8; alphabet_size]
400    } else {
401        create_huffman_tree(&tree_counts, 15)
402    };
403
404    // Encode 'data' using depths from 'tree'
405    let mut cost = 0.0f32;
406    for (i, &count) in data.counts.iter().take(alphabet_size).enumerate() {
407        if count > 0 && i < depths.len() {
408            let depth = if depths[i] == 0 { 15 } else { depths[i] }; // Penalize symbols not in tree
409            cost += count.max(0) as f32 * depth as f32;
410        }
411    }
412
413    cost
414}
415
416/// Estimate ANS population cost (header + data bits).
417///
418/// This is a simplified version of libjxl's `Histogram::ANSPopulationCost()`.
419/// ANS uses a frequency table with log-scale precision, supporting larger alphabets.
420fn ans_population_cost(h: &Histogram) -> f32 {
421    if h.total_count == 0 {
422        return 0.0;
423    }
424
425    let alphabet_size = h.alphabet_size();
426    if alphabet_size <= 1 {
427        // Single symbol or empty: almost no header cost
428        return 0.0;
429    }
430
431    // Data cost (entropy)
432    let data_cost = h.cached_entropy();
433
434    // Header cost estimate: roughly 5 bits per symbol for frequency table
435    // ANS encodes frequencies using variable-length coding based on precision
436    // This is a rough approximation - actual cost depends on the shift parameter
437    let header_cost = (alphabet_size as f32) * 5.0;
438
439    data_cost + header_cost
440}
441
442/// Estimate population cost for a histogram based on entropy type.
443fn population_cost(h: &Histogram, entropy_type: EntropyType) -> f32 {
444    match entropy_type {
445        EntropyType::Huffman => huffman_population_cost(h),
446        EntropyType::Ans => ans_population_cost(h),
447    }
448}
449
450/// Refine clusters by merging pairs that reduce total cost.
451///
452/// This implements the pair merge refinement from libjxl's `ClusterHistograms`
453/// when `params.clustering == ClusteringType::Best`.
454///
455/// The `entropy_type` parameter controls the cost model used for merge decisions:
456/// - `EntropyType::Huffman`: Uses Huffman tree serialization cost model
457/// - `EntropyType::Ans`: Uses ANS frequency table cost model
458pub fn refine_clusters_by_merging(
459    histograms: &mut Vec<Histogram>,
460    symbols: &mut [u32],
461    entropy_type: EntropyType,
462) -> Result<()> {
463    if histograms.is_empty() {
464        return Ok(());
465    }
466
467    // Compute initial costs
468    for h in histograms.iter() {
469        h.shannon_entropy();
470    }
471
472    // Version tracking for invalidation
473    let mut version = vec![1u32; histograms.len()];
474    let mut next_version = 2u32;
475
476    // Renumbering map (for tracking merges)
477    let mut renumbering: Vec<u32> = (0..histograms.len() as u32).collect();
478
479    // Create priority queue of pairs to merge
480    let mut pairs_to_merge: BinaryHeap<HistogramPair> = BinaryHeap::new();
481
482    for i in 0..histograms.len() as u32 {
483        for j in (i + 1)..histograms.len() as u32 {
484            // Compute cost of merging
485            let mut merged = histograms[i as usize].clone();
486            merged.add_histogram(&histograms[j as usize]);
487            merged.shannon_entropy();
488
489            let merged_cost = population_cost(&merged, entropy_type);
490            let individual_cost = population_cost(&histograms[i as usize], entropy_type)
491                + population_cost(&histograms[j as usize], entropy_type);
492
493            let cost = merged_cost - individual_cost;
494
495            // Only enqueue if merging is beneficial
496            if cost < 0.0 {
497                pairs_to_merge.push(HistogramPair {
498                    cost,
499                    first: i,
500                    second: j,
501                    version: version[i as usize].max(version[j as usize]),
502                });
503            }
504        }
505    }
506
507    // Process merges
508    while let Some(pair) = pairs_to_merge.pop() {
509        let first = pair.first as usize;
510        let second = pair.second as usize;
511
512        // Check if pair is still valid
513        let expected_version = version[first].max(version[second]);
514        if pair.version != expected_version || version[first] == 0 || version[second] == 0 {
515            continue;
516        }
517
518        // Merge second into first (clone to avoid borrow conflict)
519        let second_histo = histograms[second].clone();
520        histograms[first].add_histogram(&second_histo);
521        histograms[first].shannon_entropy();
522
523        // Update renumbering
524        for item in renumbering.iter_mut() {
525            if *item == pair.second {
526                *item = pair.first;
527            }
528        }
529
530        // Mark second as dead
531        version[second] = 0;
532        version[first] = next_version;
533        next_version += 1;
534
535        // Add new pairs with the merged histogram
536        for j in 0..histograms.len() as u32 {
537            if j == pair.first || version[j as usize] == 0 {
538                continue;
539            }
540
541            let mut merged = histograms[first].clone();
542            merged.add_histogram(&histograms[j as usize]);
543            merged.shannon_entropy();
544
545            let merged_cost = population_cost(&merged, entropy_type);
546            let individual_cost = population_cost(&histograms[first], entropy_type)
547                + population_cost(&histograms[j as usize], entropy_type);
548
549            let cost = merged_cost - individual_cost;
550
551            if cost < 0.0 {
552                pairs_to_merge.push(HistogramPair {
553                    cost,
554                    first: pair.first.min(j),
555                    second: pair.first.max(j),
556                    version: version[first].max(version[j as usize]),
557                });
558            }
559        }
560    }
561
562    // Build reverse renumbering and compact
563    let mut reverse_renumbering = vec![u32::MAX; histograms.len()];
564    let mut num_alive = 0u32;
565
566    for i in 0..histograms.len() {
567        if version[i] == 0 {
568            continue;
569        }
570        if num_alive != i as u32 {
571            histograms[num_alive as usize] = histograms[i].clone();
572        }
573        reverse_renumbering[i] = num_alive;
574        num_alive += 1;
575    }
576    histograms.truncate(num_alive as usize);
577
578    // Update symbols
579    for symbol in symbols.iter_mut() {
580        let renumbered = renumbering[*symbol as usize];
581        *symbol = reverse_renumbering[renumbered as usize];
582    }
583
584    Ok(())
585}
586
587/// Reindex histograms so that symbols appear in increasing order.
588fn histogram_reindex(histograms: &mut Vec<Histogram>, prev_count: usize, symbols: &mut [u32]) {
589    use std::collections::HashMap;
590
591    let tmp = histograms.clone();
592    let mut new_index: HashMap<u32, u32> = HashMap::new();
593
594    // Previous histograms keep their indices
595    for i in 0..prev_count {
596        new_index.insert(i as u32, i as u32);
597    }
598
599    // Assign new indices in order of first appearance
600    let mut next_index = prev_count as u32;
601    for &symbol in symbols.iter() {
602        if let std::collections::hash_map::Entry::Vacant(e) = new_index.entry(symbol) {
603            e.insert(next_index);
604            histograms[next_index as usize] = tmp[symbol as usize].clone();
605            next_index += 1;
606        }
607    }
608
609    histograms.truncate(next_index as usize);
610
611    // Update symbols
612    for symbol in symbols.iter_mut() {
613        *symbol = new_index[symbol];
614    }
615}
616
617/// Full clustering pipeline.
618///
619/// Combines fast clustering with optional pair merge refinement.
620///
621/// # Arguments
622///
623/// * `clustering_type` - Controls clustering aggressiveness (Fastest/Fast/Best)
624/// * `entropy_type` - Controls cost model for merge decisions (Huffman/Ans)
625/// * `input` - Input histograms to cluster
626/// * `max_histograms` - Maximum number of output clusters
627pub fn cluster_histograms(
628    clustering_type: ClusteringType,
629    entropy_type: EntropyType,
630    input: &[Histogram],
631    max_histograms: usize,
632) -> Result<ClusterResult> {
633    let max_histograms = match clustering_type {
634        ClusteringType::Fastest => max_histograms.min(4),
635        _ => max_histograms,
636    };
637
638    let max_histograms = max_histograms.min(input.len()).min(CLUSTERS_LIMIT);
639
640    // Fast clustering
641    let mut result = fast_cluster_histograms(input, max_histograms)?;
642
643    // Pair merge refinement for Best quality
644    if clustering_type == ClusteringType::Best && !result.histograms.is_empty() {
645        refine_clusters_by_merging(&mut result.histograms, &mut result.symbols, entropy_type)?;
646    }
647
648    // Reindex for canonical form
649    histogram_reindex(&mut result.histograms, 0, &mut result.symbols);
650
651    Ok(result)
652}
653
654#[cfg(test)]
655mod tests {
656    use super::*;
657
658    fn make_histogram(counts: &[i32]) -> Histogram {
659        let h = Histogram::from_counts(counts);
660        h.shannon_entropy(); // Pre-compute entropy
661        h
662    }
663
664    #[test]
665    fn test_fast_cluster_single() {
666        let input = vec![make_histogram(&[100, 50, 25])];
667
668        let result = fast_cluster_histograms(&input, 10).unwrap();
669
670        assert_eq!(result.histograms.len(), 1);
671        assert_eq!(result.symbols.len(), 1);
672        assert_eq!(result.symbols[0], 0);
673    }
674
675    #[test]
676    fn test_fast_cluster_identical() {
677        let input = vec![
678            make_histogram(&[100, 50, 25]),
679            make_histogram(&[100, 50, 25]),
680            make_histogram(&[100, 50, 25]),
681        ];
682
683        let result = fast_cluster_histograms(&input, 10).unwrap();
684
685        // All identical histograms should cluster together
686        assert_eq!(result.histograms.len(), 1);
687        assert_eq!(result.symbols, vec![0, 0, 0]);
688    }
689
690    #[test]
691    fn test_fast_cluster_different() {
692        let input = vec![
693            make_histogram(&[100, 0, 0]),
694            make_histogram(&[0, 100, 0]),
695            make_histogram(&[0, 0, 100]),
696        ];
697
698        let result = fast_cluster_histograms(&input, 10).unwrap();
699
700        // Very different histograms should be in separate clusters
701        assert!(result.histograms.len() >= 2);
702        // Each histogram should be assigned to some cluster
703        assert!(
704            result
705                .symbols
706                .iter()
707                .all(|&s| (s as usize) < result.histograms.len())
708        );
709    }
710
711    #[test]
712    fn test_fast_cluster_max_limit() {
713        let input: Vec<Histogram> = (0..10)
714            .map(|i| {
715                let mut counts = vec![0i32; 10];
716                counts[i] = 100;
717                make_histogram(&counts)
718            })
719            .collect();
720
721        let result = fast_cluster_histograms(&input, 4).unwrap();
722
723        // Should not exceed max limit
724        assert!(result.histograms.len() <= 4);
725    }
726
727    #[test]
728    fn test_fast_cluster_empty() {
729        let input: Vec<Histogram> = vec![];
730        let result = fast_cluster_histograms(&input, 10).unwrap();
731
732        assert!(result.histograms.is_empty());
733        assert!(result.symbols.is_empty());
734    }
735
736    #[test]
737    fn test_fast_cluster_with_empty_histograms() {
738        let input = vec![
739            Histogram::new(), // Empty
740            make_histogram(&[100, 50]),
741            Histogram::new(), // Empty
742        ];
743
744        let result = fast_cluster_histograms(&input, 10).unwrap();
745
746        // Empty histograms should be assigned to cluster 0
747        assert!(!result.histograms.is_empty());
748        assert_eq!(result.symbols[0], 0);
749        assert_eq!(result.symbols[2], 0);
750    }
751
752    #[test]
753    fn test_cluster_histograms_fastest() {
754        let input: Vec<Histogram> = (0..10)
755            .map(|i| {
756                let mut counts = vec![0i32; 10];
757                counts[i] = 100;
758                make_histogram(&counts)
759            })
760            .collect();
761
762        let result =
763            cluster_histograms(ClusteringType::Fastest, EntropyType::Huffman, &input, 10).unwrap();
764
765        // Fastest should limit to 4 clusters
766        assert!(result.histograms.len() <= 4);
767    }
768
769    #[test]
770    fn test_cluster_histograms_best_merges_huffman() {
771        // Create two pairs of similar histograms
772        let input = vec![
773            make_histogram(&[100, 50, 25, 10]),
774            make_histogram(&[105, 52, 23, 11]), // Similar to 0
775            make_histogram(&[10, 25, 50, 100]),
776            make_histogram(&[11, 23, 52, 105]), // Similar to 2
777        ];
778
779        let result =
780            cluster_histograms(ClusteringType::Best, EntropyType::Huffman, &input, 10).unwrap();
781
782        // With best quality, similar histograms should be merged
783        assert!(result.histograms.len() <= 4);
784    }
785
786    #[test]
787    fn test_cluster_histograms_best_merges_ans() {
788        // Create two pairs of similar histograms
789        let input = vec![
790            make_histogram(&[100, 50, 25, 10]),
791            make_histogram(&[105, 52, 23, 11]), // Similar to 0
792            make_histogram(&[10, 25, 50, 100]),
793            make_histogram(&[11, 23, 52, 105]), // Similar to 2
794        ];
795
796        let result =
797            cluster_histograms(ClusteringType::Best, EntropyType::Ans, &input, 10).unwrap();
798
799        // With best quality, similar histograms should be merged
800        assert!(result.histograms.len() <= 4);
801    }
802
803    #[test]
804    fn test_huffman_vs_ans_cost_model() {
805        // Histogram with many symbols - ANS and Huffman should have different costs
806        let mut counts = vec![0i32; 64];
807        for (i, c) in counts.iter_mut().enumerate() {
808            *c = (64 - i as i32) * 10; // Decreasing frequencies
809        }
810        let h = make_histogram(&counts);
811
812        let huffman_cost = huffman_population_cost(&h);
813        let ans_cost = ans_population_cost(&h);
814
815        // Both should be positive
816        assert!(huffman_cost > 0.0);
817        assert!(ans_cost > 0.0);
818
819        // For large alphabets, ANS header cost (5 bits/symbol) should be higher
820        // than Huffman's nested tree (~3 bits/symbol + 30 bit overhead)
821        // This test just verifies they're different - actual values depend on distribution
822        assert!((huffman_cost - ans_cost).abs() > 1.0);
823    }
824
825    #[test]
826    fn test_histogram_reindex() {
827        let mut histograms = vec![
828            make_histogram(&[100]),
829            make_histogram(&[200]),
830            make_histogram(&[300]),
831        ];
832        let mut symbols = vec![2, 0, 2, 1, 0];
833
834        histogram_reindex(&mut histograms, 0, &mut symbols);
835
836        // Symbols should now be renumbered in order of first appearance
837        // 2 -> 0, 0 -> 1, 1 -> 2
838        assert_eq!(symbols, vec![0, 1, 0, 2, 1]);
839    }
840}
841
842#[test]
843fn test_huffman_cost_disjoint_histograms() {
844    // Disjoint histograms - merging should NOT be beneficial
845    let a = Histogram::from_counts(&[100, 50, 25, 0, 0, 0, 0, 0]);
846    a.shannon_entropy();
847
848    let b = Histogram::from_counts(&[0, 0, 0, 80, 40, 20, 0, 0]);
849    b.shannon_entropy();
850
851    let mut merged = a.clone();
852    merged.add_histogram(&b);
853    merged.shannon_entropy();
854
855    let cost_a = huffman_population_cost(&a);
856    let cost_b = huffman_population_cost(&b);
857    let cost_merged = huffman_population_cost(&merged);
858    let delta = cost_merged - cost_a - cost_b;
859
860    // Disjoint histograms: merging increases data cost significantly
861    assert!(delta >= 0.0, "Disjoint merge should not be beneficial");
862}
863
864#[test]
865fn test_huffman_cost_identical_histograms() {
866    // Identical histograms - merging should have near-zero delta
867    let a = Histogram::from_counts(&[100, 50, 25, 10, 0, 0, 0, 0]);
868    a.shannon_entropy();
869
870    let b = Histogram::from_counts(&[100, 50, 25, 10, 0, 0, 0, 0]);
871    b.shannon_entropy();
872
873    let mut merged = a.clone();
874    merged.add_histogram(&b);
875    merged.shannon_entropy();
876
877    let cost_a = huffman_population_cost(&a);
878    let cost_b = huffman_population_cost(&b);
879    let cost_merged = huffman_population_cost(&merged);
880    let delta = cost_merged - cost_a - cost_b;
881
882    // Identical histograms use same Huffman tree, so merged cost = 2x individual
883    assert!(
884        delta.abs() < 1.0,
885        "Identical histograms should have near-zero delta, got {}",
886        delta
887    );
888}