Skip to main content

jxl_encoder/vardct/
encoder.rs

1// Copyright (c) Imazen LLC and the JPEG XL Project Authors.
2// Algorithms and constants derived from libjxl (BSD-3-Clause).
3// Licensed under AGPL-3.0-or-later. Commercial licenses at https://www.imazen.io/pricing
4
5//! Main tiny encoder implementation.
6
7use super::ac_strategy::{
8    AcStrategyMap, adjust_quant_field_float_with_distance, adjust_quant_field_with_distance,
9    compute_ac_strategy,
10};
11use super::adaptive_quant::{compute_mask1x1, compute_quant_field_float, quantize_quant_field};
12use super::chroma_from_luma::{CflMap, compute_cfl_map};
13use super::common::*;
14use super::frame::{DistanceParams, write_toc};
15use super::gaborish::gaborish_inverse;
16use super::noise::{denoise_xyb, estimate_noise_params, noise_quality_coef};
17use super::static_codes::{get_ac_entropy_code, get_dc_entropy_code};
18use crate::bit_writer::BitWriter;
19#[cfg(feature = "debug-tokens")]
20use crate::debug_log;
21use crate::debug_rect;
22use crate::error::Result;
23use crate::headers::frame_header::FrameHeader;
24
25// Re-export types from entropy_code sub-module.
26pub(crate) use super::entropy_code::{BuiltEntropyCode, force_strategy_map};
27
28/// Output of a VarDCT encode operation.
29pub struct VarDctOutput {
30    /// Encoded JXL codestream bytes.
31    pub data: Vec<u8>,
32    /// Per-strategy first-block counts, indexed by raw strategy code (0..19).
33    pub strategy_counts: [u32; 19],
34}
35
36/// Tiny JPEG XL encoder.
37///
38/// This is a simplified VarDCT encoder based on libjxl-tiny that uses:
39/// - Only DCT8, DCT8x16, DCT16x8 transforms
40/// - Huffman or ANS entropy coding
41/// - Default zig-zag coefficient order
42/// - Fixed context tree for DC
43pub struct VarDctEncoder {
44    /// Target distance (quality). 1.0 = visually lossless.
45    pub distance: f32,
46    /// Effort level (1–10). Controls AC strategy gating and search depth.
47    pub effort: u8,
48    /// Centralized effort-derived decisions. All effort-gated constants and
49    /// thresholds are read from this profile instead of inline `if effort >= N`.
50    pub profile: crate::effort::EffortProfile,
51    /// Use dynamic Huffman codes built from actual token frequencies.
52    /// When true (default), uses a two-pass mode: collect tokens first, build optimal codes, then write.
53    /// When false, uses pre-computed static codes (streaming, single-pass).
54    pub optimize_codes: bool,
55    /// Use enhanced histogram clustering with pair merge refinement.
56    /// Only effective when `optimize_codes` is true.
57    ///
58    /// Note: The enhanced clustering algorithm was designed for ANS entropy coding
59    /// and may not provide benefits (or may slightly increase size) when used with
60    /// Huffman coding. This option is experimental.
61    pub enhanced_clustering: bool,
62    /// Use ANS entropy coding instead of Huffman.
63    /// Only effective when `optimize_codes` is true (requires two-pass mode).
64    /// ANS typically produces 5-10% smaller files than Huffman.
65    pub use_ans: bool,
66    /// Enable chroma-from-luma (CfL) optimization.
67    /// When true (default), computes per-tile ytox/ytob values via least-squares fitting.
68    /// When false, uses ytox=0, ytob=0 (no chroma decorrelation).
69    pub cfl_enabled: bool,
70    /// Enable adaptive AC strategy selection (DCT8/DCT16x8/DCT8x16).
71    /// When true (default), selects the best transform size per 16x16 block region.
72    /// When false, uses DCT8 for all blocks.
73    pub ac_strategy_enabled: bool,
74    /// Enable custom coefficient ordering.
75    /// When true (default when optimize_codes is true), reorders AC coefficients
76    /// so frequently-zero positions appear last, reducing bitstream size.
77    /// Only effective when `optimize_codes` is true (requires two-pass mode).
78    pub custom_orders: bool,
79    /// Force a specific AC strategy for all blocks (for testing).
80    /// When Some(strategy), uses that raw strategy code for all blocks that fit.
81    /// None (default) uses normal strategy selection based on `ac_strategy_enabled`.
82    pub force_strategy: Option<u8>,
83    /// Enable noise synthesis.
84    /// When true, estimates noise parameters from the image and encodes them
85    /// in the frame header. The decoder regenerates noise during rendering.
86    /// Off by default (matching libjxl's default).
87    pub enable_noise: bool,
88    /// Enable Wiener denoising pre-filter (requires `enable_noise`).
89    /// When true, applies a conservative Wiener filter to remove estimated noise
90    /// before encoding. The decoder re-adds noise from the encoded parameters.
91    /// Provides 1-8% file size savings with near-zero Butteraugli quality impact.
92    /// Off by default (libjxl does not have a denoising pre-filter).
93    pub enable_denoise: bool,
94    /// Enable gaborish inverse pre-filter.
95    /// When true (default), applies a 5x5 sharpening kernel to XYB before DCT
96    /// and signals gab=1 in the frame header. The decoder applies a 3x3 blur
97    /// to compensate, reducing blocking artifacts.
98    /// Matches the libjxl VarDCT encoder default.
99    pub enable_gaborish: bool,
100    /// Enable error diffusion in AC quantization.
101    /// When true, spreads quantization error to neighboring coefficients in
102    /// zigzag order, helping preserve smooth gradients at high compression.
103    /// Off by default (modest quality improvement, slight performance cost).
104    pub error_diffusion: bool,
105    /// Enable pixel-domain loss calculation in AC strategy selection.
106    /// When true, uses full libjxl's pixel-domain loss model (IDCT error,
107    /// per-pixel masking, 8th power norm). This provides better distance
108    /// calibration matching cjxl's output.
109    /// When false (default), uses coefficient-domain loss (libjxl-tiny style).
110    /// Note: Requires `ac_strategy_enabled` to have any effect.
111    pub pixel_domain_loss: bool,
112    /// Enable LZ77 backward references in entropy coding.
113    /// When true, compresses token streams using LZ77 length+distance tokens.
114    /// Only effective with two-pass mode (optimize_codes=true) and ANS (use_ans=true).
115    /// Off by default — works for most cases but has known interactions with certain
116    /// forced strategy combinations (DCT2x2, IDENTITY) that cause InvalidAnsStream.
117    pub enable_lz77: bool,
118    /// LZ77 method to use when enable_lz77 is true.
119    ///
120    /// - `Rle`: Only matches consecutive identical values (fast, limited on photos)
121    /// - `Greedy`: Hash chain backward references (slower, 1-3% better on photos)
122    ///
123    /// Default: `Greedy` (best compression)
124    pub lz77_method: crate::entropy_coding::lz77::Lz77Method,
125    /// Enable DC tree learning.
126    /// When true, learns an optimal context tree for DC coding from image content
127    /// instead of using the fixed GRADIENT_CONTEXT_LUT.
128    /// **DISABLED/BROKEN**: The learned tree doesn't correctly route AC metadata
129    /// samples to contexts 0-10. Fixing requires parsing the static tree structure
130    /// and splicing in the learned DC subtree while preserving AC metadata routing.
131    /// Expected gain (~1.2% overall) doesn't justify the complexity. See CLAUDE.md.
132    pub dc_tree_learning: bool,
133    /// Number of butteraugli quantization loop iterations.
134    /// When > 0, iteratively refines the per-block quant field using butteraugli
135    /// perceptual distance feedback. Each iteration: encode → reconstruct → measure
136    /// → adjust quant_field. AC strategy is kept fixed; only quant_field changes.
137    ///
138    /// libjxl uses 2 iterations at effort 8, 4 at effort 9.
139    /// Requires the `butteraugli-loop` feature.
140    ///
141    /// Default: 0 (disabled)
142    #[cfg(feature = "butteraugli-loop")]
143    pub butteraugli_iters: u32,
144    /// Number of SSIM2 quantization loop iterations.
145    /// Alternative to butteraugli loop: uses per-block linear RGB RMSE + full-image SSIM2.
146    /// Requires the `ssim2-loop` feature.
147    ///
148    /// Default: 0 (disabled)
149    #[cfg(feature = "ssim2-loop")]
150    pub ssim2_iters: u32,
151    /// Number of zensim quantization loop iterations.
152    /// Alternative to butteraugli loop: uses zensim's psychovisual metric for both
153    /// global quality tracking and per-pixel spatial error map (diffmap in XYB space).
154    /// Also refines AC strategy by splitting large transforms with high perceptual error.
155    /// Requires the `zensim-loop` feature.
156    ///
157    /// Default: 0 (disabled)
158    #[cfg(feature = "zensim-loop")]
159    pub zensim_iters: u32,
160    /// Whether the input has 16-bit samples. When true, the file header signals
161    /// bit_depth=16 instead of 8. The actual VarDCT encoding is the same (XYB
162    /// is always f32 internally), but the decoder uses this to reconstruct at
163    /// the correct output bit depth.
164    pub bit_depth_16: bool,
165    /// ICC profile to embed in the codestream.
166    /// When Some, writes has_icc=1 and encodes the profile after the file header.
167    pub icc_profile: Option<Vec<u8>>,
168    /// Enable patches (dictionary-based repeated pattern detection).
169    /// When true, detects repeated rectangular elements (text glyphs, buttons, icons)
170    /// and stores unique patterns once in a reference frame. Huge wins on screenshots.
171    /// On by default for lossy encoding.
172    pub enable_patches: bool,
173    /// Encoder mode: Reference (match libjxl) or Experimental (own improvements).
174    pub encoder_mode: crate::api::EncoderMode,
175    /// Manual splines to overlay on the image (opt-in, None by default).
176    pub splines: Option<Vec<crate::vardct::splines::Spline>>,
177    /// Whether the input is grayscale. When true, the file header signals
178    /// ColorSpace::Gray instead of RGB. VarDCT still operates in XYB (3 channels)
179    /// internally — this only affects the output colorspace the decoder targets.
180    pub is_grayscale: bool,
181    /// Progressive encoding mode (Single, QuantizedAcFullAc, DcVlfLfAc).
182    /// When not Single, AC coefficients are split across multiple passes with
183    /// shift-based precision reduction for early preview rendering.
184    pub progressive: crate::api::ProgressiveMode,
185    /// Enable LfFrame (separate DC frame).
186    /// When true, DC coefficients are encoded as a separate modular frame
187    /// (frame_type=1, dc_level=1) before the main VarDCT frame, with
188    /// distance-scaled quantization factors matching libjxl's progressive_dc >= 1.
189    pub use_lf_frame: bool,
190    /// Custom gamma (encoding exponent) from source image.
191    /// When Some, writes have_gamma=true in the JXL header and uses gamma
192    /// linearization instead of sRGB TF. Example: 0.45455 for gamma 2.2.
193    pub source_gamma: Option<f32>,
194    /// Explicit color encoding override for the JXL header.
195    /// When Some, this is used instead of deriving from source_gamma / defaults.
196    /// Allows signaling HDR (PQ, HLG) or non-sRGB primaries (BT.2020, P3).
197    pub color_encoding: Option<crate::headers::color_encoding::ColorEncoding>,
198    /// Peak display luminance in nits for ToneMapping. Default 255.0 (SDR).
199    pub intensity_target: f32,
200    /// Minimum display luminance in nits for ToneMapping. Default 0.0.
201    pub min_nits: f32,
202    /// Intrinsic display size `(width, height)`, if different from coded dimensions.
203    pub intrinsic_size: Option<(u32, u32)>,
204}
205
206impl Default for VarDctEncoder {
207    fn default() -> Self {
208        Self {
209            distance: 1.0,
210            effort: 7,
211            profile: crate::effort::EffortProfile::lossy(7, crate::api::EncoderMode::Reference),
212            optimize_codes: true,
213            enhanced_clustering: true, // Profile-driven: e9+ for Best, Fast otherwise
214            use_ans: true,             // ANS produces 4-10% smaller files than Huffman
215            cfl_enabled: true,
216            ac_strategy_enabled: true,
217            custom_orders: true,
218            force_strategy: None,
219            enable_noise: false,
220            enable_denoise: false,
221            enable_gaborish: true,
222            error_diffusion: false, // libjxl accepts param but never uses it in QuantizeBlockAC
223            pixel_domain_loss: true, // Full libjxl pixel-domain loss: +0.2-1.9 SSIM2 at all distances
224            enable_lz77: false,      // LZ77 has known interactions with DCT2x2/IDENTITY strategies
225            lz77_method: crate::entropy_coding::lz77::Lz77Method::Greedy, // Best compression
226            dc_tree_learning: false, // DC tree learning (experimental)
227            #[cfg(feature = "butteraugli-loop")]
228            butteraugli_iters: 0, // Effort-gated: default off (effort 7). Set via LossyConfig.
229            #[cfg(feature = "ssim2-loop")]
230            ssim2_iters: 0, // Off by default. Set via LossyConfig.
231            #[cfg(feature = "zensim-loop")]
232            zensim_iters: 0, // Off by default. Set via LossyConfig.
233            bit_depth_16: false,
234            icc_profile: None,
235            enable_patches: true, // Patches: huge wins on screenshots, zero cost on photos
236            encoder_mode: crate::api::EncoderMode::Reference,
237            splines: None,
238            is_grayscale: false,
239            progressive: crate::api::ProgressiveMode::Single,
240            use_lf_frame: false,
241            source_gamma: None,
242            color_encoding: None,
243            intensity_target: 255.0,
244            min_nits: 0.0,
245            intrinsic_size: None,
246        }
247    }
248}
249
250impl VarDctEncoder {
251    /// Create a new tiny encoder with the given distance.
252    pub fn new(distance: f32) -> Self {
253        Self {
254            distance,
255            effort: 7,
256            profile: crate::effort::EffortProfile::lossy(7, crate::api::EncoderMode::Reference),
257            optimize_codes: true,
258            enhanced_clustering: true, // Profile-driven: e9+ for Best, Fast otherwise
259            use_ans: true,             // ANS produces 4-10% smaller files than Huffman
260            cfl_enabled: true,
261            ac_strategy_enabled: true,
262            custom_orders: true,
263            force_strategy: None,
264            enable_noise: false,
265            enable_denoise: false,
266            enable_gaborish: true,
267            error_diffusion: false, // libjxl accepts param but never uses it in QuantizeBlockAC
268            pixel_domain_loss: true, // Full libjxl pixel-domain loss: +0.2-1.9 SSIM2
269            enable_lz77: false,     // LZ77 has known interactions with DCT2x2/IDENTITY strategies
270            lz77_method: crate::entropy_coding::lz77::Lz77Method::Greedy, // Best compression
271            dc_tree_learning: false, // DC tree learning (experimental)
272            #[cfg(feature = "butteraugli-loop")]
273            butteraugli_iters: 0, // Effort-gated: default off (effort 7). Set via LossyConfig.
274            #[cfg(feature = "ssim2-loop")]
275            ssim2_iters: 0, // Off by default. Set via LossyConfig.
276            #[cfg(feature = "zensim-loop")]
277            zensim_iters: 0, // Off by default. Set via LossyConfig.
278            bit_depth_16: false,
279            icc_profile: None,
280            enable_patches: true, // Patches: huge wins on screenshots, zero cost on photos
281            encoder_mode: crate::api::EncoderMode::Reference,
282            splines: None,
283            is_grayscale: false,
284            progressive: crate::api::ProgressiveMode::Single,
285            use_lf_frame: false,
286            source_gamma: None,
287            color_encoding: None,
288            intensity_target: 255.0,
289            min_nits: 0.0,
290            intrinsic_size: None,
291        }
292    }
293
294    /// Encode an image in linear sRGB format, optionally with an alpha channel.
295    ///
296    /// Input should be 3 channels (RGB) of f32 values in [0, 1] range.
297    /// Values outside [0, 1] are allowed for out-of-gamut colors.
298    ///
299    /// If `alpha` is provided, it must be `width * height` bytes of u8 alpha values.
300    /// Alpha is encoded as a modular extra channel alongside the VarDCT RGB data.
301    pub fn encode(
302        &self,
303        width: usize,
304        height: usize,
305        linear_rgb: &[f32],
306        alpha: Option<&[u8]>,
307    ) -> Result<VarDctOutput> {
308        assert_eq!(linear_rgb.len(), width * height * 3);
309        if let Some(a) = alpha {
310            assert_eq!(a.len(), width * height);
311        }
312
313        crate::debug_rect::clear();
314
315        // Calculate dimensions
316        let xsize_blocks = div_ceil(width, BLOCK_DIM);
317        let ysize_blocks = div_ceil(height, BLOCK_DIM);
318        let xsize_groups = div_ceil(width, GROUP_DIM);
319        let ysize_groups = div_ceil(height, GROUP_DIM);
320        let xsize_dc_groups = div_ceil(width, DC_GROUP_DIM);
321        let ysize_dc_groups = div_ceil(height, DC_GROUP_DIM);
322        let num_groups = xsize_groups * ysize_groups;
323        let num_dc_groups = xsize_dc_groups * ysize_dc_groups;
324
325        // Number of sections: DC global + DC groups + AC global + AC groups
326        let num_sections = 2 + num_dc_groups + num_groups;
327
328        // Pad to block boundary dimensions
329        let padded_width = xsize_blocks * BLOCK_DIM;
330        let padded_height = ysize_blocks * BLOCK_DIM;
331
332        // Convert to XYB with edge-replicated padding to block boundaries.
333        // This allows SIMD to process full blocks without bounds checking.
334        let (mut xyb_x, mut xyb_y, mut xyb_b) =
335            self.convert_to_xyb_padded(width, height, padded_width, padded_height, linear_rgb);
336
337        // Estimate noise parameters (if enabled).
338        // The decoder adds noise during rendering; the encoder just encodes the params.
339        let noise_params = if self.enable_noise {
340            let quality_coef = noise_quality_coef(self.distance);
341            let params = estimate_noise_params(
342                &xyb_x,
343                &xyb_y,
344                &xyb_b,
345                padded_width,
346                padded_height,
347                quality_coef,
348            );
349
350            // Apply denoising pre-filter if enabled and noise was detected.
351            // Removes estimated noise before encoding so the encoder spends fewer
352            // bits on noise; the decoder re-adds it from the encoded parameters.
353            if self.enable_denoise
354                && let Some(ref p) = params
355            {
356                denoise_xyb(
357                    &mut xyb_x,
358                    &mut xyb_y,
359                    &mut xyb_b,
360                    padded_width,
361                    padded_height,
362                    p,
363                    quality_coef,
364                );
365            }
366
367            params
368        } else {
369            None
370        };
371
372        // Detect and subtract patches (before gaborish, after noise).
373        // Patches work in the XYB domain: detect repeated rectangular elements,
374        // store unique patterns in a reference frame, subtract from image.
375        let mut patches_data = if self.enable_patches {
376            super::patches::find_and_build([&xyb_x, &xyb_y, &xyb_b], width, height, padded_width)
377        } else {
378            None
379        };
380        // Cost-benefit gating for experimental mode only.
381        // libjxl uses patches unconditionally when detected (no cost check),
382        // so reference mode skips this to match.
383        if matches!(self.encoder_mode, crate::api::EncoderMode::Experimental)
384            && let Some(ref pd) = patches_data
385            && !pd.is_cost_effective(self.distance, self.use_ans)
386        {
387            patches_data = None;
388        }
389        // Quantize ref_image so subtract/add use the same values the decoder will reconstruct.
390        if let Some(ref mut pd) = patches_data {
391            pd.quantize_ref_image();
392        }
393        if let Some(ref pd) = patches_data {
394            let mut xyb = [
395                core::mem::take(&mut xyb_x),
396                core::mem::take(&mut xyb_y),
397                core::mem::take(&mut xyb_b),
398            ];
399            super::patches::subtract_patches(&mut xyb, padded_width, pd);
400            let [x, y, b] = xyb;
401            xyb_x = x;
402            xyb_y = y;
403            xyb_b = b;
404        }
405
406        // Build and subtract splines (after patches, before gaborish).
407        // Splines are additive overlays: encoder subtracts, decoder adds back.
408        // Uses default DC CfL params (y_to_x=0.0, y_to_b=1.0) since we write default DC cmap.
409        let splines_data = if let Some(ref splines) = self.splines {
410            if !splines.is_empty() {
411                let sd = super::splines::SplinesData::from_splines(
412                    splines.clone(),
413                    0,   // quantization_adjustment
414                    0.0, // y_to_x (default DC CfL)
415                    1.0, // y_to_b (default DC CfL)
416                    width,
417                    height,
418                );
419                {
420                    let mut xyb = [
421                        core::mem::take(&mut xyb_x),
422                        core::mem::take(&mut xyb_y),
423                        core::mem::take(&mut xyb_b),
424                    ];
425                    super::splines::subtract_splines(&mut xyb, padded_width, width, height, &sd);
426                    let [x, y, b] = xyb;
427                    xyb_x = x;
428                    xyb_y = y;
429                    xyb_b = b;
430                }
431                Some(sd)
432            } else {
433                None
434            }
435        } else {
436            None
437        };
438
439        // Compute pixel chromacity stats BEFORE gaborish (matching libjxl pipeline).
440        // Gaborish sharpening inflates gradients, producing overly aggressive adjustment.
441        // Gated at effort >= 7 to skip the full-image gradient scan at low effort.
442        let (chromacity_x, chromacity_b) = if self.profile.chromacity_adjustment {
443            let pixel_stats = super::frame::PixelStatsForChromacityAdjustment::calc(
444                &xyb_x,
445                &xyb_y,
446                &xyb_b,
447                padded_width,
448                padded_height,
449            );
450            (
451                pixel_stats.how_much_is_x_channel_pixelized(),
452                pixel_stats.how_much_is_b_channel_pixelized(),
453            )
454        } else {
455            (0, 0)
456        };
457
458        // Compute adaptive per-block quantization field and masking on ORIGINAL
459        // (pre-gaborish) XYB. libjxl computes InitialQuantField before GaborishInverse
460        // (enc_heuristics.cc:1117-1142, comment: "relies on pre-gaborish values").
461        // When gaborish is off, scale distance by 0.62 for the quant field only
462        // (not global_scale/quant_dc). This matches libjxl enc_heuristics.cc:1119.
463        let distance_for_iqf = if self.enable_gaborish {
464            self.distance
465        } else {
466            self.distance * 0.62
467        };
468
469        // Step 1: Compute float quant field on pre-gaborish XYB.
470        //
471        // libjxl effort gating (enc_heuristics.cc:1097-1128):
472        // - effort < 5 (speed_tier > kHare): flat quant field = q_numerator/distance
473        // - effort >= 5 (speed_tier <= kHare): adaptive via InitialQuantField
474        let (mut quant_field_float, masking) = if self.profile.use_adaptive_quant {
475            compute_quant_field_float(
476                &xyb_x,
477                &xyb_y,
478                &xyb_b,
479                padded_width,
480                padded_height,
481                xsize_blocks,
482                ysize_blocks,
483                distance_for_iqf,
484                self.profile.k_ac_quant,
485            )
486        } else {
487            // Flat quant field for low effort (matches libjxl enc_heuristics.cc:1105-1106)
488            let q = self.profile.initial_q_numerator / self.distance;
489            let flat_qf = vec![q; xsize_blocks * ysize_blocks];
490            let masking_val = 1.0 / (q + 0.001);
491            let flat_masking = vec![masking_val; xsize_blocks * ysize_blocks];
492            (flat_qf, flat_masking)
493        };
494
495        // Step 2: Compute distance params with effort-matched global_scale.
496        //
497        // Uses profile.initial_q_numerator for q = numerator / distance.
498        // The adaptive median/MAD formula is only used inside the butteraugli
499        // loop (effort >= 8).
500        let mut params = DistanceParams::compute_for_profile(self.distance, &self.profile);
501
502        // Apply pixel-level chromacity adjustments using pre-gaborish stats
503        // Gated at effort >= 7 (speed_tier <= kSquirrel) matching libjxl
504        if self.profile.chromacity_adjustment {
505            params.apply_chromacity_adjustment(chromacity_x, chromacity_b);
506        }
507
508        debug_rect!(
509            "enc/params",
510            0,
511            0,
512            width,
513            height,
514            "global_scale={} quant_dc={} scale={:.4} inv_scale={:.4} epf_iters={} chrom_x={:.3} chrom_b={:.3}",
515            params.global_scale,
516            params.quant_dc,
517            params.scale,
518            params.inv_scale,
519            params.epf_iters,
520            chromacity_x,
521            chromacity_b
522        );
523
524        // Step 3: Quantize float quant field to raw u8 with adaptive inv_scale
525        let mut quant_field = quantize_quant_field(&quant_field_float, params.inv_scale);
526
527        // Compute per-pixel mask on PRE-GABORISH image (matches libjxl:
528        // initial_quant_masking1x1 is computed in InitialQuantField before GaborishInverse)
529        let mask1x1 = if self.ac_strategy_enabled && self.pixel_domain_loss {
530            Some(compute_mask1x1(&xyb_y, padded_width, padded_height))
531        } else {
532            None
533        };
534
535        // Apply gaborish inverse (5x5 sharpening) AFTER quant field and mask1x1
536        // but BEFORE CfL and AC strategy. This matches libjxl enc_heuristics.cc:
537        //   line 1124: InitialQuantField (pre-gaborish)
538        //   line 1142: GaborishInverse
539        //   line 1150-1174: CfL (post-gaborish)
540        //   line 1179: AC strategy (post-gaborish)
541        if self.enable_gaborish {
542            gaborish_inverse(
543                &mut xyb_x,
544                &mut xyb_y,
545                &mut xyb_b,
546                padded_width,
547                padded_height,
548            );
549        }
550
551        // Float DC for LfFrame is now extracted from the transform pipeline
552        // (TransformOutput.float_dc) using dc_from_dct_NxN, which produces correct
553        // DC values for multi-block transforms (DCT16+). The old compute_float_dc
554        // used simple 8x8 pixel averages which diverge from dc_from_dct_NxN for
555        // blocks with spatial structure, causing catastrophic LfFrame quality for
556        // DCT16+ (up to 31% error on gradient content, butteraugli 13-20 vs ~2.5).
557
558        // Compute per-tile chroma-from-luma map on GABORISHED XYB
559        // Pass 1 always uses LS (use_newton=false): with distance_mul=1e-9, the
560        // perceptual cost function collapses to LS, so Newton adds no value.
561        // Newton is only useful in pass 2 where actual quant weighting matters.
562        let mut cfl_map = if self.cfl_enabled {
563            compute_cfl_map(
564                &xyb_x,
565                &xyb_y,
566                &xyb_b,
567                padded_width,
568                padded_height,
569                xsize_blocks,
570                ysize_blocks,
571                false,
572                self.profile.cfl_newton_eps,
573                self.profile.cfl_newton_max_iters,
574            )
575        } else {
576            CflMap::zeros(
577                div_ceil(xsize_blocks, TILE_DIM_IN_BLOCKS),
578                div_ceil(ysize_blocks, TILE_DIM_IN_BLOCKS),
579            )
580        };
581
582        debug_rect!(
583            "enc/config",
584            0,
585            0,
586            width,
587            height,
588            "d={:.2} gab={} cfl={} pixel_loss={} patches={} bfly_iters={} noise={} denoise={} ac_strat={} err_diff={}",
589            self.distance,
590            self.enable_gaborish,
591            self.cfl_enabled,
592            self.pixel_domain_loss,
593            self.enable_patches,
594            self.profile.butteraugli_iters,
595            self.enable_noise,
596            self.enable_denoise,
597            self.ac_strategy_enabled,
598            self.error_diffusion
599        );
600
601        // Compute adaptive AC strategy (DCT8/DCT16x8/DCT8x16/DCT16x16/DCT32x32)
602        #[allow(unused_mut)]
603        let mut ac_strategy = if let Some(forced) = self.force_strategy {
604            // Force a specific strategy for all blocks that fit
605            force_strategy_map(xsize_blocks, ysize_blocks, forced)
606        } else if !self.ac_strategy_enabled {
607            AcStrategyMap::new_dct8(xsize_blocks, ysize_blocks)
608        } else {
609            compute_ac_strategy(
610                &xyb_x,
611                &xyb_y,
612                &xyb_b,
613                padded_width,
614                padded_height,
615                xsize_blocks,
616                ysize_blocks,
617                self.distance,
618                &quant_field_float,
619                &masking,
620                &cfl_map,
621                mask1x1.as_deref(),
622                padded_width,
623                &self.profile,
624            )
625        };
626
627        // Debug: print strategy histogram if enabled
628        #[cfg(feature = "debug-ac-strategy")]
629        {
630            eprintln!(
631                "AC strategy mode: {}",
632                if mask1x1.is_some() {
633                    "pixel-domain"
634                } else {
635                    "coefficient-domain"
636                }
637            );
638            ac_strategy.print_histogram();
639        }
640
641        // Log AC strategy distribution
642        {
643            let mut counts = [0u32; 27];
644            for by in 0..ysize_blocks {
645                for bx in 0..xsize_blocks {
646                    if ac_strategy.is_first(bx, by) {
647                        let s = ac_strategy.raw_strategy(bx, by) as usize;
648                        if s < counts.len() {
649                            counts[s] += 1;
650                        }
651                    }
652                }
653            }
654            let total: u32 = counts.iter().sum();
655            // Format top strategies
656            // Names indexed by RAW_STRATEGY_* internal codes (NOT bitstream order)
657            let names = [
658                "DCT8",     // 0 = RAW_STRATEGY_DCT8
659                "DCT16x8",  // 1 = RAW_STRATEGY_DCT16X8
660                "DCT8x16",  // 2 = RAW_STRATEGY_DCT8X16
661                "DCT16x16", // 3 = RAW_STRATEGY_DCT16X16
662                "DCT32x32", // 4 = RAW_STRATEGY_DCT32X32
663                "DCT4x8",   // 5 = RAW_STRATEGY_DCT4X8
664                "DCT8x4",   // 6 = RAW_STRATEGY_DCT8X4
665                "DCT4x4",   // 7 = RAW_STRATEGY_DCT4X4
666                "IDENTITY", // 8 = RAW_STRATEGY_IDENTITY
667                "DCT2x2",   // 9 = RAW_STRATEGY_DCT2X2
668                "DCT32x16", // 10 = RAW_STRATEGY_DCT32X16
669                "DCT16x32", // 11 = RAW_STRATEGY_DCT16X32
670                "AFV0",     // 12 = RAW_STRATEGY_AFV0
671                "AFV1",     // 13 = RAW_STRATEGY_AFV1
672                "AFV2",     // 14 = RAW_STRATEGY_AFV2
673                "AFV3",     // 15 = RAW_STRATEGY_AFV3
674                "DCT64x64", // 16 = RAW_STRATEGY_DCT64X64
675                "DCT64x32", // 17 = RAW_STRATEGY_DCT64X32
676                "DCT32x64", // 18 = RAW_STRATEGY_DCT32X64
677            ];
678            let mut parts = alloc::string::String::new();
679            for (i, &c) in counts.iter().enumerate() {
680                if c > 0 {
681                    if !parts.is_empty() {
682                        parts.push(' ');
683                    }
684                    let name = names.get(i).copied().unwrap_or("?");
685                    let pct = c as f32 / total.max(1) as f32 * 100.0;
686                    parts.push_str(&alloc::format!("{}={:.0}%", name, pct));
687                }
688            }
689            debug_rect!(
690                "enc/ac_strategy",
691                0,
692                0,
693                width,
694                height,
695                "total={} {}",
696                total,
697                parts
698            );
699        }
700
701        // Free masking — no longer needed after AC strategy selection.
702        drop(masking);
703
704        // Adjust quant field for multi-block transforms.
705        // At low distances uses max, at high distances blends toward mean for better quality.
706        // Adjust BOTH u8 and float fields (libjxl adjusts float before SetQuantField).
707        adjust_quant_field_with_distance(&ac_strategy, &mut quant_field, self.distance);
708        adjust_quant_field_float_with_distance(&ac_strategy, &mut quant_field_float, self.distance);
709
710        // Quantization loops: iteratively refine quant_field using perceptual
711        // distance feedback. Butteraugli and zensim loops can stack: butteraugli
712        // handles global convergence, zensim adds SSIM-aware spatial fine-tuning.
713        // Works in float quant field domain with per-iteration global_scale
714        // recomputation (matching libjxl FindBestQuantization).
715        #[cfg(feature = "butteraugli-loop")]
716        if self.butteraugli_iters > 0 {
717            let initial_qf_float = quant_field_float.clone();
718            params = self.butteraugli_refine_quant_field(
719                linear_rgb,
720                width,
721                height,
722                &xyb_x,
723                &xyb_y,
724                &xyb_b,
725                padded_width,
726                padded_height,
727                xsize_blocks,
728                ysize_blocks,
729                &params,
730                &mut quant_field,
731                &mut quant_field_float,
732                &initial_qf_float,
733                &cfl_map,
734                &ac_strategy,
735                patches_data.as_ref(),
736                splines_data.as_ref(),
737            );
738        }
739
740        // SSIM2 quantization loop: alternative to butteraugli using SSIM2 + per-block RMSE.
741        #[cfg(feature = "ssim2-loop")]
742        if self.ssim2_iters > 0 {
743            let initial_qf_float = quant_field_float.clone();
744            params = self.ssim2_refine_quant_field(
745                linear_rgb,
746                width,
747                height,
748                &xyb_x,
749                &xyb_y,
750                &xyb_b,
751                padded_width,
752                padded_height,
753                xsize_blocks,
754                ysize_blocks,
755                &params,
756                &mut quant_field,
757                &mut quant_field_float,
758                &initial_qf_float,
759                &cfl_map,
760                &ac_strategy,
761                patches_data.as_ref(),
762                splines_data.as_ref(),
763            );
764        }
765
766        // Zensim quantization loop: uses zensim psychovisual metric + per-pixel diffmap.
767        // Also refines AC strategy by splitting large transforms with high perceptual error.
768        #[cfg(feature = "zensim-loop")]
769        if self.zensim_iters > 0 {
770            let initial_qf_float = quant_field_float.clone();
771            params = self.zensim_refine_quant_field(
772                linear_rgb,
773                width,
774                height,
775                &xyb_x,
776                &xyb_y,
777                &xyb_b,
778                padded_width,
779                padded_height,
780                xsize_blocks,
781                ysize_blocks,
782                &params,
783                &mut quant_field,
784                &mut quant_field_float,
785                &initial_qf_float,
786                &cfl_map,
787                &mut ac_strategy,
788                patches_data.as_ref(),
789                splines_data.as_ref(),
790            );
791        }
792
793        // Free float quant field — no longer needed after loop refinement.
794        drop(quant_field_float);
795
796        // Log quant field statistics after all adjustments
797        {
798            let qf = &quant_field;
799            let sum: u64 = qf.iter().map(|&v| v as u64).sum();
800            let avg = sum as f32 / qf.len() as f32;
801            let min = qf.iter().copied().min().unwrap_or(0);
802            let max = qf.iter().copied().max().unwrap_or(0);
803            debug_rect!(
804                "enc/quant_field",
805                0,
806                0,
807                width,
808                height,
809                "final avg={:.1} min={} max={} blocks={}",
810                avg,
811                min,
812                max,
813                qf.len()
814            );
815        }
816
817        // Dump AC strategy and quant field maps for comparison with libjxl.
818        // Set JXL_DUMP_MAPS=/tmp/prefix to enable. Maps are written as CSV.
819        #[cfg(feature = "debug-rect")]
820        if let Ok(prefix) = std::env::var("JXL_DUMP_MAPS") {
821            use std::io::Write;
822            // AC strategy map
823            if let Ok(mut f) = std::fs::File::create(format!("{prefix}_acs.csv")) {
824                for by in 0..ysize_blocks {
825                    for bx in 0..xsize_blocks {
826                        if bx > 0 {
827                            let _ = write!(f, ",");
828                        }
829                        let _ = write!(f, "{}", ac_strategy.raw_strategy(bx, by));
830                    }
831                    let _ = writeln!(f);
832                }
833                eprintln!("DIAG: wrote {prefix}_acs.csv ({xsize_blocks}x{ysize_blocks})");
834            }
835            // Quant field map
836            if let Ok(mut f) = std::fs::File::create(format!("{prefix}_qf.csv")) {
837                for by in 0..ysize_blocks {
838                    for bx in 0..xsize_blocks {
839                        if bx > 0 {
840                            let _ = write!(f, ",");
841                        }
842                        let _ = write!(f, "{}", quant_field[by * xsize_blocks + bx]);
843                    }
844                    let _ = writeln!(f);
845                }
846                eprintln!("DIAG: wrote {prefix}_qf.csv ({xsize_blocks}x{ysize_blocks})");
847            }
848        }
849
850        // CfL pass 2: recompute CfL map using actual AC strategies and per-block
851        // quantization weighting. Uses the same FindBestMultiplier as pass 1 but
852        // with strategy-specific DCTs and quant-weighted coefficients.
853        // Gated at effort >= 7 (speed_tier <= kSquirrel) matching libjxl.
854        if self.profile.cfl_two_pass && self.cfl_enabled {
855            super::chroma_from_luma::refine_cfl_map(
856                &mut cfl_map,
857                &xyb_x,
858                &xyb_y,
859                &xyb_b,
860                padded_width,
861                xsize_blocks,
862                ysize_blocks,
863                &ac_strategy,
864                &quant_field,
865                params.scale,
866                self.profile.cfl_newton,
867                self.profile.cfl_newton_eps,
868                self.profile.cfl_newton_max_iters,
869            );
870        }
871
872        // Perform DCT and quantization (XYB data is padded to block boundaries)
873        let transform_out = self.transform_and_quantize(
874            &xyb_x,
875            &xyb_y,
876            &xyb_b,
877            padded_width,
878            xsize_blocks,
879            ysize_blocks,
880            &params,
881            &mut quant_field,
882            &cfl_map,
883            &ac_strategy,
884        );
885        let quant_dc = &transform_out.quant_dc;
886        let quant_ac = &transform_out.quant_ac;
887        let nzeros = &transform_out.nzeros;
888        let raw_nzeros = &transform_out.raw_nzeros;
889
890        // Compute per-block EPF sharpness map when EPF is active
891        // Dynamic sharpness gated at effort >= 6 (speed_tier <= kWombat) matching libjxl
892        let sharpness_map = if params.epf_iters > 0
893            && self.distance >= 0.5
894            && self.profile.epf_dynamic_sharpness
895        {
896            let mask_fallback;
897            let mask: &[f32] = match &mask1x1 {
898                Some(m) => m,
899                None => {
900                    mask_fallback =
901                        super::adaptive_quant::compute_mask1x1(&xyb_y, padded_width, padded_height);
902                    &mask_fallback
903                }
904            };
905            Some(super::epf::compute_epf_sharpness(
906                [&xyb_x, &xyb_y, &xyb_b],
907                quant_dc,
908                quant_ac,
909                &quant_field,
910                mask,
911                &params,
912                &cfl_map,
913                &ac_strategy,
914                self.enable_gaborish,
915                xsize_blocks,
916                ysize_blocks,
917            ))
918        } else {
919            None
920        };
921
922        // Free XYB planes — no longer needed after EPF sharpness computation.
923        // At 4K (6720×4480), this frees ~339 MB (3 channels × padded_pixels × f32).
924        drop(xyb_x);
925        drop(xyb_y);
926        drop(xyb_b);
927        // Free mask1x1 — up to ~115 MB at 4K (padded_pixels × f32).
928        drop(mask1x1);
929
930        // Two-pass mode: collect tokens, build optimal codes, write bitstream
931        if self.optimize_codes {
932            let strategy_counts = ac_strategy.strategy_histogram();
933            let data = self.encode_two_pass(
934                width,
935                height,
936                &params,
937                xsize_blocks,
938                ysize_blocks,
939                xsize_groups,
940                ysize_groups,
941                xsize_dc_groups,
942                ysize_dc_groups,
943                num_groups,
944                num_dc_groups,
945                num_sections,
946                quant_dc,
947                quant_ac,
948                nzeros,
949                raw_nzeros,
950                &quant_field,
951                &cfl_map,
952                &ac_strategy,
953                &noise_params,
954                sharpness_map.as_deref(),
955                alpha,
956                patches_data.as_ref(),
957                splines_data.as_ref(),
958                if self.use_lf_frame {
959                    Some(&transform_out.float_dc)
960                } else {
961                    None
962                },
963            )?;
964            crate::debug_rect::flush("");
965            return Ok(VarDctOutput {
966                data,
967                strategy_counts,
968            });
969        }
970
971        // Get static entropy codes (wrapped in BuiltEntropyCode for uniform handling)
972        let dc_code = BuiltEntropyCode::StaticHuffman(get_dc_entropy_code());
973        let ac_code = BuiltEntropyCode::StaticHuffman(get_ac_entropy_code());
974
975        // Create main writer
976        let mut writer = BitWriter::with_capacity(width * height * 4);
977
978        // Write file header (includes JXL signature, ICC, and byte padding)
979        // Streaming path does not support alpha
980        self.write_file_header_and_pad(width, height, false, &mut writer)?;
981        #[cfg(feature = "debug-tokens")]
982        debug_log!(
983            "After file header: bit {} (byte {})",
984            writer.bits_written(),
985            writer.bits_written() / 8
986        );
987
988        // Write frame header
989        {
990            let mut fh = FrameHeader::lossy();
991            fh.x_qm_scale = params.x_qm_scale;
992            fh.b_qm_scale = params.b_qm_scale;
993            fh.epf_iters = params.epf_iters;
994            fh.gaborish = self.enable_gaborish;
995            if noise_params.is_some() {
996                fh.flags |= 0x01; // ENABLE_NOISE
997            }
998            // streaming path: no extra channels
999            fh.write(&mut writer)?;
1000        }
1001        #[cfg(feature = "debug-tokens")]
1002        debug_log!(
1003            "After frame header: bit {} (byte {})",
1004            writer.bits_written(),
1005            writer.bits_written() / 8
1006        );
1007
1008        // For single-group images, combine all sections at the bit level
1009        // (no byte padding between sections, only at the end)
1010        if num_sections == 4 {
1011            // Write sections to individual BitWriters (no padding)
1012            let block_ctx_map = super::ac_context::BlockCtxMap::default();
1013            let num_blocks = xsize_blocks * ysize_blocks;
1014            let mut dc_global = BitWriter::with_capacity(4096);
1015            self.write_dc_global(
1016                &params,
1017                num_dc_groups,
1018                &dc_code,
1019                &noise_params,
1020                None,
1021                &block_ctx_map,
1022                None, // No learned tree in single-pass mode
1023                None, // No patches in streaming mode
1024                None, // No splines in streaming mode
1025                None, // No custom dc_quant in single-pass mode
1026                &mut dc_global,
1027            )?;
1028
1029            // Get borrowed Huffman codes for streaming token writing
1030            let dc_huffman = dc_code.as_huffman();
1031            let ac_huffman = ac_code.as_huffman();
1032
1033            let mut dc_group = BitWriter::with_capacity(num_blocks * 10);
1034            self.write_dc_group(
1035                0,
1036                quant_dc,
1037                xsize_blocks,
1038                ysize_blocks,
1039                xsize_dc_groups,
1040                &quant_field,
1041                &cfl_map,
1042                &ac_strategy,
1043                None, // no sharpness map in single-pass mode
1044                &dc_huffman,
1045                &mut dc_group,
1046            )?;
1047
1048            let mut ac_global = BitWriter::with_capacity(4096);
1049            self.write_ac_global(
1050                num_groups,
1051                core::slice::from_ref(&ac_code),
1052                0,
1053                None,
1054                &[None],
1055                &mut ac_global,
1056            )?;
1057
1058            let mut ac_group_writer = BitWriter::with_capacity(num_blocks * 100);
1059            self.write_ac_group(
1060                0,
1061                quant_ac,
1062                nzeros,
1063                raw_nzeros,
1064                xsize_blocks,
1065                ysize_blocks,
1066                xsize_groups,
1067                &quant_field,
1068                &ac_strategy,
1069                &block_ctx_map,
1070                &ac_huffman,
1071                &mut ac_group_writer,
1072            )?;
1073
1074            #[cfg(feature = "debug-tokens")]
1075            {
1076                debug_log!(
1077                    "Section bit counts: DC_global={}, DC_group={}, AC_global={}, AC_group={}",
1078                    dc_global.bits_written(),
1079                    dc_group.bits_written(),
1080                    ac_global.bits_written(),
1081                    ac_group_writer.bits_written()
1082                );
1083            }
1084
1085            // Combine at bit level
1086            let mut combined = dc_global;
1087            #[cfg(feature = "debug-tokens")]
1088            debug_log!("After DC_global: {} bits", combined.bits_written());
1089            combined.append_unaligned(&dc_group)?;
1090            #[cfg(feature = "debug-tokens")]
1091            debug_log!("After DC_group: {} bits", combined.bits_written());
1092            combined.append_unaligned(&ac_global)?;
1093            #[cfg(feature = "debug-tokens")]
1094            debug_log!("After AC_global: {} bits", combined.bits_written());
1095            combined.append_unaligned(&ac_group_writer)?;
1096            #[cfg(feature = "debug-tokens")]
1097            debug_log!("After AC_group: {} bits", combined.bits_written());
1098            combined.zero_pad_to_byte();
1099            let combined_bytes = combined.finish();
1100
1101            #[cfg(feature = "debug-tokens")]
1102            {
1103                debug_log!("Combined section size: {} bytes", combined_bytes.len());
1104                debug_log!(
1105                    "Before TOC: bit {} (byte {})",
1106                    writer.bits_written(),
1107                    writer.bits_written() / 8
1108                );
1109            }
1110            write_toc(&[combined_bytes.len()], &mut writer)?;
1111            #[cfg(feature = "debug-tokens")]
1112            debug_log!(
1113                "After TOC: bit {} (byte {})",
1114                writer.bits_written(),
1115                writer.bits_written() / 8
1116            );
1117            writer.append_bytes(&combined_bytes)?;
1118        } else {
1119            // Multi-group: use byte-aligned sections
1120            let mut sections: Vec<Vec<u8>> = Vec::with_capacity(num_sections);
1121            let dc_huffman = dc_code.as_huffman();
1122            let ac_huffman = ac_code.as_huffman();
1123
1124            // DC Global section
1125            let block_ctx_map = super::ac_context::BlockCtxMap::default();
1126            let mut dc_global = BitWriter::with_capacity(4096);
1127            self.write_dc_global(
1128                &params,
1129                num_dc_groups,
1130                &dc_code,
1131                &noise_params,
1132                None,
1133                &block_ctx_map,
1134                None, // No learned tree in single-pass mode
1135                None, // No patches in streaming mode
1136                None, // No splines in streaming mode
1137                None, // No custom dc_quant in single-pass mode
1138                &mut dc_global,
1139            )?;
1140            dc_global.zero_pad_to_byte();
1141            sections.push(dc_global.finish());
1142
1143            // DC group sections
1144            let blocks_per_dc_group = (256 / 8) * (256 / 8); // 1024 blocks per DC group
1145            for dc_group_idx in 0..num_dc_groups {
1146                let mut dc_group = BitWriter::with_capacity(blocks_per_dc_group * 10);
1147                self.write_dc_group(
1148                    dc_group_idx,
1149                    quant_dc,
1150                    xsize_blocks,
1151                    ysize_blocks,
1152                    xsize_dc_groups,
1153                    &quant_field,
1154                    &cfl_map,
1155                    &ac_strategy,
1156                    None, // no sharpness map in single-pass mode
1157                    &dc_huffman,
1158                    &mut dc_group,
1159                )?;
1160                dc_group.zero_pad_to_byte();
1161                sections.push(dc_group.finish());
1162            }
1163
1164            // AC Global section
1165            let mut ac_global = BitWriter::with_capacity(4096);
1166            self.write_ac_global(
1167                num_groups,
1168                core::slice::from_ref(&ac_code),
1169                0,
1170                None,
1171                &[None],
1172                &mut ac_global,
1173            )?;
1174            ac_global.zero_pad_to_byte();
1175            sections.push(ac_global.finish());
1176
1177            // AC group sections
1178            let blocks_per_ac_group = (256 / 8) * (256 / 8); // 1024 blocks per AC group
1179            for group_idx in 0..num_groups {
1180                let mut ac_group_writer = BitWriter::with_capacity(blocks_per_ac_group * 100);
1181                self.write_ac_group(
1182                    group_idx,
1183                    quant_ac,
1184                    nzeros,
1185                    raw_nzeros,
1186                    xsize_blocks,
1187                    ysize_blocks,
1188                    xsize_groups,
1189                    &quant_field,
1190                    &ac_strategy,
1191                    &block_ctx_map,
1192                    &ac_huffman,
1193                    &mut ac_group_writer,
1194                )?;
1195                ac_group_writer.zero_pad_to_byte();
1196                sections.push(ac_group_writer.finish());
1197            }
1198
1199            let section_sizes: Vec<usize> = sections.iter().map(|s| s.len()).collect();
1200            write_toc(&section_sizes, &mut writer)?;
1201            for section in sections {
1202                writer.append_bytes(&section)?;
1203            }
1204        }
1205
1206        let strategy_counts = ac_strategy.strategy_histogram();
1207        crate::debug_rect::flush("");
1208        Ok(VarDctOutput {
1209            data: writer.finish_with_padding(),
1210            strategy_counts,
1211        })
1212    }
1213
1214    /// Encode with iterative rate control for improved distance targeting.
1215    ///
1216    /// This method:
1217    /// 1. Computes precomputed state (XYB, CfL, masking, AC strategy) once
1218    /// 2. Loops: encode → decode → butteraugli → adjust quant field
1219    /// 3. Returns when converged (within 5% of target) or max iterations reached
1220    ///
1221    /// Typically converges in 2-4 iterations. Each iteration costs ~50% of a
1222    /// full encode since XYB conversion, CfL, masking, and AC strategy are reused.
1223    ///
1224    /// Returns the encoded bytes. Use `encode_with_rate_control_config` for
1225    /// iteration count and custom configuration.
1226    ///
1227    /// Requires the `rate-control` feature.
1228    #[cfg(feature = "rate-control")]
1229    pub fn encode_with_rate_control(
1230        &self,
1231        width: usize,
1232        height: usize,
1233        linear_rgb: &[f32],
1234    ) -> Result<Vec<u8>> {
1235        let config = super::rate_control::RateControlConfig::default();
1236        let (encoded, _iters) =
1237            self.encode_with_rate_control_config(width, height, linear_rgb, &config)?;
1238        Ok(encoded)
1239    }
1240
1241    /// Encode with iterative rate control and custom configuration.
1242    ///
1243    /// Returns `(encoded_bytes, iteration_count)`.
1244    ///
1245    /// Requires the `rate-control` feature.
1246    #[cfg(feature = "rate-control")]
1247    pub fn encode_with_rate_control_config(
1248        &self,
1249        width: usize,
1250        height: usize,
1251        linear_rgb: &[f32],
1252        config: &super::rate_control::RateControlConfig,
1253    ) -> Result<(Vec<u8>, usize)> {
1254        // Compute precomputed state
1255        let precomputed = super::precomputed::EncoderPrecomputed::compute(
1256            width,
1257            height,
1258            linear_rgb,
1259            self.distance,
1260            self.cfl_enabled,
1261            self.ac_strategy_enabled,
1262            self.pixel_domain_loss,
1263            self.enable_noise,
1264            self.enable_denoise,
1265            self.enable_gaborish,
1266            self.force_strategy,
1267            &self.profile,
1268        );
1269
1270        // Run rate control loop
1271        super::rate_control::encode_with_rate_control(self, &precomputed, config)
1272    }
1273
1274    /// Encode from precomputed state with a specific quant field.
1275    ///
1276    /// This is the core encoding function used by rate control iterations.
1277    /// It skips XYB conversion, CfL, masking, and AC strategy computation,
1278    /// using the values from `precomputed` instead.
1279    ///
1280    /// Requires the `rate-control` feature.
1281    #[cfg(feature = "rate-control")]
1282    pub fn encode_from_precomputed(
1283        &self,
1284        precomputed: &super::precomputed::EncoderPrecomputed,
1285        quant_field: &[u8],
1286    ) -> Result<Vec<u8>> {
1287        let width = precomputed.width;
1288        let height = precomputed.height;
1289        let xsize_blocks = precomputed.xsize_blocks;
1290        let ysize_blocks = precomputed.ysize_blocks;
1291        let padded_width = precomputed.padded_width;
1292
1293        // Calculate group dimensions
1294        let xsize_groups = div_ceil(width, GROUP_DIM);
1295        let ysize_groups = div_ceil(height, GROUP_DIM);
1296        let xsize_dc_groups = div_ceil(width, DC_GROUP_DIM);
1297        let ysize_dc_groups = div_ceil(height, DC_GROUP_DIM);
1298        let num_groups = xsize_groups * ysize_groups;
1299        let num_dc_groups = xsize_dc_groups * ysize_dc_groups;
1300        let num_sections = 2 + num_dc_groups + num_groups;
1301
1302        // Copy and adjust quant field for multi-block transforms
1303        let mut quant_field = quant_field.to_vec();
1304        adjust_quant_field_with_distance(&precomputed.ac_strategy, &mut quant_field, self.distance);
1305
1306        // Compute distance params from effort profile
1307        let mut params = DistanceParams::compute_for_profile(self.distance, &self.profile);
1308
1309        // Apply pixel-level chromacity adjustments using pre-gaborish stats
1310        if self.profile.chromacity_adjustment {
1311            params.apply_chromacity_adjustment(
1312                precomputed.chromacity_x_pixelized,
1313                precomputed.chromacity_b_pixelized,
1314            );
1315        }
1316
1317        // Perform DCT and quantization using precomputed XYB data
1318        let transform_out = self.transform_and_quantize(
1319            &precomputed.xyb_x,
1320            &precomputed.xyb_y,
1321            &precomputed.xyb_b,
1322            padded_width,
1323            xsize_blocks,
1324            ysize_blocks,
1325            &params,
1326            &mut quant_field,
1327            &precomputed.cfl_map,
1328            &precomputed.ac_strategy,
1329        );
1330        let quant_dc = &transform_out.quant_dc;
1331        let quant_ac = &transform_out.quant_ac;
1332        let nzeros = &transform_out.nzeros;
1333        let raw_nzeros = &transform_out.raw_nzeros;
1334
1335        // Use two-pass mode for rate control (required for ANS)
1336        self.encode_two_pass(
1337            width,
1338            height,
1339            &params,
1340            xsize_blocks,
1341            ysize_blocks,
1342            xsize_groups,
1343            ysize_groups,
1344            xsize_dc_groups,
1345            ysize_dc_groups,
1346            num_groups,
1347            num_dc_groups,
1348            num_sections,
1349            quant_dc,
1350            quant_ac,
1351            nzeros,
1352            raw_nzeros,
1353            &quant_field,
1354            &precomputed.cfl_map,
1355            &precomputed.ac_strategy,
1356            &precomputed.noise_params,
1357            None, // TODO: compute sharpness_map for rate control path
1358            None, // TODO: thread alpha through butteraugli path
1359            None, // patches
1360            None, // splines
1361            None, // float_dc
1362        )
1363    }
1364}
1365
1366#[cfg(test)]
1367mod tests {
1368    use super::*;
1369
1370    #[test]
1371    fn test_encoder_creation() {
1372        let encoder = VarDctEncoder::new(1.0);
1373        assert_eq!(encoder.distance, 1.0);
1374
1375        let encoder_default = VarDctEncoder::default();
1376        assert_eq!(encoder_default.distance, 1.0);
1377    }
1378
1379    #[test]
1380    fn test_encode_small_image() {
1381        let encoder = VarDctEncoder::new(1.0);
1382
1383        // Create a simple 8x8 red image
1384        let width = 8;
1385        let height = 8;
1386        let mut linear_rgb = vec![0.0f32; width * height * 3];
1387        for y in 0..height {
1388            for x in 0..width {
1389                let idx = (y * width + x) * 3;
1390                linear_rgb[idx] = 1.0; // R
1391                linear_rgb[idx + 1] = 0.0; // G
1392                linear_rgb[idx + 2] = 0.0; // B
1393            }
1394        }
1395
1396        // This should at least not panic - full encoding not yet implemented
1397        let result = encoder.encode(width, height, &linear_rgb, None);
1398        // For now, just check it produces some output
1399        assert!(result.is_ok());
1400        let output = result.unwrap();
1401        assert!(output.data.len() > 2);
1402        assert_eq!(output.data[0], 0xFF);
1403        assert_eq!(output.data[1], 0x0A);
1404    }
1405
1406    #[test]
1407    fn test_convert_to_xyb_padded() {
1408        let encoder = VarDctEncoder::new(1.0);
1409
1410        // Gray pixel (1x1 image -> padded to 8x8)
1411        let linear_rgb = vec![0.5, 0.5, 0.5];
1412        let (x, y, b) = encoder.convert_to_xyb_padded(1, 1, 8, 8, &linear_rgb);
1413
1414        // Padded to 8x8 = 64 pixels
1415        assert_eq!(x.len(), 64);
1416        assert_eq!(y.len(), 64);
1417        assert_eq!(b.len(), 64);
1418
1419        // Gray should have X ≈ 0 (equal L and M)
1420        assert!(x[0].abs() < 0.01, "X should be near zero for gray");
1421        assert!(y[0] > 0.0, "Y should be positive");
1422        assert!(b[0] > 0.0, "B should be positive");
1423
1424        // Edge replication: all padded pixels should match the corner
1425        for i in 0..64 {
1426            assert!((x[i] - x[0]).abs() < 1e-6, "All padded X should match");
1427            assert!((y[i] - y[0]).abs() < 1e-6, "All padded Y should match");
1428            assert!((b[i] - b[0]).abs() < 1e-6, "All padded B should match");
1429        }
1430    }
1431
1432    #[test]
1433    fn test_encode_16x16_red_image() {
1434        // Test a 16x16 pixel image (2x2 blocks) to compare with libjxl-tiny
1435        let encoder = VarDctEncoder::new(1.0);
1436
1437        let width = 16;
1438        let height = 16;
1439        let mut linear_rgb = vec![0.0f32; width * height * 3];
1440        for y in 0..height {
1441            for x in 0..width {
1442                let idx = (y * width + x) * 3;
1443                linear_rgb[idx] = 1.0; // R
1444                linear_rgb[idx + 1] = 0.0; // G
1445                linear_rgb[idx + 2] = 0.0; // B
1446            }
1447        }
1448
1449        let result = encoder.encode(width, height, &linear_rgb, None);
1450        assert!(result.is_ok());
1451        let output = result.unwrap();
1452
1453        eprintln!("Output file size: {} bytes", output.data.len());
1454        eprintln!(
1455            "First 32 bytes: {:02x?}",
1456            &output.data[..32.min(output.data.len())]
1457        );
1458
1459        // Write output to file for comparison
1460        std::fs::write(std::env::temp_dir().join("our_16x16.jxl"), &output.data).unwrap();
1461
1462        // libjxl-tiny produces:
1463        // DC_group: 106 bits (14 bytes)
1464        // Total combined: 1086 bytes
1465        // Total file: 1104 bytes
1466        //
1467        // Our encoder should match these sizes
1468
1469        // Check signature
1470        assert_eq!(output.data[0], 0xFF);
1471        assert_eq!(output.data[1], 0x0A);
1472    }
1473
1474    /// Compute a simple hash of a byte slice for output locking.
1475    fn hash_bytes(bytes: &[u8]) -> u64 {
1476        use std::hash::{Hash, Hasher};
1477        let mut hasher = std::collections::hash_map::DefaultHasher::new();
1478        bytes.hash(&mut hasher);
1479        hasher.finish()
1480    }
1481
1482    /// Hash-locked test for 8x8 gradient image.
1483    /// This test ensures the encoder output doesn't change unexpectedly.
1484    /// x86_64 only: FP rounding differs on other architectures and 32-bit.
1485    #[test]
1486    #[cfg(target_arch = "x86_64")]
1487    fn test_hash_lock_8x8_gradient() {
1488        let encoder = VarDctEncoder::new(1.0);
1489        let width = 8;
1490        let height = 8;
1491        let mut linear_rgb = vec![0.0f32; width * height * 3];
1492
1493        // Simple gradient: R increases with x, G with y
1494        for y in 0..height {
1495            for x in 0..width {
1496                let idx = (y * width + x) * 3;
1497                linear_rgb[idx] = x as f32 / 7.0; // R
1498                linear_rgb[idx + 1] = y as f32 / 7.0; // G
1499                linear_rgb[idx + 2] = 0.5; // B
1500            }
1501        }
1502
1503        let bytes = encoder
1504            .encode(width, height, &linear_rgb, None)
1505            .unwrap()
1506            .data;
1507        let hash = hash_bytes(&bytes);
1508
1509        // Lock the hash - if this changes, the encoding has changed
1510        // Updated: error_diffusion default changed from true to false
1511        const EXPECTED_HASH: u64 = 0x311e7f185fbbf3f1;
1512        assert_eq!(
1513            hash,
1514            EXPECTED_HASH,
1515            "8x8 gradient hash mismatch: got {:#x}, expected {:#x}. \
1516             Output size: {} bytes. If intentional, update EXPECTED_HASH.",
1517            hash,
1518            EXPECTED_HASH,
1519            bytes.len()
1520        );
1521    }
1522
1523    /// Hash-locked test for 16x16 solid color image.
1524    /// x86_64 only: FP rounding differs on other architectures and 32-bit.
1525    #[test]
1526    #[cfg(target_arch = "x86_64")]
1527    fn test_hash_lock_16x16_solid() {
1528        let encoder = VarDctEncoder::new(1.0);
1529        let width = 16;
1530        let height = 16;
1531        let linear_rgb = vec![0.3f32; width * height * 3]; // gray
1532
1533        let bytes = encoder
1534            .encode(width, height, &linear_rgb, None)
1535            .unwrap()
1536            .data;
1537        let hash = hash_bytes(&bytes);
1538
1539        // Updated: fix global_scale to use effort-matched fixed q (libjxl parity)
1540        const EXPECTED_HASH: u64 = 0x1fd8e75f15fd418c;
1541        assert_eq!(
1542            hash,
1543            EXPECTED_HASH,
1544            "16x16 solid hash mismatch: got {:#x}, expected {:#x}. \
1545             Output size: {} bytes. If intentional, update EXPECTED_HASH.",
1546            hash,
1547            EXPECTED_HASH,
1548            bytes.len()
1549        );
1550    }
1551
1552    /// Hash-locked test for 64x64 checkerboard pattern.
1553    /// x86_64 only: FP rounding differs on other architectures and 32-bit.
1554    #[test]
1555    #[cfg(target_arch = "x86_64")]
1556    fn test_hash_lock_64x64_checkerboard() {
1557        let encoder = VarDctEncoder::new(1.0);
1558        let width = 64;
1559        let height = 64;
1560        let mut linear_rgb = vec![0.0f32; width * height * 3];
1561
1562        // 8x8 checkerboard pattern
1563        for y in 0..height {
1564            for x in 0..width {
1565                let idx = (y * width + x) * 3;
1566                let checker = ((x / 8) + (y / 8)) % 2 == 0;
1567                let val = if checker { 0.8 } else { 0.2 };
1568                linear_rgb[idx] = val;
1569                linear_rgb[idx + 1] = val;
1570                linear_rgb[idx + 2] = val;
1571            }
1572        }
1573
1574        let bytes = encoder
1575            .encode(width, height, &linear_rgb, None)
1576            .unwrap()
1577            .data;
1578        let hash = hash_bytes(&bytes);
1579
1580        // Updated: fast_log2f replaces glibc log2 in ANS frequency optimization
1581        const EXPECTED_HASH: u64 = 0x777dbc66ef3d69a3;
1582        assert_eq!(
1583            hash,
1584            EXPECTED_HASH,
1585            "64x64 checkerboard hash mismatch: got {:#x}, expected {:#x}. \
1586             Output size: {} bytes. If intentional, update EXPECTED_HASH.",
1587            hash,
1588            EXPECTED_HASH,
1589            bytes.len()
1590        );
1591    }
1592
1593    /// Hash-locked test for non-power-of-two size (tests padding).
1594    /// x86_64 only: FP rounding differs on other architectures and 32-bit.
1595    #[test]
1596    #[cfg(target_arch = "x86_64")]
1597    fn test_hash_lock_13x17_noise() {
1598        let encoder = VarDctEncoder::new(1.0);
1599        let width = 13;
1600        let height = 17;
1601        let mut linear_rgb = vec![0.0f32; width * height * 3];
1602
1603        // Deterministic pseudo-random pattern
1604        let mut seed = 12345u64;
1605        for val in &mut linear_rgb {
1606            seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
1607            *val = ((seed >> 32) as f32) / (u32::MAX as f32);
1608        }
1609
1610        let bytes = encoder
1611            .encode(width, height, &linear_rgb, None)
1612            .unwrap()
1613            .data;
1614        let hash = hash_bytes(&bytes);
1615
1616        // Updated: error_diffusion default changed from true to false
1617        const EXPECTED_HASH: u64 = 0x0c54e44d071039db;
1618        assert_eq!(
1619            hash,
1620            EXPECTED_HASH,
1621            "13x17 noise hash mismatch: got {:#x}, expected {:#x}. \
1622             Output size: {} bytes. If intentional, update EXPECTED_HASH.",
1623            hash,
1624            EXPECTED_HASH,
1625            bytes.len()
1626        );
1627    }
1628
1629    /// Roundtrip quality test for non-8-aligned dimensions.
1630    ///
1631    /// Encodes a 100x75 gradient, decodes with jxl-oxide, and verifies:
1632    /// 1. Dimensions match
1633    /// 2. Output is a valid JXL file (correct signature, decodable)
1634    ///
1635    /// This catches stride mismatch bugs where padded XYB buffers have
1636    /// stride != width, which corrupts adaptive quant, CfL, and AC strategy.
1637    #[test]
1638    fn test_roundtrip_non_8_aligned() {
1639        for &(w, h) in &[(100, 75), (13, 17), (33, 49), (7, 9)] {
1640            let mut linear_rgb = vec![0.0f32; w * h * 3];
1641
1642            // Smooth gradient (linear RGB)
1643            for y in 0..h {
1644                for x in 0..w {
1645                    let idx = (y * w + x) * 3;
1646                    linear_rgb[idx] = x as f32 / w.max(1) as f32;
1647                    linear_rgb[idx + 1] = y as f32 / h.max(1) as f32;
1648                    linear_rgb[idx + 2] = 0.3;
1649                }
1650            }
1651
1652            let encoder = VarDctEncoder::new(1.0);
1653            let bytes = encoder
1654                .encode(w, h, &linear_rgb, None)
1655                .unwrap_or_else(|e| panic!("encode {}x{} failed: {}", w, h, e))
1656                .data;
1657
1658            // Verify JXL signature
1659            assert_eq!(bytes[0], 0xFF, "{}x{}: bad signature byte 0", w, h);
1660            assert_eq!(bytes[1], 0x0A, "{}x{}: bad signature byte 1", w, h);
1661
1662            // Decode with jxl-oxide and verify dimensions
1663            let image = jxl_oxide::JxlImage::builder()
1664                .read(std::io::Cursor::new(&bytes))
1665                .unwrap_or_else(|e| panic!("jxl-oxide decode {}x{} failed: {}", w, h, e));
1666            assert_eq!(
1667                image.width(),
1668                w as u32,
1669                "{}x{}: decoded width mismatch",
1670                w,
1671                h
1672            );
1673            assert_eq!(
1674                image.height(),
1675                h as u32,
1676                "{}x{}: decoded height mismatch",
1677                w,
1678                h
1679            );
1680
1681            // Render to verify pixel data is valid
1682            let render = image
1683                .render_frame(0)
1684                .unwrap_or_else(|e| panic!("jxl-oxide render {}x{} failed: {}", w, h, e));
1685            let _pixels = render.image_all_channels();
1686        }
1687    }
1688
1689    /// Test DC tree learning produces valid output.
1690    #[test]
1691    fn test_dc_tree_learning() {
1692        let width = 64;
1693        let height = 64;
1694
1695        // Create a gradient image
1696        let mut linear_rgb = vec![0.0f32; width * height * 3];
1697        for y in 0..height {
1698            for x in 0..width {
1699                let idx = (y * width + x) * 3;
1700                linear_rgb[idx] = x as f32 / width as f32;
1701                linear_rgb[idx + 1] = y as f32 / height as f32;
1702                linear_rgb[idx + 2] = 0.5;
1703            }
1704        }
1705
1706        // Encode WITHOUT DC tree learning (baseline) — use ANS
1707        let mut encoder_baseline = VarDctEncoder::new(1.0);
1708        encoder_baseline.dc_tree_learning = false;
1709        let bytes_baseline = encoder_baseline
1710            .encode(width, height, &linear_rgb, None)
1711            .expect("baseline encode failed")
1712            .data;
1713
1714        // Encode WITH DC tree learning — also use ANS
1715        let mut encoder_learned = VarDctEncoder::new(1.0);
1716        encoder_learned.dc_tree_learning = true;
1717        std::fs::write(
1718            std::env::temp_dir().join("dc_baseline_test.jxl"),
1719            &bytes_baseline,
1720        )
1721        .unwrap();
1722        let bytes_learned = encoder_learned
1723            .encode(width, height, &linear_rgb, None)
1724            .expect("learned encode failed")
1725            .data;
1726        std::fs::write(
1727            std::env::temp_dir().join("dc_learned_test.jxl"),
1728            &bytes_learned,
1729        )
1730        .unwrap();
1731
1732        eprintln!(
1733            "DC tree learning: baseline={} bytes, learned={} bytes (delta={:.2}%)",
1734            bytes_baseline.len(),
1735            bytes_learned.len(),
1736            (bytes_learned.len() as f64 / bytes_baseline.len() as f64 - 1.0) * 100.0
1737        );
1738
1739        // Verify both produce valid JXL signature
1740        assert_eq!(bytes_baseline[0], 0xFF);
1741        assert_eq!(bytes_baseline[1], 0x0A);
1742        assert_eq!(bytes_learned[0], 0xFF);
1743        assert_eq!(bytes_learned[1], 0x0A);
1744
1745        // Verify baseline decodes (sanity check)
1746        {
1747            let image = jxl_oxide::JxlImage::builder()
1748                .read(std::io::Cursor::new(&bytes_baseline))
1749                .expect("jxl-oxide parse of baseline failed");
1750            let render = image
1751                .render_frame(0)
1752                .expect("jxl-oxide render of baseline failed");
1753            let _pixels = render.image_all_channels();
1754            eprintln!("Baseline ANS decodes OK ({} bytes)", bytes_baseline.len());
1755        }
1756
1757        // Decode the learned version with jxl-oxide to verify it's valid
1758        let image = jxl_oxide::JxlImage::builder()
1759            .read(std::io::Cursor::new(&bytes_learned))
1760            .expect("jxl-oxide decode of learned version failed");
1761        assert_eq!(image.width(), width as u32);
1762        assert_eq!(image.height(), height as u32);
1763
1764        // Render to verify pixel data is valid
1765        let render = image
1766            .render_frame(0)
1767            .expect("jxl-oxide render of learned version failed");
1768        let _pixels = render.image_all_channels();
1769        eprintln!("Learned ANS decodes OK ({} bytes)", bytes_learned.len());
1770
1771        // Also verify with djxl
1772        std::fs::write(
1773            std::env::temp_dir().join("dc_learned_test.jxl"),
1774            &bytes_learned,
1775        )
1776        .unwrap();
1777    }
1778
1779    /// Test that the butteraugli quantization loop produces valid output.
1780    #[cfg(feature = "butteraugli-loop")]
1781    #[test]
1782    fn test_butteraugli_loop_basic() {
1783        // Create a 64x64 test image with some variation
1784        let width = 64;
1785        let height = 64;
1786        let mut linear_rgb = vec![0.0f32; width * height * 3];
1787        for y in 0..height {
1788            for x in 0..width {
1789                let idx = (y * width + x) * 3;
1790                let fx = x as f32 / width as f32;
1791                let fy = y as f32 / height as f32;
1792                linear_rgb[idx] = fx * 0.8; // R
1793                linear_rgb[idx + 1] = fy * 0.6; // G
1794                linear_rgb[idx + 2] = (1.0 - fx) * 0.4; // B
1795            }
1796        }
1797
1798        // Encode without butteraugli loop
1799        let mut encoder_baseline = VarDctEncoder::new(2.0);
1800        encoder_baseline.butteraugli_iters = 0;
1801        let bytes_baseline = encoder_baseline
1802            .encode(width, height, &linear_rgb, None)
1803            .expect("baseline encode failed")
1804            .data;
1805
1806        // Encode with 2 butteraugli loop iterations
1807        let mut encoder_loop = VarDctEncoder::new(2.0);
1808        encoder_loop.butteraugli_iters = 2;
1809        let bytes_loop = encoder_loop
1810            .encode(width, height, &linear_rgb, None)
1811            .expect("butteraugli loop encode failed")
1812            .data;
1813
1814        // Both should produce valid JXL
1815        assert_eq!(bytes_baseline[0], 0xFF);
1816        assert_eq!(bytes_baseline[1], 0x0A);
1817        assert_eq!(bytes_loop[0], 0xFF);
1818        assert_eq!(bytes_loop[1], 0x0A);
1819
1820        // File sizes should differ (butteraugli loop changes quant field)
1821        eprintln!(
1822            "Baseline: {} bytes, Butteraugli loop (2 iters): {} bytes",
1823            bytes_baseline.len(),
1824            bytes_loop.len()
1825        );
1826
1827        // Verify the butteraugli-loop output decodes correctly
1828        let image = jxl_oxide::JxlImage::builder()
1829            .read(std::io::Cursor::new(&bytes_loop))
1830            .expect("jxl-oxide decode of butteraugli loop output failed");
1831        assert_eq!(image.width(), width as u32);
1832        assert_eq!(image.height(), height as u32);
1833
1834        let render = image
1835            .render_frame(0)
1836            .expect("jxl-oxide render of butteraugli loop output failed");
1837        let _pixels = render.image_all_channels();
1838        eprintln!("Butteraugli loop output decodes OK");
1839    }
1840}