jxl_encoder/vardct/encoder.rs
1// Copyright (c) Imazen LLC and the JPEG XL Project Authors.
2// Algorithms and constants derived from libjxl (BSD-3-Clause).
3// Licensed under AGPL-3.0-or-later. Commercial licenses at https://www.imazen.io/pricing
4
5//! Main tiny encoder implementation.
6
7use super::ac_strategy::{
8 AcStrategyMap, adjust_quant_field_float_with_distance, adjust_quant_field_with_distance,
9 compute_ac_strategy,
10};
11use super::adaptive_quant::{compute_mask1x1, compute_quant_field_float, quantize_quant_field};
12use super::chroma_from_luma::{CflMap, compute_cfl_map};
13use super::common::*;
14use super::frame::{DistanceParams, write_toc};
15use super::gaborish::gaborish_inverse;
16use super::noise::{denoise_xyb, estimate_noise_params, noise_quality_coef};
17use super::static_codes::{get_ac_entropy_code, get_dc_entropy_code};
18use crate::bit_writer::BitWriter;
19#[cfg(feature = "debug-tokens")]
20use crate::debug_log;
21use crate::debug_rect;
22use crate::error::Result;
23use crate::headers::frame_header::FrameHeader;
24
25// Re-export types from entropy_code sub-module.
26pub(crate) use super::entropy_code::{BuiltEntropyCode, force_strategy_map};
27
28/// Output of a VarDCT encode operation.
29pub struct VarDctOutput {
30 /// Encoded JXL codestream bytes.
31 pub data: Vec<u8>,
32 /// Per-strategy first-block counts, indexed by raw strategy code (0..19).
33 pub strategy_counts: [u32; 19],
34}
35
36/// Tiny JPEG XL encoder.
37///
38/// This is a simplified VarDCT encoder based on libjxl-tiny that uses:
39/// - Only DCT8, DCT8x16, DCT16x8 transforms
40/// - Huffman or ANS entropy coding
41/// - Default zig-zag coefficient order
42/// - Fixed context tree for DC
43pub struct VarDctEncoder {
44 /// Target distance (quality). 1.0 = visually lossless.
45 pub distance: f32,
46 /// Effort level (1–10). Controls AC strategy gating and search depth.
47 pub effort: u8,
48 /// Centralized effort-derived decisions. All effort-gated constants and
49 /// thresholds are read from this profile instead of inline `if effort >= N`.
50 pub profile: crate::effort::EffortProfile,
51 /// Use dynamic Huffman codes built from actual token frequencies.
52 /// When true (default), uses a two-pass mode: collect tokens first, build optimal codes, then write.
53 /// When false, uses pre-computed static codes (streaming, single-pass).
54 pub optimize_codes: bool,
55 /// Use enhanced histogram clustering with pair merge refinement.
56 /// Only effective when `optimize_codes` is true.
57 ///
58 /// Note: The enhanced clustering algorithm was designed for ANS entropy coding
59 /// and may not provide benefits (or may slightly increase size) when used with
60 /// Huffman coding. This option is experimental.
61 pub enhanced_clustering: bool,
62 /// Use ANS entropy coding instead of Huffman.
63 /// Only effective when `optimize_codes` is true (requires two-pass mode).
64 /// ANS typically produces 5-10% smaller files than Huffman.
65 pub use_ans: bool,
66 /// Enable chroma-from-luma (CfL) optimization.
67 /// When true (default), computes per-tile ytox/ytob values via least-squares fitting.
68 /// When false, uses ytox=0, ytob=0 (no chroma decorrelation).
69 pub cfl_enabled: bool,
70 /// Enable adaptive AC strategy selection (DCT8/DCT16x8/DCT8x16).
71 /// When true (default), selects the best transform size per 16x16 block region.
72 /// When false, uses DCT8 for all blocks.
73 pub ac_strategy_enabled: bool,
74 /// Enable custom coefficient ordering.
75 /// When true (default when optimize_codes is true), reorders AC coefficients
76 /// so frequently-zero positions appear last, reducing bitstream size.
77 /// Only effective when `optimize_codes` is true (requires two-pass mode).
78 pub custom_orders: bool,
79 /// Force a specific AC strategy for all blocks (for testing).
80 /// When Some(strategy), uses that raw strategy code for all blocks that fit.
81 /// None (default) uses normal strategy selection based on `ac_strategy_enabled`.
82 pub force_strategy: Option<u8>,
83 /// Enable noise synthesis.
84 /// When true, estimates noise parameters from the image and encodes them
85 /// in the frame header. The decoder regenerates noise during rendering.
86 /// Off by default (matching libjxl's default).
87 pub enable_noise: bool,
88 /// Enable Wiener denoising pre-filter (requires `enable_noise`).
89 /// When true, applies a conservative Wiener filter to remove estimated noise
90 /// before encoding. The decoder re-adds noise from the encoded parameters.
91 /// Provides 1-8% file size savings with near-zero Butteraugli quality impact.
92 /// Off by default (libjxl does not have a denoising pre-filter).
93 pub enable_denoise: bool,
94 /// Enable gaborish inverse pre-filter.
95 /// When true (default), applies a 5x5 sharpening kernel to XYB before DCT
96 /// and signals gab=1 in the frame header. The decoder applies a 3x3 blur
97 /// to compensate, reducing blocking artifacts.
98 /// Matches the libjxl VarDCT encoder default.
99 pub enable_gaborish: bool,
100 /// Enable error diffusion in AC quantization.
101 /// When true, spreads quantization error to neighboring coefficients in
102 /// zigzag order, helping preserve smooth gradients at high compression.
103 /// Off by default (modest quality improvement, slight performance cost).
104 pub error_diffusion: bool,
105 /// Enable pixel-domain loss calculation in AC strategy selection.
106 /// When true, uses full libjxl's pixel-domain loss model (IDCT error,
107 /// per-pixel masking, 8th power norm). This provides better distance
108 /// calibration matching cjxl's output.
109 /// When false (default), uses coefficient-domain loss (libjxl-tiny style).
110 /// Note: Requires `ac_strategy_enabled` to have any effect.
111 pub pixel_domain_loss: bool,
112 /// Enable LZ77 backward references in entropy coding.
113 /// When true, compresses token streams using LZ77 length+distance tokens.
114 /// Only effective with two-pass mode (optimize_codes=true) and ANS (use_ans=true).
115 /// Off by default — works for most cases but has known interactions with certain
116 /// forced strategy combinations (DCT2x2, IDENTITY) that cause InvalidAnsStream.
117 pub enable_lz77: bool,
118 /// LZ77 method to use when enable_lz77 is true.
119 ///
120 /// - `Rle`: Only matches consecutive identical values (fast, limited on photos)
121 /// - `Greedy`: Hash chain backward references (slower, 1-3% better on photos)
122 ///
123 /// Default: `Greedy` (best compression)
124 pub lz77_method: crate::entropy_coding::lz77::Lz77Method,
125 /// Enable DC tree learning.
126 /// When true, learns an optimal context tree for DC coding from image content
127 /// instead of using the fixed GRADIENT_CONTEXT_LUT.
128 /// **DISABLED/BROKEN**: The learned tree doesn't correctly route AC metadata
129 /// samples to contexts 0-10. Fixing requires parsing the static tree structure
130 /// and splicing in the learned DC subtree while preserving AC metadata routing.
131 /// Expected gain (~1.2% overall) doesn't justify the complexity. See CLAUDE.md.
132 pub dc_tree_learning: bool,
133 /// Number of butteraugli quantization loop iterations.
134 /// When > 0, iteratively refines the per-block quant field using butteraugli
135 /// perceptual distance feedback. Each iteration: encode → reconstruct → measure
136 /// → adjust quant_field. AC strategy is kept fixed; only quant_field changes.
137 ///
138 /// libjxl uses 2 iterations at effort 8, 4 at effort 9.
139 /// Requires the `butteraugli-loop` feature.
140 ///
141 /// Default: 0 (disabled)
142 #[cfg(feature = "butteraugli-loop")]
143 pub butteraugli_iters: u32,
144 /// Number of SSIM2 quantization loop iterations.
145 /// Alternative to butteraugli loop: uses per-block linear RGB RMSE + full-image SSIM2.
146 /// Requires the `ssim2-loop` feature.
147 ///
148 /// Default: 0 (disabled)
149 #[cfg(feature = "ssim2-loop")]
150 pub ssim2_iters: u32,
151 /// Number of zensim quantization loop iterations.
152 /// Alternative to butteraugli loop: uses zensim's psychovisual metric for both
153 /// global quality tracking and per-pixel spatial error map (diffmap in XYB space).
154 /// Also refines AC strategy by splitting large transforms with high perceptual error.
155 /// Requires the `zensim-loop` feature.
156 ///
157 /// Default: 0 (disabled)
158 #[cfg(feature = "zensim-loop")]
159 pub zensim_iters: u32,
160 /// Whether the input has 16-bit samples. When true, the file header signals
161 /// bit_depth=16 instead of 8. The actual VarDCT encoding is the same (XYB
162 /// is always f32 internally), but the decoder uses this to reconstruct at
163 /// the correct output bit depth.
164 pub bit_depth_16: bool,
165 /// ICC profile to embed in the codestream.
166 /// When Some, writes has_icc=1 and encodes the profile after the file header.
167 pub icc_profile: Option<Vec<u8>>,
168 /// Enable patches (dictionary-based repeated pattern detection).
169 /// When true, detects repeated rectangular elements (text glyphs, buttons, icons)
170 /// and stores unique patterns once in a reference frame. Huge wins on screenshots.
171 /// On by default for lossy encoding.
172 pub enable_patches: bool,
173 /// Encoder mode: Reference (match libjxl) or Experimental (own improvements).
174 pub encoder_mode: crate::api::EncoderMode,
175 /// Manual splines to overlay on the image (opt-in, None by default).
176 pub splines: Option<Vec<crate::vardct::splines::Spline>>,
177 /// Whether the input is grayscale. When true, the file header signals
178 /// ColorSpace::Gray instead of RGB. VarDCT still operates in XYB (3 channels)
179 /// internally — this only affects the output colorspace the decoder targets.
180 pub is_grayscale: bool,
181 /// Progressive encoding mode (Single, QuantizedAcFullAc, DcVlfLfAc).
182 /// When not Single, AC coefficients are split across multiple passes with
183 /// shift-based precision reduction for early preview rendering.
184 pub progressive: crate::api::ProgressiveMode,
185 /// Enable LfFrame (separate DC frame).
186 /// When true, DC coefficients are encoded as a separate modular frame
187 /// (frame_type=1, dc_level=1) before the main VarDCT frame, with
188 /// distance-scaled quantization factors matching libjxl's progressive_dc >= 1.
189 pub use_lf_frame: bool,
190 /// Custom gamma (encoding exponent) from source image.
191 /// When Some, writes have_gamma=true in the JXL header and uses gamma
192 /// linearization instead of sRGB TF. Example: 0.45455 for gamma 2.2.
193 pub source_gamma: Option<f32>,
194 /// Explicit color encoding override for the JXL header.
195 /// When Some, this is used instead of deriving from source_gamma / defaults.
196 /// Allows signaling HDR (PQ, HLG) or non-sRGB primaries (BT.2020, P3).
197 pub color_encoding: Option<crate::headers::color_encoding::ColorEncoding>,
198 /// Peak display luminance in nits for ToneMapping. Default 255.0 (SDR).
199 pub intensity_target: f32,
200 /// Minimum display luminance in nits for ToneMapping. Default 0.0.
201 pub min_nits: f32,
202 /// Intrinsic display size `(width, height)`, if different from coded dimensions.
203 pub intrinsic_size: Option<(u32, u32)>,
204}
205
206impl Default for VarDctEncoder {
207 fn default() -> Self {
208 Self {
209 distance: 1.0,
210 effort: 7,
211 profile: crate::effort::EffortProfile::lossy(7, crate::api::EncoderMode::Reference),
212 optimize_codes: true,
213 enhanced_clustering: true, // Profile-driven: e9+ for Best, Fast otherwise
214 use_ans: true, // ANS produces 4-10% smaller files than Huffman
215 cfl_enabled: true,
216 ac_strategy_enabled: true,
217 custom_orders: true,
218 force_strategy: None,
219 enable_noise: false,
220 enable_denoise: false,
221 enable_gaborish: true,
222 error_diffusion: false, // libjxl accepts param but never uses it in QuantizeBlockAC
223 pixel_domain_loss: true, // Full libjxl pixel-domain loss: +0.2-1.9 SSIM2 at all distances
224 enable_lz77: false, // LZ77 has known interactions with DCT2x2/IDENTITY strategies
225 lz77_method: crate::entropy_coding::lz77::Lz77Method::Greedy, // Best compression
226 dc_tree_learning: false, // DC tree learning (experimental)
227 #[cfg(feature = "butteraugli-loop")]
228 butteraugli_iters: 0, // Effort-gated: default off (effort 7). Set via LossyConfig.
229 #[cfg(feature = "ssim2-loop")]
230 ssim2_iters: 0, // Off by default. Set via LossyConfig.
231 #[cfg(feature = "zensim-loop")]
232 zensim_iters: 0, // Off by default. Set via LossyConfig.
233 bit_depth_16: false,
234 icc_profile: None,
235 enable_patches: true, // Patches: huge wins on screenshots, zero cost on photos
236 encoder_mode: crate::api::EncoderMode::Reference,
237 splines: None,
238 is_grayscale: false,
239 progressive: crate::api::ProgressiveMode::Single,
240 use_lf_frame: false,
241 source_gamma: None,
242 color_encoding: None,
243 intensity_target: 255.0,
244 min_nits: 0.0,
245 intrinsic_size: None,
246 }
247 }
248}
249
250impl VarDctEncoder {
251 /// Create a new tiny encoder with the given distance.
252 pub fn new(distance: f32) -> Self {
253 Self {
254 distance,
255 effort: 7,
256 profile: crate::effort::EffortProfile::lossy(7, crate::api::EncoderMode::Reference),
257 optimize_codes: true,
258 enhanced_clustering: true, // Profile-driven: e9+ for Best, Fast otherwise
259 use_ans: true, // ANS produces 4-10% smaller files than Huffman
260 cfl_enabled: true,
261 ac_strategy_enabled: true,
262 custom_orders: true,
263 force_strategy: None,
264 enable_noise: false,
265 enable_denoise: false,
266 enable_gaborish: true,
267 error_diffusion: false, // libjxl accepts param but never uses it in QuantizeBlockAC
268 pixel_domain_loss: true, // Full libjxl pixel-domain loss: +0.2-1.9 SSIM2
269 enable_lz77: false, // LZ77 has known interactions with DCT2x2/IDENTITY strategies
270 lz77_method: crate::entropy_coding::lz77::Lz77Method::Greedy, // Best compression
271 dc_tree_learning: false, // DC tree learning (experimental)
272 #[cfg(feature = "butteraugli-loop")]
273 butteraugli_iters: 0, // Effort-gated: default off (effort 7). Set via LossyConfig.
274 #[cfg(feature = "ssim2-loop")]
275 ssim2_iters: 0, // Off by default. Set via LossyConfig.
276 #[cfg(feature = "zensim-loop")]
277 zensim_iters: 0, // Off by default. Set via LossyConfig.
278 bit_depth_16: false,
279 icc_profile: None,
280 enable_patches: true, // Patches: huge wins on screenshots, zero cost on photos
281 encoder_mode: crate::api::EncoderMode::Reference,
282 splines: None,
283 is_grayscale: false,
284 progressive: crate::api::ProgressiveMode::Single,
285 use_lf_frame: false,
286 source_gamma: None,
287 color_encoding: None,
288 intensity_target: 255.0,
289 min_nits: 0.0,
290 intrinsic_size: None,
291 }
292 }
293
294 /// Encode an image in linear sRGB format, optionally with an alpha channel.
295 ///
296 /// Input should be 3 channels (RGB) of f32 values in [0, 1] range.
297 /// Values outside [0, 1] are allowed for out-of-gamut colors.
298 ///
299 /// If `alpha` is provided, it must be `width * height` bytes of u8 alpha values.
300 /// Alpha is encoded as a modular extra channel alongside the VarDCT RGB data.
301 pub fn encode(
302 &self,
303 width: usize,
304 height: usize,
305 linear_rgb: &[f32],
306 alpha: Option<&[u8]>,
307 ) -> Result<VarDctOutput> {
308 assert_eq!(linear_rgb.len(), width * height * 3);
309 if let Some(a) = alpha {
310 assert_eq!(a.len(), width * height);
311 }
312
313 crate::debug_rect::clear();
314
315 // Calculate dimensions
316 let xsize_blocks = div_ceil(width, BLOCK_DIM);
317 let ysize_blocks = div_ceil(height, BLOCK_DIM);
318 let xsize_groups = div_ceil(width, GROUP_DIM);
319 let ysize_groups = div_ceil(height, GROUP_DIM);
320 let xsize_dc_groups = div_ceil(width, DC_GROUP_DIM);
321 let ysize_dc_groups = div_ceil(height, DC_GROUP_DIM);
322 let num_groups = xsize_groups * ysize_groups;
323 let num_dc_groups = xsize_dc_groups * ysize_dc_groups;
324
325 // Number of sections: DC global + DC groups + AC global + AC groups
326 let num_sections = 2 + num_dc_groups + num_groups;
327
328 // Pad to block boundary dimensions
329 let padded_width = xsize_blocks * BLOCK_DIM;
330 let padded_height = ysize_blocks * BLOCK_DIM;
331
332 // Convert to XYB with edge-replicated padding to block boundaries.
333 // This allows SIMD to process full blocks without bounds checking.
334 let (mut xyb_x, mut xyb_y, mut xyb_b) =
335 self.convert_to_xyb_padded(width, height, padded_width, padded_height, linear_rgb);
336
337 // Estimate noise parameters (if enabled).
338 // The decoder adds noise during rendering; the encoder just encodes the params.
339 let noise_params = if self.enable_noise {
340 let quality_coef = noise_quality_coef(self.distance);
341 let params = estimate_noise_params(
342 &xyb_x,
343 &xyb_y,
344 &xyb_b,
345 padded_width,
346 padded_height,
347 quality_coef,
348 );
349
350 // Apply denoising pre-filter if enabled and noise was detected.
351 // Removes estimated noise before encoding so the encoder spends fewer
352 // bits on noise; the decoder re-adds it from the encoded parameters.
353 if self.enable_denoise
354 && let Some(ref p) = params
355 {
356 denoise_xyb(
357 &mut xyb_x,
358 &mut xyb_y,
359 &mut xyb_b,
360 padded_width,
361 padded_height,
362 p,
363 quality_coef,
364 );
365 }
366
367 params
368 } else {
369 None
370 };
371
372 // Detect and subtract patches (before gaborish, after noise).
373 // Patches work in the XYB domain: detect repeated rectangular elements,
374 // store unique patterns in a reference frame, subtract from image.
375 let mut patches_data = if self.enable_patches {
376 super::patches::find_and_build([&xyb_x, &xyb_y, &xyb_b], width, height, padded_width)
377 } else {
378 None
379 };
380 // Cost-benefit gating for experimental mode only.
381 // libjxl uses patches unconditionally when detected (no cost check),
382 // so reference mode skips this to match.
383 if matches!(self.encoder_mode, crate::api::EncoderMode::Experimental)
384 && let Some(ref pd) = patches_data
385 && !pd.is_cost_effective(self.distance, self.use_ans)
386 {
387 patches_data = None;
388 }
389 // Quantize ref_image so subtract/add use the same values the decoder will reconstruct.
390 if let Some(ref mut pd) = patches_data {
391 pd.quantize_ref_image();
392 }
393 if let Some(ref pd) = patches_data {
394 let mut xyb = [
395 core::mem::take(&mut xyb_x),
396 core::mem::take(&mut xyb_y),
397 core::mem::take(&mut xyb_b),
398 ];
399 super::patches::subtract_patches(&mut xyb, padded_width, pd);
400 let [x, y, b] = xyb;
401 xyb_x = x;
402 xyb_y = y;
403 xyb_b = b;
404 }
405
406 // Build and subtract splines (after patches, before gaborish).
407 // Splines are additive overlays: encoder subtracts, decoder adds back.
408 // Uses default DC CfL params (y_to_x=0.0, y_to_b=1.0) since we write default DC cmap.
409 let splines_data = if let Some(ref splines) = self.splines {
410 if !splines.is_empty() {
411 let sd = super::splines::SplinesData::from_splines(
412 splines.clone(),
413 0, // quantization_adjustment
414 0.0, // y_to_x (default DC CfL)
415 1.0, // y_to_b (default DC CfL)
416 width,
417 height,
418 );
419 {
420 let mut xyb = [
421 core::mem::take(&mut xyb_x),
422 core::mem::take(&mut xyb_y),
423 core::mem::take(&mut xyb_b),
424 ];
425 super::splines::subtract_splines(&mut xyb, padded_width, width, height, &sd);
426 let [x, y, b] = xyb;
427 xyb_x = x;
428 xyb_y = y;
429 xyb_b = b;
430 }
431 Some(sd)
432 } else {
433 None
434 }
435 } else {
436 None
437 };
438
439 // Compute pixel chromacity stats BEFORE gaborish (matching libjxl pipeline).
440 // Gaborish sharpening inflates gradients, producing overly aggressive adjustment.
441 // Gated at effort >= 7 to skip the full-image gradient scan at low effort.
442 let (chromacity_x, chromacity_b) = if self.profile.chromacity_adjustment {
443 let pixel_stats = super::frame::PixelStatsForChromacityAdjustment::calc(
444 &xyb_x,
445 &xyb_y,
446 &xyb_b,
447 padded_width,
448 padded_height,
449 );
450 (
451 pixel_stats.how_much_is_x_channel_pixelized(),
452 pixel_stats.how_much_is_b_channel_pixelized(),
453 )
454 } else {
455 (0, 0)
456 };
457
458 // Compute adaptive per-block quantization field and masking on ORIGINAL
459 // (pre-gaborish) XYB. libjxl computes InitialQuantField before GaborishInverse
460 // (enc_heuristics.cc:1117-1142, comment: "relies on pre-gaborish values").
461 // When gaborish is off, scale distance by 0.62 for the quant field only
462 // (not global_scale/quant_dc). This matches libjxl enc_heuristics.cc:1119.
463 let distance_for_iqf = if self.enable_gaborish {
464 self.distance
465 } else {
466 self.distance * 0.62
467 };
468
469 // Step 1: Compute float quant field on pre-gaborish XYB.
470 //
471 // libjxl effort gating (enc_heuristics.cc:1097-1128):
472 // - effort < 5 (speed_tier > kHare): flat quant field = q_numerator/distance
473 // - effort >= 5 (speed_tier <= kHare): adaptive via InitialQuantField
474 let (mut quant_field_float, masking) = if self.profile.use_adaptive_quant {
475 compute_quant_field_float(
476 &xyb_x,
477 &xyb_y,
478 &xyb_b,
479 padded_width,
480 padded_height,
481 xsize_blocks,
482 ysize_blocks,
483 distance_for_iqf,
484 self.profile.k_ac_quant,
485 )
486 } else {
487 // Flat quant field for low effort (matches libjxl enc_heuristics.cc:1105-1106)
488 let q = self.profile.initial_q_numerator / self.distance;
489 let flat_qf = vec![q; xsize_blocks * ysize_blocks];
490 let masking_val = 1.0 / (q + 0.001);
491 let flat_masking = vec![masking_val; xsize_blocks * ysize_blocks];
492 (flat_qf, flat_masking)
493 };
494
495 // Step 2: Compute distance params with effort-matched global_scale.
496 //
497 // Uses profile.initial_q_numerator for q = numerator / distance.
498 // The adaptive median/MAD formula is only used inside the butteraugli
499 // loop (effort >= 8).
500 let mut params = DistanceParams::compute_for_profile(self.distance, &self.profile);
501
502 // Apply pixel-level chromacity adjustments using pre-gaborish stats
503 // Gated at effort >= 7 (speed_tier <= kSquirrel) matching libjxl
504 if self.profile.chromacity_adjustment {
505 params.apply_chromacity_adjustment(chromacity_x, chromacity_b);
506 }
507
508 debug_rect!(
509 "enc/params",
510 0,
511 0,
512 width,
513 height,
514 "global_scale={} quant_dc={} scale={:.4} inv_scale={:.4} epf_iters={} chrom_x={:.3} chrom_b={:.3}",
515 params.global_scale,
516 params.quant_dc,
517 params.scale,
518 params.inv_scale,
519 params.epf_iters,
520 chromacity_x,
521 chromacity_b
522 );
523
524 // Step 3: Quantize float quant field to raw u8 with adaptive inv_scale
525 let mut quant_field = quantize_quant_field(&quant_field_float, params.inv_scale);
526
527 // Compute per-pixel mask on PRE-GABORISH image (matches libjxl:
528 // initial_quant_masking1x1 is computed in InitialQuantField before GaborishInverse)
529 let mask1x1 = if self.ac_strategy_enabled && self.pixel_domain_loss {
530 Some(compute_mask1x1(&xyb_y, padded_width, padded_height))
531 } else {
532 None
533 };
534
535 // Apply gaborish inverse (5x5 sharpening) AFTER quant field and mask1x1
536 // but BEFORE CfL and AC strategy. This matches libjxl enc_heuristics.cc:
537 // line 1124: InitialQuantField (pre-gaborish)
538 // line 1142: GaborishInverse
539 // line 1150-1174: CfL (post-gaborish)
540 // line 1179: AC strategy (post-gaborish)
541 if self.enable_gaborish {
542 gaborish_inverse(
543 &mut xyb_x,
544 &mut xyb_y,
545 &mut xyb_b,
546 padded_width,
547 padded_height,
548 );
549 }
550
551 // Float DC for LfFrame is now extracted from the transform pipeline
552 // (TransformOutput.float_dc) using dc_from_dct_NxN, which produces correct
553 // DC values for multi-block transforms (DCT16+). The old compute_float_dc
554 // used simple 8x8 pixel averages which diverge from dc_from_dct_NxN for
555 // blocks with spatial structure, causing catastrophic LfFrame quality for
556 // DCT16+ (up to 31% error on gradient content, butteraugli 13-20 vs ~2.5).
557
558 // Compute per-tile chroma-from-luma map on GABORISHED XYB
559 // Pass 1 always uses LS (use_newton=false): with distance_mul=1e-9, the
560 // perceptual cost function collapses to LS, so Newton adds no value.
561 // Newton is only useful in pass 2 where actual quant weighting matters.
562 let mut cfl_map = if self.cfl_enabled {
563 compute_cfl_map(
564 &xyb_x,
565 &xyb_y,
566 &xyb_b,
567 padded_width,
568 padded_height,
569 xsize_blocks,
570 ysize_blocks,
571 false,
572 self.profile.cfl_newton_eps,
573 self.profile.cfl_newton_max_iters,
574 )
575 } else {
576 CflMap::zeros(
577 div_ceil(xsize_blocks, TILE_DIM_IN_BLOCKS),
578 div_ceil(ysize_blocks, TILE_DIM_IN_BLOCKS),
579 )
580 };
581
582 debug_rect!(
583 "enc/config",
584 0,
585 0,
586 width,
587 height,
588 "d={:.2} gab={} cfl={} pixel_loss={} patches={} bfly_iters={} noise={} denoise={} ac_strat={} err_diff={}",
589 self.distance,
590 self.enable_gaborish,
591 self.cfl_enabled,
592 self.pixel_domain_loss,
593 self.enable_patches,
594 self.profile.butteraugli_iters,
595 self.enable_noise,
596 self.enable_denoise,
597 self.ac_strategy_enabled,
598 self.error_diffusion
599 );
600
601 // Compute adaptive AC strategy (DCT8/DCT16x8/DCT8x16/DCT16x16/DCT32x32)
602 #[allow(unused_mut)]
603 let mut ac_strategy = if let Some(forced) = self.force_strategy {
604 // Force a specific strategy for all blocks that fit
605 force_strategy_map(xsize_blocks, ysize_blocks, forced)
606 } else if !self.ac_strategy_enabled {
607 AcStrategyMap::new_dct8(xsize_blocks, ysize_blocks)
608 } else {
609 compute_ac_strategy(
610 &xyb_x,
611 &xyb_y,
612 &xyb_b,
613 padded_width,
614 padded_height,
615 xsize_blocks,
616 ysize_blocks,
617 self.distance,
618 &quant_field_float,
619 &masking,
620 &cfl_map,
621 mask1x1.as_deref(),
622 padded_width,
623 &self.profile,
624 )
625 };
626
627 // Debug: print strategy histogram if enabled
628 #[cfg(feature = "debug-ac-strategy")]
629 {
630 eprintln!(
631 "AC strategy mode: {}",
632 if mask1x1.is_some() {
633 "pixel-domain"
634 } else {
635 "coefficient-domain"
636 }
637 );
638 ac_strategy.print_histogram();
639 }
640
641 // Log AC strategy distribution
642 {
643 let mut counts = [0u32; 27];
644 for by in 0..ysize_blocks {
645 for bx in 0..xsize_blocks {
646 if ac_strategy.is_first(bx, by) {
647 let s = ac_strategy.raw_strategy(bx, by) as usize;
648 if s < counts.len() {
649 counts[s] += 1;
650 }
651 }
652 }
653 }
654 let total: u32 = counts.iter().sum();
655 // Format top strategies
656 // Names indexed by RAW_STRATEGY_* internal codes (NOT bitstream order)
657 let names = [
658 "DCT8", // 0 = RAW_STRATEGY_DCT8
659 "DCT16x8", // 1 = RAW_STRATEGY_DCT16X8
660 "DCT8x16", // 2 = RAW_STRATEGY_DCT8X16
661 "DCT16x16", // 3 = RAW_STRATEGY_DCT16X16
662 "DCT32x32", // 4 = RAW_STRATEGY_DCT32X32
663 "DCT4x8", // 5 = RAW_STRATEGY_DCT4X8
664 "DCT8x4", // 6 = RAW_STRATEGY_DCT8X4
665 "DCT4x4", // 7 = RAW_STRATEGY_DCT4X4
666 "IDENTITY", // 8 = RAW_STRATEGY_IDENTITY
667 "DCT2x2", // 9 = RAW_STRATEGY_DCT2X2
668 "DCT32x16", // 10 = RAW_STRATEGY_DCT32X16
669 "DCT16x32", // 11 = RAW_STRATEGY_DCT16X32
670 "AFV0", // 12 = RAW_STRATEGY_AFV0
671 "AFV1", // 13 = RAW_STRATEGY_AFV1
672 "AFV2", // 14 = RAW_STRATEGY_AFV2
673 "AFV3", // 15 = RAW_STRATEGY_AFV3
674 "DCT64x64", // 16 = RAW_STRATEGY_DCT64X64
675 "DCT64x32", // 17 = RAW_STRATEGY_DCT64X32
676 "DCT32x64", // 18 = RAW_STRATEGY_DCT32X64
677 ];
678 let mut parts = alloc::string::String::new();
679 for (i, &c) in counts.iter().enumerate() {
680 if c > 0 {
681 if !parts.is_empty() {
682 parts.push(' ');
683 }
684 let name = names.get(i).copied().unwrap_or("?");
685 let pct = c as f32 / total.max(1) as f32 * 100.0;
686 parts.push_str(&alloc::format!("{}={:.0}%", name, pct));
687 }
688 }
689 debug_rect!(
690 "enc/ac_strategy",
691 0,
692 0,
693 width,
694 height,
695 "total={} {}",
696 total,
697 parts
698 );
699 }
700
701 // Free masking — no longer needed after AC strategy selection.
702 drop(masking);
703
704 // Adjust quant field for multi-block transforms.
705 // At low distances uses max, at high distances blends toward mean for better quality.
706 // Adjust BOTH u8 and float fields (libjxl adjusts float before SetQuantField).
707 adjust_quant_field_with_distance(&ac_strategy, &mut quant_field, self.distance);
708 adjust_quant_field_float_with_distance(&ac_strategy, &mut quant_field_float, self.distance);
709
710 // Quantization loops: iteratively refine quant_field using perceptual
711 // distance feedback. Butteraugli and zensim loops can stack: butteraugli
712 // handles global convergence, zensim adds SSIM-aware spatial fine-tuning.
713 // Works in float quant field domain with per-iteration global_scale
714 // recomputation (matching libjxl FindBestQuantization).
715 #[cfg(feature = "butteraugli-loop")]
716 if self.butteraugli_iters > 0 {
717 let initial_qf_float = quant_field_float.clone();
718 params = self.butteraugli_refine_quant_field(
719 linear_rgb,
720 width,
721 height,
722 &xyb_x,
723 &xyb_y,
724 &xyb_b,
725 padded_width,
726 padded_height,
727 xsize_blocks,
728 ysize_blocks,
729 ¶ms,
730 &mut quant_field,
731 &mut quant_field_float,
732 &initial_qf_float,
733 &cfl_map,
734 &ac_strategy,
735 patches_data.as_ref(),
736 splines_data.as_ref(),
737 );
738 }
739
740 // SSIM2 quantization loop: alternative to butteraugli using SSIM2 + per-block RMSE.
741 #[cfg(feature = "ssim2-loop")]
742 if self.ssim2_iters > 0 {
743 let initial_qf_float = quant_field_float.clone();
744 params = self.ssim2_refine_quant_field(
745 linear_rgb,
746 width,
747 height,
748 &xyb_x,
749 &xyb_y,
750 &xyb_b,
751 padded_width,
752 padded_height,
753 xsize_blocks,
754 ysize_blocks,
755 ¶ms,
756 &mut quant_field,
757 &mut quant_field_float,
758 &initial_qf_float,
759 &cfl_map,
760 &ac_strategy,
761 patches_data.as_ref(),
762 splines_data.as_ref(),
763 );
764 }
765
766 // Zensim quantization loop: uses zensim psychovisual metric + per-pixel diffmap.
767 // Also refines AC strategy by splitting large transforms with high perceptual error.
768 #[cfg(feature = "zensim-loop")]
769 if self.zensim_iters > 0 {
770 let initial_qf_float = quant_field_float.clone();
771 params = self.zensim_refine_quant_field(
772 linear_rgb,
773 width,
774 height,
775 &xyb_x,
776 &xyb_y,
777 &xyb_b,
778 padded_width,
779 padded_height,
780 xsize_blocks,
781 ysize_blocks,
782 ¶ms,
783 &mut quant_field,
784 &mut quant_field_float,
785 &initial_qf_float,
786 &cfl_map,
787 &mut ac_strategy,
788 patches_data.as_ref(),
789 splines_data.as_ref(),
790 );
791 }
792
793 // Free float quant field — no longer needed after loop refinement.
794 drop(quant_field_float);
795
796 // Log quant field statistics after all adjustments
797 {
798 let qf = &quant_field;
799 let sum: u64 = qf.iter().map(|&v| v as u64).sum();
800 let avg = sum as f32 / qf.len() as f32;
801 let min = qf.iter().copied().min().unwrap_or(0);
802 let max = qf.iter().copied().max().unwrap_or(0);
803 debug_rect!(
804 "enc/quant_field",
805 0,
806 0,
807 width,
808 height,
809 "final avg={:.1} min={} max={} blocks={}",
810 avg,
811 min,
812 max,
813 qf.len()
814 );
815 }
816
817 // Dump AC strategy and quant field maps for comparison with libjxl.
818 // Set JXL_DUMP_MAPS=/tmp/prefix to enable. Maps are written as CSV.
819 #[cfg(feature = "debug-rect")]
820 if let Ok(prefix) = std::env::var("JXL_DUMP_MAPS") {
821 use std::io::Write;
822 // AC strategy map
823 if let Ok(mut f) = std::fs::File::create(format!("{prefix}_acs.csv")) {
824 for by in 0..ysize_blocks {
825 for bx in 0..xsize_blocks {
826 if bx > 0 {
827 let _ = write!(f, ",");
828 }
829 let _ = write!(f, "{}", ac_strategy.raw_strategy(bx, by));
830 }
831 let _ = writeln!(f);
832 }
833 eprintln!("DIAG: wrote {prefix}_acs.csv ({xsize_blocks}x{ysize_blocks})");
834 }
835 // Quant field map
836 if let Ok(mut f) = std::fs::File::create(format!("{prefix}_qf.csv")) {
837 for by in 0..ysize_blocks {
838 for bx in 0..xsize_blocks {
839 if bx > 0 {
840 let _ = write!(f, ",");
841 }
842 let _ = write!(f, "{}", quant_field[by * xsize_blocks + bx]);
843 }
844 let _ = writeln!(f);
845 }
846 eprintln!("DIAG: wrote {prefix}_qf.csv ({xsize_blocks}x{ysize_blocks})");
847 }
848 }
849
850 // CfL pass 2: recompute CfL map using actual AC strategies and per-block
851 // quantization weighting. Uses the same FindBestMultiplier as pass 1 but
852 // with strategy-specific DCTs and quant-weighted coefficients.
853 // Gated at effort >= 7 (speed_tier <= kSquirrel) matching libjxl.
854 if self.profile.cfl_two_pass && self.cfl_enabled {
855 super::chroma_from_luma::refine_cfl_map(
856 &mut cfl_map,
857 &xyb_x,
858 &xyb_y,
859 &xyb_b,
860 padded_width,
861 xsize_blocks,
862 ysize_blocks,
863 &ac_strategy,
864 &quant_field,
865 params.scale,
866 self.profile.cfl_newton,
867 self.profile.cfl_newton_eps,
868 self.profile.cfl_newton_max_iters,
869 );
870 }
871
872 // Perform DCT and quantization (XYB data is padded to block boundaries)
873 let transform_out = self.transform_and_quantize(
874 &xyb_x,
875 &xyb_y,
876 &xyb_b,
877 padded_width,
878 xsize_blocks,
879 ysize_blocks,
880 ¶ms,
881 &mut quant_field,
882 &cfl_map,
883 &ac_strategy,
884 );
885 let quant_dc = &transform_out.quant_dc;
886 let quant_ac = &transform_out.quant_ac;
887 let nzeros = &transform_out.nzeros;
888 let raw_nzeros = &transform_out.raw_nzeros;
889
890 // Compute per-block EPF sharpness map when EPF is active
891 // Dynamic sharpness gated at effort >= 6 (speed_tier <= kWombat) matching libjxl
892 let sharpness_map = if params.epf_iters > 0
893 && self.distance >= 0.5
894 && self.profile.epf_dynamic_sharpness
895 {
896 let mask_fallback;
897 let mask: &[f32] = match &mask1x1 {
898 Some(m) => m,
899 None => {
900 mask_fallback =
901 super::adaptive_quant::compute_mask1x1(&xyb_y, padded_width, padded_height);
902 &mask_fallback
903 }
904 };
905 Some(super::epf::compute_epf_sharpness(
906 [&xyb_x, &xyb_y, &xyb_b],
907 quant_dc,
908 quant_ac,
909 &quant_field,
910 mask,
911 ¶ms,
912 &cfl_map,
913 &ac_strategy,
914 self.enable_gaborish,
915 xsize_blocks,
916 ysize_blocks,
917 ))
918 } else {
919 None
920 };
921
922 // Free XYB planes — no longer needed after EPF sharpness computation.
923 // At 4K (6720×4480), this frees ~339 MB (3 channels × padded_pixels × f32).
924 drop(xyb_x);
925 drop(xyb_y);
926 drop(xyb_b);
927 // Free mask1x1 — up to ~115 MB at 4K (padded_pixels × f32).
928 drop(mask1x1);
929
930 // Two-pass mode: collect tokens, build optimal codes, write bitstream
931 if self.optimize_codes {
932 let strategy_counts = ac_strategy.strategy_histogram();
933 let data = self.encode_two_pass(
934 width,
935 height,
936 ¶ms,
937 xsize_blocks,
938 ysize_blocks,
939 xsize_groups,
940 ysize_groups,
941 xsize_dc_groups,
942 ysize_dc_groups,
943 num_groups,
944 num_dc_groups,
945 num_sections,
946 quant_dc,
947 quant_ac,
948 nzeros,
949 raw_nzeros,
950 &quant_field,
951 &cfl_map,
952 &ac_strategy,
953 &noise_params,
954 sharpness_map.as_deref(),
955 alpha,
956 patches_data.as_ref(),
957 splines_data.as_ref(),
958 if self.use_lf_frame {
959 Some(&transform_out.float_dc)
960 } else {
961 None
962 },
963 )?;
964 crate::debug_rect::flush("");
965 return Ok(VarDctOutput {
966 data,
967 strategy_counts,
968 });
969 }
970
971 // Get static entropy codes (wrapped in BuiltEntropyCode for uniform handling)
972 let dc_code = BuiltEntropyCode::StaticHuffman(get_dc_entropy_code());
973 let ac_code = BuiltEntropyCode::StaticHuffman(get_ac_entropy_code());
974
975 // Create main writer
976 let mut writer = BitWriter::with_capacity(width * height * 4);
977
978 // Write file header (includes JXL signature, ICC, and byte padding)
979 // Streaming path does not support alpha
980 self.write_file_header_and_pad(width, height, false, &mut writer)?;
981 #[cfg(feature = "debug-tokens")]
982 debug_log!(
983 "After file header: bit {} (byte {})",
984 writer.bits_written(),
985 writer.bits_written() / 8
986 );
987
988 // Write frame header
989 {
990 let mut fh = FrameHeader::lossy();
991 fh.x_qm_scale = params.x_qm_scale;
992 fh.b_qm_scale = params.b_qm_scale;
993 fh.epf_iters = params.epf_iters;
994 fh.gaborish = self.enable_gaborish;
995 if noise_params.is_some() {
996 fh.flags |= 0x01; // ENABLE_NOISE
997 }
998 // streaming path: no extra channels
999 fh.write(&mut writer)?;
1000 }
1001 #[cfg(feature = "debug-tokens")]
1002 debug_log!(
1003 "After frame header: bit {} (byte {})",
1004 writer.bits_written(),
1005 writer.bits_written() / 8
1006 );
1007
1008 // For single-group images, combine all sections at the bit level
1009 // (no byte padding between sections, only at the end)
1010 if num_sections == 4 {
1011 // Write sections to individual BitWriters (no padding)
1012 let block_ctx_map = super::ac_context::BlockCtxMap::default();
1013 let num_blocks = xsize_blocks * ysize_blocks;
1014 let mut dc_global = BitWriter::with_capacity(4096);
1015 self.write_dc_global(
1016 ¶ms,
1017 num_dc_groups,
1018 &dc_code,
1019 &noise_params,
1020 None,
1021 &block_ctx_map,
1022 None, // No learned tree in single-pass mode
1023 None, // No patches in streaming mode
1024 None, // No splines in streaming mode
1025 None, // No custom dc_quant in single-pass mode
1026 &mut dc_global,
1027 )?;
1028
1029 // Get borrowed Huffman codes for streaming token writing
1030 let dc_huffman = dc_code.as_huffman();
1031 let ac_huffman = ac_code.as_huffman();
1032
1033 let mut dc_group = BitWriter::with_capacity(num_blocks * 10);
1034 self.write_dc_group(
1035 0,
1036 quant_dc,
1037 xsize_blocks,
1038 ysize_blocks,
1039 xsize_dc_groups,
1040 &quant_field,
1041 &cfl_map,
1042 &ac_strategy,
1043 None, // no sharpness map in single-pass mode
1044 &dc_huffman,
1045 &mut dc_group,
1046 )?;
1047
1048 let mut ac_global = BitWriter::with_capacity(4096);
1049 self.write_ac_global(
1050 num_groups,
1051 core::slice::from_ref(&ac_code),
1052 0,
1053 None,
1054 &[None],
1055 &mut ac_global,
1056 )?;
1057
1058 let mut ac_group_writer = BitWriter::with_capacity(num_blocks * 100);
1059 self.write_ac_group(
1060 0,
1061 quant_ac,
1062 nzeros,
1063 raw_nzeros,
1064 xsize_blocks,
1065 ysize_blocks,
1066 xsize_groups,
1067 &quant_field,
1068 &ac_strategy,
1069 &block_ctx_map,
1070 &ac_huffman,
1071 &mut ac_group_writer,
1072 )?;
1073
1074 #[cfg(feature = "debug-tokens")]
1075 {
1076 debug_log!(
1077 "Section bit counts: DC_global={}, DC_group={}, AC_global={}, AC_group={}",
1078 dc_global.bits_written(),
1079 dc_group.bits_written(),
1080 ac_global.bits_written(),
1081 ac_group_writer.bits_written()
1082 );
1083 }
1084
1085 // Combine at bit level
1086 let mut combined = dc_global;
1087 #[cfg(feature = "debug-tokens")]
1088 debug_log!("After DC_global: {} bits", combined.bits_written());
1089 combined.append_unaligned(&dc_group)?;
1090 #[cfg(feature = "debug-tokens")]
1091 debug_log!("After DC_group: {} bits", combined.bits_written());
1092 combined.append_unaligned(&ac_global)?;
1093 #[cfg(feature = "debug-tokens")]
1094 debug_log!("After AC_global: {} bits", combined.bits_written());
1095 combined.append_unaligned(&ac_group_writer)?;
1096 #[cfg(feature = "debug-tokens")]
1097 debug_log!("After AC_group: {} bits", combined.bits_written());
1098 combined.zero_pad_to_byte();
1099 let combined_bytes = combined.finish();
1100
1101 #[cfg(feature = "debug-tokens")]
1102 {
1103 debug_log!("Combined section size: {} bytes", combined_bytes.len());
1104 debug_log!(
1105 "Before TOC: bit {} (byte {})",
1106 writer.bits_written(),
1107 writer.bits_written() / 8
1108 );
1109 }
1110 write_toc(&[combined_bytes.len()], &mut writer)?;
1111 #[cfg(feature = "debug-tokens")]
1112 debug_log!(
1113 "After TOC: bit {} (byte {})",
1114 writer.bits_written(),
1115 writer.bits_written() / 8
1116 );
1117 writer.append_bytes(&combined_bytes)?;
1118 } else {
1119 // Multi-group: use byte-aligned sections
1120 let mut sections: Vec<Vec<u8>> = Vec::with_capacity(num_sections);
1121 let dc_huffman = dc_code.as_huffman();
1122 let ac_huffman = ac_code.as_huffman();
1123
1124 // DC Global section
1125 let block_ctx_map = super::ac_context::BlockCtxMap::default();
1126 let mut dc_global = BitWriter::with_capacity(4096);
1127 self.write_dc_global(
1128 ¶ms,
1129 num_dc_groups,
1130 &dc_code,
1131 &noise_params,
1132 None,
1133 &block_ctx_map,
1134 None, // No learned tree in single-pass mode
1135 None, // No patches in streaming mode
1136 None, // No splines in streaming mode
1137 None, // No custom dc_quant in single-pass mode
1138 &mut dc_global,
1139 )?;
1140 dc_global.zero_pad_to_byte();
1141 sections.push(dc_global.finish());
1142
1143 // DC group sections
1144 let blocks_per_dc_group = (256 / 8) * (256 / 8); // 1024 blocks per DC group
1145 for dc_group_idx in 0..num_dc_groups {
1146 let mut dc_group = BitWriter::with_capacity(blocks_per_dc_group * 10);
1147 self.write_dc_group(
1148 dc_group_idx,
1149 quant_dc,
1150 xsize_blocks,
1151 ysize_blocks,
1152 xsize_dc_groups,
1153 &quant_field,
1154 &cfl_map,
1155 &ac_strategy,
1156 None, // no sharpness map in single-pass mode
1157 &dc_huffman,
1158 &mut dc_group,
1159 )?;
1160 dc_group.zero_pad_to_byte();
1161 sections.push(dc_group.finish());
1162 }
1163
1164 // AC Global section
1165 let mut ac_global = BitWriter::with_capacity(4096);
1166 self.write_ac_global(
1167 num_groups,
1168 core::slice::from_ref(&ac_code),
1169 0,
1170 None,
1171 &[None],
1172 &mut ac_global,
1173 )?;
1174 ac_global.zero_pad_to_byte();
1175 sections.push(ac_global.finish());
1176
1177 // AC group sections
1178 let blocks_per_ac_group = (256 / 8) * (256 / 8); // 1024 blocks per AC group
1179 for group_idx in 0..num_groups {
1180 let mut ac_group_writer = BitWriter::with_capacity(blocks_per_ac_group * 100);
1181 self.write_ac_group(
1182 group_idx,
1183 quant_ac,
1184 nzeros,
1185 raw_nzeros,
1186 xsize_blocks,
1187 ysize_blocks,
1188 xsize_groups,
1189 &quant_field,
1190 &ac_strategy,
1191 &block_ctx_map,
1192 &ac_huffman,
1193 &mut ac_group_writer,
1194 )?;
1195 ac_group_writer.zero_pad_to_byte();
1196 sections.push(ac_group_writer.finish());
1197 }
1198
1199 let section_sizes: Vec<usize> = sections.iter().map(|s| s.len()).collect();
1200 write_toc(§ion_sizes, &mut writer)?;
1201 for section in sections {
1202 writer.append_bytes(§ion)?;
1203 }
1204 }
1205
1206 let strategy_counts = ac_strategy.strategy_histogram();
1207 crate::debug_rect::flush("");
1208 Ok(VarDctOutput {
1209 data: writer.finish_with_padding(),
1210 strategy_counts,
1211 })
1212 }
1213
1214 /// Encode with iterative rate control for improved distance targeting.
1215 ///
1216 /// This method:
1217 /// 1. Computes precomputed state (XYB, CfL, masking, AC strategy) once
1218 /// 2. Loops: encode → decode → butteraugli → adjust quant field
1219 /// 3. Returns when converged (within 5% of target) or max iterations reached
1220 ///
1221 /// Typically converges in 2-4 iterations. Each iteration costs ~50% of a
1222 /// full encode since XYB conversion, CfL, masking, and AC strategy are reused.
1223 ///
1224 /// Returns the encoded bytes. Use `encode_with_rate_control_config` for
1225 /// iteration count and custom configuration.
1226 ///
1227 /// Requires the `rate-control` feature.
1228 #[cfg(feature = "rate-control")]
1229 pub fn encode_with_rate_control(
1230 &self,
1231 width: usize,
1232 height: usize,
1233 linear_rgb: &[f32],
1234 ) -> Result<Vec<u8>> {
1235 let config = super::rate_control::RateControlConfig::default();
1236 let (encoded, _iters) =
1237 self.encode_with_rate_control_config(width, height, linear_rgb, &config)?;
1238 Ok(encoded)
1239 }
1240
1241 /// Encode with iterative rate control and custom configuration.
1242 ///
1243 /// Returns `(encoded_bytes, iteration_count)`.
1244 ///
1245 /// Requires the `rate-control` feature.
1246 #[cfg(feature = "rate-control")]
1247 pub fn encode_with_rate_control_config(
1248 &self,
1249 width: usize,
1250 height: usize,
1251 linear_rgb: &[f32],
1252 config: &super::rate_control::RateControlConfig,
1253 ) -> Result<(Vec<u8>, usize)> {
1254 // Compute precomputed state
1255 let precomputed = super::precomputed::EncoderPrecomputed::compute(
1256 width,
1257 height,
1258 linear_rgb,
1259 self.distance,
1260 self.cfl_enabled,
1261 self.ac_strategy_enabled,
1262 self.pixel_domain_loss,
1263 self.enable_noise,
1264 self.enable_denoise,
1265 self.enable_gaborish,
1266 self.force_strategy,
1267 &self.profile,
1268 );
1269
1270 // Run rate control loop
1271 super::rate_control::encode_with_rate_control(self, &precomputed, config)
1272 }
1273
1274 /// Encode from precomputed state with a specific quant field.
1275 ///
1276 /// This is the core encoding function used by rate control iterations.
1277 /// It skips XYB conversion, CfL, masking, and AC strategy computation,
1278 /// using the values from `precomputed` instead.
1279 ///
1280 /// Requires the `rate-control` feature.
1281 #[cfg(feature = "rate-control")]
1282 pub fn encode_from_precomputed(
1283 &self,
1284 precomputed: &super::precomputed::EncoderPrecomputed,
1285 quant_field: &[u8],
1286 ) -> Result<Vec<u8>> {
1287 let width = precomputed.width;
1288 let height = precomputed.height;
1289 let xsize_blocks = precomputed.xsize_blocks;
1290 let ysize_blocks = precomputed.ysize_blocks;
1291 let padded_width = precomputed.padded_width;
1292
1293 // Calculate group dimensions
1294 let xsize_groups = div_ceil(width, GROUP_DIM);
1295 let ysize_groups = div_ceil(height, GROUP_DIM);
1296 let xsize_dc_groups = div_ceil(width, DC_GROUP_DIM);
1297 let ysize_dc_groups = div_ceil(height, DC_GROUP_DIM);
1298 let num_groups = xsize_groups * ysize_groups;
1299 let num_dc_groups = xsize_dc_groups * ysize_dc_groups;
1300 let num_sections = 2 + num_dc_groups + num_groups;
1301
1302 // Copy and adjust quant field for multi-block transforms
1303 let mut quant_field = quant_field.to_vec();
1304 adjust_quant_field_with_distance(&precomputed.ac_strategy, &mut quant_field, self.distance);
1305
1306 // Compute distance params from effort profile
1307 let mut params = DistanceParams::compute_for_profile(self.distance, &self.profile);
1308
1309 // Apply pixel-level chromacity adjustments using pre-gaborish stats
1310 if self.profile.chromacity_adjustment {
1311 params.apply_chromacity_adjustment(
1312 precomputed.chromacity_x_pixelized,
1313 precomputed.chromacity_b_pixelized,
1314 );
1315 }
1316
1317 // Perform DCT and quantization using precomputed XYB data
1318 let transform_out = self.transform_and_quantize(
1319 &precomputed.xyb_x,
1320 &precomputed.xyb_y,
1321 &precomputed.xyb_b,
1322 padded_width,
1323 xsize_blocks,
1324 ysize_blocks,
1325 ¶ms,
1326 &mut quant_field,
1327 &precomputed.cfl_map,
1328 &precomputed.ac_strategy,
1329 );
1330 let quant_dc = &transform_out.quant_dc;
1331 let quant_ac = &transform_out.quant_ac;
1332 let nzeros = &transform_out.nzeros;
1333 let raw_nzeros = &transform_out.raw_nzeros;
1334
1335 // Use two-pass mode for rate control (required for ANS)
1336 self.encode_two_pass(
1337 width,
1338 height,
1339 ¶ms,
1340 xsize_blocks,
1341 ysize_blocks,
1342 xsize_groups,
1343 ysize_groups,
1344 xsize_dc_groups,
1345 ysize_dc_groups,
1346 num_groups,
1347 num_dc_groups,
1348 num_sections,
1349 quant_dc,
1350 quant_ac,
1351 nzeros,
1352 raw_nzeros,
1353 &quant_field,
1354 &precomputed.cfl_map,
1355 &precomputed.ac_strategy,
1356 &precomputed.noise_params,
1357 None, // TODO: compute sharpness_map for rate control path
1358 None, // TODO: thread alpha through butteraugli path
1359 None, // patches
1360 None, // splines
1361 None, // float_dc
1362 )
1363 }
1364}
1365
1366#[cfg(test)]
1367mod tests {
1368 use super::*;
1369
1370 #[test]
1371 fn test_encoder_creation() {
1372 let encoder = VarDctEncoder::new(1.0);
1373 assert_eq!(encoder.distance, 1.0);
1374
1375 let encoder_default = VarDctEncoder::default();
1376 assert_eq!(encoder_default.distance, 1.0);
1377 }
1378
1379 #[test]
1380 fn test_encode_small_image() {
1381 let encoder = VarDctEncoder::new(1.0);
1382
1383 // Create a simple 8x8 red image
1384 let width = 8;
1385 let height = 8;
1386 let mut linear_rgb = vec![0.0f32; width * height * 3];
1387 for y in 0..height {
1388 for x in 0..width {
1389 let idx = (y * width + x) * 3;
1390 linear_rgb[idx] = 1.0; // R
1391 linear_rgb[idx + 1] = 0.0; // G
1392 linear_rgb[idx + 2] = 0.0; // B
1393 }
1394 }
1395
1396 // This should at least not panic - full encoding not yet implemented
1397 let result = encoder.encode(width, height, &linear_rgb, None);
1398 // For now, just check it produces some output
1399 assert!(result.is_ok());
1400 let output = result.unwrap();
1401 assert!(output.data.len() > 2);
1402 assert_eq!(output.data[0], 0xFF);
1403 assert_eq!(output.data[1], 0x0A);
1404 }
1405
1406 #[test]
1407 fn test_convert_to_xyb_padded() {
1408 let encoder = VarDctEncoder::new(1.0);
1409
1410 // Gray pixel (1x1 image -> padded to 8x8)
1411 let linear_rgb = vec![0.5, 0.5, 0.5];
1412 let (x, y, b) = encoder.convert_to_xyb_padded(1, 1, 8, 8, &linear_rgb);
1413
1414 // Padded to 8x8 = 64 pixels
1415 assert_eq!(x.len(), 64);
1416 assert_eq!(y.len(), 64);
1417 assert_eq!(b.len(), 64);
1418
1419 // Gray should have X ≈ 0 (equal L and M)
1420 assert!(x[0].abs() < 0.01, "X should be near zero for gray");
1421 assert!(y[0] > 0.0, "Y should be positive");
1422 assert!(b[0] > 0.0, "B should be positive");
1423
1424 // Edge replication: all padded pixels should match the corner
1425 for i in 0..64 {
1426 assert!((x[i] - x[0]).abs() < 1e-6, "All padded X should match");
1427 assert!((y[i] - y[0]).abs() < 1e-6, "All padded Y should match");
1428 assert!((b[i] - b[0]).abs() < 1e-6, "All padded B should match");
1429 }
1430 }
1431
1432 #[test]
1433 fn test_encode_16x16_red_image() {
1434 // Test a 16x16 pixel image (2x2 blocks) to compare with libjxl-tiny
1435 let encoder = VarDctEncoder::new(1.0);
1436
1437 let width = 16;
1438 let height = 16;
1439 let mut linear_rgb = vec![0.0f32; width * height * 3];
1440 for y in 0..height {
1441 for x in 0..width {
1442 let idx = (y * width + x) * 3;
1443 linear_rgb[idx] = 1.0; // R
1444 linear_rgb[idx + 1] = 0.0; // G
1445 linear_rgb[idx + 2] = 0.0; // B
1446 }
1447 }
1448
1449 let result = encoder.encode(width, height, &linear_rgb, None);
1450 assert!(result.is_ok());
1451 let output = result.unwrap();
1452
1453 eprintln!("Output file size: {} bytes", output.data.len());
1454 eprintln!(
1455 "First 32 bytes: {:02x?}",
1456 &output.data[..32.min(output.data.len())]
1457 );
1458
1459 // Write output to file for comparison
1460 std::fs::write(std::env::temp_dir().join("our_16x16.jxl"), &output.data).unwrap();
1461
1462 // libjxl-tiny produces:
1463 // DC_group: 106 bits (14 bytes)
1464 // Total combined: 1086 bytes
1465 // Total file: 1104 bytes
1466 //
1467 // Our encoder should match these sizes
1468
1469 // Check signature
1470 assert_eq!(output.data[0], 0xFF);
1471 assert_eq!(output.data[1], 0x0A);
1472 }
1473
1474 /// Compute a simple hash of a byte slice for output locking.
1475 fn hash_bytes(bytes: &[u8]) -> u64 {
1476 use std::hash::{Hash, Hasher};
1477 let mut hasher = std::collections::hash_map::DefaultHasher::new();
1478 bytes.hash(&mut hasher);
1479 hasher.finish()
1480 }
1481
1482 /// Hash-locked test for 8x8 gradient image.
1483 /// This test ensures the encoder output doesn't change unexpectedly.
1484 /// x86_64 only: FP rounding differs on other architectures and 32-bit.
1485 #[test]
1486 #[cfg(target_arch = "x86_64")]
1487 fn test_hash_lock_8x8_gradient() {
1488 let encoder = VarDctEncoder::new(1.0);
1489 let width = 8;
1490 let height = 8;
1491 let mut linear_rgb = vec![0.0f32; width * height * 3];
1492
1493 // Simple gradient: R increases with x, G with y
1494 for y in 0..height {
1495 for x in 0..width {
1496 let idx = (y * width + x) * 3;
1497 linear_rgb[idx] = x as f32 / 7.0; // R
1498 linear_rgb[idx + 1] = y as f32 / 7.0; // G
1499 linear_rgb[idx + 2] = 0.5; // B
1500 }
1501 }
1502
1503 let bytes = encoder
1504 .encode(width, height, &linear_rgb, None)
1505 .unwrap()
1506 .data;
1507 let hash = hash_bytes(&bytes);
1508
1509 // Lock the hash - if this changes, the encoding has changed
1510 // Updated: error_diffusion default changed from true to false
1511 const EXPECTED_HASH: u64 = 0x311e7f185fbbf3f1;
1512 assert_eq!(
1513 hash,
1514 EXPECTED_HASH,
1515 "8x8 gradient hash mismatch: got {:#x}, expected {:#x}. \
1516 Output size: {} bytes. If intentional, update EXPECTED_HASH.",
1517 hash,
1518 EXPECTED_HASH,
1519 bytes.len()
1520 );
1521 }
1522
1523 /// Hash-locked test for 16x16 solid color image.
1524 /// x86_64 only: FP rounding differs on other architectures and 32-bit.
1525 #[test]
1526 #[cfg(target_arch = "x86_64")]
1527 fn test_hash_lock_16x16_solid() {
1528 let encoder = VarDctEncoder::new(1.0);
1529 let width = 16;
1530 let height = 16;
1531 let linear_rgb = vec![0.3f32; width * height * 3]; // gray
1532
1533 let bytes = encoder
1534 .encode(width, height, &linear_rgb, None)
1535 .unwrap()
1536 .data;
1537 let hash = hash_bytes(&bytes);
1538
1539 // Updated: fix global_scale to use effort-matched fixed q (libjxl parity)
1540 const EXPECTED_HASH: u64 = 0x1fd8e75f15fd418c;
1541 assert_eq!(
1542 hash,
1543 EXPECTED_HASH,
1544 "16x16 solid hash mismatch: got {:#x}, expected {:#x}. \
1545 Output size: {} bytes. If intentional, update EXPECTED_HASH.",
1546 hash,
1547 EXPECTED_HASH,
1548 bytes.len()
1549 );
1550 }
1551
1552 /// Hash-locked test for 64x64 checkerboard pattern.
1553 /// x86_64 only: FP rounding differs on other architectures and 32-bit.
1554 #[test]
1555 #[cfg(target_arch = "x86_64")]
1556 fn test_hash_lock_64x64_checkerboard() {
1557 let encoder = VarDctEncoder::new(1.0);
1558 let width = 64;
1559 let height = 64;
1560 let mut linear_rgb = vec![0.0f32; width * height * 3];
1561
1562 // 8x8 checkerboard pattern
1563 for y in 0..height {
1564 for x in 0..width {
1565 let idx = (y * width + x) * 3;
1566 let checker = ((x / 8) + (y / 8)) % 2 == 0;
1567 let val = if checker { 0.8 } else { 0.2 };
1568 linear_rgb[idx] = val;
1569 linear_rgb[idx + 1] = val;
1570 linear_rgb[idx + 2] = val;
1571 }
1572 }
1573
1574 let bytes = encoder
1575 .encode(width, height, &linear_rgb, None)
1576 .unwrap()
1577 .data;
1578 let hash = hash_bytes(&bytes);
1579
1580 // Updated: fast_log2f replaces glibc log2 in ANS frequency optimization
1581 const EXPECTED_HASH: u64 = 0x777dbc66ef3d69a3;
1582 assert_eq!(
1583 hash,
1584 EXPECTED_HASH,
1585 "64x64 checkerboard hash mismatch: got {:#x}, expected {:#x}. \
1586 Output size: {} bytes. If intentional, update EXPECTED_HASH.",
1587 hash,
1588 EXPECTED_HASH,
1589 bytes.len()
1590 );
1591 }
1592
1593 /// Hash-locked test for non-power-of-two size (tests padding).
1594 /// x86_64 only: FP rounding differs on other architectures and 32-bit.
1595 #[test]
1596 #[cfg(target_arch = "x86_64")]
1597 fn test_hash_lock_13x17_noise() {
1598 let encoder = VarDctEncoder::new(1.0);
1599 let width = 13;
1600 let height = 17;
1601 let mut linear_rgb = vec![0.0f32; width * height * 3];
1602
1603 // Deterministic pseudo-random pattern
1604 let mut seed = 12345u64;
1605 for val in &mut linear_rgb {
1606 seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
1607 *val = ((seed >> 32) as f32) / (u32::MAX as f32);
1608 }
1609
1610 let bytes = encoder
1611 .encode(width, height, &linear_rgb, None)
1612 .unwrap()
1613 .data;
1614 let hash = hash_bytes(&bytes);
1615
1616 // Updated: error_diffusion default changed from true to false
1617 const EXPECTED_HASH: u64 = 0x0c54e44d071039db;
1618 assert_eq!(
1619 hash,
1620 EXPECTED_HASH,
1621 "13x17 noise hash mismatch: got {:#x}, expected {:#x}. \
1622 Output size: {} bytes. If intentional, update EXPECTED_HASH.",
1623 hash,
1624 EXPECTED_HASH,
1625 bytes.len()
1626 );
1627 }
1628
1629 /// Roundtrip quality test for non-8-aligned dimensions.
1630 ///
1631 /// Encodes a 100x75 gradient, decodes with jxl-oxide, and verifies:
1632 /// 1. Dimensions match
1633 /// 2. Output is a valid JXL file (correct signature, decodable)
1634 ///
1635 /// This catches stride mismatch bugs where padded XYB buffers have
1636 /// stride != width, which corrupts adaptive quant, CfL, and AC strategy.
1637 #[test]
1638 fn test_roundtrip_non_8_aligned() {
1639 for &(w, h) in &[(100, 75), (13, 17), (33, 49), (7, 9)] {
1640 let mut linear_rgb = vec![0.0f32; w * h * 3];
1641
1642 // Smooth gradient (linear RGB)
1643 for y in 0..h {
1644 for x in 0..w {
1645 let idx = (y * w + x) * 3;
1646 linear_rgb[idx] = x as f32 / w.max(1) as f32;
1647 linear_rgb[idx + 1] = y as f32 / h.max(1) as f32;
1648 linear_rgb[idx + 2] = 0.3;
1649 }
1650 }
1651
1652 let encoder = VarDctEncoder::new(1.0);
1653 let bytes = encoder
1654 .encode(w, h, &linear_rgb, None)
1655 .unwrap_or_else(|e| panic!("encode {}x{} failed: {}", w, h, e))
1656 .data;
1657
1658 // Verify JXL signature
1659 assert_eq!(bytes[0], 0xFF, "{}x{}: bad signature byte 0", w, h);
1660 assert_eq!(bytes[1], 0x0A, "{}x{}: bad signature byte 1", w, h);
1661
1662 // Decode with jxl-oxide and verify dimensions
1663 let image = jxl_oxide::JxlImage::builder()
1664 .read(std::io::Cursor::new(&bytes))
1665 .unwrap_or_else(|e| panic!("jxl-oxide decode {}x{} failed: {}", w, h, e));
1666 assert_eq!(
1667 image.width(),
1668 w as u32,
1669 "{}x{}: decoded width mismatch",
1670 w,
1671 h
1672 );
1673 assert_eq!(
1674 image.height(),
1675 h as u32,
1676 "{}x{}: decoded height mismatch",
1677 w,
1678 h
1679 );
1680
1681 // Render to verify pixel data is valid
1682 let render = image
1683 .render_frame(0)
1684 .unwrap_or_else(|e| panic!("jxl-oxide render {}x{} failed: {}", w, h, e));
1685 let _pixels = render.image_all_channels();
1686 }
1687 }
1688
1689 /// Test DC tree learning produces valid output.
1690 #[test]
1691 fn test_dc_tree_learning() {
1692 let width = 64;
1693 let height = 64;
1694
1695 // Create a gradient image
1696 let mut linear_rgb = vec![0.0f32; width * height * 3];
1697 for y in 0..height {
1698 for x in 0..width {
1699 let idx = (y * width + x) * 3;
1700 linear_rgb[idx] = x as f32 / width as f32;
1701 linear_rgb[idx + 1] = y as f32 / height as f32;
1702 linear_rgb[idx + 2] = 0.5;
1703 }
1704 }
1705
1706 // Encode WITHOUT DC tree learning (baseline) — use ANS
1707 let mut encoder_baseline = VarDctEncoder::new(1.0);
1708 encoder_baseline.dc_tree_learning = false;
1709 let bytes_baseline = encoder_baseline
1710 .encode(width, height, &linear_rgb, None)
1711 .expect("baseline encode failed")
1712 .data;
1713
1714 // Encode WITH DC tree learning — also use ANS
1715 let mut encoder_learned = VarDctEncoder::new(1.0);
1716 encoder_learned.dc_tree_learning = true;
1717 std::fs::write(
1718 std::env::temp_dir().join("dc_baseline_test.jxl"),
1719 &bytes_baseline,
1720 )
1721 .unwrap();
1722 let bytes_learned = encoder_learned
1723 .encode(width, height, &linear_rgb, None)
1724 .expect("learned encode failed")
1725 .data;
1726 std::fs::write(
1727 std::env::temp_dir().join("dc_learned_test.jxl"),
1728 &bytes_learned,
1729 )
1730 .unwrap();
1731
1732 eprintln!(
1733 "DC tree learning: baseline={} bytes, learned={} bytes (delta={:.2}%)",
1734 bytes_baseline.len(),
1735 bytes_learned.len(),
1736 (bytes_learned.len() as f64 / bytes_baseline.len() as f64 - 1.0) * 100.0
1737 );
1738
1739 // Verify both produce valid JXL signature
1740 assert_eq!(bytes_baseline[0], 0xFF);
1741 assert_eq!(bytes_baseline[1], 0x0A);
1742 assert_eq!(bytes_learned[0], 0xFF);
1743 assert_eq!(bytes_learned[1], 0x0A);
1744
1745 // Verify baseline decodes (sanity check)
1746 {
1747 let image = jxl_oxide::JxlImage::builder()
1748 .read(std::io::Cursor::new(&bytes_baseline))
1749 .expect("jxl-oxide parse of baseline failed");
1750 let render = image
1751 .render_frame(0)
1752 .expect("jxl-oxide render of baseline failed");
1753 let _pixels = render.image_all_channels();
1754 eprintln!("Baseline ANS decodes OK ({} bytes)", bytes_baseline.len());
1755 }
1756
1757 // Decode the learned version with jxl-oxide to verify it's valid
1758 let image = jxl_oxide::JxlImage::builder()
1759 .read(std::io::Cursor::new(&bytes_learned))
1760 .expect("jxl-oxide decode of learned version failed");
1761 assert_eq!(image.width(), width as u32);
1762 assert_eq!(image.height(), height as u32);
1763
1764 // Render to verify pixel data is valid
1765 let render = image
1766 .render_frame(0)
1767 .expect("jxl-oxide render of learned version failed");
1768 let _pixels = render.image_all_channels();
1769 eprintln!("Learned ANS decodes OK ({} bytes)", bytes_learned.len());
1770
1771 // Also verify with djxl
1772 std::fs::write(
1773 std::env::temp_dir().join("dc_learned_test.jxl"),
1774 &bytes_learned,
1775 )
1776 .unwrap();
1777 }
1778
1779 /// Test that the butteraugli quantization loop produces valid output.
1780 #[cfg(feature = "butteraugli-loop")]
1781 #[test]
1782 fn test_butteraugli_loop_basic() {
1783 // Create a 64x64 test image with some variation
1784 let width = 64;
1785 let height = 64;
1786 let mut linear_rgb = vec![0.0f32; width * height * 3];
1787 for y in 0..height {
1788 for x in 0..width {
1789 let idx = (y * width + x) * 3;
1790 let fx = x as f32 / width as f32;
1791 let fy = y as f32 / height as f32;
1792 linear_rgb[idx] = fx * 0.8; // R
1793 linear_rgb[idx + 1] = fy * 0.6; // G
1794 linear_rgb[idx + 2] = (1.0 - fx) * 0.4; // B
1795 }
1796 }
1797
1798 // Encode without butteraugli loop
1799 let mut encoder_baseline = VarDctEncoder::new(2.0);
1800 encoder_baseline.butteraugli_iters = 0;
1801 let bytes_baseline = encoder_baseline
1802 .encode(width, height, &linear_rgb, None)
1803 .expect("baseline encode failed")
1804 .data;
1805
1806 // Encode with 2 butteraugli loop iterations
1807 let mut encoder_loop = VarDctEncoder::new(2.0);
1808 encoder_loop.butteraugli_iters = 2;
1809 let bytes_loop = encoder_loop
1810 .encode(width, height, &linear_rgb, None)
1811 .expect("butteraugli loop encode failed")
1812 .data;
1813
1814 // Both should produce valid JXL
1815 assert_eq!(bytes_baseline[0], 0xFF);
1816 assert_eq!(bytes_baseline[1], 0x0A);
1817 assert_eq!(bytes_loop[0], 0xFF);
1818 assert_eq!(bytes_loop[1], 0x0A);
1819
1820 // File sizes should differ (butteraugli loop changes quant field)
1821 eprintln!(
1822 "Baseline: {} bytes, Butteraugli loop (2 iters): {} bytes",
1823 bytes_baseline.len(),
1824 bytes_loop.len()
1825 );
1826
1827 // Verify the butteraugli-loop output decodes correctly
1828 let image = jxl_oxide::JxlImage::builder()
1829 .read(std::io::Cursor::new(&bytes_loop))
1830 .expect("jxl-oxide decode of butteraugli loop output failed");
1831 assert_eq!(image.width(), width as u32);
1832 assert_eq!(image.height(), height as u32);
1833
1834 let render = image
1835 .render_frame(0)
1836 .expect("jxl-oxide render of butteraugli loop output failed");
1837 let _pixels = render.image_all_channels();
1838 eprintln!("Butteraugli loop output decodes OK");
1839 }
1840}