jpegli/
encode.rs

1//! JPEG encoder implementation.
2//!
3//! This module provides the main encoder interface for creating JPEG images.
4
5use crate::adaptive_quant::compute_aq_strength_map;
6use crate::alloc::{
7    checked_size_2d, try_alloc_filled, try_alloc_zeroed_f32, validate_dimensions,
8    DEFAULT_MAX_PIXELS,
9};
10use crate::color;
11use crate::consts::{
12    DCT_BLOCK_SIZE, DCT_SIZE, ICC_PROFILE_SIGNATURE, JPEG_NATURAL_ORDER, JPEG_ZIGZAG_ORDER,
13    MARKER_APP14, MARKER_APP2, MARKER_DHT, MARKER_DQT, MARKER_DRI, MARKER_EOI, MARKER_SOF0,
14    MARKER_SOF2, MARKER_SOI, MARKER_SOS, MAX_ICC_BYTES_PER_MARKER, XYB_ICC_PROFILE,
15};
16use crate::dct::forward_dct_8x8;
17use crate::entropy::{self, EntropyEncoder};
18use crate::error::{Error, Result};
19use crate::huffman::HuffmanEncodeTable;
20use crate::huffman_opt::{
21    FrequencyCounter, OptimizedHuffmanTables, OptimizedTable, ProgressiveTokenBuffer,
22};
23use crate::quant::{self, Quality, QuantTable, ZeroBiasParams};
24use crate::types::{ColorSpace, JpegMode, PixelFormat, Subsampling};
25use crate::xyb::srgb_to_scaled_xyb;
26
27/// Progressive scan parameters.
28#[derive(Debug, Clone)]
29struct ProgressiveScan {
30    /// Component indices in this scan (0=Y, 1=Cb, 2=Cr)
31    components: Vec<u8>,
32    /// Spectral selection start (0=DC, 1-63=AC)
33    ss: u8,
34    /// Spectral selection end (0-63)
35    se: u8,
36    /// Successive approximation high bit (previous pass)
37    ah: u8,
38    /// Successive approximation low bit (current pass)
39    al: u8,
40}
41
42/// Encoder configuration.
43#[derive(Debug, Clone)]
44pub struct EncoderConfig {
45    /// Image width
46    pub width: u32,
47    /// Image height
48    pub height: u32,
49    /// Input pixel format
50    pub pixel_format: PixelFormat,
51    /// Quality setting
52    pub quality: Quality,
53    /// Encoding mode
54    pub mode: JpegMode,
55    /// Chroma subsampling
56    pub subsampling: Subsampling,
57    /// Use XYB color space
58    pub use_xyb: bool,
59    /// Restart interval (0 = disabled)
60    pub restart_interval: u16,
61    /// Use optimized Huffman tables
62    pub optimize_huffman: bool,
63}
64
65impl Default for EncoderConfig {
66    fn default() -> Self {
67        Self {
68            width: 0,
69            height: 0,
70            pixel_format: PixelFormat::Rgb,
71            quality: Quality::default(),
72            mode: JpegMode::Baseline,
73            // Use 4:4:4 - this is what the encoder actually supports currently
74            subsampling: Subsampling::S444,
75            use_xyb: false,
76            restart_interval: 0,
77            // Match C++ jpegli default: optimize_coding = true
78            optimize_huffman: true,
79        }
80    }
81}
82
83/// JPEG encoder.
84pub struct Encoder {
85    config: EncoderConfig,
86}
87
88impl Encoder {
89    /// Creates a new encoder with default settings.
90    #[must_use]
91    pub fn new() -> Self {
92        Self {
93            config: EncoderConfig::default(),
94        }
95    }
96
97    /// Creates an encoder from configuration.
98    #[must_use]
99    pub fn from_config(config: EncoderConfig) -> Self {
100        Self { config }
101    }
102
103    /// Sets the image width.
104    #[must_use]
105    pub fn width(mut self, width: u32) -> Self {
106        self.config.width = width;
107        self
108    }
109
110    /// Sets the image height.
111    #[must_use]
112    pub fn height(mut self, height: u32) -> Self {
113        self.config.height = height;
114        self
115    }
116
117    /// Sets the pixel format.
118    #[must_use]
119    pub fn pixel_format(mut self, format: PixelFormat) -> Self {
120        self.config.pixel_format = format;
121        self
122    }
123
124    /// Sets the quality.
125    #[must_use]
126    pub fn quality(mut self, quality: Quality) -> Self {
127        self.config.quality = quality;
128        self
129    }
130
131    /// Sets the encoding mode.
132    #[must_use]
133    pub fn mode(mut self, mode: JpegMode) -> Self {
134        self.config.mode = mode;
135        self
136    }
137
138    /// Sets chroma subsampling.
139    #[must_use]
140    pub fn subsampling(mut self, subsampling: Subsampling) -> Self {
141        self.config.subsampling = subsampling;
142        self
143    }
144
145    /// Enables XYB-optimized encoding mode.
146    ///
147    /// XYB mode encodes images using the perceptually-optimized XYB color space
148    /// from JPEG XL. This provides better quality at the same file size compared
149    /// to standard YCbCr encoding.
150    ///
151    /// The implementation includes:
152    /// 1. Full sRGB → linear RGB → XYB color space conversion
153    /// 2. XYB value scaling for optimal quantization
154    /// 3. Embedded ICC profile for decoder color interpretation
155    /// 4. Blue channel subsampling (R:2×2, G:2×2, B:1×1)
156    /// 5. Separate XYB-optimized quant tables per component
157    ///
158    /// The ICC profile allows any ICC-aware decoder (including djpegli, ImageMagick,
159    /// and most image viewers) to correctly interpret the XYB values back to sRGB.
160    ///
161    /// Note: Without ICC profile support in the decoder, images will display with
162    /// incorrect colors. Use standard YCbCr mode for maximum compatibility.
163    #[must_use]
164    pub fn use_xyb(mut self, enable: bool) -> Self {
165        self.config.use_xyb = enable;
166        self
167    }
168
169    /// Sets the restart interval.
170    #[must_use]
171    pub fn restart_interval(mut self, interval: u16) -> Self {
172        self.config.restart_interval = interval;
173        self
174    }
175
176    /// Enables optimized Huffman tables.
177    #[must_use]
178    pub fn optimize_huffman(mut self, enable: bool) -> Self {
179        self.config.optimize_huffman = enable;
180        self
181    }
182
183    /// Validates the configuration.
184    fn validate(&self) -> Result<()> {
185        // Use validate_dimensions for comprehensive checks (zero, max dimension, max pixels)
186        validate_dimensions(self.config.width, self.config.height, DEFAULT_MAX_PIXELS)?;
187        Ok(())
188    }
189
190    /// Encodes the image data.
191    pub fn encode(&self, data: &[u8]) -> Result<Vec<u8>> {
192        self.validate()?;
193
194        // Calculate expected size with overflow checking
195        let expected_size =
196            checked_size_2d(self.config.width as usize, self.config.height as usize)?;
197        let expected_size =
198            checked_size_2d(expected_size, self.config.pixel_format.bytes_per_pixel())?;
199
200        if data.len() != expected_size {
201            return Err(Error::InvalidBufferSize {
202                expected: expected_size,
203                actual: data.len(),
204            });
205        }
206
207        // For now, implement baseline encoding only
208        match self.config.mode {
209            JpegMode::Baseline => self.encode_baseline(data),
210            JpegMode::Progressive => self.encode_progressive(data),
211            _ => Err(Error::UnsupportedFeature {
212                feature: "extended/lossless encoding",
213            }),
214        }
215    }
216
217    /// Encodes as baseline JPEG.
218    fn encode_baseline(&self, data: &[u8]) -> Result<Vec<u8>> {
219        let mut output = Vec::with_capacity(data.len() / 4);
220
221        if self.config.use_xyb {
222            self.encode_baseline_xyb(data, &mut output)
223        } else {
224            self.encode_baseline_ycbcr(data, &mut output)
225        }
226    }
227
228    /// Encodes using standard YCbCr color space.
229    fn encode_baseline_ycbcr(&self, data: &[u8], output: &mut Vec<u8>) -> Result<Vec<u8>> {
230        // Convert to YCbCr using f32 precision throughout (matches C++ jpegli)
231        let (y_plane, cb_plane, cr_plane) = self.convert_to_ycbcr_f32(data)?;
232
233        let width = self.config.width as usize;
234        let height = self.config.height as usize;
235
236        // Handle chroma subsampling
237        let (cb_plane_final, cr_plane_final, c_width, c_height) = match self.config.subsampling {
238            Subsampling::S420 => {
239                // 4:2:0: Downsample both Cb and Cr by 2x2
240                let cb_down = self.downsample_2x2_f32(&cb_plane, width, height)?;
241                let cr_down = self.downsample_2x2_f32(&cr_plane, width, height)?;
242                let c_w = (width + 1) / 2;
243                let c_h = (height + 1) / 2;
244                (cb_down, cr_down, c_w, c_h)
245            }
246            Subsampling::S422 => {
247                // 4:2:2: Downsample horizontally only
248                let cb_down = self.downsample_2x1_f32(&cb_plane, width, height)?;
249                let cr_down = self.downsample_2x1_f32(&cr_plane, width, height)?;
250                let c_w = (width + 1) / 2;
251                (cb_down, cr_down, c_w, height)
252            }
253            Subsampling::S440 => {
254                // 4:4:0: Downsample vertically only
255                let cb_down = self.downsample_1x2_f32(&cb_plane, width, height)?;
256                let cr_down = self.downsample_1x2_f32(&cr_plane, width, height)?;
257                let c_h = (height + 1) / 2;
258                (cb_down, cr_down, width, c_h)
259            }
260            Subsampling::S444 => {
261                // 4:4:4: No subsampling
262                (cb_plane, cr_plane, width, height)
263            }
264        };
265
266        // Generate quantization tables (3 separate tables like C++ cjpegli)
267        // Apply 4:2:0 quality compensation if using 4:2:0 subsampling
268        let is_420 = self.config.subsampling == Subsampling::S420;
269        let y_quant =
270            quant::generate_quant_table(self.config.quality, 0, ColorSpace::YCbCr, false, is_420);
271        let cb_quant =
272            quant::generate_quant_table(self.config.quality, 1, ColorSpace::YCbCr, false, is_420);
273        let cr_quant =
274            quant::generate_quant_table(self.config.quality, 2, ColorSpace::YCbCr, false, is_420);
275
276        // Quantize all blocks first (needed for both standard and optimized encoding)
277        let (y_blocks, cb_blocks, cr_blocks) = self.quantize_all_blocks_subsampled(
278            &y_plane,
279            width,
280            height,
281            &cb_plane_final,
282            &cr_plane_final,
283            c_width,
284            c_height,
285            &y_quant,
286            &cb_quant,
287            &cr_quant,
288        )?;
289        let is_color = self.config.pixel_format != PixelFormat::Gray;
290
291        // Write JPEG structure
292        self.write_header(output)?;
293        self.write_quant_tables(output, &y_quant, &cb_quant, &cr_quant)?;
294        self.write_frame_header(output)?;
295
296        // For optimized Huffman, build tables from block frequencies before writing DHT
297        let scan_data = if self.config.optimize_huffman {
298            let tables =
299                self.build_optimized_tables(&y_blocks, &cb_blocks, &cr_blocks, is_color)?;
300            self.write_huffman_tables_optimized(output, &tables)?;
301
302            if self.config.restart_interval > 0 {
303                self.write_restart_interval(output)?;
304            }
305            self.write_scan_header(output)?;
306
307            // Encode with optimized tables
308            self.encode_with_tables(&y_blocks, &cb_blocks, &cr_blocks, is_color, &tables)?
309        } else {
310            self.write_huffman_tables(output)?;
311
312            if self.config.restart_interval > 0 {
313                self.write_restart_interval(output)?;
314            }
315            self.write_scan_header(output)?;
316
317            // Encode with standard tables
318            self.encode_blocks_standard(&y_blocks, &cb_blocks, &cr_blocks, is_color)?
319        };
320
321        output.extend_from_slice(&scan_data);
322
323        // Write EOI
324        output.push(0xFF);
325        output.push(MARKER_EOI);
326
327        Ok(std::mem::take(output))
328    }
329
330    /// Encodes using XYB mode (perceptually optimized color space).
331    ///
332    /// XYB encoding pipeline:
333    /// 1. sRGB → linear RGB → XYB → scaled XYB (values in [0, 1])
334    /// 2. Multiply by 255 for JPEG sample range
335    /// 3. Level shift by subtracting 128 for DCT
336    fn encode_baseline_xyb(&self, data: &[u8], output: &mut Vec<u8>) -> Result<Vec<u8>> {
337        let width = self.config.width as usize;
338        let height = self.config.height as usize;
339
340        // Convert sRGB to scaled XYB (full color conversion pipeline)
341        let (x_plane, y_plane, b_plane) = self.convert_to_scaled_xyb(data)?;
342
343        // Downsample B channel (XYB subsamples B to 1/4 resolution)
344        let b_downsampled = self.downsample_2x2_f32(&b_plane, width, height)?;
345        let b_width = (width + 1) / 2;
346        let b_height = (height + 1) / 2;
347
348        // Generate XYB quantization tables (one per component)
349        // XYB mode doesn't use 4:2:0 quality compensation
350        let x_quant = quant::generate_quant_table(
351            self.config.quality,
352            0, // X component
353            ColorSpace::Rgb,
354            true,
355            false, // is_420
356        );
357        let y_quant = quant::generate_quant_table(
358            self.config.quality,
359            1, // Y component (luma-like)
360            ColorSpace::Rgb,
361            true,
362            false, // is_420
363        );
364        let b_quant = quant::generate_quant_table(
365            self.config.quality,
366            2, // B component
367            ColorSpace::Rgb,
368            true,
369            false, // is_420
370        );
371
372        // Write JPEG structure for XYB mode (no JFIF, just ICC profile)
373        self.write_header_xyb(output)?;
374        // Write APP14 Adobe marker for RGB colorspace (required by some decoders)
375        // See: https://github.com/google/jpegli/pull/135
376        self.write_app14_adobe(output, 0)?; // 0 = RGB (no transform)
377                                            // Write XYB ICC profile so decoders can interpret the colors correctly
378        self.write_icc_profile(output, &XYB_ICC_PROFILE)?;
379        self.write_quant_tables_xyb(output, &x_quant, &y_quant, &b_quant)?;
380        self.write_frame_header_xyb(output)?;
381
382        // For optimized Huffman, quantize all blocks first to collect frequencies
383        let scan_data = if self.config.optimize_huffman {
384            let (x_blocks, y_blocks, b_blocks) = self.quantize_all_blocks_xyb(
385                &x_plane,
386                &y_plane,
387                &b_downsampled,
388                width,
389                height,
390                b_width,
391                b_height,
392                &x_quant,
393                &y_quant,
394                &b_quant,
395            );
396            let (dc_table, ac_table) =
397                self.build_optimized_tables_xyb(&x_blocks, &y_blocks, &b_blocks)?;
398            self.write_huffman_tables_xyb_optimized(output, &dc_table, &ac_table);
399
400            if self.config.restart_interval > 0 {
401                self.write_restart_interval(output)?;
402            }
403            self.write_scan_header_xyb(output)?;
404
405            // Encode with optimized tables
406            self.encode_with_tables_xyb(&x_blocks, &y_blocks, &b_blocks, &dc_table, &ac_table)?
407        } else {
408            self.write_huffman_tables(output)?;
409
410            if self.config.restart_interval > 0 {
411                self.write_restart_interval(output)?;
412            }
413            self.write_scan_header_xyb(output)?;
414
415            // Encode with standard tables
416            self.encode_scan_xyb_float(
417                &x_plane,
418                &y_plane,
419                &b_downsampled,
420                width,
421                height,
422                b_width,
423                b_height,
424                &x_quant,
425                &y_quant,
426                &b_quant,
427            )?
428        };
429
430        output.extend_from_slice(&scan_data);
431
432        // Write EOI
433        output.push(0xFF);
434        output.push(MARKER_EOI);
435
436        Ok(std::mem::take(output))
437    }
438
439    /// Converts input data to scaled XYB planes.
440    ///
441    /// Performs the full conversion: sRGB u8 → linear RGB → XYB → scaled XYB
442    /// Output values are in [0, 1] range, ready to be scaled to [0, 255] for JPEG.
443    fn convert_to_scaled_xyb(&self, data: &[u8]) -> Result<(Vec<f32>, Vec<f32>, Vec<f32>)> {
444        let width = self.config.width as usize;
445        let height = self.config.height as usize;
446        let num_pixels = checked_size_2d(width, height)?;
447
448        let mut x_plane = try_alloc_zeroed_f32(num_pixels, "allocating XYB X plane")?;
449        let mut y_plane = try_alloc_zeroed_f32(num_pixels, "allocating XYB Y plane")?;
450        let mut b_plane = try_alloc_zeroed_f32(num_pixels, "allocating XYB B plane")?;
451
452        match self.config.pixel_format {
453            PixelFormat::Rgb => {
454                for i in 0..num_pixels {
455                    let (x, y, b) =
456                        srgb_to_scaled_xyb(data[i * 3], data[i * 3 + 1], data[i * 3 + 2]);
457                    x_plane[i] = x;
458                    y_plane[i] = y;
459                    b_plane[i] = b;
460                }
461            }
462            PixelFormat::Rgba => {
463                for i in 0..num_pixels {
464                    let (x, y, b) =
465                        srgb_to_scaled_xyb(data[i * 4], data[i * 4 + 1], data[i * 4 + 2]);
466                    x_plane[i] = x;
467                    y_plane[i] = y;
468                    b_plane[i] = b;
469                }
470            }
471            PixelFormat::Gray => {
472                // Grayscale: R=G=B
473                for i in 0..num_pixels {
474                    let (x, y, b) = srgb_to_scaled_xyb(data[i], data[i], data[i]);
475                    x_plane[i] = x;
476                    y_plane[i] = y;
477                    b_plane[i] = b;
478                }
479            }
480            PixelFormat::Bgr => {
481                for i in 0..num_pixels {
482                    let (x, y, b) =
483                        srgb_to_scaled_xyb(data[i * 3 + 2], data[i * 3 + 1], data[i * 3]);
484                    x_plane[i] = x;
485                    y_plane[i] = y;
486                    b_plane[i] = b;
487                }
488            }
489            PixelFormat::Bgra => {
490                for i in 0..num_pixels {
491                    let (x, y, b) =
492                        srgb_to_scaled_xyb(data[i * 4 + 2], data[i * 4 + 1], data[i * 4]);
493                    x_plane[i] = x;
494                    y_plane[i] = y;
495                    b_plane[i] = b;
496                }
497            }
498            PixelFormat::Cmyk => {
499                return Err(Error::UnsupportedFeature {
500                    feature: "CMYK with XYB mode",
501                });
502            }
503        }
504
505        Ok((x_plane, y_plane, b_plane))
506    }
507
508    /// Downsamples a float plane by 2x2 (box filter averaging).
509    fn downsample_2x2_f32(&self, plane: &[f32], width: usize, height: usize) -> Result<Vec<f32>> {
510        let new_width = (width + 1) / 2;
511        let new_height = (height + 1) / 2;
512        let result_size = checked_size_2d(new_width, new_height)?;
513        let mut result = try_alloc_zeroed_f32(result_size, "allocating downsampled plane")?;
514
515        for y in 0..new_height {
516            for x in 0..new_width {
517                let x0 = x * 2;
518                let y0 = y * 2;
519                let x1 = (x0 + 1).min(width - 1);
520                let y1 = (y0 + 1).min(height - 1);
521
522                let p00 = plane[y0 * width + x0];
523                let p10 = plane[y0 * width + x1];
524                let p01 = plane[y1 * width + x0];
525                let p11 = plane[y1 * width + x1];
526
527                result[y * new_width + x] = (p00 + p10 + p01 + p11) * 0.25;
528            }
529        }
530
531        Ok(result)
532    }
533
534    /// Downsamples a float plane by 2x1 (horizontal only, box filter averaging).
535    fn downsample_2x1_f32(&self, plane: &[f32], width: usize, height: usize) -> Result<Vec<f32>> {
536        let new_width = (width + 1) / 2;
537        let result_size = checked_size_2d(new_width, height)?;
538        let mut result = try_alloc_zeroed_f32(result_size, "allocating downsampled plane")?;
539
540        for y in 0..height {
541            for x in 0..new_width {
542                let x0 = x * 2;
543                let x1 = (x0 + 1).min(width - 1);
544
545                let p0 = plane[y * width + x0];
546                let p1 = plane[y * width + x1];
547
548                result[y * new_width + x] = (p0 + p1) * 0.5;
549            }
550        }
551
552        Ok(result)
553    }
554
555    /// Downsamples a float plane by 1x2 (vertical only, box filter averaging).
556    fn downsample_1x2_f32(&self, plane: &[f32], width: usize, height: usize) -> Result<Vec<f32>> {
557        let new_height = (height + 1) / 2;
558        let result_size = checked_size_2d(width, new_height)?;
559        let mut result = try_alloc_zeroed_f32(result_size, "allocating downsampled plane")?;
560
561        for y in 0..new_height {
562            for x in 0..width {
563                let y0 = y * 2;
564                let y1 = (y0 + 1).min(height - 1);
565
566                let p0 = plane[y0 * width + x];
567                let p1 = plane[y1 * width + x];
568
569                result[y * width + x] = (p0 + p1) * 0.5;
570            }
571        }
572
573        Ok(result)
574    }
575
576    /// Encodes as progressive JPEG (level 2, matching cjpegli default).
577    ///
578    /// Progressive level 2 uses the following scan script:
579    /// 1. DC first: Ss=0, Se=0, Ah=0, Al=0 (DC only, full precision)
580    /// 2. AC 1-2: Ss=1, Se=2, Ah=0, Al=0 (low AC, full precision)
581    /// 3. AC 3-63 first: Ss=3, Se=63, Ah=0, Al=2 (high AC, top bits)
582    /// 4. AC 3-63 refine: Ss=3, Se=63, Ah=2, Al=1 (bit 1 refinement)
583    /// 5. AC 3-63 refine: Ss=3, Se=63, Ah=1, Al=0 (bit 0 refinement)
584    fn encode_progressive(&self, data: &[u8]) -> Result<Vec<u8>> {
585        // Use tokenization-based approach when optimizing Huffman tables
586        if self.config.optimize_huffman {
587            return self.encode_progressive_optimized(data);
588        }
589
590        let mut output = Vec::with_capacity(data.len() / 4);
591
592        // Convert to YCbCr using f32 precision
593        let (y_plane, cb_plane, cr_plane) = self.convert_to_ycbcr_f32(data)?;
594
595        // Generate quantization tables (3 separate tables like C++ cjpegli)
596        // Progressive mode uses 4:4:4, so is_420 = false
597        let y_quant =
598            quant::generate_quant_table(self.config.quality, 0, ColorSpace::YCbCr, false, false);
599        let cb_quant =
600            quant::generate_quant_table(self.config.quality, 1, ColorSpace::YCbCr, false, false);
601        let cr_quant =
602            quant::generate_quant_table(self.config.quality, 2, ColorSpace::YCbCr, false, false);
603
604        // Quantize all blocks to get full-precision coefficients
605        let (y_blocks, cb_blocks, cr_blocks) = self.quantize_all_blocks(
606            &y_plane, &cb_plane, &cr_plane, &y_quant, &cb_quant, &cr_quant,
607        )?;
608        let is_color = self.config.pixel_format != PixelFormat::Gray;
609
610        // Write JPEG structure
611        self.write_header(&mut output)?;
612        self.write_quant_tables(&mut output, &y_quant, &cb_quant, &cr_quant)?;
613        self.write_frame_header(&mut output)?; // Uses SOF2 for progressive
614
615        // For non-optimized progressive, use standard Huffman tables
616        self.write_huffman_tables(&mut output)?;
617        let tables: Option<OptimizedHuffmanTables> = None;
618
619        if self.config.restart_interval > 0 {
620            self.write_restart_interval(&mut output)?;
621        }
622
623        // Define progressive scan script (level 2)
624        // For 4:4:4 (no subsampling), DC can be interleaved
625        let scans = self.get_progressive_scan_script(is_color);
626
627        // Encode each scan
628        for scan in &scans {
629            // Write SOS header for this scan
630            self.write_progressive_scan_header(&mut output, scan, is_color)?;
631
632            // Encode the scan data
633            let scan_data = self.encode_progressive_scan(
634                &y_blocks, &cb_blocks, &cr_blocks, scan, is_color, &tables,
635            )?;
636            output.extend_from_slice(&scan_data);
637        }
638
639        // Write EOI
640        output.push(0xFF);
641        output.push(MARKER_EOI);
642
643        Ok(output)
644    }
645
646    /// Encodes progressive JPEG with optimized Huffman tables using two-pass tokenization.
647    ///
648    /// This approach:
649    /// 1. Tokenizes all scans first to collect actual symbol usage
650    /// 2. Builds histograms from actual tokens (not estimated baseline statistics)
651    /// 3. Clusters similar histograms to minimize table overhead
652    /// 4. Generates optimal Huffman tables from clustered histograms
653    /// 5. Replays tokens with optimized tables
654    fn encode_progressive_optimized(&self, data: &[u8]) -> Result<Vec<u8>> {
655        let mut output = Vec::with_capacity(data.len() / 4);
656
657        // Convert to YCbCr using f32 precision
658        let (y_plane, cb_plane, cr_plane) = self.convert_to_ycbcr_f32(data)?;
659
660        // Generate quantization tables (3 separate tables like C++ cjpegli)
661        // Progressive mode uses 4:4:4, so is_420 = false
662        let y_quant =
663            quant::generate_quant_table(self.config.quality, 0, ColorSpace::YCbCr, false, false);
664        let cb_quant =
665            quant::generate_quant_table(self.config.quality, 1, ColorSpace::YCbCr, false, false);
666        let cr_quant =
667            quant::generate_quant_table(self.config.quality, 2, ColorSpace::YCbCr, false, false);
668
669        // Quantize all blocks to get full-precision coefficients
670        let (y_blocks, cb_blocks, cr_blocks) = self.quantize_all_blocks(
671            &y_plane, &cb_plane, &cr_plane, &y_quant, &cb_quant, &cr_quant,
672        )?;
673        let is_color = self.config.pixel_format != PixelFormat::Gray;
674        let num_components = if is_color { 3 } else { 1 };
675
676        // Define progressive scan script
677        let scans = self.get_progressive_scan_script(is_color);
678
679        // ========== PASS 1: TOKENIZATION ==========
680        // Tokenize all scans to collect symbol statistics
681        let mut token_buffer = ProgressiveTokenBuffer::new(num_components, scans.len());
682
683        for scan in scans.iter() {
684            // Calculate context for this scan
685            // Context determines which Huffman table histogram to use
686            let context = if scan.ss == 0 && scan.se == 0 {
687                // DC scan: use component index as context (0=Y, 1=Cb, 2=Cr)
688                scan.components[0]
689            } else {
690                // AC scan: use num_components + component_index as context
691                // This ensures Y always uses luma table, Cb/Cr use chroma table
692                // regardless of scan order (which varies with subsampling mode)
693                (num_components as u8) + scan.components[0]
694            };
695
696            if scan.ss == 0 && scan.se == 0 {
697                // DC scan
698                let blocks: Vec<&[[i16; DCT_BLOCK_SIZE]]> = scan
699                    .components
700                    .iter()
701                    .map(|&c| match c {
702                        0 => y_blocks.as_slice(),
703                        1 => cb_blocks.as_slice(),
704                        2 => cr_blocks.as_slice(),
705                        _ => &[][..],
706                    })
707                    .collect();
708                let component_indices: Vec<usize> =
709                    scan.components.iter().map(|&c| c as usize).collect();
710                token_buffer.tokenize_dc_scan(&blocks, &component_indices, scan.al, scan.ah);
711            } else if scan.ah == 0 {
712                // AC first scan
713                let blocks: &[[i16; DCT_BLOCK_SIZE]] = match scan.components[0] {
714                    0 => &y_blocks,
715                    1 => &cb_blocks,
716                    2 => &cr_blocks,
717                    _ => {
718                        return Err(Error::InternalError {
719                            reason: "Invalid component",
720                        })
721                    }
722                };
723                token_buffer.tokenize_ac_first_scan(blocks, context, scan.ss, scan.se, scan.al);
724            } else {
725                // AC refinement scan
726                let blocks: &[[i16; DCT_BLOCK_SIZE]] = match scan.components[0] {
727                    0 => &y_blocks,
728                    1 => &cb_blocks,
729                    2 => &cr_blocks,
730                    _ => {
731                        return Err(Error::InternalError {
732                            reason: "Invalid component",
733                        })
734                    }
735                };
736                token_buffer.tokenize_ac_refinement_scan(
737                    blocks, context, scan.ss, scan.se, scan.ah, scan.al,
738                );
739            }
740        }
741
742        // ========== GENERATE OPTIMIZED TABLES ==========
743        // Use explicit luma/chroma grouping to ensure table assignment matches
744        // what the replay code expects (luma=0, chroma=1)
745        let (num_dc_tables, tables) = token_buffer.generate_luma_chroma_tables(num_components)?;
746
747        // Convert to OptimizedHuffmanTables format for compatibility
748        let opt_tables =
749            self.build_progressive_huffman_tables(&tables, num_components, num_dc_tables)?;
750
751        // ========== WRITE JPEG STRUCTURE ==========
752        self.write_header(&mut output)?;
753        self.write_quant_tables(&mut output, &y_quant, &cb_quant, &cr_quant)?;
754        self.write_frame_header(&mut output)?; // Uses SOF2 for progressive
755
756        // Write optimized Huffman tables
757        self.write_huffman_tables_optimized(&mut output, &opt_tables)?;
758
759        if self.config.restart_interval > 0 {
760            self.write_restart_interval(&mut output)?;
761        }
762
763        // ========== PASS 2: REPLAY TOKENS ==========
764        // Encode each scan by replaying tokens with optimized tables
765        for (scan_idx, scan) in scans.iter().enumerate() {
766            // Write SOS header
767            self.write_progressive_scan_header(&mut output, scan, is_color)?;
768
769            // Replay tokens for this scan
770            let scan_data =
771                self.replay_progressive_scan(&token_buffer, scan_idx, scan, is_color, &opt_tables)?;
772            output.extend_from_slice(&scan_data);
773        }
774
775        // Write EOI
776        output.push(0xFF);
777        output.push(MARKER_EOI);
778
779        Ok(output)
780    }
781
782    /// Builds OptimizedHuffmanTables from the clustered tables.
783    fn build_progressive_huffman_tables(
784        &self,
785        tables: &[OptimizedTable],
786        num_components: usize,
787        num_dc_tables: usize,
788    ) -> Result<OptimizedHuffmanTables> {
789        // Tables are arranged: DC clusters first, then AC clusters
790        // num_dc_tables tells us where DC ends and AC begins
791
792        let dc_luma = tables.first().cloned().unwrap_or_else(|| {
793            // Create a minimal default table
794            let mut counter = FrequencyCounter::new();
795            counter.count(0);
796            counter.generate_table_with_dht().unwrap()
797        });
798
799        // DC chroma is the second DC table if it exists
800        let dc_chroma = if num_components > 1 && num_dc_tables > 1 {
801            tables.get(1).cloned().unwrap_or_else(|| dc_luma.clone())
802        } else {
803            dc_luma.clone()
804        };
805
806        // AC tables start after DC tables
807        let ac_luma = tables.get(num_dc_tables).cloned().unwrap_or_else(|| {
808            let mut counter = FrequencyCounter::new();
809            counter.count(0);
810            counter.generate_table_with_dht().unwrap()
811        });
812
813        // AC chroma is the second AC table if it exists
814        let ac_chroma = if num_components > 1 && tables.len() > num_dc_tables + 1 {
815            tables
816                .get(num_dc_tables + 1)
817                .cloned()
818                .unwrap_or_else(|| ac_luma.clone())
819        } else {
820            ac_luma.clone()
821        };
822
823        Ok(OptimizedHuffmanTables {
824            dc_luma,
825            ac_luma,
826            dc_chroma,
827            ac_chroma,
828        })
829    }
830
831    /// Replays tokens for a progressive scan with optimized tables.
832    fn replay_progressive_scan(
833        &self,
834        token_buffer: &ProgressiveTokenBuffer,
835        scan_idx: usize,
836        scan: &ProgressiveScan,
837        is_color: bool,
838        tables: &OptimizedHuffmanTables,
839    ) -> Result<Vec<u8>> {
840        let mut encoder = EntropyEncoder::new();
841
842        // Set up Huffman tables
843        encoder.set_dc_table(0, tables.dc_luma.table.clone());
844        encoder.set_ac_table(0, tables.ac_luma.table.clone());
845        if is_color {
846            encoder.set_dc_table(1, tables.dc_chroma.table.clone());
847            encoder.set_ac_table(1, tables.ac_chroma.table.clone());
848        }
849
850        if self.config.restart_interval > 0 {
851            encoder.set_restart_interval(self.config.restart_interval);
852        }
853
854        // Get scan info
855        let scan_info = token_buffer
856            .scan_info
857            .get(scan_idx)
858            .ok_or(Error::InternalError {
859                reason: "Scan info not found",
860            })?;
861
862        if scan.ss == 0 && scan.se == 0 {
863            // DC scan: replay DC tokens
864            let tokens = token_buffer.scan_tokens(scan_idx);
865            // Create context map for DC (component index -> table index)
866            let context_to_table: Vec<usize> = (0..4)
867                .map(|c| if is_color && c > 0 { 1 } else { 0 })
868                .collect();
869            encoder.write_dc_tokens(tokens, &context_to_table)?;
870        } else if scan.ah == 0 {
871            // AC first scan: replay AC tokens
872            let tokens = token_buffer.scan_tokens(scan_idx);
873            let table_idx = if is_color && scan.components[0] > 0 {
874                1
875            } else {
876                0
877            };
878            encoder.write_ac_first_tokens(tokens, table_idx)?;
879        } else {
880            // AC refinement scan: replay refinement tokens
881            let table_idx = if is_color && scan.components[0] > 0 {
882                1
883            } else {
884                0
885            };
886            encoder.write_ac_refinement_tokens(scan_info, table_idx)?;
887        }
888
889        Ok(encoder.finish())
890    }
891
892    /// Returns the progressive scan script for level 2.
893    fn get_progressive_scan_script(&self, is_color: bool) -> Vec<ProgressiveScan> {
894        let num_components = if is_color { 3 } else { 1 };
895        let mut scans = Vec::new();
896
897        // For 4:4:4 subsampling, DC can be interleaved
898        let dc_interleaved = matches!(self.config.subsampling, Subsampling::S444);
899
900        // DC first scan
901        if dc_interleaved && is_color {
902            // Interleaved DC for all components
903            scans.push(ProgressiveScan {
904                components: vec![0, 1, 2],
905                ss: 0,
906                se: 0,
907                ah: 0,
908                al: 0,
909            });
910        } else {
911            // Non-interleaved DC
912            for c in 0..num_components {
913                scans.push(ProgressiveScan {
914                    components: vec![c],
915                    ss: 0,
916                    se: 0,
917                    ah: 0,
918                    al: 0,
919                });
920            }
921        }
922
923        // AC scans are always non-interleaved
924        // Progressive Level 2 with successive approximation (matches C++ jpegli)
925        let use_refinement = true;
926
927        for c in 0..num_components {
928            if use_refinement {
929                // Level 2: with successive approximation
930                // AC 1-2: full precision (low frequency, most visible)
931                scans.push(ProgressiveScan {
932                    components: vec![c],
933                    ss: 1,
934                    se: 2,
935                    ah: 0,
936                    al: 0,
937                });
938
939                // AC 3-63 first pass: top bits only (Al=2 means bits 2+)
940                scans.push(ProgressiveScan {
941                    components: vec![c],
942                    ss: 3,
943                    se: 63,
944                    ah: 0,
945                    al: 2,
946                });
947
948                // AC 3-63 refinement: bit 1 (Ah=2, Al=1)
949                scans.push(ProgressiveScan {
950                    components: vec![c],
951                    ss: 3,
952                    se: 63,
953                    ah: 2,
954                    al: 1,
955                });
956
957                // AC 3-63 refinement: bit 0 (Ah=1, Al=0)
958                // TEMP: Skipping second refinement to test first refinement
959                // scans.push(ProgressiveScan {
960                //     components: vec![c],
961                //     ss: 3,
962                //     se: 63,
963                //     ah: 1,
964                //     al: 0,
965                // });
966            } else {
967                // Level 0: no successive approximation (simpler, works)
968                scans.push(ProgressiveScan {
969                    components: vec![c],
970                    ss: 1,
971                    se: 63,
972                    ah: 0,
973                    al: 0,
974                });
975            }
976        }
977
978        scans
979    }
980
981    /// Writes SOS header for a progressive scan.
982    fn write_progressive_scan_header(
983        &self,
984        output: &mut Vec<u8>,
985        scan: &ProgressiveScan,
986        is_color: bool,
987    ) -> Result<()> {
988        output.push(0xFF);
989        output.push(MARKER_SOS);
990
991        let num_components = scan.components.len() as u8;
992        let length = 6u16 + num_components as u16 * 2;
993        output.push((length >> 8) as u8);
994        output.push(length as u8);
995
996        output.push(num_components);
997
998        for &comp_idx in &scan.components {
999            // Component ID (1-based for YCbCr)
1000            let comp_id = comp_idx + 1;
1001            output.push(comp_id);
1002
1003            // DC/AC table selectors
1004            // For DC scans (ss=0): use DC table for the component
1005            // For AC scans (ss>0): use AC table for the component
1006            let table_selector = if is_color && comp_idx > 0 {
1007                0x11 // DC table 1, AC table 1 for chroma
1008            } else {
1009                0x00 // DC table 0, AC table 0 for luma
1010            };
1011            output.push(table_selector);
1012        }
1013
1014        output.push(scan.ss); // Spectral selection start
1015        output.push(scan.se); // Spectral selection end
1016        output.push((scan.ah << 4) | scan.al); // Successive approximation
1017
1018        Ok(())
1019    }
1020
1021    /// Encodes a single progressive scan.
1022    fn encode_progressive_scan(
1023        &self,
1024        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
1025        cb_blocks: &[[i16; DCT_BLOCK_SIZE]],
1026        cr_blocks: &[[i16; DCT_BLOCK_SIZE]],
1027        scan: &ProgressiveScan,
1028        is_color: bool,
1029        tables: &Option<OptimizedHuffmanTables>,
1030    ) -> Result<Vec<u8>> {
1031        let mut encoder = EntropyEncoder::new();
1032
1033        // Set up Huffman tables
1034        if let Some(ref opt_tables) = tables {
1035            encoder.set_dc_table(0, opt_tables.dc_luma.table.clone());
1036            encoder.set_ac_table(0, opt_tables.ac_luma.table.clone());
1037            if is_color {
1038                encoder.set_dc_table(1, opt_tables.dc_chroma.table.clone());
1039                encoder.set_ac_table(1, opt_tables.ac_chroma.table.clone());
1040            }
1041        } else {
1042            encoder.set_dc_table(0, HuffmanEncodeTable::std_dc_luminance());
1043            encoder.set_ac_table(0, HuffmanEncodeTable::std_ac_luminance());
1044            if is_color {
1045                encoder.set_dc_table(1, HuffmanEncodeTable::std_dc_chrominance());
1046                encoder.set_ac_table(1, HuffmanEncodeTable::std_ac_chrominance());
1047            }
1048        }
1049
1050        if self.config.restart_interval > 0 {
1051            encoder.set_restart_interval(self.config.restart_interval);
1052        }
1053
1054        let width = self.config.width as usize;
1055        let height = self.config.height as usize;
1056        let blocks_h = (width + DCT_SIZE - 1) / DCT_SIZE;
1057        let blocks_v = (height + DCT_SIZE - 1) / DCT_SIZE;
1058
1059        // Determine scan type and encode accordingly
1060        if scan.ss == 0 && scan.se == 0 {
1061            // DC scan (first or refinement)
1062            self.encode_dc_scan(
1063                &mut encoder,
1064                y_blocks,
1065                cb_blocks,
1066                cr_blocks,
1067                scan,
1068                blocks_h,
1069                blocks_v,
1070                is_color,
1071            )?;
1072        } else if scan.ah == 0 {
1073            // AC first scan
1074            self.encode_ac_first_scan(
1075                &mut encoder,
1076                y_blocks,
1077                cb_blocks,
1078                cr_blocks,
1079                scan,
1080                blocks_h,
1081                blocks_v,
1082                is_color,
1083            )?;
1084        } else {
1085            // AC refinement scan
1086            self.encode_ac_refine_scan(
1087                &mut encoder,
1088                y_blocks,
1089                cb_blocks,
1090                cr_blocks,
1091                scan,
1092                blocks_h,
1093                blocks_v,
1094                is_color,
1095            )?;
1096        }
1097
1098        Ok(encoder.finish())
1099    }
1100
1101    /// Encodes DC scan (first or refinement).
1102    fn encode_dc_scan(
1103        &self,
1104        encoder: &mut EntropyEncoder,
1105        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
1106        cb_blocks: &[[i16; DCT_BLOCK_SIZE]],
1107        cr_blocks: &[[i16; DCT_BLOCK_SIZE]],
1108        scan: &ProgressiveScan,
1109        blocks_h: usize,
1110        blocks_v: usize,
1111        is_color: bool,
1112    ) -> Result<()> {
1113        for by in 0..blocks_v {
1114            for bx in 0..blocks_h {
1115                let block_idx = by * blocks_h + bx;
1116
1117                for (comp_num, &comp_idx) in scan.components.iter().enumerate() {
1118                    let blocks: &[[i16; DCT_BLOCK_SIZE]] = match comp_idx {
1119                        0 => y_blocks,
1120                        1 => cb_blocks,
1121                        2 => cr_blocks,
1122                        _ => {
1123                            return Err(Error::InternalError {
1124                                reason: "Invalid component index",
1125                            })
1126                        }
1127                    };
1128
1129                    if block_idx >= blocks.len() {
1130                        continue;
1131                    }
1132
1133                    let dc = blocks[block_idx][0];
1134                    let table = if is_color && comp_idx > 0 { 1 } else { 0 };
1135
1136                    encoder.encode_dc_progressive(dc, comp_num, table, scan.al, scan.ah)?;
1137                }
1138            }
1139        }
1140
1141        Ok(())
1142    }
1143
1144    /// Encodes AC first scan (Ah=0, ss>0).
1145    fn encode_ac_first_scan(
1146        &self,
1147        encoder: &mut EntropyEncoder,
1148        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
1149        cb_blocks: &[[i16; DCT_BLOCK_SIZE]],
1150        cr_blocks: &[[i16; DCT_BLOCK_SIZE]],
1151        scan: &ProgressiveScan,
1152        blocks_h: usize,
1153        blocks_v: usize,
1154        is_color: bool,
1155    ) -> Result<()> {
1156        // AC first scan is always non-interleaved (single component)
1157        assert_eq!(scan.components.len(), 1);
1158        let comp_idx = scan.components[0];
1159
1160        let blocks: &[[i16; DCT_BLOCK_SIZE]] = match comp_idx {
1161            0 => y_blocks,
1162            1 => cb_blocks,
1163            2 => cr_blocks,
1164            _ => {
1165                return Err(Error::InternalError {
1166                    reason: "Invalid component index",
1167                })
1168            }
1169        };
1170
1171        let table_idx = if is_color && comp_idx > 0 { 1 } else { 0 };
1172
1173        let mut eob_run = 0u16;
1174
1175        for by in 0..blocks_v {
1176            for bx in 0..blocks_h {
1177                let block_idx = by * blocks_h + bx;
1178
1179                if block_idx >= blocks.len() {
1180                    continue;
1181                }
1182
1183                encoder.encode_ac_progressive_first(
1184                    &blocks[block_idx],
1185                    table_idx,
1186                    scan.ss,
1187                    scan.se,
1188                    scan.al,
1189                    &mut eob_run,
1190                )?;
1191            }
1192        }
1193
1194        // Flush remaining EOB run
1195        encoder.flush_eob_run(table_idx, eob_run)?;
1196
1197        Ok(())
1198    }
1199
1200    /// Encodes AC refinement scan (Ah>0, ss>0).
1201    fn encode_ac_refine_scan(
1202        &self,
1203        encoder: &mut EntropyEncoder,
1204        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
1205        cb_blocks: &[[i16; DCT_BLOCK_SIZE]],
1206        cr_blocks: &[[i16; DCT_BLOCK_SIZE]],
1207        scan: &ProgressiveScan,
1208        blocks_h: usize,
1209        blocks_v: usize,
1210        is_color: bool,
1211    ) -> Result<()> {
1212        // AC refinement scan is always non-interleaved
1213        assert_eq!(scan.components.len(), 1);
1214        let comp_idx = scan.components[0];
1215
1216        let blocks: &[[i16; DCT_BLOCK_SIZE]] = match comp_idx {
1217            0 => y_blocks,
1218            1 => cb_blocks,
1219            2 => cr_blocks,
1220            _ => {
1221                return Err(Error::InternalError {
1222                    reason: "Invalid component index",
1223                })
1224            }
1225        };
1226
1227        let table_idx = if is_color && comp_idx > 0 { 1 } else { 0 };
1228
1229        let mut eob_run = 0u16;
1230
1231        for by in 0..blocks_v {
1232            for bx in 0..blocks_h {
1233                let block_idx = by * blocks_h + bx;
1234
1235                if block_idx >= blocks.len() {
1236                    continue;
1237                }
1238
1239                encoder.encode_ac_progressive_refine(
1240                    &blocks[block_idx],
1241                    table_idx,
1242                    scan.ss,
1243                    scan.se,
1244                    scan.ah,
1245                    scan.al,
1246                    &mut eob_run,
1247                )?;
1248            }
1249        }
1250
1251        // Flush remaining EOB run
1252        encoder.flush_refine_eob(table_idx, eob_run)?;
1253
1254        Ok(())
1255    }
1256
1257    /// Converts input data to YCbCr planes (u8 version - legacy).
1258    #[allow(dead_code)]
1259    fn convert_to_ycbcr(&self, data: &[u8]) -> Result<(Vec<u8>, Vec<u8>, Vec<u8>)> {
1260        let width = self.config.width as usize;
1261        let height = self.config.height as usize;
1262        let num_pixels = checked_size_2d(width, height)?;
1263
1264        match self.config.pixel_format {
1265            PixelFormat::Gray => {
1266                let y = data.to_vec();
1267                let cb = try_alloc_filled(num_pixels, 128u8, "YCbCr Cb plane")?;
1268                let cr = try_alloc_filled(num_pixels, 128u8, "YCbCr Cr plane")?;
1269                Ok((y, cb, cr))
1270            }
1271            PixelFormat::Rgb => color::rgb_to_ycbcr_planes(data, width, height),
1272            PixelFormat::Rgba => {
1273                // Strip alpha and convert
1274                let rgb: Vec<u8> = data
1275                    .chunks(4)
1276                    .flat_map(|chunk| [chunk[0], chunk[1], chunk[2]])
1277                    .collect();
1278                color::rgb_to_ycbcr_planes(&rgb, width, height)
1279            }
1280            PixelFormat::Bgr => {
1281                let rgb: Vec<u8> = data
1282                    .chunks(3)
1283                    .flat_map(|chunk| [chunk[2], chunk[1], chunk[0]])
1284                    .collect();
1285                color::rgb_to_ycbcr_planes(&rgb, width, height)
1286            }
1287            PixelFormat::Bgra => {
1288                let rgb: Vec<u8> = data
1289                    .chunks(4)
1290                    .flat_map(|chunk| [chunk[2], chunk[1], chunk[0]])
1291                    .collect();
1292                color::rgb_to_ycbcr_planes(&rgb, width, height)
1293            }
1294            PixelFormat::Cmyk => Err(Error::UnsupportedFeature {
1295                feature: "CMYK encoding",
1296            }),
1297        }
1298    }
1299
1300    /// Converts input data to YCbCr planes using full f32 precision.
1301    /// This matches C++ jpegli which uses float throughout the pipeline.
1302    /// Output values are in [0, 255] range (not level-shifted).
1303    fn convert_to_ycbcr_f32(&self, data: &[u8]) -> Result<(Vec<f32>, Vec<f32>, Vec<f32>)> {
1304        let width = self.config.width as usize;
1305        let height = self.config.height as usize;
1306        let num_pixels = checked_size_2d(width, height)?;
1307
1308        let mut y_plane = try_alloc_zeroed_f32(num_pixels, "YCbCr Y plane f32")?;
1309        let mut cb_plane = try_alloc_zeroed_f32(num_pixels, "YCbCr Cb plane f32")?;
1310        let mut cr_plane = try_alloc_zeroed_f32(num_pixels, "YCbCr Cr plane f32")?;
1311
1312        match self.config.pixel_format {
1313            PixelFormat::Gray => {
1314                for i in 0..num_pixels {
1315                    y_plane[i] = data[i] as f32;
1316                    cb_plane[i] = 128.0;
1317                    cr_plane[i] = 128.0;
1318                }
1319            }
1320            PixelFormat::Rgb => {
1321                for i in 0..num_pixels {
1322                    let (y, cb, cr) = color::rgb_to_ycbcr_f32(
1323                        data[i * 3] as f32,
1324                        data[i * 3 + 1] as f32,
1325                        data[i * 3 + 2] as f32,
1326                    );
1327                    y_plane[i] = y;
1328                    cb_plane[i] = cb;
1329                    cr_plane[i] = cr;
1330                }
1331            }
1332            PixelFormat::Rgba => {
1333                for i in 0..num_pixels {
1334                    let (y, cb, cr) = color::rgb_to_ycbcr_f32(
1335                        data[i * 4] as f32,
1336                        data[i * 4 + 1] as f32,
1337                        data[i * 4 + 2] as f32,
1338                    );
1339                    y_plane[i] = y;
1340                    cb_plane[i] = cb;
1341                    cr_plane[i] = cr;
1342                }
1343            }
1344            PixelFormat::Bgr => {
1345                for i in 0..num_pixels {
1346                    let (y, cb, cr) = color::rgb_to_ycbcr_f32(
1347                        data[i * 3 + 2] as f32,
1348                        data[i * 3 + 1] as f32,
1349                        data[i * 3] as f32,
1350                    );
1351                    y_plane[i] = y;
1352                    cb_plane[i] = cb;
1353                    cr_plane[i] = cr;
1354                }
1355            }
1356            PixelFormat::Bgra => {
1357                for i in 0..num_pixels {
1358                    let (y, cb, cr) = color::rgb_to_ycbcr_f32(
1359                        data[i * 4 + 2] as f32,
1360                        data[i * 4 + 1] as f32,
1361                        data[i * 4] as f32,
1362                    );
1363                    y_plane[i] = y;
1364                    cb_plane[i] = cb;
1365                    cr_plane[i] = cr;
1366                }
1367            }
1368            PixelFormat::Cmyk => {
1369                return Err(Error::UnsupportedFeature {
1370                    feature: "CMYK encoding",
1371                });
1372            }
1373        }
1374
1375        Ok((y_plane, cb_plane, cr_plane))
1376    }
1377
1378    /// Writes the JPEG header (SOI only, no JFIF APP0).
1379    ///
1380    /// Note: C++ jpegli does not write JFIF APP0, so we skip it for parity.
1381    /// The JFIF marker is optional and many modern decoders don't require it.
1382    fn write_header(&self, output: &mut Vec<u8>) -> Result<()> {
1383        // SOI only - no JFIF marker for C++ parity
1384        output.push(0xFF);
1385        output.push(MARKER_SOI);
1386        Ok(())
1387    }
1388
1389    /// Writes the JPEG header for XYB mode (SOI only, no JFIF).
1390    ///
1391    /// XYB mode uses RGB component IDs and an ICC profile for color interpretation.
1392    /// JFIF APP0 is not appropriate because it implies YCbCr colorspace.
1393    fn write_header_xyb(&self, output: &mut Vec<u8>) -> Result<()> {
1394        // SOI only - no JFIF marker for XYB mode
1395        output.push(0xFF);
1396        output.push(MARKER_SOI);
1397        Ok(())
1398    }
1399
1400    /// Writes an APP14 Adobe marker for RGB/CMYK/YCCK colorspaces.
1401    ///
1402    /// The APP14 marker is required by some decoders to properly interpret
1403    /// RGB (including XYB), CMYK, and YCCK colorspaces.
1404    ///
1405    /// See: https://github.com/google/jpegli/pull/135
1406    ///
1407    /// # Arguments
1408    /// * `transform` - Color transform type:
1409    ///   - 0 = RGB or CMYK (no transform)
1410    ///   - 1 = YCbCr
1411    ///   - 2 = YCCK
1412    fn write_app14_adobe(&self, output: &mut Vec<u8>, transform: u8) -> Result<()> {
1413        output.push(0xFF);
1414        output.push(MARKER_APP14);
1415        output.extend_from_slice(&[
1416            0x00, 0x0E, // Length: 14 bytes (includes length field)
1417            b'A', b'd', b'o', b'b', b'e', // Signature
1418            0x00, 0x64, // DCTEncodeVersion (100)
1419            0x00, 0x00, // APP14Flags0
1420            0x00, 0x00,      // APP14Flags1
1421            transform, // Color transform
1422        ]);
1423        Ok(())
1424    }
1425
1426    /// Writes an ICC profile to the JPEG output.
1427    ///
1428    /// ICC profiles are stored in APP2 marker segments with the signature "ICC_PROFILE\0".
1429    /// Large profiles are split into multiple segments (max ~65519 bytes per segment).
1430    fn write_icc_profile(&self, output: &mut Vec<u8>, icc_data: &[u8]) -> Result<()> {
1431        if icc_data.is_empty() {
1432            return Ok(());
1433        }
1434
1435        // Calculate number of chunks needed
1436        let num_chunks = (icc_data.len() + MAX_ICC_BYTES_PER_MARKER - 1) / MAX_ICC_BYTES_PER_MARKER;
1437
1438        let mut offset = 0;
1439        for chunk_num in 0..num_chunks {
1440            let chunk_size = (icc_data.len() - offset).min(MAX_ICC_BYTES_PER_MARKER);
1441
1442            // APP2 marker
1443            output.push(0xFF);
1444            output.push(MARKER_APP2);
1445
1446            // Length: 2 (length field) + 12 (signature) + 2 (chunk info) + data
1447            let segment_length = 2 + 12 + 2 + chunk_size;
1448            output.push((segment_length >> 8) as u8);
1449            output.push(segment_length as u8);
1450
1451            // ICC_PROFILE signature
1452            output.extend_from_slice(&ICC_PROFILE_SIGNATURE);
1453
1454            // Chunk number (1-based) and total chunks
1455            output.push((chunk_num + 1) as u8);
1456            output.push(num_chunks as u8);
1457
1458            // ICC data chunk
1459            output.extend_from_slice(&icc_data[offset..offset + chunk_size]);
1460
1461            offset += chunk_size;
1462        }
1463
1464        Ok(())
1465    }
1466
1467    /// Writes quantization tables (3 separate tables for Y, Cb, Cr).
1468    /// This matches C++ jpegli behavior with add_two_chroma_tables=true.
1469    fn write_quant_tables(
1470        &self,
1471        output: &mut Vec<u8>,
1472        y_quant: &QuantTable,
1473        cb_quant: &QuantTable,
1474        cr_quant: &QuantTable,
1475    ) -> Result<()> {
1476        // Write all 3 tables in one DQT segment
1477        // Length = 2 + 3 * (1 + 64) = 197 bytes
1478        output.push(0xFF);
1479        output.push(MARKER_DQT);
1480        output.push(0x00);
1481        output.push(0xC5); // Length: 197 bytes
1482
1483        // Table 0 (Y) - values must be written in zigzag order
1484        output.push(0x00); // 8-bit precision, table 0
1485        for i in 0..DCT_BLOCK_SIZE {
1486            output.push(y_quant.values[JPEG_NATURAL_ORDER[i] as usize] as u8);
1487        }
1488
1489        // Table 1 (Cb)
1490        output.push(0x01); // 8-bit precision, table 1
1491        for i in 0..DCT_BLOCK_SIZE {
1492            output.push(cb_quant.values[JPEG_NATURAL_ORDER[i] as usize] as u8);
1493        }
1494
1495        // Table 2 (Cr)
1496        output.push(0x02); // 8-bit precision, table 2
1497        for i in 0..DCT_BLOCK_SIZE {
1498            output.push(cr_quant.values[JPEG_NATURAL_ORDER[i] as usize] as u8);
1499        }
1500
1501        Ok(())
1502    }
1503
1504    /// Writes quantization tables for XYB mode (3 separate tables).
1505    fn write_quant_tables_xyb(
1506        &self,
1507        output: &mut Vec<u8>,
1508        r_quant: &QuantTable,
1509        g_quant: &QuantTable,
1510        b_quant: &QuantTable,
1511    ) -> Result<()> {
1512        // Write all 3 tables in one DQT segment
1513        // Length = 2 + 3 * (1 + 64) = 197 bytes
1514        output.push(0xFF);
1515        output.push(MARKER_DQT);
1516        output.push(0x00);
1517        output.push(0xC5); // Length: 197 bytes
1518
1519        // Table 0 (Red)
1520        output.push(0x00); // 8-bit precision, table 0
1521        for i in 0..DCT_BLOCK_SIZE {
1522            output.push(r_quant.values[JPEG_NATURAL_ORDER[i] as usize] as u8);
1523        }
1524
1525        // Table 1 (Green)
1526        output.push(0x01); // 8-bit precision, table 1
1527        for i in 0..DCT_BLOCK_SIZE {
1528            output.push(g_quant.values[JPEG_NATURAL_ORDER[i] as usize] as u8);
1529        }
1530
1531        // Table 2 (Blue)
1532        output.push(0x02); // 8-bit precision, table 2
1533        for i in 0..DCT_BLOCK_SIZE {
1534            output.push(b_quant.values[JPEG_NATURAL_ORDER[i] as usize] as u8);
1535        }
1536
1537        Ok(())
1538    }
1539
1540    /// Writes the frame header (SOF0).
1541    fn write_frame_header(&self, output: &mut Vec<u8>) -> Result<()> {
1542        let marker = if self.config.mode == JpegMode::Progressive {
1543            MARKER_SOF2
1544        } else {
1545            MARKER_SOF0
1546        };
1547
1548        output.push(0xFF);
1549        output.push(marker);
1550
1551        let num_components = if self.config.pixel_format == PixelFormat::Gray {
1552            1u8
1553        } else {
1554            3u8
1555        };
1556
1557        let length = 8u16 + num_components as u16 * 3;
1558        output.push((length >> 8) as u8);
1559        output.push(length as u8);
1560
1561        output.push(8); // Sample precision
1562        output.push((self.config.height >> 8) as u8);
1563        output.push(self.config.height as u8);
1564        output.push((self.config.width >> 8) as u8);
1565        output.push(self.config.width as u8);
1566        output.push(num_components);
1567
1568        if num_components == 1 {
1569            // Grayscale
1570            output.push(1); // Component ID
1571            output.push(0x11); // 1x1 sampling
1572            output.push(0); // Quant table 0
1573        } else {
1574            // Y component
1575            let (h_samp, v_samp) = match self.config.subsampling {
1576                Subsampling::S444 => (1, 1),
1577                Subsampling::S422 => (2, 1),
1578                Subsampling::S420 => (2, 2),
1579                Subsampling::S440 => (1, 2),
1580            };
1581
1582            output.push(1); // Component ID = 1 (Y)
1583            output.push((h_samp << 4) | v_samp);
1584            output.push(0); // Quant table 0
1585
1586            output.push(2); // Component ID = 2 (Cb)
1587            output.push(0x11); // 1x1 sampling
1588            output.push(1); // Quant table 1
1589
1590            output.push(3); // Component ID = 3 (Cr)
1591            output.push(0x11); // 1x1 sampling
1592            output.push(2); // Quant table 2 (separate Cr table like C++ cjpegli)
1593        }
1594
1595        Ok(())
1596    }
1597
1598    /// Writes the frame header for XYB mode (RGB with B subsampling).
1599    fn write_frame_header_xyb(&self, output: &mut Vec<u8>) -> Result<()> {
1600        output.push(0xFF);
1601        output.push(MARKER_SOF0); // Baseline DCT
1602
1603        // 3 components: R, G, B
1604        let length = 8u16 + 3 * 3; // 17 bytes
1605        output.push((length >> 8) as u8);
1606        output.push(length as u8);
1607
1608        output.push(8); // Sample precision
1609        output.push((self.config.height >> 8) as u8);
1610        output.push(self.config.height as u8);
1611        output.push((self.config.width >> 8) as u8);
1612        output.push(self.config.width as u8);
1613        output.push(3); // Number of components
1614
1615        // XYB sampling: R:2×2, G:2×2, B:1×1
1616        // This means R and G are full resolution, B is 1/4 resolution
1617        output.push(b'R'); // Component ID = 'R' (82)
1618        output.push(0x22); // 2x2 sampling
1619        output.push(0); // Quant table 0
1620
1621        output.push(b'G'); // Component ID = 'G' (71)
1622        output.push(0x22); // 2x2 sampling
1623        output.push(1); // Quant table 1
1624
1625        output.push(b'B'); // Component ID = 'B' (66)
1626        output.push(0x11); // 1x1 sampling (subsampled)
1627        output.push(2); // Quant table 2
1628
1629        Ok(())
1630    }
1631
1632    /// Writes standard Huffman tables in a single DHT segment.
1633    fn write_huffman_tables(&self, output: &mut Vec<u8>) -> Result<()> {
1634        use crate::huffman::{
1635            STD_AC_CHROMINANCE_BITS, STD_AC_CHROMINANCE_VALUES, STD_AC_LUMINANCE_BITS,
1636            STD_AC_LUMINANCE_VALUES, STD_DC_CHROMINANCE_BITS, STD_DC_CHROMINANCE_VALUES,
1637            STD_DC_LUMINANCE_BITS, STD_DC_LUMINANCE_VALUES,
1638        };
1639
1640        // Write all 4 Huffman tables in a single DHT segment (like C++ jpegli)
1641        output.push(0xFF);
1642        output.push(MARKER_DHT);
1643
1644        // Calculate total length
1645        let total_len = 2
1646            + (1 + 16 + STD_DC_LUMINANCE_VALUES.len())
1647            + (1 + 16 + STD_AC_LUMINANCE_VALUES.len())
1648            + (1 + 16 + STD_DC_CHROMINANCE_VALUES.len())
1649            + (1 + 16 + STD_AC_CHROMINANCE_VALUES.len());
1650
1651        output.push((total_len >> 8) as u8);
1652        output.push(total_len as u8);
1653
1654        // DC luminance (class 0, id 0)
1655        output.push(0x00);
1656        output.extend_from_slice(&STD_DC_LUMINANCE_BITS);
1657        output.extend_from_slice(&STD_DC_LUMINANCE_VALUES);
1658
1659        // AC luminance (class 1, id 0)
1660        output.push(0x10);
1661        output.extend_from_slice(&STD_AC_LUMINANCE_BITS);
1662        output.extend_from_slice(&STD_AC_LUMINANCE_VALUES);
1663
1664        // DC chrominance (class 0, id 1)
1665        output.push(0x01);
1666        output.extend_from_slice(&STD_DC_CHROMINANCE_BITS);
1667        output.extend_from_slice(&STD_DC_CHROMINANCE_VALUES);
1668
1669        // AC chrominance (class 1, id 1)
1670        output.push(0x11);
1671        output.extend_from_slice(&STD_AC_CHROMINANCE_BITS);
1672        output.extend_from_slice(&STD_AC_CHROMINANCE_VALUES);
1673
1674        Ok(())
1675    }
1676
1677    /// Writes optimized Huffman tables.
1678    ///
1679    /// This is used when `optimize_huffman` is enabled to write the
1680    /// image-specific optimized tables to the DHT markers.
1681    fn write_huffman_tables_optimized(
1682        &self,
1683        output: &mut Vec<u8>,
1684        tables: &OptimizedHuffmanTables,
1685    ) -> Result<()> {
1686        // Write all 4 Huffman tables in a single DHT segment (like C++ jpegli)
1687        // This saves 12 bytes compared to 4 separate segments
1688        output.push(0xFF);
1689        output.push(MARKER_DHT);
1690
1691        // Calculate total length: 2 (length field) + 4 tables × (1 + 16 + values.len())
1692        let total_len = 2
1693            + (1 + 16 + tables.dc_luma.values.len())
1694            + (1 + 16 + tables.ac_luma.values.len())
1695            + (1 + 16 + tables.dc_chroma.values.len())
1696            + (1 + 16 + tables.ac_chroma.values.len());
1697
1698        output.push((total_len >> 8) as u8);
1699        output.push(total_len as u8);
1700
1701        // DC luminance (class 0, id 0)
1702        output.push(0x00);
1703        output.extend_from_slice(&tables.dc_luma.bits);
1704        output.extend_from_slice(&tables.dc_luma.values);
1705
1706        // AC luminance (class 1, id 0)
1707        output.push(0x10);
1708        output.extend_from_slice(&tables.ac_luma.bits);
1709        output.extend_from_slice(&tables.ac_luma.values);
1710
1711        // DC chrominance (class 0, id 1)
1712        output.push(0x01);
1713        output.extend_from_slice(&tables.dc_chroma.bits);
1714        output.extend_from_slice(&tables.dc_chroma.values);
1715
1716        // AC chrominance (class 1, id 1)
1717        output.push(0x11);
1718        output.extend_from_slice(&tables.ac_chroma.bits);
1719        output.extend_from_slice(&tables.ac_chroma.values);
1720
1721        Ok(())
1722    }
1723
1724    /// Writes restart interval.
1725    fn write_restart_interval(&self, output: &mut Vec<u8>) -> Result<()> {
1726        output.push(0xFF);
1727        output.push(MARKER_DRI);
1728        output.push(0x00);
1729        output.push(0x04); // Length
1730        output.push((self.config.restart_interval >> 8) as u8);
1731        output.push(self.config.restart_interval as u8);
1732        Ok(())
1733    }
1734
1735    /// Writes scan header.
1736    fn write_scan_header(&self, output: &mut Vec<u8>) -> Result<()> {
1737        output.push(0xFF);
1738        output.push(MARKER_SOS);
1739
1740        let num_components = if self.config.pixel_format == PixelFormat::Gray {
1741            1u8
1742        } else {
1743            3u8
1744        };
1745
1746        let length = 6u16 + num_components as u16 * 2;
1747        output.push((length >> 8) as u8);
1748        output.push(length as u8);
1749
1750        output.push(num_components);
1751
1752        if num_components == 1 {
1753            output.push(1); // Component selector
1754            output.push(0x00); // DC/AC table selectors
1755        } else {
1756            output.push(1); // Y component
1757            output.push(0x00); // DC table 0, AC table 0
1758
1759            output.push(2); // Cb component
1760            output.push(0x11); // DC table 1, AC table 1
1761
1762            output.push(3); // Cr component
1763            output.push(0x11); // DC table 1, AC table 1
1764        }
1765
1766        output.push(0x00); // Ss (spectral selection start)
1767        output.push(0x3F); // Se (spectral selection end = 63)
1768        output.push(0x00); // Ah/Al (successive approximation)
1769
1770        Ok(())
1771    }
1772
1773    /// Writes scan header for XYB mode.
1774    fn write_scan_header_xyb(&self, output: &mut Vec<u8>) -> Result<()> {
1775        output.push(0xFF);
1776        output.push(MARKER_SOS);
1777
1778        // 3 components: R, G, B
1779        let length = 6u16 + 3 * 2; // 12 bytes
1780        output.push((length >> 8) as u8);
1781        output.push(length as u8);
1782
1783        output.push(3); // Number of components
1784
1785        // R component: DC table 0, AC table 0
1786        output.push(b'R');
1787        output.push(0x00);
1788
1789        // G component: DC table 0, AC table 0
1790        output.push(b'G');
1791        output.push(0x00);
1792
1793        // B component: DC table 0, AC table 0
1794        output.push(b'B');
1795        output.push(0x00);
1796
1797        output.push(0x00); // Ss (spectral selection start)
1798        output.push(0x3F); // Se (spectral selection end = 63)
1799        output.push(0x00); // Ah/Al (successive approximation)
1800
1801        Ok(())
1802    }
1803
1804    /// Encodes the scan data (u8 version - legacy).
1805    #[allow(dead_code)]
1806    fn encode_scan(
1807        &self,
1808        y_plane: &[u8],
1809        cb_plane: &[u8],
1810        cr_plane: &[u8],
1811        y_quant: &QuantTable,
1812        c_quant: &QuantTable,
1813    ) -> Result<Vec<u8>> {
1814        let mut encoder = EntropyEncoder::new();
1815
1816        // Set up Huffman tables
1817        encoder.set_dc_table(0, HuffmanEncodeTable::std_dc_luminance());
1818        encoder.set_ac_table(0, HuffmanEncodeTable::std_ac_luminance());
1819        encoder.set_dc_table(1, HuffmanEncodeTable::std_dc_chrominance());
1820        encoder.set_ac_table(1, HuffmanEncodeTable::std_ac_chrominance());
1821
1822        if self.config.restart_interval > 0 {
1823            encoder.set_restart_interval(self.config.restart_interval);
1824        }
1825
1826        let width = self.config.width as usize;
1827        let height = self.config.height as usize;
1828
1829        // For 4:2:0, process MCUs
1830        let _mcu_width = ((width + 15) / 16) * 16;
1831        let _mcu_height = ((height + 15) / 16) * 16;
1832
1833        // TODO: Implement full MCU processing with subsampling
1834        // For now, simplified 4:4:4 encoding
1835        let blocks_h = (width + 7) / 8;
1836        let blocks_v = (height + 7) / 8;
1837
1838        // Zero-bias parameters for each component
1839        // Use effective distance inferred from quant tables (like C++ QuantValsToDistance)
1840        // For YCbCr mode, Cb and Cr share the same quant table (c_quant)
1841        let _input_distance = self.config.quality.to_distance();
1842        let effective_distance = quant::quant_vals_to_distance(y_quant, c_quant, c_quant);
1843        let y_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 0);
1844        let cb_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 1);
1845        let cr_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 2);
1846
1847        // Convert Y plane to f32 for AQ computation
1848        let y_plane_f32: Vec<f32> = y_plane.iter().map(|&v| v as f32).collect();
1849
1850        // Compute per-block adaptive quantization strength from Y plane
1851        // C++ uses y_quant_01 = quant_table[1] for dampen calculation
1852        let y_quant_01 = y_quant.values[1];
1853        let aq_map = compute_aq_strength_map(&y_plane_f32, width, height, y_quant_01);
1854
1855        for by in 0..blocks_v {
1856            for bx in 0..blocks_h {
1857                // Get per-block aq_strength (C++ AQ produces 0.0-0.2, mean ~0.08)
1858                let aq_strength = aq_map.get(bx, by);
1859
1860                // Extract and encode Y block
1861                let y_block = self.extract_block(y_plane, width, height, bx, by);
1862                let y_dct = forward_dct_8x8(&y_block);
1863                let y_quant_coeffs = quant::quantize_block_with_zero_bias(
1864                    &y_dct,
1865                    &y_quant.values,
1866                    &y_zero_bias,
1867                    aq_strength,
1868                );
1869                let y_zigzag = natural_to_zigzag(&y_quant_coeffs);
1870                encoder.encode_block(&y_zigzag, 0, 0, 0)?;
1871
1872                if self.config.pixel_format != PixelFormat::Gray {
1873                    // Cb block
1874                    let cb_block = self.extract_block(cb_plane, width, height, bx, by);
1875                    let cb_dct = forward_dct_8x8(&cb_block);
1876                    let cb_quant_coeffs = quant::quantize_block_with_zero_bias(
1877                        &cb_dct,
1878                        &c_quant.values,
1879                        &cb_zero_bias,
1880                        aq_strength,
1881                    );
1882                    let cb_zigzag = natural_to_zigzag(&cb_quant_coeffs);
1883                    encoder.encode_block(&cb_zigzag, 1, 1, 1)?;
1884
1885                    // Cr block
1886                    let cr_block = self.extract_block(cr_plane, width, height, bx, by);
1887                    let cr_dct = forward_dct_8x8(&cr_block);
1888                    let cr_quant_coeffs = quant::quantize_block_with_zero_bias(
1889                        &cr_dct,
1890                        &c_quant.values,
1891                        &cr_zero_bias,
1892                        aq_strength,
1893                    );
1894                    let cr_zigzag = natural_to_zigzag(&cr_quant_coeffs);
1895                    encoder.encode_block(&cr_zigzag, 2, 1, 1)?;
1896                }
1897
1898                encoder.check_restart();
1899            }
1900        }
1901
1902        Ok(encoder.finish())
1903    }
1904
1905    /// Quantizes all blocks in the image.
1906    ///
1907    /// This is separated from encoding to allow Huffman optimization:
1908    /// 1. Quantize all blocks
1909    /// 2. Collect frequencies to build optimal tables
1910    /// 3. Encode with optimal tables
1911    fn quantize_all_blocks(
1912        &self,
1913        y_plane: &[f32],
1914        cb_plane: &[f32],
1915        cr_plane: &[f32],
1916        y_quant: &QuantTable,
1917        cb_quant: &QuantTable,
1918        cr_quant: &QuantTable,
1919    ) -> Result<(
1920        Vec<[i16; DCT_BLOCK_SIZE]>,
1921        Vec<[i16; DCT_BLOCK_SIZE]>,
1922        Vec<[i16; DCT_BLOCK_SIZE]>,
1923    )> {
1924        let width = self.config.width as usize;
1925        let height = self.config.height as usize;
1926        let blocks_h = (width + 7) / 8;
1927        let blocks_v = (height + 7) / 8;
1928        let is_color = self.config.pixel_format != PixelFormat::Gray;
1929
1930        // Zero-bias parameters for each component
1931        // Use effective distance inferred from quant tables (like C++ QuantValsToDistance)
1932        // This is important at Q100 where quant values are all 1s but input distance is 0.01
1933        let _input_distance = self.config.quality.to_distance();
1934        let effective_distance = quant::quant_vals_to_distance(y_quant, cb_quant, cr_quant);
1935        let y_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 0);
1936        let cb_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 1);
1937        let cr_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 2);
1938
1939        // Compute per-block adaptive quantization strength from Y plane
1940        // C++ uses y_quant_01 = quant_table[1] for dampen calculation
1941        let y_quant_01 = y_quant.values[1];
1942        let aq_map = compute_aq_strength_map(y_plane, width, height, y_quant_01);
1943
1944        let mut y_blocks = Vec::with_capacity(blocks_h * blocks_v);
1945        let mut cb_blocks = Vec::with_capacity(if is_color { blocks_h * blocks_v } else { 0 });
1946        let mut cr_blocks = Vec::with_capacity(if is_color { blocks_h * blocks_v } else { 0 });
1947
1948        for by in 0..blocks_v {
1949            for bx in 0..blocks_h {
1950                // Get per-block aq_strength
1951                let aq_strength = aq_map.get(bx, by);
1952
1953                let y_block = self.extract_block_ycbcr_f32(y_plane, width, height, bx, by);
1954                let y_dct = forward_dct_8x8(&y_block);
1955                let y_quant_coeffs = quant::quantize_block_with_zero_bias(
1956                    &y_dct,
1957                    &y_quant.values,
1958                    &y_zero_bias,
1959                    aq_strength,
1960                );
1961                y_blocks.push(natural_to_zigzag(&y_quant_coeffs));
1962
1963                if is_color {
1964                    let cb_block = self.extract_block_ycbcr_f32(cb_plane, width, height, bx, by);
1965                    let cb_dct = forward_dct_8x8(&cb_block);
1966                    let cb_quant_coeffs = quant::quantize_block_with_zero_bias(
1967                        &cb_dct,
1968                        &cb_quant.values,
1969                        &cb_zero_bias,
1970                        aq_strength,
1971                    );
1972                    cb_blocks.push(natural_to_zigzag(&cb_quant_coeffs));
1973
1974                    let cr_block = self.extract_block_ycbcr_f32(cr_plane, width, height, bx, by);
1975                    let cr_dct = forward_dct_8x8(&cr_block);
1976                    let cr_quant_coeffs = quant::quantize_block_with_zero_bias(
1977                        &cr_dct,
1978                        &cr_quant.values,
1979                        &cr_zero_bias,
1980                        aq_strength,
1981                    );
1982                    cr_blocks.push(natural_to_zigzag(&cr_quant_coeffs));
1983                }
1984            }
1985        }
1986
1987        Ok((y_blocks, cb_blocks, cr_blocks))
1988    }
1989
1990    /// Quantizes all blocks with subsampling support.
1991    ///
1992    /// Unlike `quantize_all_blocks`, this version handles different dimensions
1993    /// for Y and chroma planes (needed for 4:2:0, 4:2:2, 4:4:0 subsampling).
1994    #[allow(clippy::too_many_arguments)]
1995    fn quantize_all_blocks_subsampled(
1996        &self,
1997        y_plane: &[f32],
1998        y_width: usize,
1999        y_height: usize,
2000        cb_plane: &[f32],
2001        cr_plane: &[f32],
2002        c_width: usize,
2003        c_height: usize,
2004        y_quant: &QuantTable,
2005        cb_quant: &QuantTable,
2006        cr_quant: &QuantTable,
2007    ) -> Result<(
2008        Vec<[i16; DCT_BLOCK_SIZE]>,
2009        Vec<[i16; DCT_BLOCK_SIZE]>,
2010        Vec<[i16; DCT_BLOCK_SIZE]>,
2011    )> {
2012        let y_blocks_h = (y_width + 7) / 8;
2013        let y_blocks_v = (y_height + 7) / 8;
2014        let c_blocks_h = (c_width + 7) / 8;
2015        let c_blocks_v = (c_height + 7) / 8;
2016        let is_color = self.config.pixel_format != PixelFormat::Gray;
2017
2018        // Zero-bias parameters for each component
2019        let effective_distance = quant::quant_vals_to_distance(y_quant, cb_quant, cr_quant);
2020        let y_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 0);
2021        let cb_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 1);
2022        let cr_zero_bias = ZeroBiasParams::for_ycbcr(effective_distance, 2);
2023
2024        // Compute per-block adaptive quantization strength from Y plane
2025        let y_quant_01 = y_quant.values[1];
2026        let aq_map = compute_aq_strength_map(y_plane, y_width, y_height, y_quant_01);
2027
2028        let mut y_blocks = Vec::with_capacity(y_blocks_h * y_blocks_v);
2029        let mut cb_blocks = Vec::with_capacity(if is_color { c_blocks_h * c_blocks_v } else { 0 });
2030        let mut cr_blocks = Vec::with_capacity(if is_color { c_blocks_h * c_blocks_v } else { 0 });
2031
2032        // Quantize Y blocks
2033        for by in 0..y_blocks_v {
2034            for bx in 0..y_blocks_h {
2035                let aq_strength = aq_map.get(bx, by);
2036                let y_block = self.extract_block_ycbcr_f32(y_plane, y_width, y_height, bx, by);
2037                let y_dct = forward_dct_8x8(&y_block);
2038                let y_quant_coeffs = quant::quantize_block_with_zero_bias(
2039                    &y_dct,
2040                    &y_quant.values,
2041                    &y_zero_bias,
2042                    aq_strength,
2043                );
2044                y_blocks.push(natural_to_zigzag(&y_quant_coeffs));
2045            }
2046        }
2047
2048        // Quantize chroma blocks (from possibly downsampled planes)
2049        if is_color {
2050            for by in 0..c_blocks_v {
2051                for bx in 0..c_blocks_h {
2052                    // For chroma, use average AQ strength from corresponding Y region
2053                    // For 4:2:0, each chroma block corresponds to 2x2 Y blocks
2054                    let y_bx = (bx * y_blocks_h) / c_blocks_h;
2055                    let y_by = (by * y_blocks_v) / c_blocks_v;
2056                    let aq_strength =
2057                        aq_map.get(y_bx.min(y_blocks_h - 1), y_by.min(y_blocks_v - 1));
2058
2059                    let cb_block =
2060                        self.extract_block_ycbcr_f32(cb_plane, c_width, c_height, bx, by);
2061                    let cb_dct = forward_dct_8x8(&cb_block);
2062                    let cb_quant_coeffs = quant::quantize_block_with_zero_bias(
2063                        &cb_dct,
2064                        &cb_quant.values,
2065                        &cb_zero_bias,
2066                        aq_strength,
2067                    );
2068                    cb_blocks.push(natural_to_zigzag(&cb_quant_coeffs));
2069
2070                    let cr_block =
2071                        self.extract_block_ycbcr_f32(cr_plane, c_width, c_height, bx, by);
2072                    let cr_dct = forward_dct_8x8(&cr_block);
2073                    let cr_quant_coeffs = quant::quantize_block_with_zero_bias(
2074                        &cr_dct,
2075                        &cr_quant.values,
2076                        &cr_zero_bias,
2077                        aq_strength,
2078                    );
2079                    cr_blocks.push(natural_to_zigzag(&cr_quant_coeffs));
2080                }
2081            }
2082        }
2083
2084        Ok((y_blocks, cb_blocks, cr_blocks))
2085    }
2086
2087    /// Builds optimized Huffman tables from quantized blocks.
2088    ///
2089    /// Collects symbol frequencies from all blocks and generates optimal
2090    /// Huffman tables with their DHT marker representations.
2091    ///
2092    /// For subsampled modes, this iterates blocks in MCU order to correctly
2093    /// account for padding blocks.
2094    fn build_optimized_tables(
2095        &self,
2096        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
2097        cb_blocks: &[[i16; DCT_BLOCK_SIZE]],
2098        cr_blocks: &[[i16; DCT_BLOCK_SIZE]],
2099        is_color: bool,
2100    ) -> Result<OptimizedHuffmanTables> {
2101        let mut dc_luma_freq = FrequencyCounter::new();
2102        let mut dc_chroma_freq = FrequencyCounter::new();
2103        let mut ac_luma_freq = FrequencyCounter::new();
2104        let mut ac_chroma_freq = FrequencyCounter::new();
2105
2106        let width = self.config.width as usize;
2107        let height = self.config.height as usize;
2108        let (h_samp, v_samp) = match self.config.subsampling {
2109            Subsampling::S444 => (1, 1),
2110            Subsampling::S422 => (2, 1),
2111            Subsampling::S420 => (2, 2),
2112            Subsampling::S440 => (1, 2),
2113        };
2114
2115        // Zero block for padding
2116        const ZERO_BLOCK: [i16; DCT_BLOCK_SIZE] = [0i16; DCT_BLOCK_SIZE];
2117
2118        if h_samp == 1 && v_samp == 1 {
2119            // 4:4:4 mode - simple iteration, no padding needed
2120            let mut prev_y_dc: i16 = 0;
2121            let mut prev_cb_dc: i16 = 0;
2122            let mut prev_cr_dc: i16 = 0;
2123
2124            for (i, y_block) in y_blocks.iter().enumerate() {
2125                Self::collect_block_frequencies(
2126                    y_block,
2127                    prev_y_dc,
2128                    &mut dc_luma_freq,
2129                    &mut ac_luma_freq,
2130                );
2131                prev_y_dc = y_block[0];
2132
2133                if is_color {
2134                    Self::collect_block_frequencies(
2135                        &cb_blocks[i],
2136                        prev_cb_dc,
2137                        &mut dc_chroma_freq,
2138                        &mut ac_chroma_freq,
2139                    );
2140                    prev_cb_dc = cb_blocks[i][0];
2141
2142                    Self::collect_block_frequencies(
2143                        &cr_blocks[i],
2144                        prev_cr_dc,
2145                        &mut dc_chroma_freq,
2146                        &mut ac_chroma_freq,
2147                    );
2148                    prev_cr_dc = cr_blocks[i][0];
2149                }
2150            }
2151        } else {
2152            // Subsampled mode - iterate in MCU order with padding
2153            let y_blocks_h = (width + 7) / 8;
2154            let y_blocks_v = (height + 7) / 8;
2155            // Use ceiling division for chroma dimensions: (n + d - 1) / d
2156            let c_width = (width + h_samp - 1) / h_samp;
2157            let c_height = (height + v_samp - 1) / v_samp;
2158            let c_blocks_h = (c_width + 7) / 8;
2159            let c_blocks_v = (c_height + 7) / 8;
2160            let mcu_h = (y_blocks_h + h_samp - 1) / h_samp;
2161            let mcu_v = (y_blocks_v + v_samp - 1) / v_samp;
2162
2163            let mut prev_y_dc: i16 = 0;
2164            let mut prev_cb_dc: i16 = 0;
2165            let mut prev_cr_dc: i16 = 0;
2166
2167            for mcu_y in 0..mcu_v {
2168                for mcu_x in 0..mcu_h {
2169                    // Y blocks in this MCU
2170                    for dy in 0..v_samp {
2171                        for dx in 0..h_samp {
2172                            let y_bx = mcu_x * h_samp + dx;
2173                            let y_by = mcu_y * v_samp + dy;
2174                            let block = if y_bx < y_blocks_h && y_by < y_blocks_v {
2175                                let y_idx = y_by * y_blocks_h + y_bx;
2176                                &y_blocks[y_idx]
2177                            } else {
2178                                &ZERO_BLOCK
2179                            };
2180                            Self::collect_block_frequencies(
2181                                block,
2182                                prev_y_dc,
2183                                &mut dc_luma_freq,
2184                                &mut ac_luma_freq,
2185                            );
2186                            prev_y_dc = block[0];
2187                        }
2188                    }
2189
2190                    // Chroma blocks
2191                    if is_color {
2192                        let (cb_block, cr_block) = if mcu_x < c_blocks_h && mcu_y < c_blocks_v {
2193                            let c_idx = mcu_y * c_blocks_h + mcu_x;
2194                            (&cb_blocks[c_idx], &cr_blocks[c_idx])
2195                        } else {
2196                            (&ZERO_BLOCK, &ZERO_BLOCK)
2197                        };
2198
2199                        Self::collect_block_frequencies(
2200                            cb_block,
2201                            prev_cb_dc,
2202                            &mut dc_chroma_freq,
2203                            &mut ac_chroma_freq,
2204                        );
2205                        prev_cb_dc = cb_block[0];
2206
2207                        Self::collect_block_frequencies(
2208                            cr_block,
2209                            prev_cr_dc,
2210                            &mut dc_chroma_freq,
2211                            &mut ac_chroma_freq,
2212                        );
2213                        prev_cr_dc = cr_block[0];
2214                    }
2215                }
2216            }
2217        }
2218
2219        // Build optimized tables with DHT data
2220        let dc_luma = dc_luma_freq.generate_table_with_dht()?;
2221        let ac_luma = ac_luma_freq.generate_table_with_dht()?;
2222
2223        let (dc_chroma, ac_chroma) = if is_color {
2224            (
2225                dc_chroma_freq.generate_table_with_dht()?,
2226                ac_chroma_freq.generate_table_with_dht()?,
2227            )
2228        } else {
2229            // Use standard tables for grayscale (won't be used but needed for structure)
2230            use crate::huffman::{
2231                STD_AC_CHROMINANCE_BITS, STD_AC_CHROMINANCE_VALUES, STD_DC_CHROMINANCE_BITS,
2232                STD_DC_CHROMINANCE_VALUES,
2233            };
2234            use crate::huffman_opt::OptimizedTable;
2235
2236            (
2237                OptimizedTable {
2238                    table: HuffmanEncodeTable::std_dc_chrominance(),
2239                    bits: STD_DC_CHROMINANCE_BITS,
2240                    values: STD_DC_CHROMINANCE_VALUES.to_vec(),
2241                },
2242                OptimizedTable {
2243                    table: HuffmanEncodeTable::std_ac_chrominance(),
2244                    bits: STD_AC_CHROMINANCE_BITS,
2245                    values: STD_AC_CHROMINANCE_VALUES.to_vec(),
2246                },
2247            )
2248        };
2249
2250        Ok(OptimizedHuffmanTables {
2251            dc_luma,
2252            ac_luma,
2253            dc_chroma,
2254            ac_chroma,
2255        })
2256    }
2257
2258    /// Encodes blocks using optimized Huffman tables.
2259    ///
2260    /// Handles MCU interleaving for subsampled modes (4:2:0, 4:2:2, 4:4:0).
2261    fn encode_with_tables(
2262        &self,
2263        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
2264        cb_blocks: &[[i16; DCT_BLOCK_SIZE]],
2265        cr_blocks: &[[i16; DCT_BLOCK_SIZE]],
2266        is_color: bool,
2267        tables: &OptimizedHuffmanTables,
2268    ) -> Result<Vec<u8>> {
2269        let mut encoder = EntropyEncoder::new();
2270
2271        encoder.set_dc_table(0, tables.dc_luma.table.clone());
2272        encoder.set_ac_table(0, tables.ac_luma.table.clone());
2273        encoder.set_dc_table(1, tables.dc_chroma.table.clone());
2274        encoder.set_ac_table(1, tables.ac_chroma.table.clone());
2275
2276        if self.config.restart_interval > 0 {
2277            encoder.set_restart_interval(self.config.restart_interval);
2278        }
2279
2280        let width = self.config.width as usize;
2281        let height = self.config.height as usize;
2282        let (h_samp, v_samp) = match self.config.subsampling {
2283            Subsampling::S444 => (1, 1),
2284            Subsampling::S422 => (2, 1),
2285            Subsampling::S420 => (2, 2),
2286            Subsampling::S440 => (1, 2),
2287        };
2288
2289        if h_samp == 1 && v_samp == 1 {
2290            // 4:4:4 mode - simple 1:1 interleaving
2291            for (i, y_block) in y_blocks.iter().enumerate() {
2292                encoder.encode_block(y_block, 0, 0, 0)?;
2293
2294                if is_color {
2295                    encoder.encode_block(&cb_blocks[i], 1, 1, 1)?;
2296                    encoder.encode_block(&cr_blocks[i], 2, 1, 1)?;
2297                }
2298
2299                encoder.check_restart();
2300            }
2301        } else {
2302            // Subsampled mode - MCU interleaving
2303            let y_blocks_h = (width + 7) / 8;
2304            let y_blocks_v = (height + 7) / 8;
2305            // Use ceiling division for chroma dimensions: (n + d - 1) / d
2306            let c_width = (width + h_samp - 1) / h_samp;
2307            let c_height = (height + v_samp - 1) / v_samp;
2308            let c_blocks_h = (c_width + 7) / 8;
2309            let c_blocks_v = (c_height + 7) / 8;
2310
2311            let mcu_h = (y_blocks_h + h_samp - 1) / h_samp;
2312            let mcu_v = (y_blocks_v + v_samp - 1) / v_samp;
2313
2314            // Zero block for padding out-of-bounds MCU positions
2315            const ZERO_BLOCK: [i16; DCT_BLOCK_SIZE] = [0i16; DCT_BLOCK_SIZE];
2316
2317            for mcu_y in 0..mcu_v {
2318                for mcu_x in 0..mcu_h {
2319                    // Encode Y blocks in this MCU (must encode all 4 even if out of bounds)
2320                    for dy in 0..v_samp {
2321                        for dx in 0..h_samp {
2322                            let y_bx = mcu_x * h_samp + dx;
2323                            let y_by = mcu_y * v_samp + dy;
2324                            if y_bx < y_blocks_h && y_by < y_blocks_v {
2325                                let y_idx = y_by * y_blocks_h + y_bx;
2326                                encoder.encode_block(&y_blocks[y_idx], 0, 0, 0)?;
2327                            } else {
2328                                // Out of bounds - encode zero block (padding)
2329                                encoder.encode_block(&ZERO_BLOCK, 0, 0, 0)?;
2330                            }
2331                        }
2332                    }
2333
2334                    // Encode Cb and Cr blocks (always, even if out of bounds)
2335                    if is_color {
2336                        if mcu_x < c_blocks_h && mcu_y < c_blocks_v {
2337                            let c_idx = mcu_y * c_blocks_h + mcu_x;
2338                            encoder.encode_block(&cb_blocks[c_idx], 1, 1, 1)?;
2339                            encoder.encode_block(&cr_blocks[c_idx], 2, 1, 1)?;
2340                        } else {
2341                            // Out of bounds - encode zero blocks (padding)
2342                            encoder.encode_block(&ZERO_BLOCK, 1, 1, 1)?;
2343                            encoder.encode_block(&ZERO_BLOCK, 2, 1, 1)?;
2344                        }
2345                    }
2346
2347                    encoder.check_restart();
2348                }
2349            }
2350        }
2351
2352        Ok(encoder.finish())
2353    }
2354
2355    /// Encodes blocks using standard (fixed) Huffman tables - single pass.
2356    ///
2357    /// Handles MCU interleaving for subsampled modes (4:2:0, 4:2:2, 4:4:0).
2358    fn encode_blocks_standard(
2359        &self,
2360        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
2361        cb_blocks: &[[i16; DCT_BLOCK_SIZE]],
2362        cr_blocks: &[[i16; DCT_BLOCK_SIZE]],
2363        is_color: bool,
2364    ) -> Result<Vec<u8>> {
2365        let mut encoder = EntropyEncoder::new();
2366
2367        encoder.set_dc_table(0, HuffmanEncodeTable::std_dc_luminance());
2368        encoder.set_ac_table(0, HuffmanEncodeTable::std_ac_luminance());
2369        encoder.set_dc_table(1, HuffmanEncodeTable::std_dc_chrominance());
2370        encoder.set_ac_table(1, HuffmanEncodeTable::std_ac_chrominance());
2371
2372        if self.config.restart_interval > 0 {
2373            encoder.set_restart_interval(self.config.restart_interval);
2374        }
2375
2376        let width = self.config.width as usize;
2377        let height = self.config.height as usize;
2378        let (h_samp, v_samp) = match self.config.subsampling {
2379            Subsampling::S444 => (1, 1),
2380            Subsampling::S422 => (2, 1),
2381            Subsampling::S420 => (2, 2),
2382            Subsampling::S440 => (1, 2),
2383        };
2384
2385        if h_samp == 1 && v_samp == 1 {
2386            // 4:4:4 mode - simple 1:1 interleaving
2387            for (i, y_block) in y_blocks.iter().enumerate() {
2388                encoder.encode_block(y_block, 0, 0, 0)?;
2389
2390                if is_color {
2391                    encoder.encode_block(&cb_blocks[i], 1, 1, 1)?;
2392                    encoder.encode_block(&cr_blocks[i], 2, 1, 1)?;
2393                }
2394
2395                encoder.check_restart();
2396            }
2397        } else {
2398            // Subsampled mode - MCU interleaving
2399            let y_blocks_h = (width + 7) / 8;
2400            let y_blocks_v = (height + 7) / 8;
2401            // Use ceiling division for chroma dimensions: (n + d - 1) / d
2402            let c_width = (width + h_samp - 1) / h_samp;
2403            let c_height = (height + v_samp - 1) / v_samp;
2404            let c_blocks_h = (c_width + 7) / 8;
2405            let c_blocks_v = (c_height + 7) / 8;
2406
2407            // MCU dimensions in terms of Y blocks
2408            let mcu_h = (y_blocks_h + h_samp - 1) / h_samp;
2409            let mcu_v = (y_blocks_v + v_samp - 1) / v_samp;
2410
2411            // Zero block for padding out-of-bounds MCU positions
2412            const ZERO_BLOCK: [i16; DCT_BLOCK_SIZE] = [0i16; DCT_BLOCK_SIZE];
2413
2414            for mcu_y in 0..mcu_v {
2415                for mcu_x in 0..mcu_h {
2416                    // Encode Y blocks in this MCU (must encode all even if out of bounds)
2417                    for dy in 0..v_samp {
2418                        for dx in 0..h_samp {
2419                            let y_bx = mcu_x * h_samp + dx;
2420                            let y_by = mcu_y * v_samp + dy;
2421                            if y_bx < y_blocks_h && y_by < y_blocks_v {
2422                                let y_idx = y_by * y_blocks_h + y_bx;
2423                                encoder.encode_block(&y_blocks[y_idx], 0, 0, 0)?;
2424                            } else {
2425                                // Out of bounds - encode zero block (padding)
2426                                encoder.encode_block(&ZERO_BLOCK, 0, 0, 0)?;
2427                            }
2428                        }
2429                    }
2430
2431                    // Encode Cb and Cr blocks (always, even if out of bounds)
2432                    if is_color {
2433                        if mcu_x < c_blocks_h && mcu_y < c_blocks_v {
2434                            let c_idx = mcu_y * c_blocks_h + mcu_x;
2435                            encoder.encode_block(&cb_blocks[c_idx], 1, 1, 1)?;
2436                            encoder.encode_block(&cr_blocks[c_idx], 2, 1, 1)?;
2437                        } else {
2438                            // Out of bounds - encode zero blocks (padding)
2439                            encoder.encode_block(&ZERO_BLOCK, 1, 1, 1)?;
2440                            encoder.encode_block(&ZERO_BLOCK, 2, 1, 1)?;
2441                        }
2442                    }
2443
2444                    encoder.check_restart();
2445                }
2446            }
2447        }
2448
2449        Ok(encoder.finish())
2450    }
2451
2452    /// Collects symbol frequencies from a block for Huffman optimization.
2453    fn collect_block_frequencies(
2454        coeffs: &[i16; DCT_BLOCK_SIZE],
2455        prev_dc: i16,
2456        dc_freq: &mut FrequencyCounter,
2457        ac_freq: &mut FrequencyCounter,
2458    ) {
2459        // DC coefficient - limit category to 11 for 8-bit JPEG compatibility
2460        let dc_diff = coeffs[0] - prev_dc;
2461        let dc_category = entropy::category(dc_diff).min(11);
2462        dc_freq.count(dc_category);
2463
2464        // AC coefficients
2465        let mut run = 0u8;
2466        for i in 1..DCT_BLOCK_SIZE {
2467            let ac = coeffs[i];
2468
2469            if ac == 0 {
2470                run += 1;
2471            } else {
2472                // Encode runs of 16 zeros (ZRL)
2473                while run >= 16 {
2474                    ac_freq.count(0xF0);
2475                    run -= 16;
2476                }
2477
2478                // Encode run/size symbol
2479                let ac_category = entropy::category(ac);
2480                let symbol = (run << 4) | ac_category;
2481                ac_freq.count(symbol);
2482                run = 0;
2483            }
2484        }
2485
2486        // EOB if trailing zeros
2487        if run > 0 {
2488            ac_freq.count(0x00);
2489        }
2490    }
2491
2492    /// Quantizes all XYB blocks for Huffman optimization.
2493    ///
2494    /// Returns quantized blocks for X, Y, and B components.
2495    /// B component is already downsampled (half resolution).
2496    #[allow(clippy::too_many_arguments)]
2497    fn quantize_all_blocks_xyb(
2498        &self,
2499        x_plane: &[f32],
2500        y_plane: &[f32],
2501        b_plane: &[f32], // Already downsampled
2502        width: usize,
2503        height: usize,
2504        b_width: usize,
2505        b_height: usize,
2506        x_quant: &QuantTable,
2507        y_quant: &QuantTable,
2508        b_quant: &QuantTable,
2509    ) -> (
2510        Vec<[i16; DCT_BLOCK_SIZE]>,
2511        Vec<[i16; DCT_BLOCK_SIZE]>,
2512        Vec<[i16; DCT_BLOCK_SIZE]>,
2513    ) {
2514        // MCU size for 2×2, 2×2, 1×1 sampling: 16×16 pixels
2515        let mcu_cols = (width + 15) / 16;
2516        let mcu_rows = (height + 15) / 16;
2517        let num_xy_blocks = mcu_cols * mcu_rows * 4; // 4 blocks per MCU for X and Y
2518        let num_b_blocks = mcu_cols * mcu_rows; // 1 block per MCU for B
2519
2520        let mut x_blocks = Vec::with_capacity(num_xy_blocks);
2521        let mut y_blocks = Vec::with_capacity(num_xy_blocks);
2522        let mut b_blocks = Vec::with_capacity(num_b_blocks);
2523
2524        for mcu_y in 0..mcu_rows {
2525            for mcu_x in 0..mcu_cols {
2526                // Process 4 X blocks (2×2 arrangement within 16×16 MCU)
2527                for block_y in 0..2 {
2528                    for block_x in 0..2 {
2529                        let bx = mcu_x * 2 + block_x;
2530                        let by = mcu_y * 2 + block_y;
2531                        let x_block = self.extract_block_f32(x_plane, width, height, bx, by);
2532                        let x_dct = forward_dct_8x8(&x_block);
2533                        let x_quant_coeffs = quant::quantize_block(&x_dct, &x_quant.values);
2534                        x_blocks.push(natural_to_zigzag(&x_quant_coeffs));
2535                    }
2536                }
2537
2538                // Process 4 Y blocks (2×2 arrangement within 16×16 MCU)
2539                for block_y in 0..2 {
2540                    for block_x in 0..2 {
2541                        let bx = mcu_x * 2 + block_x;
2542                        let by = mcu_y * 2 + block_y;
2543                        let y_block = self.extract_block_f32(y_plane, width, height, bx, by);
2544                        let y_dct = forward_dct_8x8(&y_block);
2545                        let y_quant_coeffs = quant::quantize_block(&y_dct, &y_quant.values);
2546                        y_blocks.push(natural_to_zigzag(&y_quant_coeffs));
2547                    }
2548                }
2549
2550                // Process 1 B block (from downsampled plane)
2551                let b_block = self.extract_block_f32(b_plane, b_width, b_height, mcu_x, mcu_y);
2552                let b_dct = forward_dct_8x8(&b_block);
2553                let b_quant_coeffs = quant::quantize_block(&b_dct, &b_quant.values);
2554                b_blocks.push(natural_to_zigzag(&b_quant_coeffs));
2555            }
2556        }
2557
2558        (x_blocks, y_blocks, b_blocks)
2559    }
2560
2561    /// Builds optimized Huffman tables for XYB mode.
2562    ///
2563    /// XYB uses a single shared table for all components (luminance tables).
2564    /// Returns the optimized DC and AC tables.
2565    fn build_optimized_tables_xyb(
2566        &self,
2567        x_blocks: &[[i16; DCT_BLOCK_SIZE]],
2568        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
2569        b_blocks: &[[i16; DCT_BLOCK_SIZE]],
2570    ) -> Result<(
2571        crate::huffman_opt::OptimizedTable,
2572        crate::huffman_opt::OptimizedTable,
2573    )> {
2574        let mut dc_freq = FrequencyCounter::new();
2575        let mut ac_freq = FrequencyCounter::new();
2576
2577        // Collect frequencies from all components
2578        // Note: XYB MCU order is 4 X blocks, 4 Y blocks, 1 B block per MCU
2579        // But since all share the same table, we just iterate through them
2580
2581        // In XYB mode, we have interleaved blocks per MCU:
2582        // [X0, X1, X2, X3, Y0, Y1, Y2, Y3, B0] per MCU
2583        // DC prediction carries across MCUs for each component (standard JPEG behavior)
2584
2585        let mcu_count = b_blocks.len();
2586
2587        // Each component maintains its own DC prediction across all MCUs
2588        let mut prev_dc_x: i16 = 0;
2589        let mut prev_dc_y: i16 = 0;
2590        let mut prev_dc_b: i16 = 0;
2591
2592        for mcu_idx in 0..mcu_count {
2593            // X blocks (4 per MCU)
2594            let x_start = mcu_idx * 4;
2595            for i in 0..4 {
2596                let block = &x_blocks[x_start + i];
2597                Self::collect_block_frequencies(block, prev_dc_x, &mut dc_freq, &mut ac_freq);
2598                prev_dc_x = block[0];
2599            }
2600
2601            // Y blocks (4 per MCU)
2602            let y_start = mcu_idx * 4;
2603            for i in 0..4 {
2604                let block = &y_blocks[y_start + i];
2605                Self::collect_block_frequencies(block, prev_dc_y, &mut dc_freq, &mut ac_freq);
2606                prev_dc_y = block[0];
2607            }
2608
2609            // B block (1 per MCU)
2610            Self::collect_block_frequencies(
2611                &b_blocks[mcu_idx],
2612                prev_dc_b,
2613                &mut dc_freq,
2614                &mut ac_freq,
2615            );
2616            prev_dc_b = b_blocks[mcu_idx][0];
2617        }
2618
2619        // Generate optimized tables
2620        let dc_table = dc_freq.generate_table_with_dht()?;
2621        let ac_table = ac_freq.generate_table_with_dht()?;
2622
2623        Ok((dc_table, ac_table))
2624    }
2625
2626    /// Encodes XYB blocks using optimized Huffman tables.
2627    #[allow(clippy::too_many_arguments)]
2628    fn encode_with_tables_xyb(
2629        &self,
2630        x_blocks: &[[i16; DCT_BLOCK_SIZE]],
2631        y_blocks: &[[i16; DCT_BLOCK_SIZE]],
2632        b_blocks: &[[i16; DCT_BLOCK_SIZE]],
2633        dc_table: &crate::huffman_opt::OptimizedTable,
2634        ac_table: &crate::huffman_opt::OptimizedTable,
2635    ) -> Result<Vec<u8>> {
2636        let mut encoder = EntropyEncoder::new();
2637
2638        // Use the same optimized table for all components
2639        encoder.set_dc_table(0, dc_table.table.clone());
2640        encoder.set_ac_table(0, ac_table.table.clone());
2641
2642        if self.config.restart_interval > 0 {
2643            encoder.set_restart_interval(self.config.restart_interval);
2644        }
2645
2646        let mcu_count = b_blocks.len();
2647        for mcu_idx in 0..mcu_count {
2648            // X blocks (4 per MCU)
2649            let x_start = mcu_idx * 4;
2650            for i in 0..4 {
2651                encoder.encode_block(&x_blocks[x_start + i], 0, 0, 0)?;
2652            }
2653
2654            // Y blocks (4 per MCU)
2655            let y_start = mcu_idx * 4;
2656            for i in 0..4 {
2657                encoder.encode_block(&y_blocks[y_start + i], 1, 0, 0)?;
2658            }
2659
2660            // B block (1 per MCU)
2661            encoder.encode_block(&b_blocks[mcu_idx], 2, 0, 0)?;
2662
2663            encoder.check_restart();
2664        }
2665
2666        Ok(encoder.finish())
2667    }
2668
2669    /// Writes DHT markers for XYB optimized tables.
2670    fn write_huffman_tables_xyb_optimized(
2671        &self,
2672        output: &mut Vec<u8>,
2673        dc_table: &crate::huffman_opt::OptimizedTable,
2674        ac_table: &crate::huffman_opt::OptimizedTable,
2675    ) {
2676        let write_table = |out: &mut Vec<u8>, class: u8, id: u8, bits: &[u8; 16], values: &[u8]| {
2677            out.push(0xFF);
2678            out.push(MARKER_DHT);
2679            let length = 2 + 1 + 16 + values.len();
2680            out.push((length >> 8) as u8);
2681            out.push(length as u8);
2682            out.push((class << 4) | id);
2683            out.extend_from_slice(bits);
2684            out.extend_from_slice(values);
2685        };
2686
2687        // DC table (class=0, id=0)
2688        write_table(output, 0, 0, &dc_table.bits, &dc_table.values);
2689        // AC table (class=1, id=0)
2690        write_table(output, 1, 0, &ac_table.bits, &ac_table.values);
2691    }
2692
2693    /// Encodes scan data for XYB mode with float planes.
2694    ///
2695    /// Uses scaled XYB values (in [0, 1] range), converts to [0, 255],
2696    /// then level shifts by subtracting 128 before DCT.
2697    #[allow(clippy::too_many_arguments)]
2698    fn encode_scan_xyb_float(
2699        &self,
2700        x_plane: &[f32],
2701        y_plane: &[f32],
2702        b_plane: &[f32], // Already downsampled
2703        width: usize,
2704        height: usize,
2705        b_width: usize,
2706        b_height: usize,
2707        x_quant: &QuantTable,
2708        y_quant: &QuantTable,
2709        b_quant: &QuantTable,
2710    ) -> Result<Vec<u8>> {
2711        let mut encoder = EntropyEncoder::new();
2712
2713        // Set up Huffman tables - use luminance tables for all components in XYB mode
2714        encoder.set_dc_table(0, HuffmanEncodeTable::std_dc_luminance());
2715        encoder.set_ac_table(0, HuffmanEncodeTable::std_ac_luminance());
2716
2717        if self.config.restart_interval > 0 {
2718            encoder.set_restart_interval(self.config.restart_interval);
2719        }
2720
2721        // MCU size for 2×2, 2×2, 1×1 sampling: 16×16 pixels
2722        // Each MCU contains: 4 X blocks + 4 Y blocks + 1 B block = 9 blocks
2723        let mcu_cols = (width + 15) / 16;
2724        let mcu_rows = (height + 15) / 16;
2725
2726        for mcu_y in 0..mcu_rows {
2727            for mcu_x in 0..mcu_cols {
2728                // Process 4 X blocks (2×2 arrangement within 16×16 MCU)
2729                for block_y in 0..2 {
2730                    for block_x in 0..2 {
2731                        let bx = mcu_x * 2 + block_x;
2732                        let by = mcu_y * 2 + block_y;
2733                        let x_block = self.extract_block_f32(x_plane, width, height, bx, by);
2734                        let x_dct = forward_dct_8x8(&x_block);
2735                        let x_quant_coeffs = quant::quantize_block(&x_dct, &x_quant.values);
2736                        let x_zigzag = natural_to_zigzag(&x_quant_coeffs);
2737                        encoder.encode_block(&x_zigzag, 0, 0, 0)?;
2738                    }
2739                }
2740
2741                // Process 4 Y blocks (2×2 arrangement within 16×16 MCU)
2742                for block_y in 0..2 {
2743                    for block_x in 0..2 {
2744                        let bx = mcu_x * 2 + block_x;
2745                        let by = mcu_y * 2 + block_y;
2746                        let y_block = self.extract_block_f32(y_plane, width, height, bx, by);
2747                        let y_dct = forward_dct_8x8(&y_block);
2748                        let y_quant_coeffs = quant::quantize_block(&y_dct, &y_quant.values);
2749                        let y_zigzag = natural_to_zigzag(&y_quant_coeffs);
2750                        encoder.encode_block(&y_zigzag, 1, 0, 0)?;
2751                    }
2752                }
2753
2754                // Process 1 B block (from downsampled plane)
2755                let b_block = self.extract_block_f32(b_plane, b_width, b_height, mcu_x, mcu_y);
2756                let b_dct = forward_dct_8x8(&b_block);
2757                let b_quant_coeffs = quant::quantize_block(&b_dct, &b_quant.values);
2758                let b_zigzag = natural_to_zigzag(&b_quant_coeffs);
2759                encoder.encode_block(&b_zigzag, 2, 0, 0)?;
2760
2761                encoder.check_restart();
2762            }
2763        }
2764
2765        Ok(encoder.finish())
2766    }
2767
2768    /// Extracts an 8x8 block from a float plane (scaled XYB values).
2769    ///
2770    /// Scaled XYB values are in [0, 1] range. This method:
2771    /// 1. Multiplies by 255 to get to [0, 255] range
2772    /// 2. Subtracts 128 for level shifting (DCT input is [-128, 127])
2773    fn extract_block_f32(
2774        &self,
2775        plane: &[f32],
2776        width: usize,
2777        height: usize,
2778        bx: usize,
2779        by: usize,
2780    ) -> [f32; DCT_BLOCK_SIZE] {
2781        let mut block = [0.0f32; DCT_BLOCK_SIZE];
2782
2783        for y in 0..DCT_SIZE {
2784            for x in 0..DCT_SIZE {
2785                let px = (bx * DCT_SIZE + x).min(width - 1);
2786                let py = (by * DCT_SIZE + y).min(height - 1);
2787                let idx = py * width + px;
2788                // Scale from [0, 1] to [0, 255], then level shift by -128
2789                block[y * DCT_SIZE + x] = plane[idx] * 255.0 - 128.0;
2790            }
2791        }
2792
2793        block
2794    }
2795
2796    /// Extracts an 8x8 block from a u8 plane with level shift.
2797    #[allow(dead_code)]
2798    fn extract_block(
2799        &self,
2800        plane: &[u8],
2801        width: usize,
2802        height: usize,
2803        bx: usize,
2804        by: usize,
2805    ) -> [f32; DCT_BLOCK_SIZE] {
2806        let mut block = [0.0f32; DCT_BLOCK_SIZE];
2807
2808        for y in 0..DCT_SIZE {
2809            for x in 0..DCT_SIZE {
2810                let px = (bx * DCT_SIZE + x).min(width - 1);
2811                let py = (by * DCT_SIZE + y).min(height - 1);
2812                let idx = py * width + px;
2813                // Level shift: subtract 128
2814                block[y * DCT_SIZE + x] = plane[idx] as f32 - 128.0;
2815            }
2816        }
2817
2818        block
2819    }
2820
2821    /// Extracts an 8x8 block from a YCbCr f32 plane with level shift.
2822    /// Input values are in [0, 255] range, output is level-shifted by -128.
2823    fn extract_block_ycbcr_f32(
2824        &self,
2825        plane: &[f32],
2826        width: usize,
2827        height: usize,
2828        bx: usize,
2829        by: usize,
2830    ) -> [f32; DCT_BLOCK_SIZE] {
2831        let mut block = [0.0f32; DCT_BLOCK_SIZE];
2832
2833        for y in 0..DCT_SIZE {
2834            for x in 0..DCT_SIZE {
2835                let px = (bx * DCT_SIZE + x).min(width - 1);
2836                let py = (by * DCT_SIZE + y).min(height - 1);
2837                let idx = py * width + px;
2838                // Level shift: subtract 128 (values are already in [0, 255])
2839                block[y * DCT_SIZE + x] = plane[idx] - 128.0;
2840            }
2841        }
2842
2843        block
2844    }
2845}
2846
2847impl Default for Encoder {
2848    fn default() -> Self {
2849        Self::new()
2850    }
2851}
2852
2853/// Converts coefficients from natural order to zigzag order for JPEG encoding.
2854fn natural_to_zigzag(natural: &[i16; DCT_BLOCK_SIZE]) -> [i16; DCT_BLOCK_SIZE] {
2855    let mut zigzag = [0i16; DCT_BLOCK_SIZE];
2856    for i in 0..DCT_BLOCK_SIZE {
2857        zigzag[JPEG_ZIGZAG_ORDER[i] as usize] = natural[i];
2858    }
2859    zigzag
2860}
2861
2862#[cfg(test)]
2863mod tests {
2864    use super::*;
2865
2866    #[test]
2867    fn test_encoder_creation() {
2868        let encoder = Encoder::new()
2869            .width(640)
2870            .height(480)
2871            .quality(Quality::from_quality(90.0));
2872
2873        assert_eq!(encoder.config.width, 640);
2874        assert_eq!(encoder.config.height, 480);
2875    }
2876
2877    #[test]
2878    fn test_encoder_validation() {
2879        let encoder = Encoder::new();
2880        assert!(encoder.validate().is_err());
2881
2882        let encoder = Encoder::new().width(100).height(100);
2883        assert!(encoder.validate().is_ok());
2884    }
2885
2886    #[test]
2887    fn test_encode_small_gray() {
2888        let encoder = Encoder::new()
2889            .width(8)
2890            .height(8)
2891            .pixel_format(PixelFormat::Gray)
2892            .quality(Quality::from_quality(90.0));
2893
2894        let data = vec![128u8; 64];
2895        let result = encoder.encode(&data);
2896        assert!(result.is_ok());
2897
2898        let jpeg = result.unwrap();
2899        // Should start with SOI
2900        assert_eq!(jpeg[0], 0xFF);
2901        assert_eq!(jpeg[1], MARKER_SOI);
2902        // Should end with EOI
2903        assert_eq!(jpeg[jpeg.len() - 2], 0xFF);
2904        assert_eq!(jpeg[jpeg.len() - 1], MARKER_EOI);
2905    }
2906
2907    #[test]
2908    fn test_encode_rgb_xyb_mode() {
2909        // Test XYB mode encoding with a 16x16 RGB image
2910        let encoder = Encoder::new()
2911            .width(16)
2912            .height(16)
2913            .pixel_format(PixelFormat::Rgb)
2914            .quality(Quality::from_quality(90.0))
2915            .use_xyb(true);
2916
2917        // Create a simple gradient test image
2918        let mut data = vec![0u8; 16 * 16 * 3];
2919        for y in 0..16 {
2920            for x in 0..16 {
2921                let idx = (y * 16 + x) * 3;
2922                data[idx] = (x * 16) as u8; // Red gradient
2923                data[idx + 1] = (y * 16) as u8; // Green gradient
2924                data[idx + 2] = 128; // Constant blue
2925            }
2926        }
2927
2928        let result = encoder.encode(&data);
2929        assert!(result.is_ok(), "XYB encoding failed: {:?}", result.err());
2930
2931        let jpeg = result.unwrap();
2932        // Should start with SOI
2933        assert_eq!(jpeg[0], 0xFF);
2934        assert_eq!(jpeg[1], MARKER_SOI);
2935        // Should end with EOI
2936        assert_eq!(jpeg[jpeg.len() - 2], 0xFF);
2937        assert_eq!(jpeg[jpeg.len() - 1], MARKER_EOI);
2938
2939        // Should be a valid size (not too small)
2940        assert!(jpeg.len() > 100, "JPEG too small: {} bytes", jpeg.len());
2941        println!("XYB encoded JPEG size: {} bytes", jpeg.len());
2942    }
2943
2944    #[test]
2945    fn test_encode_rgb_xyb_larger() {
2946        // Test XYB mode with a larger image (32x32)
2947        let encoder = Encoder::new()
2948            .width(32)
2949            .height(32)
2950            .pixel_format(PixelFormat::Rgb)
2951            .quality(Quality::from_quality(75.0))
2952            .use_xyb(true);
2953
2954        // Create a test pattern
2955        let mut data = vec![0u8; 32 * 32 * 3];
2956        for y in 0..32 {
2957            for x in 0..32 {
2958                let idx = (y * 32 + x) * 3;
2959                // Checkerboard pattern
2960                let checker = ((x / 4) + (y / 4)) % 2 == 0;
2961                data[idx] = if checker { 255 } else { 0 }; // Red
2962                data[idx + 1] = if checker { 0 } else { 255 }; // Green
2963                data[idx + 2] = 128; // Blue
2964            }
2965        }
2966
2967        let result = encoder.encode(&data);
2968        assert!(result.is_ok(), "XYB encoding failed: {:?}", result.err());
2969
2970        let jpeg = result.unwrap();
2971        assert_eq!(jpeg[0], 0xFF);
2972        assert_eq!(jpeg[1], MARKER_SOI);
2973        assert_eq!(jpeg[jpeg.len() - 2], 0xFF);
2974        assert_eq!(jpeg[jpeg.len() - 1], MARKER_EOI);
2975        println!("XYB encoded 32x32 JPEG size: {} bytes", jpeg.len());
2976    }
2977
2978    #[test]
2979    fn test_huffman_optimization_produces_valid_jpeg() {
2980        // Create a gradient test image
2981        let width = 64u32;
2982        let height = 64u32;
2983        let mut data = vec![0u8; (width * height * 3) as usize];
2984
2985        for y in 0..height as usize {
2986            for x in 0..width as usize {
2987                let idx = (y * width as usize + x) * 3;
2988                data[idx] = (x * 4) as u8; // R
2989                data[idx + 1] = (y * 4) as u8; // G
2990                data[idx + 2] = ((x + y) * 2) as u8; // B
2991            }
2992        }
2993
2994        let encoder = Encoder::new()
2995            .width(width)
2996            .height(height)
2997            .quality(Quality::from_quality(75.0))
2998            .optimize_huffman(true);
2999
3000        let result = encoder.encode(&data);
3001        assert!(
3002            result.is_ok(),
3003            "Optimized Huffman encoding failed: {:?}",
3004            result.err()
3005        );
3006
3007        let jpeg = result.unwrap();
3008        assert_eq!(jpeg[0], 0xFF);
3009        assert_eq!(jpeg[1], MARKER_SOI);
3010        assert_eq!(jpeg[jpeg.len() - 2], 0xFF);
3011        assert_eq!(jpeg[jpeg.len() - 1], MARKER_EOI);
3012
3013        // Verify it's decodable
3014        let decoded = jpeg_decoder::Decoder::new(&jpeg[..]).decode();
3015        assert!(
3016            decoded.is_ok(),
3017            "Optimized JPEG not decodable: {:?}",
3018            decoded.err()
3019        );
3020    }
3021
3022    #[test]
3023    fn test_huffman_optimization_reduces_file_size() {
3024        // Create a more complex test image that benefits from optimization
3025        let width = 128u32;
3026        let height = 128u32;
3027        let mut data = vec![0u8; (width * height * 3) as usize];
3028
3029        // Create a pattern that will have non-uniform symbol frequencies
3030        for y in 0..height as usize {
3031            for x in 0..width as usize {
3032                let idx = (y * width as usize + x) * 3;
3033                // Create blocks with varying content
3034                let block_type = ((x / 16) + (y / 16)) % 4;
3035                match block_type {
3036                    0 => {
3037                        // Solid color
3038                        data[idx] = 180;
3039                        data[idx + 1] = 180;
3040                        data[idx + 2] = 180;
3041                    }
3042                    1 => {
3043                        // Gradient
3044                        data[idx] = (x * 2) as u8;
3045                        data[idx + 1] = (y * 2) as u8;
3046                        data[idx + 2] = 100;
3047                    }
3048                    2 => {
3049                        // Checkerboard
3050                        let checker = ((x + y) % 2) as u8 * 255;
3051                        data[idx] = checker;
3052                        data[idx + 1] = checker;
3053                        data[idx + 2] = checker;
3054                    }
3055                    _ => {
3056                        // Texture
3057                        data[idx] = ((x * 5 + y * 3) % 256) as u8;
3058                        data[idx + 1] = ((x * 3 + y * 7) % 256) as u8;
3059                        data[idx + 2] = ((x * 2 + y * 2) % 256) as u8;
3060                    }
3061                }
3062            }
3063        }
3064
3065        // Encode without optimization
3066        let jpeg_standard = Encoder::new()
3067            .width(width)
3068            .height(height)
3069            .quality(Quality::from_quality(75.0))
3070            .optimize_huffman(false)
3071            .encode(&data)
3072            .expect("Standard encoding failed");
3073
3074        // Encode with optimization
3075        let jpeg_optimized = Encoder::new()
3076            .width(width)
3077            .height(height)
3078            .quality(Quality::from_quality(75.0))
3079            .optimize_huffman(true)
3080            .encode(&data)
3081            .expect("Optimized encoding failed");
3082
3083        println!(
3084            "Standard size: {} bytes, Optimized size: {} bytes, Savings: {:.1}%",
3085            jpeg_standard.len(),
3086            jpeg_optimized.len(),
3087            (1.0 - jpeg_optimized.len() as f64 / jpeg_standard.len() as f64) * 100.0
3088        );
3089
3090        // Optimized should be smaller or equal (never larger)
3091        assert!(
3092            jpeg_optimized.len() <= jpeg_standard.len(),
3093            "Optimized ({}) should not be larger than standard ({})",
3094            jpeg_optimized.len(),
3095            jpeg_standard.len()
3096        );
3097
3098        // Verify both are decodable
3099        let decoded_std = jpeg_decoder::Decoder::new(&jpeg_standard[..]).decode();
3100        let decoded_opt = jpeg_decoder::Decoder::new(&jpeg_optimized[..]).decode();
3101        assert!(decoded_std.is_ok(), "Standard JPEG not decodable");
3102        assert!(decoded_opt.is_ok(), "Optimized JPEG not decodable");
3103    }
3104
3105    #[test]
3106    fn test_xyb_huffman_optimization() {
3107        // Create test image for XYB mode
3108        let width = 64u32;
3109        let height = 64u32;
3110        let mut data = vec![0u8; (width * height * 3) as usize];
3111
3112        for y in 0..height as usize {
3113            for x in 0..width as usize {
3114                let idx = (y * width as usize + x) * 3;
3115                data[idx] = (x * 4) as u8;
3116                data[idx + 1] = (y * 4) as u8;
3117                data[idx + 2] = ((x + y) * 2) as u8;
3118            }
3119        }
3120
3121        // Encode XYB without optimization
3122        let jpeg_standard = Encoder::new()
3123            .width(width)
3124            .height(height)
3125            .quality(Quality::from_quality(75.0))
3126            .use_xyb(true)
3127            .optimize_huffman(false)
3128            .encode(&data)
3129            .expect("Standard XYB encoding failed");
3130
3131        // Encode XYB with optimization
3132        let jpeg_optimized = Encoder::new()
3133            .width(width)
3134            .height(height)
3135            .quality(Quality::from_quality(75.0))
3136            .use_xyb(true)
3137            .optimize_huffman(true)
3138            .encode(&data)
3139            .expect("Optimized XYB encoding failed");
3140
3141        println!(
3142            "XYB Standard: {} bytes, Optimized: {} bytes, Savings: {:.1}%",
3143            jpeg_standard.len(),
3144            jpeg_optimized.len(),
3145            (1.0 - jpeg_optimized.len() as f64 / jpeg_standard.len() as f64) * 100.0
3146        );
3147
3148        // Verify both have valid JPEG structure
3149        assert_eq!(jpeg_standard[0], 0xFF);
3150        assert_eq!(jpeg_standard[1], MARKER_SOI);
3151        assert_eq!(jpeg_optimized[0], 0xFF);
3152        assert_eq!(jpeg_optimized[1], MARKER_SOI);
3153
3154        // Optimized should be smaller or equal
3155        assert!(
3156            jpeg_optimized.len() <= jpeg_standard.len(),
3157            "XYB Optimized ({}) should not be larger than standard ({})",
3158            jpeg_optimized.len(),
3159            jpeg_standard.len()
3160        );
3161    }
3162}