mozjpeg_rs/
encode.rs

1//! JPEG encoder pipeline.
2//!
3//! This module provides two encoder types:
4//!
5//! - [`Encoder`]: Full-featured encoder with trellis quantization, progressive mode,
6//!   and Huffman optimization. Batch encoding only.
7//! - [`StreamingEncoder`]: Streaming-capable encoder without optimizations.
8//!   Supports both batch and scanline-by-scanline encoding.
9//!
10//! Both implement the [`Encode`] trait for batch encoding.
11//!
12//! # Examples
13//!
14//! ```ignore
15//! use mozjpeg_rs::Encoder;
16//!
17//! // Full-featured batch encoding
18//! let jpeg = Encoder::new()
19//!     .quality(85)
20//!     .progressive(true)
21//!     .encode_rgb(&pixels, width, height)?;
22//!
23//! // Streaming encoding (memory-efficient for large images)
24//! let mut stream = Encoder::streaming()
25//!     .quality(85)
26//!     .start(width, height, file)?;
27//! for row in scanlines.chunks(16) {
28//!     stream.write_scanlines(row)?;
29//! }
30//! stream.finish()?;
31//! ```
32
33use std::io::Write;
34
35use crate::bitstream::BitWriter;
36use crate::consts::{QuantTableIdx, DCTSIZE, DCTSIZE2};
37use crate::deringing::preprocess_deringing;
38use crate::entropy::{EntropyEncoder, ProgressiveEncoder, ProgressiveSymbolCounter, SymbolCounter};
39use crate::error::{Error, Result};
40use crate::huffman::DerivedTable;
41use crate::huffman::FrequencyCounter;
42use crate::marker::MarkerWriter;
43use crate::progressive::{generate_baseline_scan, generate_mozjpeg_max_compression_scans};
44use crate::quant::{create_quant_tables, quantize_block_raw};
45use crate::sample;
46use crate::scan_optimize::{generate_search_scans, ScanSearchConfig, ScanSelector};
47use crate::scan_trial::ScanTrialEncoder;
48use crate::simd::SimdOps;
49use crate::trellis::trellis_quantize_block;
50use crate::types::{PixelDensity, Subsampling, TrellisConfig};
51
52mod helpers;
53mod streaming;
54
55pub(crate) use helpers::{
56    create_components, create_std_ac_chroma_table, create_std_ac_luma_table,
57    create_std_dc_chroma_table, create_std_dc_luma_table, create_ycbcr_components,
58    natural_to_zigzag, run_dc_trellis_by_row, try_alloc_vec, try_alloc_vec_array, write_dht_marker,
59    write_sos_marker,
60};
61pub use streaming::{EncodingStream, StreamingEncoder};
62
63// ============================================================================
64// Encode Trait (internal, for potential future streaming API)
65// ============================================================================
66
67/// Trait for JPEG encoding (batch mode).
68///
69/// Implemented by both [`Encoder`] and [`StreamingEncoder`].
70#[allow(dead_code)]
71pub trait Encode {
72    /// Encode RGB image data to JPEG.
73    ///
74    /// # Arguments
75    /// * `rgb_data` - RGB pixel data (3 bytes per pixel, row-major order)
76    /// * `width` - Image width in pixels
77    /// * `height` - Image height in pixels
78    fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>>;
79
80    /// Encode grayscale image data to JPEG.
81    ///
82    /// # Arguments
83    /// * `gray_data` - Grayscale pixel data (1 byte per pixel, row-major order)
84    /// * `width` - Image width in pixels
85    /// * `height` - Image height in pixels
86    fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>>;
87}
88
89/// JPEG encoder with configurable quality and features.
90#[derive(Debug, Clone)]
91pub struct Encoder {
92    /// Quality level (1-100)
93    quality: u8,
94    /// Enable progressive mode
95    progressive: bool,
96    /// Chroma subsampling mode
97    subsampling: Subsampling,
98    /// Quantization table variant
99    quant_table_idx: QuantTableIdx,
100    /// Custom luminance quantization table (overrides quant_table_idx if set)
101    custom_luma_qtable: Option<[u16; DCTSIZE2]>,
102    /// Custom chrominance quantization table (overrides quant_table_idx if set)
103    custom_chroma_qtable: Option<[u16; DCTSIZE2]>,
104    /// Trellis quantization configuration
105    trellis: TrellisConfig,
106    /// Force baseline-compatible output
107    force_baseline: bool,
108    /// Optimize Huffman tables (requires 2-pass)
109    optimize_huffman: bool,
110    /// Enable overshoot deringing (reduces ringing on white backgrounds)
111    overshoot_deringing: bool,
112    /// Optimize progressive scan configuration (tries multiple configs, picks smallest)
113    optimize_scans: bool,
114    /// Restart interval in MCUs (0 = disabled)
115    restart_interval: u16,
116    /// Pixel density for JFIF APP0 marker
117    pixel_density: PixelDensity,
118    /// EXIF data to embed (raw TIFF structure, without "Exif\0\0" header)
119    exif_data: Option<Vec<u8>>,
120    /// ICC color profile to embed (will be chunked into APP2 markers)
121    icc_profile: Option<Vec<u8>>,
122    /// Custom APP markers to embed (marker number 0-15, data)
123    custom_markers: Vec<(u8, Vec<u8>)>,
124    /// SIMD operations dispatch (detected once at construction)
125    simd: SimdOps,
126}
127
128impl Default for Encoder {
129    fn default() -> Self {
130        Self::new()
131    }
132}
133
134impl Encoder {
135    /// Create a new encoder with default settings (mozjpeg defaults).
136    ///
137    /// Default configuration:
138    /// - Quality: 75
139    /// - Progressive: false
140    /// - Subsampling: 4:2:0
141    /// - Quant tables: ImageMagick (mozjpeg default)
142    /// - Trellis: enabled (core mozjpeg optimization)
143    /// - Huffman optimization: enabled (2-pass for optimal tables)
144    /// - Overshoot deringing: enabled (reduces ringing on edges)
145    pub fn new() -> Self {
146        Self {
147            quality: 75,
148            progressive: false,
149            subsampling: Subsampling::S420,
150            quant_table_idx: QuantTableIdx::ImageMagick,
151            custom_luma_qtable: None,
152            custom_chroma_qtable: None,
153            trellis: TrellisConfig::default(),
154            force_baseline: false,
155            optimize_huffman: true,
156            overshoot_deringing: true,
157            optimize_scans: false,
158            restart_interval: 0,
159            pixel_density: PixelDensity::default(),
160            exif_data: None,
161            icc_profile: None,
162            custom_markers: Vec::new(),
163            simd: SimdOps::detect(),
164        }
165    }
166
167    /// Create encoder with max compression settings (mozjpeg defaults).
168    ///
169    /// Enables progressive mode, trellis quantization, Huffman optimization,
170    /// and overshoot deringing.
171    ///
172    /// Note: optimize_scans tries multiple scan configurations to find the smallest.
173    /// Results may differ from C mozjpeg's default 9-scan successive approximation script.
174    pub fn max_compression() -> Self {
175        Self {
176            quality: 75,
177            progressive: true,
178            subsampling: Subsampling::S420,
179            quant_table_idx: QuantTableIdx::ImageMagick,
180            custom_luma_qtable: None,
181            custom_chroma_qtable: None,
182            trellis: TrellisConfig::default(),
183            force_baseline: false,
184            optimize_huffman: true,
185            overshoot_deringing: true,
186            optimize_scans: true,
187            restart_interval: 0,
188            pixel_density: PixelDensity::default(),
189            exif_data: None,
190            icc_profile: None,
191            custom_markers: Vec::new(),
192            simd: SimdOps::detect(),
193        }
194    }
195
196    /// Create encoder with fastest settings (libjpeg-turbo compatible).
197    ///
198    /// Disables all mozjpeg optimizations (trellis, Huffman optimization, deringing).
199    /// Uses ImageMagick quant tables (same as C mozjpeg defaults from jpeg_set_defaults).
200    pub fn fastest() -> Self {
201        Self {
202            quality: 75,
203            progressive: false,
204            subsampling: Subsampling::S420,
205            quant_table_idx: QuantTableIdx::ImageMagick,
206            custom_luma_qtable: None,
207            custom_chroma_qtable: None,
208            trellis: TrellisConfig::disabled(),
209            force_baseline: true,
210            optimize_huffman: false,
211            overshoot_deringing: false,
212            optimize_scans: false,
213            restart_interval: 0,
214            pixel_density: PixelDensity::default(),
215            exif_data: None,
216            icc_profile: None,
217            custom_markers: Vec::new(),
218            simd: SimdOps::detect(),
219        }
220    }
221
222    /// Set quality level (1-100).
223    ///
224    /// Higher values produce larger, higher-quality images.
225    pub fn quality(mut self, quality: u8) -> Self {
226        self.quality = quality.clamp(1, 100);
227        self
228    }
229
230    /// Enable or disable progressive mode.
231    pub fn progressive(mut self, enable: bool) -> Self {
232        self.progressive = enable;
233        self
234    }
235
236    /// Set chroma subsampling mode.
237    pub fn subsampling(mut self, mode: Subsampling) -> Self {
238        self.subsampling = mode;
239        self
240    }
241
242    /// Set quantization table variant.
243    pub fn quant_tables(mut self, idx: QuantTableIdx) -> Self {
244        self.quant_table_idx = idx;
245        self
246    }
247
248    /// Configure trellis quantization.
249    pub fn trellis(mut self, config: TrellisConfig) -> Self {
250        self.trellis = config;
251        self
252    }
253
254    /// Force baseline-compatible output.
255    pub fn force_baseline(mut self, enable: bool) -> Self {
256        self.force_baseline = enable;
257        self
258    }
259
260    /// Enable Huffman table optimization.
261    pub fn optimize_huffman(mut self, enable: bool) -> Self {
262        self.optimize_huffman = enable;
263        self
264    }
265
266    /// Enable overshoot deringing.
267    ///
268    /// Reduces visible ringing artifacts near hard edges, especially on white
269    /// backgrounds. Works by allowing encoded values to "overshoot" above 255
270    /// (which will clamp back to 255 when decoded) to create smoother waveforms.
271    ///
272    /// This is a mozjpeg-specific feature that can improve visual quality at
273    /// minimal file size cost. Enabled by default.
274    pub fn overshoot_deringing(mut self, enable: bool) -> Self {
275        self.overshoot_deringing = enable;
276        self
277    }
278
279    /// Enable or disable scan optimization for progressive mode.
280    ///
281    /// When enabled, the encoder tries multiple scan configurations and
282    /// picks the one that produces the smallest output. This can improve
283    /// compression by 1-3% but increases encoding time.
284    ///
285    /// Only has effect when progressive mode is enabled.
286    pub fn optimize_scans(mut self, enable: bool) -> Self {
287        self.optimize_scans = enable;
288        self
289    }
290
291    /// Set restart interval in MCUs.
292    ///
293    /// Restart markers are inserted every N MCUs, which can help with
294    /// error recovery and parallel decoding. Set to 0 to disable (default).
295    ///
296    /// Common values: 0 (disabled), or image width in MCUs for row-by-row restarts.
297    pub fn restart_interval(mut self, interval: u16) -> Self {
298        self.restart_interval = interval;
299        self
300    }
301
302    /// Set EXIF data to embed in the JPEG.
303    ///
304    /// # Arguments
305    /// * `data` - Raw EXIF data (TIFF structure). The "Exif\0\0" header
306    ///   will be added automatically.
307    ///
308    /// Pass empty or call without this method to omit EXIF data.
309    pub fn exif_data(mut self, data: Vec<u8>) -> Self {
310        self.exif_data = if data.is_empty() { None } else { Some(data) };
311        self
312    }
313
314    /// Set pixel density for the JFIF APP0 marker.
315    ///
316    /// This specifies the physical pixel density (DPI/DPC) or aspect ratio.
317    /// Note that most software ignores JFIF density in favor of EXIF metadata.
318    ///
319    /// # Example
320    /// ```
321    /// use mozjpeg_rs::{Encoder, PixelDensity};
322    ///
323    /// let encoder = Encoder::new()
324    ///     .pixel_density(PixelDensity::dpi(300, 300)); // 300 DPI
325    /// ```
326    pub fn pixel_density(mut self, density: PixelDensity) -> Self {
327        self.pixel_density = density;
328        self
329    }
330
331    /// Set ICC color profile to embed.
332    ///
333    /// The profile will be embedded in APP2 markers with the standard
334    /// "ICC_PROFILE" identifier. Large profiles are automatically chunked.
335    ///
336    /// # Arguments
337    /// * `profile` - Raw ICC profile data
338    pub fn icc_profile(mut self, profile: Vec<u8>) -> Self {
339        self.icc_profile = if profile.is_empty() {
340            None
341        } else {
342            Some(profile)
343        };
344        self
345    }
346
347    /// Add a custom APP marker.
348    ///
349    /// # Arguments
350    /// * `app_num` - APP marker number (0-15, e.g., 1 for EXIF, 2 for ICC)
351    /// * `data` - Raw marker data (including any identifier prefix)
352    ///
353    /// Multiple markers with the same number are allowed.
354    /// Markers are written in the order they are added.
355    pub fn add_marker(mut self, app_num: u8, data: Vec<u8>) -> Self {
356        if app_num <= 15 && !data.is_empty() {
357            self.custom_markers.push((app_num, data));
358        }
359        self
360    }
361
362    /// Set custom luminance quantization table.
363    ///
364    /// This overrides the table selected by `quant_tables()`.
365    /// Values should be in natural (row-major) order, not zigzag.
366    ///
367    /// # Arguments
368    /// * `table` - 64 quantization values (quality scaling still applies)
369    pub fn custom_luma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
370        self.custom_luma_qtable = Some(table);
371        self
372    }
373
374    /// Set custom chrominance quantization table.
375    ///
376    /// This overrides the table selected by `quant_tables()`.
377    /// Values should be in natural (row-major) order, not zigzag.
378    ///
379    /// # Arguments
380    /// * `table` - 64 quantization values (quality scaling still applies)
381    pub fn custom_chroma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
382        self.custom_chroma_qtable = Some(table);
383        self
384    }
385
386    /// Encode RGB image data to JPEG.
387    ///
388    /// # Arguments
389    /// * `rgb_data` - RGB pixel data (3 bytes per pixel, row-major)
390    /// * `width` - Image width in pixels
391    /// * `height` - Image height in pixels
392    ///
393    /// # Returns
394    /// JPEG-encoded data as a `Vec<u8>`.
395    pub fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
396        // Validate dimensions: must be non-zero
397        if width == 0 || height == 0 {
398            return Err(Error::InvalidDimensions { width, height });
399        }
400
401        // Use checked arithmetic to prevent overflow
402        let expected_len = (width as usize)
403            .checked_mul(height as usize)
404            .and_then(|n| n.checked_mul(3))
405            .ok_or(Error::InvalidDimensions { width, height })?;
406
407        if rgb_data.len() != expected_len {
408            return Err(Error::BufferSizeMismatch {
409                expected: expected_len,
410                actual: rgb_data.len(),
411            });
412        }
413
414        let mut output = Vec::new();
415        self.encode_rgb_to_writer(rgb_data, width, height, &mut output)?;
416        Ok(output)
417    }
418
419    /// Encode grayscale image data to JPEG.
420    ///
421    /// # Arguments
422    /// * `gray_data` - Grayscale pixel data (1 byte per pixel, row-major)
423    /// * `width` - Image width in pixels
424    /// * `height` - Image height in pixels
425    ///
426    /// # Returns
427    /// JPEG-encoded data as a `Vec<u8>`.
428    pub fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
429        // Validate dimensions: must be non-zero
430        if width == 0 || height == 0 {
431            return Err(Error::InvalidDimensions { width, height });
432        }
433
434        // Use checked arithmetic to prevent overflow
435        let expected_len = (width as usize)
436            .checked_mul(height as usize)
437            .ok_or(Error::InvalidDimensions { width, height })?;
438
439        if gray_data.len() != expected_len {
440            return Err(Error::BufferSizeMismatch {
441                expected: expected_len,
442                actual: gray_data.len(),
443            });
444        }
445
446        let mut output = Vec::new();
447        self.encode_gray_to_writer(gray_data, width, height, &mut output)?;
448        Ok(output)
449    }
450
451    /// Encode grayscale image data to a writer.
452    pub fn encode_gray_to_writer<W: Write>(
453        &self,
454        gray_data: &[u8],
455        width: u32,
456        height: u32,
457        output: W,
458    ) -> Result<()> {
459        let width = width as usize;
460        let height = height as usize;
461
462        // For grayscale, Y plane is the input directly (no conversion needed)
463        let y_plane = gray_data;
464
465        // Grayscale uses 1x1 sampling
466        let (mcu_width, mcu_height) = sample::mcu_aligned_dimensions(width, height, 1, 1);
467
468        let mcu_y_size = mcu_width
469            .checked_mul(mcu_height)
470            .ok_or(Error::AllocationFailed)?;
471        let mut y_mcu = try_alloc_vec(0u8, mcu_y_size)?;
472        sample::expand_to_mcu(y_plane, width, height, &mut y_mcu, mcu_width, mcu_height);
473
474        // Create quantization table (only luma needed)
475        let luma_qtable = if let Some(ref custom) = self.custom_luma_qtable {
476            crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
477        } else {
478            let (luma, _) =
479                create_quant_tables(self.quality, self.quant_table_idx, self.force_baseline);
480            luma
481        };
482
483        // Create Huffman tables (only luma needed)
484        let dc_luma_huff = create_std_dc_luma_table();
485        let ac_luma_huff = create_std_ac_luma_table();
486        let dc_luma_derived = DerivedTable::from_huff_table(&dc_luma_huff, true)?;
487        let ac_luma_derived = DerivedTable::from_huff_table(&ac_luma_huff, false)?;
488
489        // Single component for grayscale
490        let components = create_components(Subsampling::Gray);
491
492        // Write JPEG file
493        let mut marker_writer = MarkerWriter::new(output);
494
495        // SOI
496        marker_writer.write_soi()?;
497
498        // APP0 (JFIF) with pixel density
499        marker_writer.write_jfif_app0(
500            self.pixel_density.unit as u8,
501            self.pixel_density.x,
502            self.pixel_density.y,
503        )?;
504
505        // EXIF (if present)
506        if let Some(ref exif) = self.exif_data {
507            marker_writer.write_app1_exif(exif)?;
508        }
509
510        // ICC profile (if present)
511        if let Some(ref icc) = self.icc_profile {
512            marker_writer.write_icc_profile(icc)?;
513        }
514
515        // Custom APP markers
516        for (app_num, data) in &self.custom_markers {
517            marker_writer.write_app(*app_num, data)?;
518        }
519
520        // DQT (only luma table for grayscale)
521        let luma_qtable_zz = natural_to_zigzag(&luma_qtable.values);
522        marker_writer.write_dqt(0, &luma_qtable_zz, false)?;
523
524        // SOF (baseline for grayscale - progressive would need multi-scan support)
525        marker_writer.write_sof(
526            false, // Use baseline for grayscale (simpler)
527            8,
528            height as u16,
529            width as u16,
530            &components,
531        )?;
532
533        // DRI (restart interval)
534        if self.restart_interval > 0 {
535            marker_writer.write_dri(self.restart_interval)?;
536        }
537
538        // DHT (only luma tables for grayscale)
539        if !self.optimize_huffman {
540            marker_writer
541                .write_dht_multiple(&[(0, false, &dc_luma_huff), (0, true, &ac_luma_huff)])?;
542        }
543
544        // Grayscale uses baseline encoding
545        let mcu_rows = mcu_height / DCTSIZE;
546        let mcu_cols = mcu_width / DCTSIZE;
547        let num_blocks = mcu_rows
548            .checked_mul(mcu_cols)
549            .ok_or(Error::AllocationFailed)?;
550
551        if self.optimize_huffman {
552            // 2-pass: collect blocks, count frequencies, then encode
553            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_blocks)?;
554            let mut dct_block = [0i16; DCTSIZE2];
555
556            // Collect all blocks using the same process as RGB encoding
557            for mcu_row in 0..mcu_rows {
558                for mcu_col in 0..mcu_cols {
559                    let block_idx = mcu_row * mcu_cols + mcu_col;
560                    self.process_block_to_storage_with_raw(
561                        &y_mcu,
562                        mcu_width,
563                        mcu_row,
564                        mcu_col,
565                        &luma_qtable.values,
566                        &ac_luma_derived,
567                        &mut y_blocks[block_idx],
568                        &mut dct_block,
569                        None, // No raw DCT storage needed for grayscale
570                    )?;
571                }
572            }
573
574            // Count frequencies using SymbolCounter
575            let mut dc_freq = FrequencyCounter::new();
576            let mut ac_freq = FrequencyCounter::new();
577            let mut counter = SymbolCounter::new();
578            for block in &y_blocks {
579                counter.count_block(block, 0, &mut dc_freq, &mut ac_freq);
580            }
581
582            // Generate optimized tables
583            let opt_dc_huff = dc_freq.generate_table()?;
584            let opt_ac_huff = ac_freq.generate_table()?;
585            let opt_dc_derived = DerivedTable::from_huff_table(&opt_dc_huff, true)?;
586            let opt_ac_derived = DerivedTable::from_huff_table(&opt_ac_huff, false)?;
587
588            // Write optimized Huffman tables
589            marker_writer
590                .write_dht_multiple(&[(0, false, &opt_dc_huff), (0, true, &opt_ac_huff)])?;
591
592            // Write SOS and encode
593            let scans = generate_baseline_scan(1);
594            marker_writer.write_sos(&scans[0], &components)?;
595
596            let output = marker_writer.into_inner();
597            let mut bit_writer = BitWriter::new(output);
598            let mut encoder = EntropyEncoder::new(&mut bit_writer);
599
600            // Restart marker support for grayscale (each block = 1 MCU)
601            let restart_interval = self.restart_interval as usize;
602            let mut restart_num = 0u8;
603
604            for (mcu_count, block) in y_blocks.iter().enumerate() {
605                // Emit restart marker if needed
606                if restart_interval > 0
607                    && mcu_count > 0
608                    && mcu_count.is_multiple_of(restart_interval)
609                {
610                    encoder.emit_restart(restart_num)?;
611                    restart_num = restart_num.wrapping_add(1) & 0x07;
612                }
613                encoder.encode_block(block, 0, &opt_dc_derived, &opt_ac_derived)?;
614            }
615
616            bit_writer.flush()?;
617            let mut output = bit_writer.into_inner();
618            output.write_all(&[0xFF, 0xD9])?; // EOI
619        } else {
620            // Single-pass encoding
621            let scans = generate_baseline_scan(1);
622            marker_writer.write_sos(&scans[0], &components)?;
623
624            let output = marker_writer.into_inner();
625            let mut bit_writer = BitWriter::new(output);
626            let mut encoder = EntropyEncoder::new(&mut bit_writer);
627            let mut dct_block = [0i16; DCTSIZE2];
628            let mut quant_block = [0i16; DCTSIZE2];
629
630            // Restart marker support
631            let restart_interval = self.restart_interval as usize;
632            let mut mcu_count = 0usize;
633            let mut restart_num = 0u8;
634
635            for mcu_row in 0..mcu_rows {
636                for mcu_col in 0..mcu_cols {
637                    // Emit restart marker if needed
638                    if restart_interval > 0
639                        && mcu_count > 0
640                        && mcu_count.is_multiple_of(restart_interval)
641                    {
642                        encoder.emit_restart(restart_num)?;
643                        restart_num = restart_num.wrapping_add(1) & 0x07;
644                    }
645
646                    // Process block directly to quant_block
647                    self.process_block_to_storage_with_raw(
648                        &y_mcu,
649                        mcu_width,
650                        mcu_row,
651                        mcu_col,
652                        &luma_qtable.values,
653                        &ac_luma_derived,
654                        &mut quant_block,
655                        &mut dct_block,
656                        None,
657                    )?;
658                    encoder.encode_block(&quant_block, 0, &dc_luma_derived, &ac_luma_derived)?;
659                    mcu_count += 1;
660                }
661            }
662
663            bit_writer.flush()?;
664            let mut output = bit_writer.into_inner();
665            output.write_all(&[0xFF, 0xD9])?; // EOI
666        }
667
668        Ok(())
669    }
670
671    /// Encode RGB image data to a writer.
672    pub fn encode_rgb_to_writer<W: Write>(
673        &self,
674        rgb_data: &[u8],
675        width: u32,
676        height: u32,
677        output: W,
678    ) -> Result<()> {
679        let width = width as usize;
680        let height = height as usize;
681
682        // Step 1: Convert RGB to YCbCr
683        // Use checked arithmetic for num_pixels calculation
684        let num_pixels = width.checked_mul(height).ok_or(Error::InvalidDimensions {
685            width: width as u32,
686            height: height as u32,
687        })?;
688
689        let mut y_plane = try_alloc_vec(0u8, num_pixels)?;
690        let mut cb_plane = try_alloc_vec(0u8, num_pixels)?;
691        let mut cr_plane = try_alloc_vec(0u8, num_pixels)?;
692
693        (self.simd.color_convert_rgb_to_ycbcr)(
694            rgb_data,
695            &mut y_plane,
696            &mut cb_plane,
697            &mut cr_plane,
698            num_pixels,
699        );
700
701        // Step 2: Downsample chroma if needed
702        let (luma_h, luma_v) = self.subsampling.luma_factors();
703        let (chroma_width, chroma_height) =
704            sample::subsampled_dimensions(width, height, luma_h as usize, luma_v as usize);
705
706        let chroma_size = chroma_width
707            .checked_mul(chroma_height)
708            .ok_or(Error::AllocationFailed)?;
709        let mut cb_subsampled = try_alloc_vec(0u8, chroma_size)?;
710        let mut cr_subsampled = try_alloc_vec(0u8, chroma_size)?;
711
712        sample::downsample_plane(
713            &cb_plane,
714            width,
715            height,
716            luma_h as usize,
717            luma_v as usize,
718            &mut cb_subsampled,
719        );
720        sample::downsample_plane(
721            &cr_plane,
722            width,
723            height,
724            luma_h as usize,
725            luma_v as usize,
726            &mut cr_subsampled,
727        );
728
729        // Step 3: Expand planes to MCU-aligned dimensions
730        let (mcu_width, mcu_height) =
731            sample::mcu_aligned_dimensions(width, height, luma_h as usize, luma_v as usize);
732        let (mcu_chroma_w, mcu_chroma_h) =
733            (mcu_width / luma_h as usize, mcu_height / luma_v as usize);
734
735        let mcu_y_size = mcu_width
736            .checked_mul(mcu_height)
737            .ok_or(Error::AllocationFailed)?;
738        let mcu_chroma_size = mcu_chroma_w
739            .checked_mul(mcu_chroma_h)
740            .ok_or(Error::AllocationFailed)?;
741        let mut y_mcu = try_alloc_vec(0u8, mcu_y_size)?;
742        let mut cb_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
743        let mut cr_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
744
745        sample::expand_to_mcu(&y_plane, width, height, &mut y_mcu, mcu_width, mcu_height);
746        sample::expand_to_mcu(
747            &cb_subsampled,
748            chroma_width,
749            chroma_height,
750            &mut cb_mcu,
751            mcu_chroma_w,
752            mcu_chroma_h,
753        );
754        sample::expand_to_mcu(
755            &cr_subsampled,
756            chroma_width,
757            chroma_height,
758            &mut cr_mcu,
759            mcu_chroma_w,
760            mcu_chroma_h,
761        );
762
763        // Step 4: Create quantization tables
764        let (luma_qtable, chroma_qtable) = {
765            let (default_luma, default_chroma) =
766                create_quant_tables(self.quality, self.quant_table_idx, self.force_baseline);
767            let luma = if let Some(ref custom) = self.custom_luma_qtable {
768                crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
769            } else {
770                default_luma
771            };
772            let chroma = if let Some(ref custom) = self.custom_chroma_qtable {
773                crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
774            } else {
775                default_chroma
776            };
777            (luma, chroma)
778        };
779
780        // Step 5: Create Huffman tables (standard tables)
781        let dc_luma_huff = create_std_dc_luma_table();
782        let dc_chroma_huff = create_std_dc_chroma_table();
783        let ac_luma_huff = create_std_ac_luma_table();
784        let ac_chroma_huff = create_std_ac_chroma_table();
785
786        let dc_luma_derived = DerivedTable::from_huff_table(&dc_luma_huff, true)?;
787        let dc_chroma_derived = DerivedTable::from_huff_table(&dc_chroma_huff, true)?;
788        let ac_luma_derived = DerivedTable::from_huff_table(&ac_luma_huff, false)?;
789        let ac_chroma_derived = DerivedTable::from_huff_table(&ac_chroma_huff, false)?;
790
791        // Step 6: Set up components
792        let components = create_ycbcr_components(self.subsampling);
793
794        // Step 7: Write JPEG file
795        let mut marker_writer = MarkerWriter::new(output);
796
797        // SOI
798        marker_writer.write_soi()?;
799
800        // APP0 (JFIF) with pixel density
801        marker_writer.write_jfif_app0(
802            self.pixel_density.unit as u8,
803            self.pixel_density.x,
804            self.pixel_density.y,
805        )?;
806
807        // APP1 (EXIF) - if present
808        if let Some(ref exif) = self.exif_data {
809            marker_writer.write_app1_exif(exif)?;
810        }
811
812        // ICC profile (if present)
813        if let Some(ref icc) = self.icc_profile {
814            marker_writer.write_icc_profile(icc)?;
815        }
816
817        // Custom APP markers
818        for (app_num, data) in &self.custom_markers {
819            marker_writer.write_app(*app_num, data)?;
820        }
821
822        // DQT (quantization tables in zigzag order) - combined into single marker
823        let luma_qtable_zz = natural_to_zigzag(&luma_qtable.values);
824        let chroma_qtable_zz = natural_to_zigzag(&chroma_qtable.values);
825        marker_writer
826            .write_dqt_multiple(&[(0, &luma_qtable_zz, false), (1, &chroma_qtable_zz, false)])?;
827
828        // SOF
829        marker_writer.write_sof(
830            self.progressive,
831            8,
832            height as u16,
833            width as u16,
834            &components,
835        )?;
836
837        // DRI (restart interval) - if enabled
838        if self.restart_interval > 0 {
839            marker_writer.write_dri(self.restart_interval)?;
840        }
841
842        // DHT (Huffman tables) - written here for non-optimized modes,
843        // or later after frequency counting for optimized modes
844        if !self.optimize_huffman {
845            // Combine all tables into single DHT marker for smaller file size
846            marker_writer.write_dht_multiple(&[
847                (0, false, &dc_luma_huff),
848                (1, false, &dc_chroma_huff),
849                (0, true, &ac_luma_huff),
850                (1, true, &ac_chroma_huff),
851            ])?;
852        }
853
854        if self.progressive {
855            // Progressive mode: Store all blocks, then encode multiple scans
856            let mcu_rows = mcu_height / (DCTSIZE * luma_v as usize);
857            let mcu_cols = mcu_width / (DCTSIZE * luma_h as usize);
858            let num_y_blocks = mcu_rows
859                .checked_mul(mcu_cols)
860                .and_then(|n| n.checked_mul(luma_h as usize))
861                .and_then(|n| n.checked_mul(luma_v as usize))
862                .ok_or(Error::AllocationFailed)?;
863            let num_chroma_blocks = mcu_rows
864                .checked_mul(mcu_cols)
865                .ok_or(Error::AllocationFailed)?;
866
867            // Collect all quantized blocks
868            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_y_blocks)?;
869            let mut cb_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
870            let mut cr_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
871
872            // Optionally collect raw DCT for DC trellis
873            let dc_trellis_enabled = self.trellis.enabled && self.trellis.dc_enabled;
874            let mut y_raw_dct = if dc_trellis_enabled {
875                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_y_blocks)?)
876            } else {
877                None
878            };
879            let mut cb_raw_dct = if dc_trellis_enabled {
880                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
881            } else {
882                None
883            };
884            let mut cr_raw_dct = if dc_trellis_enabled {
885                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
886            } else {
887                None
888            };
889
890            self.collect_blocks(
891                &y_mcu,
892                mcu_width,
893                mcu_height,
894                &cb_mcu,
895                &cr_mcu,
896                mcu_chroma_w,
897                mcu_chroma_h,
898                &luma_qtable.values,
899                &chroma_qtable.values,
900                &ac_luma_derived,
901                &ac_chroma_derived,
902                &mut y_blocks,
903                &mut cb_blocks,
904                &mut cr_blocks,
905                y_raw_dct.as_deref_mut(),
906                cb_raw_dct.as_deref_mut(),
907                cr_raw_dct.as_deref_mut(),
908                luma_h,
909                luma_v,
910            )?;
911
912            // Run DC trellis optimization if enabled
913            // C mozjpeg processes DC trellis row by row (each row is an independent chain)
914            if dc_trellis_enabled {
915                let h = luma_h as usize;
916                let v = luma_v as usize;
917                let y_block_cols = mcu_cols * h;
918                let y_block_rows = mcu_rows * v;
919
920                if let Some(ref y_raw) = y_raw_dct {
921                    run_dc_trellis_by_row(
922                        y_raw,
923                        &mut y_blocks,
924                        luma_qtable.values[0],
925                        &dc_luma_derived,
926                        self.trellis.lambda_log_scale1,
927                        self.trellis.lambda_log_scale2,
928                        y_block_rows,
929                        y_block_cols,
930                        mcu_cols,
931                        h,
932                        v,
933                    );
934                }
935                // Chroma has 1x1 per MCU, so MCU order = row order
936                if let Some(ref cb_raw) = cb_raw_dct {
937                    run_dc_trellis_by_row(
938                        cb_raw,
939                        &mut cb_blocks,
940                        chroma_qtable.values[0],
941                        &dc_chroma_derived,
942                        self.trellis.lambda_log_scale1,
943                        self.trellis.lambda_log_scale2,
944                        mcu_rows,
945                        mcu_cols,
946                        mcu_cols,
947                        1,
948                        1,
949                    );
950                }
951                if let Some(ref cr_raw) = cr_raw_dct {
952                    run_dc_trellis_by_row(
953                        cr_raw,
954                        &mut cr_blocks,
955                        chroma_qtable.values[0],
956                        &dc_chroma_derived,
957                        self.trellis.lambda_log_scale1,
958                        self.trellis.lambda_log_scale2,
959                        mcu_rows,
960                        mcu_cols,
961                        mcu_cols,
962                        1,
963                        1,
964                    );
965                }
966            }
967
968            // Generate progressive scan script
969            //
970            // TEMPORARY: Always use 4-scan minimal script to avoid refinement scan bugs.
971            // Our AC refinement encoding has bugs causing "failed to decode huffman code".
972            // TODO: Fix AC refinement encoding and re-enable optimize_scans.
973            let scans = if self.optimize_scans {
974                // When optimize_scans is enabled, use the scan optimizer to find
975                // the best frequency split and Al levels. However, SA refinement
976                // (Ah > 0) is currently disabled due to encoding bugs.
977                self.optimize_progressive_scans(
978                    3, // num_components
979                    &y_blocks,
980                    &cb_blocks,
981                    &cr_blocks,
982                    mcu_rows,
983                    mcu_cols,
984                    luma_h,
985                    luma_v,
986                    width,
987                    height,
988                    chroma_width,
989                    chroma_height,
990                    &dc_luma_derived,
991                    &dc_chroma_derived,
992                    &ac_luma_derived,
993                    &ac_chroma_derived,
994                )?
995            } else {
996                // Use C mozjpeg's 9-scan JCP_MAX_COMPRESSION script.
997                // This matches jcparam.c lines 932-947 (the JCP_MAX_COMPRESSION branch).
998                // mozjpeg-sys defaults to JCP_MAX_COMPRESSION profile, which uses:
999                // - DC with no successive approximation (Al=0)
1000                // - 8/9 frequency split for luma with successive approximation
1001                // - No successive approximation for chroma
1002                generate_mozjpeg_max_compression_scans(3)
1003            };
1004
1005            // Build Huffman tables and encode scans
1006            //
1007            // When optimize_scans=true, each AC scan gets its own optimal Huffman table
1008            // written immediately before the scan. This matches C mozjpeg behavior and
1009            // ensures the trial encoder's size estimates match actual encoded sizes.
1010            //
1011            // When optimize_huffman=true, use per-scan AC tables (matching C mozjpeg).
1012            // C automatically enables optimize_coding for progressive mode and does
1013            // 2 passes per scan: gather statistics, then output with optimal tables.
1014
1015            if self.optimize_huffman {
1016                // Per-scan AC tables mode: DC tables global, AC tables per-scan
1017                // This matches C mozjpeg's progressive behavior
1018
1019                // Count DC frequencies for first-pass DC scans only (Ah == 0)
1020                // DC refinement scans (Ah > 0) don't use Huffman coding - they output raw bits
1021                let mut dc_luma_freq = FrequencyCounter::new();
1022                let mut dc_chroma_freq = FrequencyCounter::new();
1023
1024                for scan in &scans {
1025                    let is_dc_first_scan = scan.ss == 0 && scan.se == 0 && scan.ah == 0;
1026                    if is_dc_first_scan {
1027                        self.count_dc_scan_symbols(
1028                            scan,
1029                            &y_blocks,
1030                            &cb_blocks,
1031                            &cr_blocks,
1032                            mcu_rows,
1033                            mcu_cols,
1034                            luma_h,
1035                            luma_v,
1036                            &mut dc_luma_freq,
1037                            &mut dc_chroma_freq,
1038                        );
1039                    }
1040                }
1041
1042                // Generate and write DC tables upfront
1043                let opt_dc_luma_huff = dc_luma_freq.generate_table()?;
1044                let opt_dc_chroma_huff = dc_chroma_freq.generate_table()?;
1045                marker_writer.write_dht_multiple(&[
1046                    (0, false, &opt_dc_luma_huff),
1047                    (1, false, &opt_dc_chroma_huff),
1048                ])?;
1049
1050                let opt_dc_luma = DerivedTable::from_huff_table(&opt_dc_luma_huff, true)?;
1051                let opt_dc_chroma = DerivedTable::from_huff_table(&opt_dc_chroma_huff, true)?;
1052
1053                // Get output writer from marker_writer
1054                let output = marker_writer.into_inner();
1055                let mut bit_writer = BitWriter::new(output);
1056
1057                // Encode each scan with per-scan AC tables
1058                for scan in &scans {
1059                    bit_writer.flush()?;
1060                    let mut inner = bit_writer.into_inner();
1061
1062                    let is_dc_scan = scan.ss == 0 && scan.se == 0;
1063
1064                    if !is_dc_scan {
1065                        // AC scan: build per-scan optimal Huffman table
1066                        let comp_idx = scan.component_index[0] as usize;
1067                        let blocks = match comp_idx {
1068                            0 => &y_blocks,
1069                            1 => &cb_blocks,
1070                            2 => &cr_blocks,
1071                            _ => &y_blocks,
1072                        };
1073                        let (block_cols, block_rows) = if comp_idx == 0 {
1074                            (width.div_ceil(DCTSIZE), height.div_ceil(DCTSIZE))
1075                        } else {
1076                            (
1077                                chroma_width.div_ceil(DCTSIZE),
1078                                chroma_height.div_ceil(DCTSIZE),
1079                            )
1080                        };
1081
1082                        // Count frequencies for this scan only
1083                        let mut ac_freq = FrequencyCounter::new();
1084                        self.count_ac_scan_symbols(
1085                            scan,
1086                            blocks,
1087                            mcu_rows,
1088                            mcu_cols,
1089                            luma_h,
1090                            luma_v,
1091                            comp_idx,
1092                            block_cols,
1093                            block_rows,
1094                            &mut ac_freq,
1095                        );
1096
1097                        // Build optimal table and write DHT
1098                        let ac_huff = ac_freq.generate_table()?;
1099                        let table_idx = if comp_idx == 0 { 0 } else { 1 };
1100                        write_dht_marker(&mut inner, table_idx, true, &ac_huff)?;
1101
1102                        // Write SOS and encode
1103                        write_sos_marker(&mut inner, scan, &components)?;
1104                        bit_writer = BitWriter::new(inner);
1105
1106                        let ac_derived = DerivedTable::from_huff_table(&ac_huff, false)?;
1107                        let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
1108
1109                        self.encode_progressive_scan(
1110                            scan,
1111                            &y_blocks,
1112                            &cb_blocks,
1113                            &cr_blocks,
1114                            mcu_rows,
1115                            mcu_cols,
1116                            luma_h,
1117                            luma_v,
1118                            width,
1119                            height,
1120                            chroma_width,
1121                            chroma_height,
1122                            &opt_dc_luma,
1123                            &opt_dc_chroma,
1124                            &ac_derived,
1125                            &ac_derived, // Not used for AC scans, but needed for signature
1126                            &mut prog_encoder,
1127                        )?;
1128                        prog_encoder.finish_scan(Some(&ac_derived))?;
1129                    } else {
1130                        // DC scan: use global DC tables
1131                        write_sos_marker(&mut inner, scan, &components)?;
1132                        bit_writer = BitWriter::new(inner);
1133
1134                        let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
1135                        self.encode_progressive_scan(
1136                            scan,
1137                            &y_blocks,
1138                            &cb_blocks,
1139                            &cr_blocks,
1140                            mcu_rows,
1141                            mcu_cols,
1142                            luma_h,
1143                            luma_v,
1144                            width,
1145                            height,
1146                            chroma_width,
1147                            chroma_height,
1148                            &opt_dc_luma,
1149                            &opt_dc_chroma,
1150                            &ac_luma_derived, // Not used for DC scans
1151                            &ac_chroma_derived,
1152                            &mut prog_encoder,
1153                        )?;
1154                        prog_encoder.finish_scan(None)?;
1155                    }
1156                }
1157
1158                // Flush and write EOI
1159                bit_writer.flush()?;
1160                let mut output = bit_writer.into_inner();
1161                output.write_all(&[0xFF, 0xD9])?;
1162            } else {
1163                // Standard tables mode (no optimization)
1164                let output = marker_writer.into_inner();
1165                let mut bit_writer = BitWriter::new(output);
1166
1167                for scan in &scans {
1168                    bit_writer.flush()?;
1169                    let mut inner = bit_writer.into_inner();
1170                    write_sos_marker(&mut inner, scan, &components)?;
1171
1172                    bit_writer = BitWriter::new(inner);
1173                    let mut prog_encoder = ProgressiveEncoder::new_standard_tables(&mut bit_writer);
1174
1175                    self.encode_progressive_scan(
1176                        scan,
1177                        &y_blocks,
1178                        &cb_blocks,
1179                        &cr_blocks,
1180                        mcu_rows,
1181                        mcu_cols,
1182                        luma_h,
1183                        luma_v,
1184                        width,
1185                        height,
1186                        chroma_width,
1187                        chroma_height,
1188                        &dc_luma_derived,
1189                        &dc_chroma_derived,
1190                        &ac_luma_derived,
1191                        &ac_chroma_derived,
1192                        &mut prog_encoder,
1193                    )?;
1194
1195                    let ac_table = if scan.ss > 0 {
1196                        if scan.component_index[0] == 0 {
1197                            Some(&ac_luma_derived)
1198                        } else {
1199                            Some(&ac_chroma_derived)
1200                        }
1201                    } else {
1202                        None
1203                    };
1204                    prog_encoder.finish_scan(ac_table)?;
1205                }
1206
1207                bit_writer.flush()?;
1208                let mut output = bit_writer.into_inner();
1209                output.write_all(&[0xFF, 0xD9])?;
1210            }
1211        } else if self.optimize_huffman {
1212            // Baseline mode with Huffman optimization (2-pass)
1213            // Pass 1: Collect blocks and count frequencies
1214            let mcu_rows = mcu_height / (DCTSIZE * luma_v as usize);
1215            let mcu_cols = mcu_width / (DCTSIZE * luma_h as usize);
1216            let num_y_blocks = mcu_rows
1217                .checked_mul(mcu_cols)
1218                .and_then(|n| n.checked_mul(luma_h as usize))
1219                .and_then(|n| n.checked_mul(luma_v as usize))
1220                .ok_or(Error::AllocationFailed)?;
1221            let num_chroma_blocks = mcu_rows
1222                .checked_mul(mcu_cols)
1223                .ok_or(Error::AllocationFailed)?;
1224
1225            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_y_blocks)?;
1226            let mut cb_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
1227            let mut cr_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
1228
1229            // Optionally collect raw DCT for DC trellis
1230            let dc_trellis_enabled = self.trellis.enabled && self.trellis.dc_enabled;
1231            let mut y_raw_dct = if dc_trellis_enabled {
1232                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_y_blocks)?)
1233            } else {
1234                None
1235            };
1236            let mut cb_raw_dct = if dc_trellis_enabled {
1237                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
1238            } else {
1239                None
1240            };
1241            let mut cr_raw_dct = if dc_trellis_enabled {
1242                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
1243            } else {
1244                None
1245            };
1246
1247            self.collect_blocks(
1248                &y_mcu,
1249                mcu_width,
1250                mcu_height,
1251                &cb_mcu,
1252                &cr_mcu,
1253                mcu_chroma_w,
1254                mcu_chroma_h,
1255                &luma_qtable.values,
1256                &chroma_qtable.values,
1257                &ac_luma_derived,
1258                &ac_chroma_derived,
1259                &mut y_blocks,
1260                &mut cb_blocks,
1261                &mut cr_blocks,
1262                y_raw_dct.as_deref_mut(),
1263                cb_raw_dct.as_deref_mut(),
1264                cr_raw_dct.as_deref_mut(),
1265                luma_h,
1266                luma_v,
1267            )?;
1268
1269            // Run DC trellis optimization if enabled
1270            // C mozjpeg processes DC trellis row by row (each row is an independent chain)
1271            if dc_trellis_enabled {
1272                let h = luma_h as usize;
1273                let v = luma_v as usize;
1274                let y_block_cols = mcu_cols * h;
1275                let y_block_rows = mcu_rows * v;
1276
1277                if let Some(ref y_raw) = y_raw_dct {
1278                    run_dc_trellis_by_row(
1279                        y_raw,
1280                        &mut y_blocks,
1281                        luma_qtable.values[0],
1282                        &dc_luma_derived,
1283                        self.trellis.lambda_log_scale1,
1284                        self.trellis.lambda_log_scale2,
1285                        y_block_rows,
1286                        y_block_cols,
1287                        mcu_cols,
1288                        h,
1289                        v,
1290                    );
1291                }
1292                // Chroma has 1x1 per MCU, so MCU order = row order
1293                if let Some(ref cb_raw) = cb_raw_dct {
1294                    run_dc_trellis_by_row(
1295                        cb_raw,
1296                        &mut cb_blocks,
1297                        chroma_qtable.values[0],
1298                        &dc_chroma_derived,
1299                        self.trellis.lambda_log_scale1,
1300                        self.trellis.lambda_log_scale2,
1301                        mcu_rows,
1302                        mcu_cols,
1303                        mcu_cols,
1304                        1,
1305                        1,
1306                    );
1307                }
1308                if let Some(ref cr_raw) = cr_raw_dct {
1309                    run_dc_trellis_by_row(
1310                        cr_raw,
1311                        &mut cr_blocks,
1312                        chroma_qtable.values[0],
1313                        &dc_chroma_derived,
1314                        self.trellis.lambda_log_scale1,
1315                        self.trellis.lambda_log_scale2,
1316                        mcu_rows,
1317                        mcu_cols,
1318                        mcu_cols,
1319                        1,
1320                        1,
1321                    );
1322                }
1323            }
1324
1325            // Count symbol frequencies
1326            let mut dc_luma_freq = FrequencyCounter::new();
1327            let mut dc_chroma_freq = FrequencyCounter::new();
1328            let mut ac_luma_freq = FrequencyCounter::new();
1329            let mut ac_chroma_freq = FrequencyCounter::new();
1330
1331            let mut counter = SymbolCounter::new();
1332            let blocks_per_mcu_y = (luma_h * luma_v) as usize;
1333            let mut y_idx = 0;
1334            let mut c_idx = 0;
1335
1336            for _mcu_row in 0..mcu_rows {
1337                for _mcu_col in 0..mcu_cols {
1338                    // Y blocks
1339                    for _ in 0..blocks_per_mcu_y {
1340                        counter.count_block(
1341                            &y_blocks[y_idx],
1342                            0,
1343                            &mut dc_luma_freq,
1344                            &mut ac_luma_freq,
1345                        );
1346                        y_idx += 1;
1347                    }
1348                    // Cb block
1349                    counter.count_block(
1350                        &cb_blocks[c_idx],
1351                        1,
1352                        &mut dc_chroma_freq,
1353                        &mut ac_chroma_freq,
1354                    );
1355                    // Cr block
1356                    counter.count_block(
1357                        &cr_blocks[c_idx],
1358                        2,
1359                        &mut dc_chroma_freq,
1360                        &mut ac_chroma_freq,
1361                    );
1362                    c_idx += 1;
1363                }
1364            }
1365
1366            // Generate optimized Huffman tables
1367            let opt_dc_luma_huff = dc_luma_freq.generate_table()?;
1368            let opt_dc_chroma_huff = dc_chroma_freq.generate_table()?;
1369            let opt_ac_luma_huff = ac_luma_freq.generate_table()?;
1370            let opt_ac_chroma_huff = ac_chroma_freq.generate_table()?;
1371
1372            let opt_dc_luma = DerivedTable::from_huff_table(&opt_dc_luma_huff, true)?;
1373            let opt_dc_chroma = DerivedTable::from_huff_table(&opt_dc_chroma_huff, true)?;
1374            let opt_ac_luma = DerivedTable::from_huff_table(&opt_ac_luma_huff, false)?;
1375            let opt_ac_chroma = DerivedTable::from_huff_table(&opt_ac_chroma_huff, false)?;
1376
1377            // Write DHT with optimized tables - combined into single marker
1378            marker_writer.write_dht_multiple(&[
1379                (0, false, &opt_dc_luma_huff),
1380                (1, false, &opt_dc_chroma_huff),
1381                (0, true, &opt_ac_luma_huff),
1382                (1, true, &opt_ac_chroma_huff),
1383            ])?;
1384
1385            // Write SOS and encode
1386            let scans = generate_baseline_scan(3);
1387            let scan = &scans[0];
1388            marker_writer.write_sos(scan, &components)?;
1389
1390            let output = marker_writer.into_inner();
1391            let mut bit_writer = BitWriter::new(output);
1392            let mut entropy = EntropyEncoder::new(&mut bit_writer);
1393
1394            // Encode from stored blocks with restart marker support
1395            y_idx = 0;
1396            c_idx = 0;
1397            let restart_interval = self.restart_interval as usize;
1398            let mut mcu_count = 0usize;
1399            let mut restart_num = 0u8;
1400
1401            for _mcu_row in 0..mcu_rows {
1402                for _mcu_col in 0..mcu_cols {
1403                    // Emit restart marker if needed (before this MCU, not first)
1404                    if restart_interval > 0
1405                        && mcu_count > 0
1406                        && mcu_count.is_multiple_of(restart_interval)
1407                    {
1408                        entropy.emit_restart(restart_num)?;
1409                        restart_num = restart_num.wrapping_add(1) & 0x07;
1410                    }
1411
1412                    // Y blocks
1413                    for _ in 0..blocks_per_mcu_y {
1414                        entropy.encode_block(&y_blocks[y_idx], 0, &opt_dc_luma, &opt_ac_luma)?;
1415                        y_idx += 1;
1416                    }
1417                    // Cb block
1418                    entropy.encode_block(&cb_blocks[c_idx], 1, &opt_dc_chroma, &opt_ac_chroma)?;
1419                    // Cr block
1420                    entropy.encode_block(&cr_blocks[c_idx], 2, &opt_dc_chroma, &opt_ac_chroma)?;
1421                    c_idx += 1;
1422                    mcu_count += 1;
1423                }
1424            }
1425
1426            bit_writer.flush()?;
1427            let mut output = bit_writer.into_inner();
1428            output.write_all(&[0xFF, 0xD9])?;
1429        } else {
1430            // Baseline mode: Encode directly (streaming)
1431            let scans = generate_baseline_scan(3);
1432            let scan = &scans[0]; // Baseline has only one scan
1433            marker_writer.write_sos(scan, &components)?;
1434
1435            // Encode MCU data
1436            let output = marker_writer.into_inner();
1437            let mut bit_writer = BitWriter::new(output);
1438            let mut entropy = EntropyEncoder::new(&mut bit_writer);
1439
1440            self.encode_mcus(
1441                &y_mcu,
1442                mcu_width,
1443                mcu_height,
1444                &cb_mcu,
1445                &cr_mcu,
1446                mcu_chroma_w,
1447                mcu_chroma_h,
1448                &luma_qtable.values,
1449                &chroma_qtable.values,
1450                &dc_luma_derived,
1451                &dc_chroma_derived,
1452                &ac_luma_derived,
1453                &ac_chroma_derived,
1454                &mut entropy,
1455                luma_h,
1456                luma_v,
1457            )?;
1458
1459            // Flush bits and get output back
1460            bit_writer.flush()?;
1461            let mut output = bit_writer.into_inner();
1462
1463            // EOI
1464            output.write_all(&[0xFF, 0xD9])?;
1465        }
1466
1467        Ok(())
1468    }
1469
1470    /// Encode all MCUs (Minimum Coded Units).
1471    #[allow(clippy::too_many_arguments)]
1472    fn encode_mcus<W: Write>(
1473        &self,
1474        y_plane: &[u8],
1475        y_width: usize,
1476        y_height: usize,
1477        cb_plane: &[u8],
1478        cr_plane: &[u8],
1479        chroma_width: usize,
1480        _chroma_height: usize,
1481        luma_qtable: &[u16; DCTSIZE2],
1482        chroma_qtable: &[u16; DCTSIZE2],
1483        dc_luma: &DerivedTable,
1484        dc_chroma: &DerivedTable,
1485        ac_luma: &DerivedTable,
1486        ac_chroma: &DerivedTable,
1487        entropy: &mut EntropyEncoder<W>,
1488        h_samp: u8,
1489        v_samp: u8,
1490    ) -> Result<()> {
1491        let mcu_rows = y_height / (DCTSIZE * v_samp as usize);
1492        let mcu_cols = y_width / (DCTSIZE * h_samp as usize);
1493        let total_mcus = mcu_rows * mcu_cols;
1494
1495        let mut dct_block = [0i16; DCTSIZE2];
1496        let mut quant_block = [0i16; DCTSIZE2];
1497
1498        // Restart marker tracking
1499        let restart_interval = self.restart_interval as usize;
1500        let mut mcu_count = 0usize;
1501        let mut restart_num = 0u8;
1502
1503        for mcu_row in 0..mcu_rows {
1504            for mcu_col in 0..mcu_cols {
1505                // Check if we need to emit a restart marker BEFORE this MCU
1506                // (except for the first MCU)
1507                if restart_interval > 0
1508                    && mcu_count > 0
1509                    && mcu_count.is_multiple_of(restart_interval)
1510                {
1511                    entropy.emit_restart(restart_num)?;
1512                    restart_num = restart_num.wrapping_add(1) & 0x07;
1513                }
1514
1515                // Encode Y blocks (may be multiple per MCU for subsampling)
1516                for v in 0..v_samp as usize {
1517                    for h in 0..h_samp as usize {
1518                        let block_row = mcu_row * v_samp as usize + v;
1519                        let block_col = mcu_col * h_samp as usize + h;
1520
1521                        self.encode_block(
1522                            y_plane,
1523                            y_width,
1524                            block_row,
1525                            block_col,
1526                            luma_qtable,
1527                            dc_luma,
1528                            ac_luma,
1529                            0, // Y component
1530                            entropy,
1531                            &mut dct_block,
1532                            &mut quant_block,
1533                        )?;
1534                    }
1535                }
1536
1537                // Encode Cb block
1538                self.encode_block(
1539                    cb_plane,
1540                    chroma_width,
1541                    mcu_row,
1542                    mcu_col,
1543                    chroma_qtable,
1544                    dc_chroma,
1545                    ac_chroma,
1546                    1, // Cb component
1547                    entropy,
1548                    &mut dct_block,
1549                    &mut quant_block,
1550                )?;
1551
1552                // Encode Cr block
1553                self.encode_block(
1554                    cr_plane,
1555                    chroma_width,
1556                    mcu_row,
1557                    mcu_col,
1558                    chroma_qtable,
1559                    dc_chroma,
1560                    ac_chroma,
1561                    2, // Cr component
1562                    entropy,
1563                    &mut dct_block,
1564                    &mut quant_block,
1565                )?;
1566
1567                mcu_count += 1;
1568            }
1569        }
1570
1571        // Suppress unused variable warning
1572        let _ = total_mcus;
1573
1574        Ok(())
1575    }
1576
1577    /// Encode a single 8x8 block.
1578    #[allow(clippy::too_many_arguments)]
1579    fn encode_block<W: Write>(
1580        &self,
1581        plane: &[u8],
1582        plane_width: usize,
1583        block_row: usize,
1584        block_col: usize,
1585        qtable: &[u16; DCTSIZE2],
1586        dc_table: &DerivedTable,
1587        ac_table: &DerivedTable,
1588        component: usize,
1589        entropy: &mut EntropyEncoder<W>,
1590        dct_block: &mut [i16; DCTSIZE2],
1591        quant_block: &mut [i16; DCTSIZE2],
1592    ) -> Result<()> {
1593        // Extract 8x8 block from plane
1594        let mut samples = [0u8; DCTSIZE2];
1595        let base_y = block_row * DCTSIZE;
1596        let base_x = block_col * DCTSIZE;
1597
1598        for row in 0..DCTSIZE {
1599            let src_offset = (base_y + row) * plane_width + base_x;
1600            let dst_offset = row * DCTSIZE;
1601            samples[dst_offset..dst_offset + DCTSIZE]
1602                .copy_from_slice(&plane[src_offset..src_offset + DCTSIZE]);
1603        }
1604
1605        // Level shift (center around 0 for DCT)
1606        let mut shifted = [0i16; DCTSIZE2];
1607        for i in 0..DCTSIZE2 {
1608            shifted[i] = (samples[i] as i16) - 128;
1609        }
1610
1611        // Apply overshoot deringing if enabled (reduces ringing on white backgrounds)
1612        if self.overshoot_deringing {
1613            preprocess_deringing(&mut shifted, qtable[0]);
1614        }
1615
1616        // Forward DCT (output scaled by factor of 8)
1617        (self.simd.forward_dct)(&shifted, dct_block);
1618
1619        // Convert to i32 for quantization
1620        let mut dct_i32 = [0i32; DCTSIZE2];
1621        for i in 0..DCTSIZE2 {
1622            dct_i32[i] = dct_block[i] as i32;
1623        }
1624
1625        // Use trellis quantization if enabled
1626        // Both paths expect raw DCT (scaled by 8) and handle the scaling internally
1627        if self.trellis.enabled {
1628            trellis_quantize_block(&dct_i32, quant_block, qtable, ac_table, &self.trellis);
1629        } else {
1630            // Non-trellis path: use single-step quantization matching C mozjpeg
1631            // This takes raw DCT (scaled by 8) and uses q_scaled = 8 * qtable[i]
1632            quantize_block_raw(&dct_i32, qtable, quant_block);
1633        }
1634
1635        // Entropy encode
1636        entropy.encode_block(quant_block, component, dc_table, ac_table)?;
1637
1638        Ok(())
1639    }
1640
1641    /// Collect all quantized DCT blocks for progressive encoding.
1642    /// Also collects raw DCT blocks if DC trellis is enabled.
1643    #[allow(clippy::too_many_arguments)]
1644    fn collect_blocks(
1645        &self,
1646        y_plane: &[u8],
1647        y_width: usize,
1648        y_height: usize,
1649        cb_plane: &[u8],
1650        cr_plane: &[u8],
1651        chroma_width: usize,
1652        _chroma_height: usize,
1653        luma_qtable: &[u16; DCTSIZE2],
1654        chroma_qtable: &[u16; DCTSIZE2],
1655        ac_luma: &DerivedTable,
1656        ac_chroma: &DerivedTable,
1657        y_blocks: &mut [[i16; DCTSIZE2]],
1658        cb_blocks: &mut [[i16; DCTSIZE2]],
1659        cr_blocks: &mut [[i16; DCTSIZE2]],
1660        mut y_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
1661        mut cb_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
1662        mut cr_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
1663        h_samp: u8,
1664        v_samp: u8,
1665    ) -> Result<()> {
1666        let mcu_rows = y_height / (DCTSIZE * v_samp as usize);
1667        let mcu_cols = y_width / (DCTSIZE * h_samp as usize);
1668
1669        let mut y_idx = 0;
1670        let mut c_idx = 0;
1671        let mut dct_block = [0i16; DCTSIZE2];
1672
1673        for mcu_row in 0..mcu_rows {
1674            for mcu_col in 0..mcu_cols {
1675                // Collect Y blocks (may be multiple per MCU for subsampling)
1676                for v in 0..v_samp as usize {
1677                    for h in 0..h_samp as usize {
1678                        let block_row = mcu_row * v_samp as usize + v;
1679                        let block_col = mcu_col * h_samp as usize + h;
1680
1681                        // Get mutable reference to raw DCT output if collecting
1682                        let raw_dct_out = y_raw_dct.as_mut().map(|arr| &mut arr[y_idx][..]);
1683                        self.process_block_to_storage_with_raw(
1684                            y_plane,
1685                            y_width,
1686                            block_row,
1687                            block_col,
1688                            luma_qtable,
1689                            ac_luma,
1690                            &mut y_blocks[y_idx],
1691                            &mut dct_block,
1692                            raw_dct_out,
1693                        )?;
1694                        y_idx += 1;
1695                    }
1696                }
1697
1698                // Collect Cb block
1699                let raw_dct_out = cb_raw_dct.as_mut().map(|arr| &mut arr[c_idx][..]);
1700                self.process_block_to_storage_with_raw(
1701                    cb_plane,
1702                    chroma_width,
1703                    mcu_row,
1704                    mcu_col,
1705                    chroma_qtable,
1706                    ac_chroma,
1707                    &mut cb_blocks[c_idx],
1708                    &mut dct_block,
1709                    raw_dct_out,
1710                )?;
1711
1712                // Collect Cr block
1713                let raw_dct_out = cr_raw_dct.as_mut().map(|arr| &mut arr[c_idx][..]);
1714                self.process_block_to_storage_with_raw(
1715                    cr_plane,
1716                    chroma_width,
1717                    mcu_row,
1718                    mcu_col,
1719                    chroma_qtable,
1720                    ac_chroma,
1721                    &mut cr_blocks[c_idx],
1722                    &mut dct_block,
1723                    raw_dct_out,
1724                )?;
1725
1726                c_idx += 1;
1727            }
1728        }
1729
1730        Ok(())
1731    }
1732
1733    /// Process a block: DCT + quantize, storing the result.
1734    /// Optionally stores raw DCT coefficients for DC trellis.
1735    #[allow(clippy::too_many_arguments)]
1736    fn process_block_to_storage_with_raw(
1737        &self,
1738        plane: &[u8],
1739        plane_width: usize,
1740        block_row: usize,
1741        block_col: usize,
1742        qtable: &[u16; DCTSIZE2],
1743        ac_table: &DerivedTable,
1744        out_block: &mut [i16; DCTSIZE2],
1745        dct_block: &mut [i16; DCTSIZE2],
1746        raw_dct_out: Option<&mut [i32]>,
1747    ) -> Result<()> {
1748        // Extract 8x8 block from plane
1749        let mut samples = [0u8; DCTSIZE2];
1750        let base_y = block_row * DCTSIZE;
1751        let base_x = block_col * DCTSIZE;
1752
1753        for row in 0..DCTSIZE {
1754            let src_offset = (base_y + row) * plane_width + base_x;
1755            let dst_offset = row * DCTSIZE;
1756            samples[dst_offset..dst_offset + DCTSIZE]
1757                .copy_from_slice(&plane[src_offset..src_offset + DCTSIZE]);
1758        }
1759
1760        // Level shift (center around 0 for DCT)
1761        let mut shifted = [0i16; DCTSIZE2];
1762        for i in 0..DCTSIZE2 {
1763            shifted[i] = (samples[i] as i16) - 128;
1764        }
1765
1766        // Apply overshoot deringing if enabled (reduces ringing on white backgrounds)
1767        if self.overshoot_deringing {
1768            preprocess_deringing(&mut shifted, qtable[0]);
1769        }
1770
1771        // Forward DCT (output scaled by factor of 8)
1772        (self.simd.forward_dct)(&shifted, dct_block);
1773
1774        // Convert to i32 for quantization
1775        let mut dct_i32 = [0i32; DCTSIZE2];
1776        for i in 0..DCTSIZE2 {
1777            dct_i32[i] = dct_block[i] as i32;
1778        }
1779
1780        // Store raw DCT if requested (for DC trellis)
1781        if let Some(raw_out) = raw_dct_out {
1782            raw_out.copy_from_slice(&dct_i32);
1783        }
1784
1785        // Use trellis quantization if enabled
1786        // Both paths expect raw DCT (scaled by 8) and handle the scaling internally
1787        if self.trellis.enabled {
1788            trellis_quantize_block(&dct_i32, out_block, qtable, ac_table, &self.trellis);
1789        } else {
1790            // Non-trellis path: use single-step quantization matching C mozjpeg
1791            // This takes raw DCT (scaled by 8) and uses q_scaled = 8 * qtable[i]
1792            quantize_block_raw(&dct_i32, qtable, out_block);
1793        }
1794
1795        Ok(())
1796    }
1797
1798    /// Optimize progressive scan configuration (C mozjpeg-compatible).
1799    ///
1800    /// This implements the optimize_scans feature from C mozjpeg:
1801    /// 1. Generate 64 individual candidate scans
1802    /// 2. Trial-encode scans SEQUENTIALLY to get accurate sizes
1803    /// 3. Use ScanSelector to find optimal Al levels and frequency splits
1804    /// 4. Build the final scan script from the selection
1805    ///
1806    /// IMPORTANT: Scans must be encoded sequentially (not independently) because
1807    /// refinement scans (Ah > 0) need context from previous scans to produce
1808    /// correct output sizes.
1809    #[allow(clippy::too_many_arguments)]
1810    fn optimize_progressive_scans(
1811        &self,
1812        num_components: u8,
1813        y_blocks: &[[i16; DCTSIZE2]],
1814        cb_blocks: &[[i16; DCTSIZE2]],
1815        cr_blocks: &[[i16; DCTSIZE2]],
1816        mcu_rows: usize,
1817        mcu_cols: usize,
1818        h_samp: u8,
1819        v_samp: u8,
1820        actual_width: usize,
1821        actual_height: usize,
1822        chroma_width: usize,
1823        chroma_height: usize,
1824        dc_luma: &DerivedTable,
1825        dc_chroma: &DerivedTable,
1826        ac_luma: &DerivedTable,
1827        ac_chroma: &DerivedTable,
1828    ) -> Result<Vec<crate::types::ScanInfo>> {
1829        let config = ScanSearchConfig::default();
1830        let candidate_scans = generate_search_scans(num_components, &config);
1831
1832        // Use ScanTrialEncoder for sequential trial encoding with proper state tracking
1833        let mut trial_encoder = ScanTrialEncoder::new(
1834            y_blocks,
1835            cb_blocks,
1836            cr_blocks,
1837            dc_luma,
1838            dc_chroma,
1839            ac_luma,
1840            ac_chroma,
1841            mcu_rows,
1842            mcu_cols,
1843            h_samp,
1844            v_samp,
1845            actual_width,
1846            actual_height,
1847            chroma_width,
1848            chroma_height,
1849        );
1850
1851        // Trial-encode all scans sequentially to get accurate sizes
1852        let scan_sizes = trial_encoder.encode_all_scans(&candidate_scans)?;
1853
1854        // Use ScanSelector to find the optimal configuration
1855        let selector = ScanSelector::new(num_components, config.clone());
1856        let result = selector.select_best(&scan_sizes);
1857
1858        // Build the final scan script from the selection
1859        Ok(result.build_final_scans(num_components, &config))
1860    }
1861
1862    /// Encode a single progressive scan.
1863    #[allow(clippy::too_many_arguments)]
1864    fn encode_progressive_scan<W: Write>(
1865        &self,
1866        scan: &crate::types::ScanInfo,
1867        y_blocks: &[[i16; DCTSIZE2]],
1868        cb_blocks: &[[i16; DCTSIZE2]],
1869        cr_blocks: &[[i16; DCTSIZE2]],
1870        mcu_rows: usize,
1871        mcu_cols: usize,
1872        h_samp: u8,
1873        v_samp: u8,
1874        actual_width: usize,
1875        actual_height: usize,
1876        chroma_width: usize,
1877        chroma_height: usize,
1878        dc_luma: &DerivedTable,
1879        dc_chroma: &DerivedTable,
1880        ac_luma: &DerivedTable,
1881        ac_chroma: &DerivedTable,
1882        encoder: &mut ProgressiveEncoder<W>,
1883    ) -> Result<()> {
1884        let is_dc_scan = scan.ss == 0 && scan.se == 0;
1885        let is_refinement = scan.ah != 0;
1886
1887        if is_dc_scan {
1888            // DC scan - can be interleaved (multiple components)
1889            self.encode_dc_scan(
1890                scan,
1891                y_blocks,
1892                cb_blocks,
1893                cr_blocks,
1894                mcu_rows,
1895                mcu_cols,
1896                h_samp,
1897                v_samp,
1898                dc_luma,
1899                dc_chroma,
1900                is_refinement,
1901                encoder,
1902            )?;
1903        } else {
1904            // AC scan - single component only (non-interleaved)
1905            // For non-interleaved scans, use actual component block dimensions
1906            let comp_idx = scan.component_index[0] as usize;
1907            let blocks = match comp_idx {
1908                0 => y_blocks,
1909                1 => cb_blocks,
1910                2 => cr_blocks,
1911                _ => return Err(Error::InvalidComponentIndex(comp_idx)),
1912            };
1913            let ac_table = if comp_idx == 0 { ac_luma } else { ac_chroma };
1914
1915            // Calculate actual block dimensions for this component.
1916            // Non-interleaved AC scans encode only the actual image blocks, not MCU padding.
1917            // This differs from interleaved DC scans which encode all MCU blocks.
1918            // Reference: ITU-T T.81 Section F.2.3
1919            let (block_cols, block_rows) = if comp_idx == 0 {
1920                // Y component: full resolution
1921                (
1922                    actual_width.div_ceil(DCTSIZE),
1923                    actual_height.div_ceil(DCTSIZE),
1924                )
1925            } else {
1926                // Chroma components: subsampled resolution
1927                (
1928                    chroma_width.div_ceil(DCTSIZE),
1929                    chroma_height.div_ceil(DCTSIZE),
1930                )
1931            };
1932
1933            self.encode_ac_scan(
1934                scan,
1935                blocks,
1936                mcu_rows,
1937                mcu_cols,
1938                h_samp,
1939                v_samp,
1940                comp_idx,
1941                block_cols,
1942                block_rows,
1943                ac_table,
1944                is_refinement,
1945                encoder,
1946            )?;
1947        }
1948
1949        Ok(())
1950    }
1951
1952    /// Encode a DC scan (Ss=Se=0).
1953    #[allow(clippy::too_many_arguments)]
1954    fn encode_dc_scan<W: Write>(
1955        &self,
1956        scan: &crate::types::ScanInfo,
1957        y_blocks: &[[i16; DCTSIZE2]],
1958        cb_blocks: &[[i16; DCTSIZE2]],
1959        cr_blocks: &[[i16; DCTSIZE2]],
1960        mcu_rows: usize,
1961        mcu_cols: usize,
1962        h_samp: u8,
1963        v_samp: u8,
1964        dc_luma: &DerivedTable,
1965        dc_chroma: &DerivedTable,
1966        is_refinement: bool,
1967        encoder: &mut ProgressiveEncoder<W>,
1968    ) -> Result<()> {
1969        let blocks_per_mcu_y = (h_samp * v_samp) as usize;
1970        let mut y_idx = 0;
1971        let mut c_idx = 0;
1972
1973        for _mcu_row in 0..mcu_rows {
1974            for _mcu_col in 0..mcu_cols {
1975                // Encode Y blocks
1976                for _ in 0..blocks_per_mcu_y {
1977                    if is_refinement {
1978                        encoder.encode_dc_refine(&y_blocks[y_idx], scan.al)?;
1979                    } else {
1980                        encoder.encode_dc_first(&y_blocks[y_idx], 0, dc_luma, scan.al)?;
1981                    }
1982                    y_idx += 1;
1983                }
1984
1985                // Encode Cb
1986                if is_refinement {
1987                    encoder.encode_dc_refine(&cb_blocks[c_idx], scan.al)?;
1988                } else {
1989                    encoder.encode_dc_first(&cb_blocks[c_idx], 1, dc_chroma, scan.al)?;
1990                }
1991
1992                // Encode Cr
1993                if is_refinement {
1994                    encoder.encode_dc_refine(&cr_blocks[c_idx], scan.al)?;
1995                } else {
1996                    encoder.encode_dc_first(&cr_blocks[c_idx], 2, dc_chroma, scan.al)?;
1997                }
1998
1999                c_idx += 1;
2000            }
2001        }
2002
2003        Ok(())
2004    }
2005
2006    /// Encode an AC scan (Ss > 0).
2007    ///
2008    /// **IMPORTANT**: Progressive AC scans are always non-interleaved, meaning blocks
2009    /// must be encoded in component raster order (row-major within the component's
2010    /// block grid), NOT in MCU-interleaved order.
2011    ///
2012    /// For non-interleaved scans, the number of blocks is determined by the actual
2013    /// component dimensions (ceil(width/8) × ceil(height/8)), NOT the MCU-padded
2014    /// dimensions. This is different from interleaved DC scans which use MCU order.
2015    /// The padding blocks (beyond actual image dimensions) have DC coefficients but
2016    /// no AC coefficients - the decoder only outputs the actual image dimensions.
2017    ///
2018    /// Reference: ITU-T T.81 Section F.2.3 - "The scan data for a non-interleaved
2019    /// scan shall consist of a sequence of entropy-coded segments... The data units
2020    /// are processed in the order defined by the scan component."
2021    #[allow(clippy::too_many_arguments)]
2022    fn encode_ac_scan<W: Write>(
2023        &self,
2024        scan: &crate::types::ScanInfo,
2025        blocks: &[[i16; DCTSIZE2]],
2026        _mcu_rows: usize,
2027        mcu_cols: usize,
2028        h_samp: u8,
2029        v_samp: u8,
2030        comp_idx: usize,
2031        block_cols: usize,
2032        block_rows: usize,
2033        ac_table: &DerivedTable,
2034        is_refinement: bool,
2035        encoder: &mut ProgressiveEncoder<W>,
2036    ) -> Result<()> {
2037        // For Y component with subsampling, blocks are stored in MCU-interleaved order
2038        // but AC scans must encode them in component raster order.
2039        // For chroma components (1 block per MCU), the orders are identical.
2040        //
2041        // For non-interleaved scans, encode only the actual image blocks (block_rows × block_cols),
2042        // not all MCU-padded blocks. Padding blocks have DC coefficients but no AC coefficients.
2043
2044        let blocks_per_mcu = if comp_idx == 0 {
2045            (h_samp * v_samp) as usize
2046        } else {
2047            1
2048        };
2049
2050        if blocks_per_mcu == 1 {
2051            // Chroma or 4:4:4 Y: storage order = raster order
2052            let total_blocks = block_rows * block_cols;
2053            for block in blocks.iter().take(total_blocks) {
2054                if is_refinement {
2055                    encoder
2056                        .encode_ac_refine(block, scan.ss, scan.se, scan.ah, scan.al, ac_table)?;
2057                } else {
2058                    encoder.encode_ac_first(block, scan.ss, scan.se, scan.al, ac_table)?;
2059                }
2060            }
2061        } else {
2062            // Y component with subsampling (h_samp > 1 or v_samp > 1)
2063            // Convert from MCU-interleaved storage to component raster order
2064            let h = h_samp as usize;
2065            let v = v_samp as usize;
2066
2067            for block_row in 0..block_rows {
2068                for block_col in 0..block_cols {
2069                    // Convert raster position to MCU-interleaved storage index
2070                    let mcu_row = block_row / v;
2071                    let mcu_col = block_col / h;
2072                    let v_idx = block_row % v;
2073                    let h_idx = block_col % h;
2074                    let storage_idx = mcu_row * (mcu_cols * blocks_per_mcu)
2075                        + mcu_col * blocks_per_mcu
2076                        + v_idx * h
2077                        + h_idx;
2078
2079                    if is_refinement {
2080                        encoder.encode_ac_refine(
2081                            &blocks[storage_idx],
2082                            scan.ss,
2083                            scan.se,
2084                            scan.ah,
2085                            scan.al,
2086                            ac_table,
2087                        )?;
2088                    } else {
2089                        encoder.encode_ac_first(
2090                            &blocks[storage_idx],
2091                            scan.ss,
2092                            scan.se,
2093                            scan.al,
2094                            ac_table,
2095                        )?;
2096                    }
2097                }
2098            }
2099        }
2100
2101        Ok(())
2102    }
2103
2104    /// Count DC symbols for a progressive DC scan.
2105    #[allow(clippy::too_many_arguments)]
2106    fn count_dc_scan_symbols(
2107        &self,
2108        scan: &crate::types::ScanInfo,
2109        y_blocks: &[[i16; DCTSIZE2]],
2110        cb_blocks: &[[i16; DCTSIZE2]],
2111        cr_blocks: &[[i16; DCTSIZE2]],
2112        mcu_rows: usize,
2113        mcu_cols: usize,
2114        h_samp: u8,
2115        v_samp: u8,
2116        dc_luma_freq: &mut FrequencyCounter,
2117        dc_chroma_freq: &mut FrequencyCounter,
2118    ) {
2119        let blocks_per_mcu_y = (h_samp * v_samp) as usize;
2120        let mut y_idx = 0;
2121        let mut c_idx = 0;
2122        let mut counter = ProgressiveSymbolCounter::new();
2123
2124        for _mcu_row in 0..mcu_rows {
2125            for _mcu_col in 0..mcu_cols {
2126                // Y blocks
2127                for _ in 0..blocks_per_mcu_y {
2128                    counter.count_dc_first(&y_blocks[y_idx], 0, scan.al, dc_luma_freq);
2129                    y_idx += 1;
2130                }
2131                // Cb block
2132                counter.count_dc_first(&cb_blocks[c_idx], 1, scan.al, dc_chroma_freq);
2133                // Cr block
2134                counter.count_dc_first(&cr_blocks[c_idx], 2, scan.al, dc_chroma_freq);
2135                c_idx += 1;
2136            }
2137        }
2138    }
2139
2140    /// Count AC symbols for a progressive AC scan.
2141    ///
2142    /// Must iterate blocks in the same order as `encode_ac_scan` (component raster order)
2143    /// to ensure EOBRUN counts match and Huffman tables are correct.
2144    ///
2145    /// Uses actual block dimensions (not MCU-padded) for non-interleaved scans.
2146    #[allow(clippy::too_many_arguments)]
2147    fn count_ac_scan_symbols(
2148        &self,
2149        scan: &crate::types::ScanInfo,
2150        blocks: &[[i16; DCTSIZE2]],
2151        _mcu_rows: usize,
2152        mcu_cols: usize,
2153        h_samp: u8,
2154        v_samp: u8,
2155        comp_idx: usize,
2156        block_cols: usize,
2157        block_rows: usize,
2158        ac_freq: &mut FrequencyCounter,
2159    ) {
2160        let blocks_per_mcu = if comp_idx == 0 {
2161            (h_samp * v_samp) as usize
2162        } else {
2163            1
2164        };
2165
2166        let mut counter = ProgressiveSymbolCounter::new();
2167        let is_refinement = scan.ah != 0;
2168
2169        if blocks_per_mcu == 1 {
2170            // Chroma or 4:4:4 Y: storage order = raster order
2171            let total_blocks = block_rows * block_cols;
2172            for block in blocks.iter().take(total_blocks) {
2173                if is_refinement {
2174                    counter.count_ac_refine(block, scan.ss, scan.se, scan.ah, scan.al, ac_freq);
2175                } else {
2176                    counter.count_ac_first(block, scan.ss, scan.se, scan.al, ac_freq);
2177                }
2178            }
2179        } else {
2180            // Y component with subsampling - iterate in raster order (matching encode_ac_scan)
2181            let h = h_samp as usize;
2182            let v = v_samp as usize;
2183
2184            for block_row in 0..block_rows {
2185                for block_col in 0..block_cols {
2186                    // Convert raster position to MCU-interleaved storage index
2187                    let mcu_row = block_row / v;
2188                    let mcu_col = block_col / h;
2189                    let v_idx = block_row % v;
2190                    let h_idx = block_col % h;
2191                    let storage_idx = mcu_row * (mcu_cols * blocks_per_mcu)
2192                        + mcu_col * blocks_per_mcu
2193                        + v_idx * h
2194                        + h_idx;
2195
2196                    if is_refinement {
2197                        counter.count_ac_refine(
2198                            &blocks[storage_idx],
2199                            scan.ss,
2200                            scan.se,
2201                            scan.ah,
2202                            scan.al,
2203                            ac_freq,
2204                        );
2205                    } else {
2206                        counter.count_ac_first(
2207                            &blocks[storage_idx],
2208                            scan.ss,
2209                            scan.se,
2210                            scan.al,
2211                            ac_freq,
2212                        );
2213                    }
2214                }
2215            }
2216        }
2217
2218        // Flush any pending EOBRUN
2219        counter.finish_scan(Some(ac_freq));
2220    }
2221}
2222
2223// ============================================================================
2224// Encode Trait Implementation
2225// ============================================================================
2226
2227impl Encode for Encoder {
2228    fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
2229        self.encode_rgb(rgb_data, width, height)
2230    }
2231
2232    fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
2233        self.encode_gray(gray_data, width, height)
2234    }
2235}
2236
2237// Note: StreamingEncoder and EncodingStream are in the `streaming` module.
2238
2239// Add streaming() method to Encoder
2240impl Encoder {
2241    /// Create a streaming encoder.
2242    ///
2243    /// Returns a [`StreamingEncoder`] which supports scanline-by-scanline encoding.
2244    /// Note that streaming mode does NOT support trellis quantization, progressive
2245    /// mode, or Huffman optimization (these require buffering the entire image).
2246    ///
2247    /// For full-featured encoding with all mozjpeg optimizations, use [`Encoder::new()`]
2248    /// with [`encode_rgb()`](Encoder::encode_rgb) or [`encode_gray()`](Encoder::encode_gray).
2249    ///
2250    /// # Example
2251    ///
2252    /// ```ignore
2253    /// use mozjpeg_rs::Encoder;
2254    /// use std::fs::File;
2255    ///
2256    /// let file = File::create("output.jpg")?;
2257    /// let mut stream = Encoder::streaming()
2258    ///     .quality(85)
2259    ///     .start_rgb(1920, 1080, file)?;
2260    ///
2261    /// // Write scanlines...
2262    /// stream.finish()?;
2263    /// ```
2264    pub fn streaming() -> StreamingEncoder {
2265        StreamingEncoder::new()
2266    }
2267}
2268
2269/// Unit tests for private encoder internals.
2270/// Public API tests are in tests/encode_tests.rs.
2271#[cfg(test)]
2272mod tests {
2273    use super::*;
2274
2275    #[test]
2276    fn test_encoder_defaults() {
2277        let enc = Encoder::new();
2278        assert_eq!(enc.quality, 75);
2279        assert!(!enc.progressive);
2280        assert_eq!(enc.subsampling, Subsampling::S420);
2281        assert!(enc.trellis.enabled);
2282        assert!(enc.optimize_huffman);
2283    }
2284
2285    #[test]
2286    fn test_encoder_builder_fields() {
2287        let enc = Encoder::new()
2288            .quality(90)
2289            .progressive(true)
2290            .subsampling(Subsampling::S444);
2291
2292        assert_eq!(enc.quality, 90);
2293        assert!(enc.progressive);
2294        assert_eq!(enc.subsampling, Subsampling::S444);
2295    }
2296
2297    #[test]
2298    fn test_quality_clamping() {
2299        let enc = Encoder::new().quality(0);
2300        assert_eq!(enc.quality, 1);
2301
2302        let enc = Encoder::new().quality(150);
2303        assert_eq!(enc.quality, 100);
2304    }
2305
2306    #[test]
2307    fn test_natural_to_zigzag() {
2308        let mut natural = [0u16; 64];
2309        for i in 0..64 {
2310            natural[i] = i as u16;
2311        }
2312        let zigzag = natural_to_zigzag(&natural);
2313
2314        assert_eq!(zigzag[0], 0);
2315        assert_eq!(zigzag[1], 1);
2316    }
2317
2318    #[test]
2319    fn test_max_compression_uses_all_optimizations() {
2320        let encoder = Encoder::max_compression();
2321        assert!(encoder.trellis.enabled);
2322        assert!(encoder.progressive);
2323        assert!(encoder.optimize_huffman);
2324        assert!(encoder.optimize_scans);
2325    }
2326}