mozjpeg_rs/
encode.rs

1//! JPEG encoder pipeline.
2//!
3//! This module provides two encoder types:
4//!
5//! - [`Encoder`]: Full-featured encoder with trellis quantization, progressive mode,
6//!   and Huffman optimization. Batch encoding only.
7//! - [`StreamingEncoder`]: Streaming-capable encoder without optimizations.
8//!   Supports both batch and scanline-by-scanline encoding.
9//!
10//! Both implement the [`Encode`] trait for batch encoding.
11//!
12//! # Examples
13//!
14//! ```ignore
15//! use mozjpeg_rs::{Encoder, Preset};
16//!
17//! // Full-featured batch encoding
18//! let jpeg = Encoder::new(Preset::default())
19//!     .quality(85)
20//!     .encode_rgb(&pixels, width, height)?;
21//!
22//! // Streaming encoding (memory-efficient for large images)
23//! let mut stream = Encoder::streaming()
24//!     .quality(85)
25//!     .start(width, height, file)?;
26//! for row in scanlines.chunks(16) {
27//!     stream.write_scanlines(row)?;
28//! }
29//! stream.finish()?;
30//! ```
31
32use std::io::Write;
33use std::sync::atomic::{AtomicBool, Ordering};
34use std::time::{Duration, Instant};
35
36use crate::bitstream::BitWriter;
37use crate::consts::{QuantTableIdx, DCTSIZE, DCTSIZE2};
38use crate::deringing::preprocess_deringing;
39use crate::entropy::{EntropyEncoder, ProgressiveEncoder, ProgressiveSymbolCounter, SymbolCounter};
40use crate::error::{Error, Result};
41use crate::huffman::DerivedTable;
42use crate::huffman::FrequencyCounter;
43use crate::marker::MarkerWriter;
44use crate::progressive::{generate_baseline_scan, generate_mozjpeg_max_compression_scans};
45use crate::quant::{create_quant_tables, quantize_block_raw};
46use crate::sample;
47use crate::scan_optimize::{generate_search_scans, ScanSearchConfig, ScanSelector};
48use crate::scan_trial::ScanTrialEncoder;
49#[cfg(target_arch = "x86_64")]
50use crate::simd::x86_64::entropy::SimdEntropyEncoder;
51use crate::simd::SimdOps;
52use crate::trellis::trellis_quantize_block;
53use crate::types::{Limits, PixelDensity, Preset, Subsampling, TrellisConfig};
54
55mod helpers;
56mod streaming;
57
58pub(crate) use helpers::{
59    create_components, create_std_ac_chroma_table, create_std_ac_luma_table,
60    create_std_dc_chroma_table, create_std_dc_luma_table, create_ycbcr_components,
61    natural_to_zigzag, run_dc_trellis_by_row, try_alloc_vec, try_alloc_vec_array, write_dht_marker,
62    write_sos_marker,
63};
64pub use streaming::{EncodingStream, StreamingEncoder};
65
66// ============================================================================
67// Cancellation Support
68// ============================================================================
69
70/// Internal context for cancellation checking during encoding.
71///
72/// This is passed through the encoding pipeline to allow periodic
73/// cancellation checks without function signature changes everywhere.
74#[derive(Clone, Copy)]
75pub(crate) struct CancellationContext<'a> {
76    /// Optional cancellation flag - if set to true, encoding should abort.
77    pub cancel: Option<&'a AtomicBool>,
78    /// Optional deadline - if current time exceeds this, encoding should abort.
79    pub deadline: Option<Instant>,
80}
81
82impl<'a> CancellationContext<'a> {
83    /// Create a context with no cancellation (always succeeds).
84    #[allow(dead_code)]
85    pub const fn none() -> Self {
86        Self {
87            cancel: None,
88            deadline: None,
89        }
90    }
91
92    /// Create a context from optional cancel flag and timeout.
93    #[allow(dead_code)]
94    pub fn new(cancel: Option<&'a AtomicBool>, timeout: Option<Duration>) -> Self {
95        Self {
96            cancel,
97            deadline: timeout.map(|d| Instant::now() + d),
98        }
99    }
100
101    /// Check if cancellation has been requested.
102    ///
103    /// Returns `Ok(())` if encoding should continue, or `Err` if cancelled/timed out.
104    #[inline]
105    pub fn check(&self) -> Result<()> {
106        if let Some(c) = self.cancel {
107            if c.load(Ordering::Relaxed) {
108                return Err(Error::Cancelled);
109            }
110        }
111        if let Some(d) = self.deadline {
112            if Instant::now() > d {
113                return Err(Error::TimedOut);
114            }
115        }
116        Ok(())
117    }
118
119    /// Check cancellation every N iterations (to reduce overhead).
120    ///
121    /// Only performs the check when `iteration % interval == 0`.
122    #[inline]
123    #[allow(dead_code)]
124    pub fn check_periodic(&self, iteration: usize, interval: usize) -> Result<()> {
125        if iteration.is_multiple_of(interval) {
126            self.check()
127        } else {
128            Ok(())
129        }
130    }
131}
132
133// ============================================================================
134// Encode Trait (internal, for potential future streaming API)
135// ============================================================================
136
137/// Trait for JPEG encoding (batch mode).
138///
139/// Implemented by both [`Encoder`] and [`StreamingEncoder`].
140#[allow(dead_code)]
141pub trait Encode {
142    /// Encode RGB image data to JPEG.
143    ///
144    /// # Arguments
145    /// * `rgb_data` - RGB pixel data (3 bytes per pixel, row-major order)
146    /// * `width` - Image width in pixels
147    /// * `height` - Image height in pixels
148    fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>>;
149
150    /// Encode grayscale image data to JPEG.
151    ///
152    /// # Arguments
153    /// * `gray_data` - Grayscale pixel data (1 byte per pixel, row-major order)
154    /// * `width` - Image width in pixels
155    /// * `height` - Image height in pixels
156    fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>>;
157}
158
159/// JPEG encoder with configurable quality and features.
160#[derive(Debug, Clone)]
161pub struct Encoder {
162    /// Quality level (1-100)
163    quality: u8,
164    /// Enable progressive mode
165    progressive: bool,
166    /// Chroma subsampling mode
167    subsampling: Subsampling,
168    /// Quantization table variant
169    quant_table_idx: QuantTableIdx,
170    /// Custom luminance quantization table (overrides quant_table_idx if set)
171    custom_luma_qtable: Option<[u16; DCTSIZE2]>,
172    /// Custom chrominance quantization table (overrides quant_table_idx if set)
173    custom_chroma_qtable: Option<[u16; DCTSIZE2]>,
174    /// Trellis quantization configuration
175    trellis: TrellisConfig,
176    /// Force baseline-compatible output
177    force_baseline: bool,
178    /// Optimize Huffman tables (requires 2-pass)
179    optimize_huffman: bool,
180    /// Enable overshoot deringing (reduces ringing on white backgrounds)
181    overshoot_deringing: bool,
182    /// Optimize progressive scan configuration (tries multiple configs, picks smallest)
183    optimize_scans: bool,
184    /// Restart interval in MCUs (0 = disabled)
185    restart_interval: u16,
186    /// Pixel density for JFIF APP0 marker
187    pixel_density: PixelDensity,
188    /// EXIF data to embed (raw TIFF structure, without "Exif\0\0" header)
189    exif_data: Option<Vec<u8>>,
190    /// ICC color profile to embed (will be chunked into APP2 markers)
191    icc_profile: Option<Vec<u8>>,
192    /// Custom APP markers to embed (marker number 0-15, data)
193    custom_markers: Vec<(u8, Vec<u8>)>,
194    /// SIMD operations dispatch (detected once at construction)
195    simd: SimdOps,
196    /// Smoothing factor (0-100, 0 = disabled)
197    /// Applies a weighted average filter to reduce fine-scale noise.
198    /// Useful for converting dithered images (like GIFs) to JPEG.
199    smoothing: u8,
200    /// Resource limits (dimensions, memory, ICC size)
201    limits: Limits,
202}
203
204impl Default for Encoder {
205    fn default() -> Self {
206        Self::new(Preset::default())
207    }
208}
209
210impl Encoder {
211    /// Create an encoder with the specified preset.
212    ///
213    /// # Arguments
214    ///
215    /// * `preset` - Encoding preset (see [`Preset`] for details):
216    ///   - [`BaselineFastest`](Preset::BaselineFastest): No optimizations, fastest encoding
217    ///   - [`BaselineBalanced`](Preset::BaselineBalanced): Baseline with all optimizations
218    ///   - [`ProgressiveBalanced`](Preset::ProgressiveBalanced): Progressive with optimizations (default)
219    ///   - [`ProgressiveSmallest`](Preset::ProgressiveSmallest): Maximum compression
220    ///
221    /// # Preset Comparison
222    ///
223    /// | Preset | Time | Size | Best For |
224    /// |--------|------|------|----------|
225    /// | `BaselineFastest` | ~2ms | baseline | Real-time, thumbnails |
226    /// | `BaselineBalanced` | ~7ms | -13% | Sequential playback |
227    /// | `ProgressiveBalanced` | ~9ms | -13% | Web images (default) |
228    /// | `ProgressiveSmallest` | ~21ms | -14% | Storage, archival |
229    ///
230    /// *Benchmarks: 512×512 Q75 image*
231    ///
232    /// # Example
233    ///
234    /// ```no_run
235    /// use mozjpeg_rs::{Encoder, Preset};
236    ///
237    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
238    ///
239    /// // Default: progressive with good balance
240    /// let jpeg = Encoder::new(Preset::default())
241    ///     .quality(85)
242    ///     .encode_rgb(&pixels, 256, 256)
243    ///     .unwrap();
244    ///
245    /// // Fastest for real-time applications
246    /// let jpeg = Encoder::new(Preset::BaselineFastest)
247    ///     .quality(80)
248    ///     .encode_rgb(&pixels, 256, 256)
249    ///     .unwrap();
250    ///
251    /// // Maximum compression (matches C mozjpeg)
252    /// let jpeg = Encoder::new(Preset::ProgressiveSmallest)
253    ///     .quality(85)
254    ///     .encode_rgb(&pixels, 256, 256)
255    ///     .unwrap();
256    /// ```
257    pub fn new(preset: Preset) -> Self {
258        match preset {
259            Preset::BaselineFastest => Self::fastest(),
260            Preset::BaselineBalanced => Self::baseline_optimized(),
261            Preset::ProgressiveBalanced => Self::progressive_balanced(),
262            Preset::ProgressiveSmallest => Self::max_compression(),
263        }
264    }
265
266    /// Create an encoder with the most optimized baseline (non-progressive) settings.
267    ///
268    /// This is the recommended starting point for most use cases. It produces
269    /// sequential (non-progressive) JPEGs with all mozjpeg optimizations enabled:
270    /// trellis quantization, Huffman optimization, and overshoot deringing.
271    ///
272    /// # Default Settings
273    ///
274    /// | Setting | Value | Notes |
275    /// |---------|-------|-------|
276    /// | quality | 75 | Good balance of size/quality |
277    /// | progressive | **false** | Sequential baseline JPEG |
278    /// | optimize_scans | **false** | N/A for baseline mode |
279    /// | subsampling | 4:2:0 | Standard chroma subsampling |
280    /// | trellis | **enabled** | AC + DC trellis quantization |
281    /// | optimize_huffman | **true** | 2-pass for optimal Huffman tables |
282    /// | overshoot_deringing | **true** | Reduces ringing on hard edges |
283    /// | quant_tables | ImageMagick | Same as C mozjpeg default |
284    /// | force_baseline | false | Allows 16-bit DQT at very low Q |
285    ///
286    /// # Comparison with C mozjpeg
287    ///
288    /// **Important:** This differs from C mozjpeg's `jpeg_set_defaults()`!
289    ///
290    /// C mozjpeg uses `JCP_MAX_COMPRESSION` profile by default, which enables
291    /// progressive mode and optimize_scans. This produces ~20% smaller files
292    /// but with slower encoding and progressive rendering.
293    ///
294    /// | Setting | `baseline_optimized()` | C mozjpeg default |
295    /// |---------|------------------------|-------------------|
296    /// | progressive | **false** | true |
297    /// | optimize_scans | **false** | true |
298    /// | trellis | true | true |
299    /// | deringing | true | true |
300    ///
301    /// To match C mozjpeg's default behavior, use [`max_compression()`](Self::max_compression).
302    ///
303    /// # Example
304    ///
305    /// ```no_run
306    /// use mozjpeg_rs::Encoder;
307    ///
308    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
309    /// let jpeg = Encoder::baseline_optimized()
310    ///     .quality(85)
311    ///     .encode_rgb(&pixels, 256, 256)
312    ///     .unwrap();
313    /// ```
314    pub fn baseline_optimized() -> Self {
315        Self {
316            quality: 75,
317            progressive: false,
318            subsampling: Subsampling::S420,
319            quant_table_idx: QuantTableIdx::ImageMagick,
320            custom_luma_qtable: None,
321            custom_chroma_qtable: None,
322            trellis: TrellisConfig::default(),
323            force_baseline: false,
324            optimize_huffman: true,
325            overshoot_deringing: true,
326            optimize_scans: false,
327            restart_interval: 0,
328            pixel_density: PixelDensity::default(),
329            exif_data: None,
330            icc_profile: None,
331            custom_markers: Vec::new(),
332            simd: SimdOps::detect(),
333            smoothing: 0,
334            limits: Limits::none(),
335        }
336    }
337
338    /// Create encoder with maximum compression (matches C mozjpeg defaults).
339    ///
340    /// This matches the `JCP_MAX_COMPRESSION` profile used by C mozjpeg's
341    /// `jpeg_set_defaults()` and the `mozjpeg` crate.
342    ///
343    /// # Settings (differences from `new()` in **bold**)
344    ///
345    /// | Setting | Value | Notes |
346    /// |---------|-------|-------|
347    /// | quality | 75 | Same as `new()` |
348    /// | progressive | **true** | Multi-scan progressive JPEG |
349    /// | optimize_scans | **true** | Tries multiple scan configs |
350    /// | subsampling | 4:2:0 | Same as `new()` |
351    /// | trellis | enabled | Same as `new()` |
352    /// | optimize_huffman | true | Same as `new()` |
353    /// | overshoot_deringing | true | Same as `new()` |
354    ///
355    /// # File Size Comparison
356    ///
357    /// Typical results at Q75 (256×256 image):
358    /// - `Encoder::baseline_optimized()`: ~650 bytes (baseline)
359    /// - `Encoder::max_compression()`: ~520 bytes (**~20% smaller**)
360    ///
361    /// # Example
362    ///
363    /// ```no_run
364    /// use mozjpeg_rs::Encoder;
365    ///
366    /// // Match C mozjpeg's default compression
367    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
368    /// let jpeg = Encoder::max_compression()
369    ///     .quality(85)
370    ///     .encode_rgb(&pixels, 256, 256)
371    ///     .unwrap();
372    /// ```
373    pub fn max_compression() -> Self {
374        Self {
375            quality: 75,
376            progressive: true,
377            subsampling: Subsampling::S420,
378            quant_table_idx: QuantTableIdx::ImageMagick,
379            custom_luma_qtable: None,
380            custom_chroma_qtable: None,
381            trellis: TrellisConfig::default(),
382            force_baseline: false,
383            optimize_huffman: true,
384            overshoot_deringing: true,
385            optimize_scans: true,
386            restart_interval: 0,
387            pixel_density: PixelDensity::default(),
388            exif_data: None,
389            icc_profile: None,
390            custom_markers: Vec::new(),
391            simd: SimdOps::detect(),
392            smoothing: 0,
393            limits: Limits::none(),
394        }
395    }
396
397    /// Create encoder with progressive mode and all optimizations except optimize_scans.
398    ///
399    /// This is the **recommended default** for most use cases. It provides:
400    /// - Progressive rendering (blurry-to-sharp loading)
401    /// - All mozjpeg optimizations (trellis, Huffman, deringing)
402    /// - Good balance between file size and encoding speed
403    ///
404    /// # Settings
405    ///
406    /// | Setting | Value | Notes |
407    /// |---------|-------|-------|
408    /// | progressive | **true** | Multi-scan progressive JPEG |
409    /// | optimize_scans | **false** | Uses fixed 9-scan config |
410    /// | trellis | enabled | AC + DC trellis quantization |
411    /// | optimize_huffman | true | 2-pass for optimal tables |
412    /// | overshoot_deringing | true | Reduces ringing on hard edges |
413    ///
414    /// # vs `max_compression()`
415    ///
416    /// This preset omits `optimize_scans` which:
417    /// - Saves ~100% encoding time (9ms vs 21ms at 512×512)
418    /// - Loses only ~1% file size reduction
419    ///
420    /// Use `max_compression()` only when file size is critical.
421    ///
422    /// # Example
423    ///
424    /// ```no_run
425    /// use mozjpeg_rs::Encoder;
426    ///
427    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
428    /// let jpeg = Encoder::progressive_balanced()
429    ///     .quality(85)
430    ///     .encode_rgb(&pixels, 256, 256)
431    ///     .unwrap();
432    /// ```
433    pub fn progressive_balanced() -> Self {
434        Self {
435            quality: 75,
436            progressive: true,
437            subsampling: Subsampling::S420,
438            quant_table_idx: QuantTableIdx::ImageMagick,
439            custom_luma_qtable: None,
440            custom_chroma_qtable: None,
441            trellis: TrellisConfig::default(),
442            force_baseline: false,
443            optimize_huffman: true,
444            overshoot_deringing: true,
445            optimize_scans: false, // Key difference from max_compression()
446            restart_interval: 0,
447            pixel_density: PixelDensity::default(),
448            exif_data: None,
449            icc_profile: None,
450            custom_markers: Vec::new(),
451            simd: SimdOps::detect(),
452            smoothing: 0,
453            limits: Limits::none(),
454        }
455    }
456
457    /// Create encoder with fastest settings (libjpeg-turbo compatible).
458    ///
459    /// Disables all mozjpeg-specific optimizations for maximum encoding speed.
460    /// Output is compatible with standard libjpeg/libjpeg-turbo.
461    ///
462    /// # Settings (differences from `new()` in **bold**)
463    ///
464    /// | Setting | Value | Notes |
465    /// |---------|-------|-------|
466    /// | quality | 75 | Same as `new()` |
467    /// | progressive | false | Same as `new()` |
468    /// | trellis | **disabled** | No trellis quantization |
469    /// | optimize_huffman | **false** | Uses default Huffman tables |
470    /// | overshoot_deringing | **false** | No deringing filter |
471    /// | force_baseline | **true** | 8-bit DQT only |
472    ///
473    /// # Performance
474    ///
475    /// Encoding is ~4-10x faster than `new()`, but files are ~10-20% larger.
476    ///
477    /// # Example
478    ///
479    /// ```no_run
480    /// use mozjpeg_rs::Encoder;
481    ///
482    /// // Fast encoding for real-time applications
483    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
484    /// let jpeg = Encoder::fastest()
485    ///     .quality(80)
486    ///     .encode_rgb(&pixels, 256, 256)
487    ///     .unwrap();
488    /// ```
489    pub fn fastest() -> Self {
490        Self {
491            quality: 75,
492            progressive: false,
493            subsampling: Subsampling::S420,
494            quant_table_idx: QuantTableIdx::ImageMagick,
495            custom_luma_qtable: None,
496            custom_chroma_qtable: None,
497            trellis: TrellisConfig::disabled(),
498            force_baseline: true,
499            optimize_huffman: false,
500            overshoot_deringing: false,
501            optimize_scans: false,
502            restart_interval: 0,
503            pixel_density: PixelDensity::default(),
504            exif_data: None,
505            icc_profile: None,
506            custom_markers: Vec::new(),
507            simd: SimdOps::detect(),
508            smoothing: 0,
509            limits: Limits::none(),
510        }
511    }
512
513    /// Set quality level (1-100).
514    ///
515    /// Higher values produce larger, higher-quality images.
516    pub fn quality(mut self, quality: u8) -> Self {
517        self.quality = quality.clamp(1, 100);
518        self
519    }
520
521    /// Enable or disable progressive mode.
522    pub fn progressive(mut self, enable: bool) -> Self {
523        self.progressive = enable;
524        self
525    }
526
527    /// Set chroma subsampling mode.
528    pub fn subsampling(mut self, mode: Subsampling) -> Self {
529        self.subsampling = mode;
530        self
531    }
532
533    /// Set quantization table variant.
534    pub fn quant_tables(mut self, idx: QuantTableIdx) -> Self {
535        self.quant_table_idx = idx;
536        self
537    }
538
539    /// Configure trellis quantization.
540    pub fn trellis(mut self, config: TrellisConfig) -> Self {
541        self.trellis = config;
542        self
543    }
544
545    /// Force baseline-compatible output.
546    pub fn force_baseline(mut self, enable: bool) -> Self {
547        self.force_baseline = enable;
548        self
549    }
550
551    /// Enable Huffman table optimization.
552    pub fn optimize_huffman(mut self, enable: bool) -> Self {
553        self.optimize_huffman = enable;
554        self
555    }
556
557    /// Enable overshoot deringing.
558    ///
559    /// Reduces visible ringing artifacts near hard edges, especially on white
560    /// backgrounds. Works by allowing encoded values to "overshoot" above 255
561    /// (which will clamp back to 255 when decoded) to create smoother waveforms.
562    ///
563    /// This is a mozjpeg-specific feature that can improve visual quality at
564    /// minimal file size cost. Enabled by default.
565    pub fn overshoot_deringing(mut self, enable: bool) -> Self {
566        self.overshoot_deringing = enable;
567        self
568    }
569
570    /// Enable or disable scan optimization for progressive mode.
571    ///
572    /// When enabled, the encoder tries multiple scan configurations and
573    /// picks the one that produces the smallest output. This can improve
574    /// compression by 1-3% but increases encoding time.
575    ///
576    /// Only has effect when progressive mode is enabled.
577    pub fn optimize_scans(mut self, enable: bool) -> Self {
578        self.optimize_scans = enable;
579        self
580    }
581
582    /// Set input smoothing factor (0-100).
583    ///
584    /// Applies a weighted average filter to reduce fine-scale noise in the
585    /// input image before encoding. This is particularly useful for converting
586    /// dithered images (like GIFs) to JPEG.
587    ///
588    /// - 0 = disabled (default)
589    /// - 10-50 = recommended for dithered images
590    /// - Higher values = more smoothing (may blur the image)
591    ///
592    /// # Example
593    /// ```
594    /// use mozjpeg_rs::Encoder;
595    ///
596    /// // Convert a dithered GIF to JPEG with smoothing
597    /// let encoder = Encoder::baseline_optimized()
598    ///     .quality(85)
599    ///     .smoothing(30);
600    /// ```
601    pub fn smoothing(mut self, factor: u8) -> Self {
602        self.smoothing = factor.min(100);
603        self
604    }
605
606    /// Set restart interval in MCUs.
607    ///
608    /// Restart markers are inserted every N MCUs, which can help with
609    /// error recovery and parallel decoding. Set to 0 to disable (default).
610    ///
611    /// Common values: 0 (disabled), or image width in MCUs for row-by-row restarts.
612    pub fn restart_interval(mut self, interval: u16) -> Self {
613        self.restart_interval = interval;
614        self
615    }
616
617    /// Set EXIF data to embed in the JPEG.
618    ///
619    /// # Arguments
620    /// * `data` - Raw EXIF data (TIFF structure). The "Exif\0\0" header
621    ///   will be added automatically.
622    ///
623    /// Pass empty or call without this method to omit EXIF data.
624    pub fn exif_data(mut self, data: Vec<u8>) -> Self {
625        self.exif_data = if data.is_empty() { None } else { Some(data) };
626        self
627    }
628
629    /// Set pixel density for the JFIF APP0 marker.
630    ///
631    /// This specifies the physical pixel density (DPI/DPC) or aspect ratio.
632    /// Note that most software ignores JFIF density in favor of EXIF metadata.
633    ///
634    /// # Example
635    /// ```
636    /// use mozjpeg_rs::{Encoder, PixelDensity};
637    ///
638    /// let encoder = Encoder::baseline_optimized()
639    ///     .pixel_density(PixelDensity::dpi(300, 300)); // 300 DPI
640    /// ```
641    pub fn pixel_density(mut self, density: PixelDensity) -> Self {
642        self.pixel_density = density;
643        self
644    }
645
646    /// Set ICC color profile to embed.
647    ///
648    /// The profile will be embedded in APP2 markers with the standard
649    /// "ICC_PROFILE" identifier. Large profiles are automatically chunked.
650    ///
651    /// # Arguments
652    /// * `profile` - Raw ICC profile data
653    pub fn icc_profile(mut self, profile: Vec<u8>) -> Self {
654        self.icc_profile = if profile.is_empty() {
655            None
656        } else {
657            Some(profile)
658        };
659        self
660    }
661
662    /// Add a custom APP marker.
663    ///
664    /// # Arguments
665    /// * `app_num` - APP marker number (0-15, e.g., 1 for EXIF, 2 for ICC)
666    /// * `data` - Raw marker data (including any identifier prefix)
667    ///
668    /// Multiple markers with the same number are allowed.
669    /// Markers are written in the order they are added.
670    pub fn add_marker(mut self, app_num: u8, data: Vec<u8>) -> Self {
671        if app_num <= 15 && !data.is_empty() {
672            self.custom_markers.push((app_num, data));
673        }
674        self
675    }
676
677    /// Set custom luminance quantization table.
678    ///
679    /// This overrides the table selected by `quant_tables()`.
680    /// Values should be in natural (row-major) order, not zigzag.
681    ///
682    /// # Arguments
683    /// * `table` - 64 quantization values (quality scaling still applies)
684    pub fn custom_luma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
685        self.custom_luma_qtable = Some(table);
686        self
687    }
688
689    /// Set custom chrominance quantization table.
690    ///
691    /// This overrides the table selected by `quant_tables()`.
692    /// Values should be in natural (row-major) order, not zigzag.
693    ///
694    /// # Arguments
695    /// * `table` - 64 quantization values (quality scaling still applies)
696    pub fn custom_chroma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
697        self.custom_chroma_qtable = Some(table);
698        self
699    }
700
701    // =========================================================================
702    // Resource Limits
703    // =========================================================================
704
705    /// Set resource limits for the encoder.
706    ///
707    /// Limits can restrict:
708    /// - Maximum image width and height
709    /// - Maximum pixel count (width × height)
710    /// - Maximum estimated memory allocation
711    /// - Maximum ICC profile size
712    ///
713    /// # Example
714    /// ```
715    /// use mozjpeg_rs::{Encoder, Preset, Limits};
716    ///
717    /// let limits = Limits::default()
718    ///     .max_width(4096)
719    ///     .max_height(4096)
720    ///     .max_pixel_count(16_000_000)
721    ///     .max_alloc_bytes(100 * 1024 * 1024);
722    ///
723    /// let encoder = Encoder::new(Preset::default())
724    ///     .limits(limits);
725    /// ```
726    pub fn limits(mut self, limits: Limits) -> Self {
727        self.limits = limits;
728        self
729    }
730
731    /// Check all resource limits before encoding.
732    ///
733    /// # Arguments
734    /// * `width` - Image width
735    /// * `height` - Image height
736    /// * `is_gray` - True for grayscale images (affects memory estimate)
737    fn check_limits(&self, width: u32, height: u32, is_gray: bool) -> Result<()> {
738        let limits = &self.limits;
739
740        // Check dimension limits
741        if (limits.max_width > 0 && width > limits.max_width)
742            || (limits.max_height > 0 && height > limits.max_height)
743        {
744            return Err(Error::DimensionLimitExceeded {
745                width,
746                height,
747                max_width: limits.max_width,
748                max_height: limits.max_height,
749            });
750        }
751
752        // Check pixel count limit
753        if limits.max_pixel_count > 0 {
754            let pixel_count = width as u64 * height as u64;
755            if pixel_count > limits.max_pixel_count {
756                return Err(Error::PixelCountExceeded {
757                    pixel_count,
758                    limit: limits.max_pixel_count,
759                });
760            }
761        }
762
763        // Check allocation limit
764        if limits.max_alloc_bytes > 0 {
765            let estimate = if is_gray {
766                self.estimate_resources_gray(width, height)
767            } else {
768                self.estimate_resources(width, height)
769            };
770            if estimate.peak_memory_bytes > limits.max_alloc_bytes {
771                return Err(Error::AllocationLimitExceeded {
772                    estimated: estimate.peak_memory_bytes,
773                    limit: limits.max_alloc_bytes,
774                });
775            }
776        }
777
778        // Check ICC profile size limit
779        if limits.max_icc_profile_bytes > 0 {
780            if let Some(ref icc) = self.icc_profile {
781                if icc.len() > limits.max_icc_profile_bytes {
782                    return Err(Error::IccProfileTooLarge {
783                        size: icc.len(),
784                        limit: limits.max_icc_profile_bytes,
785                    });
786                }
787            }
788        }
789
790        Ok(())
791    }
792
793    // =========================================================================
794    // Aliases for rimage/CLI-style naming
795    // =========================================================================
796
797    /// Set baseline mode (opposite of progressive).
798    ///
799    /// When `true`, produces a sequential JPEG (non-progressive).
800    /// This is equivalent to `progressive(false)`.
801    ///
802    /// # Example
803    /// ```
804    /// use mozjpeg_rs::Encoder;
805    ///
806    /// // These are equivalent:
807    /// let enc1 = Encoder::baseline_optimized().baseline(true);
808    /// let enc2 = Encoder::baseline_optimized().progressive(false);
809    /// ```
810    #[inline]
811    pub fn baseline(self, enable: bool) -> Self {
812        self.progressive(!enable)
813    }
814
815    /// Enable or disable Huffman coding optimization.
816    ///
817    /// Alias for [`optimize_huffman()`](Self::optimize_huffman).
818    /// This name matches mozjpeg's CLI flag naming.
819    #[inline]
820    pub fn optimize_coding(self, enable: bool) -> Self {
821        self.optimize_huffman(enable)
822    }
823
824    /// Set chroma subsampling mode.
825    ///
826    /// Alias for [`subsampling()`](Self::subsampling).
827    #[inline]
828    pub fn chroma_subsampling(self, mode: Subsampling) -> Self {
829        self.subsampling(mode)
830    }
831
832    /// Set quantization table variant.
833    ///
834    /// Alias for [`quant_tables()`](Self::quant_tables).
835    #[inline]
836    pub fn qtable(self, idx: QuantTableIdx) -> Self {
837        self.quant_tables(idx)
838    }
839
840    // =========================================================================
841    // Resource Estimation
842    // =========================================================================
843
844    /// Estimate resource usage for encoding an RGB image of the given dimensions.
845    ///
846    /// Returns peak memory usage (in bytes) and a relative CPU cost multiplier.
847    /// Useful for scheduling, enforcing resource limits, or providing feedback.
848    ///
849    /// # Arguments
850    /// * `width` - Image width in pixels
851    /// * `height` - Image height in pixels
852    ///
853    /// # Example
854    ///
855    /// ```
856    /// use mozjpeg_rs::{Encoder, Preset};
857    ///
858    /// let encoder = Encoder::new(Preset::ProgressiveBalanced).quality(85);
859    /// let estimate = encoder.estimate_resources(1920, 1080);
860    ///
861    /// println!("Peak memory: {} MB", estimate.peak_memory_bytes / 1_000_000);
862    /// println!("Relative CPU cost: {:.1}x", estimate.cpu_cost_multiplier);
863    /// ```
864    pub fn estimate_resources(&self, width: u32, height: u32) -> crate::types::ResourceEstimate {
865        let width = width as usize;
866        let height = height as usize;
867        let pixels = width * height;
868
869        // Calculate chroma dimensions based on subsampling
870        let (h_samp, v_samp) = self.subsampling.luma_factors();
871        let chroma_width = (width + h_samp as usize - 1) / h_samp as usize;
872        let chroma_height = (height + v_samp as usize - 1) / v_samp as usize;
873        let chroma_pixels = chroma_width * chroma_height;
874
875        // MCU-aligned dimensions
876        let mcu_h = 8 * h_samp as usize;
877        let mcu_v = 8 * v_samp as usize;
878        let mcu_width = (width + mcu_h - 1) / mcu_h * mcu_h;
879        let mcu_height = (height + mcu_v - 1) / mcu_v * mcu_v;
880
881        // Block counts
882        let y_blocks = (mcu_width / 8) * (mcu_height / 8);
883        let chroma_block_w = (chroma_width + 7) / 8;
884        let chroma_block_h = (chroma_height + 7) / 8;
885        let chroma_blocks = chroma_block_w * chroma_block_h;
886        let total_blocks = y_blocks + 2 * chroma_blocks;
887
888        // --- Memory estimation ---
889        let mut memory: usize = 0;
890
891        // Color conversion buffers (Y, Cb, Cr planes)
892        memory += 3 * pixels;
893
894        // Chroma subsampled buffers
895        memory += 2 * chroma_pixels;
896
897        // MCU-padded buffers
898        memory += mcu_width * mcu_height; // Y
899        let mcu_chroma_w = (chroma_width + 7) / 8 * 8;
900        let mcu_chroma_h = (chroma_height + 7) / 8 * 8;
901        memory += 2 * mcu_chroma_w * mcu_chroma_h; // Cb, Cr
902
903        // Block storage (needed for progressive or optimize_huffman)
904        let needs_block_storage = self.progressive || self.optimize_huffman;
905        if needs_block_storage {
906            // i16[64] per block = 128 bytes
907            memory += total_blocks * 128;
908        }
909
910        // Raw DCT storage (needed for DC trellis)
911        if self.trellis.dc_enabled {
912            // i32[64] per block = 256 bytes
913            memory += total_blocks * 256;
914        }
915
916        // Output buffer estimate (varies by quality, ~0.3-1.0x input for typical images)
917        // Use a conservative estimate based on quality
918        let output_ratio = if self.quality >= 95 {
919            0.8
920        } else if self.quality >= 85 {
921            0.5
922        } else if self.quality >= 75 {
923            0.3
924        } else {
925            0.2
926        };
927        memory += (pixels as f64 * 3.0 * output_ratio) as usize;
928
929        // --- CPU cost estimation ---
930        // Reference: BaselineFastest Q75 = 1.0
931        let mut cpu_cost = 1.0;
932
933        // Trellis AC quantization is the biggest CPU factor
934        if self.trellis.enabled {
935            cpu_cost += 3.5;
936        }
937
938        // DC trellis adds extra work
939        if self.trellis.dc_enabled {
940            cpu_cost += 0.5;
941        }
942
943        // Huffman optimization (frequency counting pass)
944        if self.optimize_huffman {
945            cpu_cost += 0.3;
946        }
947
948        // Progressive mode (multiple scan encoding)
949        if self.progressive {
950            cpu_cost += 1.5;
951        }
952
953        // optimize_scans (trial encoding many scan configurations)
954        if self.optimize_scans {
955            cpu_cost += 3.0;
956        }
957
958        // High quality increases trellis work (more candidates to evaluate)
959        // This matters most when trellis is enabled
960        if self.trellis.enabled && self.quality >= 85 {
961            let quality_factor = 1.0 + (self.quality as f64 - 85.0) / 30.0;
962            cpu_cost *= quality_factor;
963        }
964
965        crate::types::ResourceEstimate {
966            peak_memory_bytes: memory,
967            cpu_cost_multiplier: cpu_cost,
968            block_count: total_blocks,
969        }
970    }
971
972    /// Estimate resource usage for encoding a grayscale image.
973    ///
974    /// Similar to [`estimate_resources`](Self::estimate_resources) but for single-channel images.
975    pub fn estimate_resources_gray(
976        &self,
977        width: u32,
978        height: u32,
979    ) -> crate::types::ResourceEstimate {
980        let width = width as usize;
981        let height = height as usize;
982        let pixels = width * height;
983
984        // MCU-aligned dimensions (always 8x8 for grayscale)
985        let mcu_width = (width + 7) / 8 * 8;
986        let mcu_height = (height + 7) / 8 * 8;
987
988        // Block count
989        let blocks = (mcu_width / 8) * (mcu_height / 8);
990
991        // --- Memory estimation ---
992        let mut memory: usize = 0;
993
994        // MCU-padded buffer
995        memory += mcu_width * mcu_height;
996
997        // Block storage (needed for progressive or optimize_huffman)
998        let needs_block_storage = self.progressive || self.optimize_huffman;
999        if needs_block_storage {
1000            memory += blocks * 128;
1001        }
1002
1003        // Raw DCT storage (needed for DC trellis)
1004        if self.trellis.dc_enabled {
1005            memory += blocks * 256;
1006        }
1007
1008        // Output buffer estimate
1009        let output_ratio = if self.quality >= 95 {
1010            0.8
1011        } else if self.quality >= 85 {
1012            0.5
1013        } else if self.quality >= 75 {
1014            0.3
1015        } else {
1016            0.2
1017        };
1018        memory += (pixels as f64 * output_ratio) as usize;
1019
1020        // --- CPU cost (same formula, but less work due to single channel) ---
1021        let mut cpu_cost = 1.0;
1022
1023        if self.trellis.enabled {
1024            cpu_cost += 3.5;
1025        }
1026        if self.trellis.dc_enabled {
1027            cpu_cost += 0.5;
1028        }
1029        if self.optimize_huffman {
1030            cpu_cost += 0.3;
1031        }
1032        if self.progressive {
1033            cpu_cost += 1.0; // Less for grayscale (fewer scans)
1034        }
1035        if self.optimize_scans {
1036            cpu_cost += 2.0; // Less for grayscale
1037        }
1038        if self.trellis.enabled && self.quality >= 85 {
1039            let quality_factor = 1.0 + (self.quality as f64 - 85.0) / 30.0;
1040            cpu_cost *= quality_factor;
1041        }
1042
1043        // Grayscale is ~1/3 the work of RGB (single channel)
1044        cpu_cost /= 3.0;
1045
1046        crate::types::ResourceEstimate {
1047            peak_memory_bytes: memory,
1048            cpu_cost_multiplier: cpu_cost,
1049            block_count: blocks,
1050        }
1051    }
1052
1053    // =========================================================================
1054    // Encoding
1055    // =========================================================================
1056
1057    /// Encode RGB image data to JPEG.
1058    ///
1059    /// # Arguments
1060    /// * `rgb_data` - RGB pixel data (3 bytes per pixel, row-major)
1061    /// * `width` - Image width in pixels
1062    /// * `height` - Image height in pixels
1063    ///
1064    /// # Returns
1065    /// JPEG-encoded data as a `Vec<u8>`.
1066    pub fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
1067        // Validate dimensions: must be non-zero
1068        if width == 0 || height == 0 {
1069            return Err(Error::InvalidDimensions { width, height });
1070        }
1071
1072        // Check all resource limits
1073        self.check_limits(width, height, false)?;
1074
1075        // Use checked arithmetic to prevent overflow
1076        let expected_len = (width as usize)
1077            .checked_mul(height as usize)
1078            .and_then(|n| n.checked_mul(3))
1079            .ok_or(Error::InvalidDimensions { width, height })?;
1080
1081        if rgb_data.len() != expected_len {
1082            return Err(Error::BufferSizeMismatch {
1083                expected: expected_len,
1084                actual: rgb_data.len(),
1085            });
1086        }
1087
1088        // Apply smoothing if enabled
1089        let rgb_data = if self.smoothing > 0 {
1090            std::borrow::Cow::Owned(crate::smooth::smooth_rgb(
1091                rgb_data,
1092                width,
1093                height,
1094                self.smoothing,
1095            ))
1096        } else {
1097            std::borrow::Cow::Borrowed(rgb_data)
1098        };
1099
1100        let mut output = Vec::new();
1101        self.encode_rgb_to_writer(&rgb_data, width, height, &mut output)?;
1102        Ok(output)
1103    }
1104
1105    /// Encode grayscale image data to JPEG.
1106    ///
1107    /// # Arguments
1108    /// * `gray_data` - Grayscale pixel data (1 byte per pixel, row-major)
1109    /// * `width` - Image width in pixels
1110    /// * `height` - Image height in pixels
1111    ///
1112    /// # Returns
1113    /// JPEG-encoded data as a `Vec<u8>`.
1114    pub fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
1115        // Validate dimensions: must be non-zero
1116        if width == 0 || height == 0 {
1117            return Err(Error::InvalidDimensions { width, height });
1118        }
1119
1120        // Check all resource limits
1121        self.check_limits(width, height, true)?;
1122
1123        // Use checked arithmetic to prevent overflow
1124        let expected_len = (width as usize)
1125            .checked_mul(height as usize)
1126            .ok_or(Error::InvalidDimensions { width, height })?;
1127
1128        if gray_data.len() != expected_len {
1129            return Err(Error::BufferSizeMismatch {
1130                expected: expected_len,
1131                actual: gray_data.len(),
1132            });
1133        }
1134
1135        // Apply smoothing if enabled
1136        let gray_data = if self.smoothing > 0 {
1137            std::borrow::Cow::Owned(crate::smooth::smooth_grayscale(
1138                gray_data,
1139                width,
1140                height,
1141                self.smoothing,
1142            ))
1143        } else {
1144            std::borrow::Cow::Borrowed(gray_data)
1145        };
1146
1147        let mut output = Vec::new();
1148        self.encode_gray_to_writer(&gray_data, width, height, &mut output)?;
1149        Ok(output)
1150    }
1151
1152    /// Encode RGB image data to JPEG with cancellation and timeout support.
1153    ///
1154    /// This method allows encoding to be cancelled mid-operation via an atomic flag,
1155    /// or to automatically abort if a timeout is exceeded.
1156    ///
1157    /// # Arguments
1158    /// * `rgb_data` - RGB pixel data (3 bytes per pixel, row-major)
1159    /// * `width` - Image width in pixels
1160    /// * `height` - Image height in pixels
1161    /// * `cancel` - Optional cancellation flag. Set to `true` to abort encoding.
1162    /// * `timeout` - Optional maximum encoding duration.
1163    ///
1164    /// # Returns
1165    /// * `Ok(Vec<u8>)` - JPEG-encoded data
1166    /// * `Err(Error::Cancelled)` - If cancelled via the flag
1167    /// * `Err(Error::TimedOut)` - If the timeout was exceeded
1168    ///
1169    /// # Example
1170    /// ```no_run
1171    /// use mozjpeg_rs::{Encoder, Preset};
1172    /// use std::sync::atomic::AtomicBool;
1173    /// use std::time::Duration;
1174    ///
1175    /// let encoder = Encoder::new(Preset::ProgressiveBalanced);
1176    /// let pixels: Vec<u8> = vec![128; 1920 * 1080 * 3];
1177    /// let cancel = AtomicBool::new(false);
1178    ///
1179    /// // Encode with 5 second timeout
1180    /// let result = encoder.encode_rgb_cancellable(
1181    ///     &pixels, 1920, 1080,
1182    ///     Some(&cancel),
1183    ///     Some(Duration::from_secs(5)),
1184    /// );
1185    /// ```
1186    pub fn encode_rgb_cancellable(
1187        &self,
1188        rgb_data: &[u8],
1189        width: u32,
1190        height: u32,
1191        cancel: Option<&AtomicBool>,
1192        timeout: Option<Duration>,
1193    ) -> Result<Vec<u8>> {
1194        // Validate dimensions
1195        if width == 0 || height == 0 {
1196            return Err(Error::InvalidDimensions { width, height });
1197        }
1198
1199        // Check all resource limits
1200        self.check_limits(width, height, false)?;
1201
1202        // Check buffer size
1203        let expected_len = (width as usize)
1204            .checked_mul(height as usize)
1205            .and_then(|n| n.checked_mul(3))
1206            .ok_or(Error::InvalidDimensions { width, height })?;
1207
1208        if rgb_data.len() != expected_len {
1209            return Err(Error::BufferSizeMismatch {
1210                expected: expected_len,
1211                actual: rgb_data.len(),
1212            });
1213        }
1214
1215        // Create cancellation context
1216        let ctx = CancellationContext::new(cancel, timeout);
1217
1218        // Check for immediate cancellation
1219        ctx.check()?;
1220
1221        // Apply smoothing if enabled
1222        let rgb_data = if self.smoothing > 0 {
1223            std::borrow::Cow::Owned(crate::smooth::smooth_rgb(
1224                rgb_data,
1225                width,
1226                height,
1227                self.smoothing,
1228            ))
1229        } else {
1230            std::borrow::Cow::Borrowed(rgb_data)
1231        };
1232
1233        let mut output = Vec::new();
1234        // For now, use the regular encoder (cancellation hooks can be added to
1235        // internal functions in a follow-up). Check cancellation before and after.
1236        ctx.check()?;
1237        self.encode_rgb_to_writer(&rgb_data, width, height, &mut output)?;
1238        ctx.check()?;
1239
1240        Ok(output)
1241    }
1242
1243    /// Encode grayscale image data to JPEG with cancellation and timeout support.
1244    ///
1245    /// This method allows encoding to be cancelled mid-operation via an atomic flag,
1246    /// or to automatically abort if a timeout is exceeded.
1247    ///
1248    /// # Arguments
1249    /// * `gray_data` - Grayscale pixel data (1 byte per pixel, row-major)
1250    /// * `width` - Image width in pixels
1251    /// * `height` - Image height in pixels
1252    /// * `cancel` - Optional cancellation flag. Set to `true` to abort encoding.
1253    /// * `timeout` - Optional maximum encoding duration.
1254    ///
1255    /// # Returns
1256    /// * `Ok(Vec<u8>)` - JPEG-encoded data
1257    /// * `Err(Error::Cancelled)` - If cancelled via the flag
1258    /// * `Err(Error::TimedOut)` - If the timeout was exceeded
1259    pub fn encode_gray_cancellable(
1260        &self,
1261        gray_data: &[u8],
1262        width: u32,
1263        height: u32,
1264        cancel: Option<&AtomicBool>,
1265        timeout: Option<Duration>,
1266    ) -> Result<Vec<u8>> {
1267        // Validate dimensions
1268        if width == 0 || height == 0 {
1269            return Err(Error::InvalidDimensions { width, height });
1270        }
1271
1272        // Check all resource limits
1273        self.check_limits(width, height, true)?;
1274
1275        // Check buffer size
1276        let expected_len = (width as usize)
1277            .checked_mul(height as usize)
1278            .ok_or(Error::InvalidDimensions { width, height })?;
1279
1280        if gray_data.len() != expected_len {
1281            return Err(Error::BufferSizeMismatch {
1282                expected: expected_len,
1283                actual: gray_data.len(),
1284            });
1285        }
1286
1287        // Create cancellation context
1288        let ctx = CancellationContext::new(cancel, timeout);
1289
1290        // Check for immediate cancellation
1291        ctx.check()?;
1292
1293        // Apply smoothing if enabled
1294        let gray_data = if self.smoothing > 0 {
1295            std::borrow::Cow::Owned(crate::smooth::smooth_grayscale(
1296                gray_data,
1297                width,
1298                height,
1299                self.smoothing,
1300            ))
1301        } else {
1302            std::borrow::Cow::Borrowed(gray_data)
1303        };
1304
1305        let mut output = Vec::new();
1306        // For now, use the regular encoder (cancellation hooks can be added to
1307        // internal functions in a follow-up). Check cancellation before and after.
1308        ctx.check()?;
1309        self.encode_gray_to_writer(&gray_data, width, height, &mut output)?;
1310        ctx.check()?;
1311
1312        Ok(output)
1313    }
1314
1315    /// Encode grayscale image data to a writer.
1316    pub fn encode_gray_to_writer<W: Write>(
1317        &self,
1318        gray_data: &[u8],
1319        width: u32,
1320        height: u32,
1321        output: W,
1322    ) -> Result<()> {
1323        let width = width as usize;
1324        let height = height as usize;
1325
1326        // For grayscale, Y plane is the input directly (no conversion needed)
1327        let y_plane = gray_data;
1328
1329        // Grayscale uses 1x1 sampling
1330        let (mcu_width, mcu_height) = sample::mcu_aligned_dimensions(width, height, 1, 1);
1331
1332        let mcu_y_size = mcu_width
1333            .checked_mul(mcu_height)
1334            .ok_or(Error::AllocationFailed)?;
1335        let mut y_mcu = try_alloc_vec(0u8, mcu_y_size)?;
1336        sample::expand_to_mcu(y_plane, width, height, &mut y_mcu, mcu_width, mcu_height);
1337
1338        // Create quantization table (only luma needed)
1339        let luma_qtable = if let Some(ref custom) = self.custom_luma_qtable {
1340            crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
1341        } else {
1342            let (luma, _) =
1343                create_quant_tables(self.quality, self.quant_table_idx, self.force_baseline);
1344            luma
1345        };
1346
1347        // Create Huffman tables (only luma needed)
1348        let dc_luma_huff = create_std_dc_luma_table();
1349        let ac_luma_huff = create_std_ac_luma_table();
1350        let dc_luma_derived = DerivedTable::from_huff_table(&dc_luma_huff, true)?;
1351        let ac_luma_derived = DerivedTable::from_huff_table(&ac_luma_huff, false)?;
1352
1353        // Single component for grayscale
1354        let components = create_components(Subsampling::Gray);
1355
1356        // Write JPEG file
1357        let mut marker_writer = MarkerWriter::new(output);
1358
1359        // SOI
1360        marker_writer.write_soi()?;
1361
1362        // APP0 (JFIF) with pixel density
1363        marker_writer.write_jfif_app0(
1364            self.pixel_density.unit as u8,
1365            self.pixel_density.x,
1366            self.pixel_density.y,
1367        )?;
1368
1369        // EXIF (if present)
1370        if let Some(ref exif) = self.exif_data {
1371            marker_writer.write_app1_exif(exif)?;
1372        }
1373
1374        // ICC profile (if present)
1375        if let Some(ref icc) = self.icc_profile {
1376            marker_writer.write_icc_profile(icc)?;
1377        }
1378
1379        // Custom APP markers
1380        for (app_num, data) in &self.custom_markers {
1381            marker_writer.write_app(*app_num, data)?;
1382        }
1383
1384        // DQT (only luma table for grayscale)
1385        let luma_qtable_zz = natural_to_zigzag(&luma_qtable.values);
1386        marker_writer.write_dqt(0, &luma_qtable_zz, false)?;
1387
1388        // SOF (baseline or progressive)
1389        marker_writer.write_sof(
1390            self.progressive,
1391            8,
1392            height as u16,
1393            width as u16,
1394            &components,
1395        )?;
1396
1397        // DRI (restart interval)
1398        if self.restart_interval > 0 {
1399            marker_writer.write_dri(self.restart_interval)?;
1400        }
1401
1402        // DHT (only luma tables for grayscale) - written later for progressive
1403        if !self.progressive && !self.optimize_huffman {
1404            marker_writer
1405                .write_dht_multiple(&[(0, false, &dc_luma_huff), (0, true, &ac_luma_huff)])?;
1406        }
1407
1408        let mcu_rows = mcu_height / DCTSIZE;
1409        let mcu_cols = mcu_width / DCTSIZE;
1410        let num_blocks = mcu_rows
1411            .checked_mul(mcu_cols)
1412            .ok_or(Error::AllocationFailed)?;
1413
1414        if self.progressive {
1415            // Progressive mode: collect all blocks, then encode multiple scans
1416            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_blocks)?;
1417            let mut dct_block = [0i16; DCTSIZE2];
1418
1419            // Optionally collect raw DCT for DC trellis
1420            let dc_trellis_enabled = self.trellis.enabled && self.trellis.dc_enabled;
1421            let mut y_raw_dct = if dc_trellis_enabled {
1422                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_blocks)?)
1423            } else {
1424                None
1425            };
1426
1427            // Collect all blocks
1428            for mcu_row in 0..mcu_rows {
1429                for mcu_col in 0..mcu_cols {
1430                    let block_idx = mcu_row * mcu_cols + mcu_col;
1431                    self.process_block_to_storage_with_raw(
1432                        &y_mcu,
1433                        mcu_width,
1434                        mcu_row,
1435                        mcu_col,
1436                        &luma_qtable.values,
1437                        &ac_luma_derived,
1438                        &mut y_blocks[block_idx],
1439                        &mut dct_block,
1440                        y_raw_dct.as_mut().map(|v| v[block_idx].as_mut_slice()),
1441                    )?;
1442                }
1443            }
1444
1445            // Run DC trellis optimization if enabled
1446            if dc_trellis_enabled {
1447                if let Some(ref y_raw) = y_raw_dct {
1448                    run_dc_trellis_by_row(
1449                        y_raw,
1450                        &mut y_blocks,
1451                        luma_qtable.values[0],
1452                        &dc_luma_derived,
1453                        self.trellis.lambda_log_scale1,
1454                        self.trellis.lambda_log_scale2,
1455                        mcu_rows,
1456                        mcu_cols,
1457                        mcu_cols,
1458                        1,
1459                        1,
1460                    );
1461                }
1462            }
1463
1464            // Run EOB optimization if enabled (cross-block EOBRUN optimization)
1465            if self.trellis.enabled && self.trellis.eob_opt {
1466                use crate::trellis::{estimate_block_eob_info, optimize_eob_runs};
1467
1468                // Estimate EOB info for each block
1469                let eob_info: Vec<_> = y_blocks
1470                    .iter()
1471                    .map(|block| estimate_block_eob_info(block, &ac_luma_derived, 1, 63))
1472                    .collect();
1473
1474                // Optimize EOB runs across all blocks
1475                optimize_eob_runs(&mut y_blocks, &eob_info, &ac_luma_derived, 1, 63);
1476            }
1477
1478            // Generate progressive scan script for grayscale (1 component)
1479            let scans = generate_mozjpeg_max_compression_scans(1);
1480
1481            // Build optimized Huffman tables
1482            let mut dc_freq = FrequencyCounter::new();
1483            let mut dc_counter = ProgressiveSymbolCounter::new();
1484            for scan in &scans {
1485                let is_dc_first_scan = scan.ss == 0 && scan.se == 0 && scan.ah == 0;
1486                if is_dc_first_scan {
1487                    // Count DC symbols using progressive counter
1488                    for block in &y_blocks {
1489                        dc_counter.count_dc_first(block, 0, scan.al, &mut dc_freq);
1490                    }
1491                }
1492            }
1493
1494            let opt_dc_huff = dc_freq.generate_table()?;
1495            let opt_dc_derived = DerivedTable::from_huff_table(&opt_dc_huff, true)?;
1496
1497            // Write DC Huffman table upfront
1498            marker_writer.write_dht_multiple(&[(0, false, &opt_dc_huff)])?;
1499
1500            // Encode each scan
1501            let output = marker_writer.into_inner();
1502            let mut bit_writer = BitWriter::new(output);
1503
1504            for scan in &scans {
1505                let is_dc_scan = scan.ss == 0 && scan.se == 0;
1506
1507                if is_dc_scan {
1508                    // DC scan
1509                    marker_writer = MarkerWriter::new(bit_writer.into_inner());
1510                    marker_writer.write_sos(scan, &components)?;
1511                    bit_writer = BitWriter::new(marker_writer.into_inner());
1512
1513                    let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
1514
1515                    if scan.ah == 0 {
1516                        // DC first scan
1517                        for block in &y_blocks {
1518                            prog_encoder.encode_dc_first(block, 0, &opt_dc_derived, scan.al)?;
1519                        }
1520                    } else {
1521                        // DC refinement scan
1522                        for block in &y_blocks {
1523                            prog_encoder.encode_dc_refine(block, scan.al)?;
1524                        }
1525                    }
1526
1527                    prog_encoder.finish_scan(None)?;
1528                } else {
1529                    // AC scan - generate per-scan Huffman table
1530                    let mut ac_freq = FrequencyCounter::new();
1531                    let mut ac_counter = ProgressiveSymbolCounter::new();
1532
1533                    for block in &y_blocks {
1534                        if scan.ah == 0 {
1535                            ac_counter.count_ac_first(
1536                                block,
1537                                scan.ss,
1538                                scan.se,
1539                                scan.al,
1540                                &mut ac_freq,
1541                            );
1542                        } else {
1543                            ac_counter.count_ac_refine(
1544                                block,
1545                                scan.ss,
1546                                scan.se,
1547                                scan.ah,
1548                                scan.al,
1549                                &mut ac_freq,
1550                            );
1551                        }
1552                    }
1553                    ac_counter.finish_scan(Some(&mut ac_freq));
1554
1555                    let opt_ac_huff = ac_freq.generate_table()?;
1556                    let opt_ac_derived = DerivedTable::from_huff_table(&opt_ac_huff, false)?;
1557
1558                    // Write AC Huffman table and SOS
1559                    marker_writer = MarkerWriter::new(bit_writer.into_inner());
1560                    marker_writer.write_dht_multiple(&[(0, true, &opt_ac_huff)])?;
1561                    marker_writer.write_sos(scan, &components)?;
1562                    bit_writer = BitWriter::new(marker_writer.into_inner());
1563
1564                    let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
1565
1566                    for block in &y_blocks {
1567                        if scan.ah == 0 {
1568                            prog_encoder.encode_ac_first(
1569                                block,
1570                                scan.ss,
1571                                scan.se,
1572                                scan.al,
1573                                &opt_ac_derived,
1574                            )?;
1575                        } else {
1576                            prog_encoder.encode_ac_refine(
1577                                block,
1578                                scan.ss,
1579                                scan.se,
1580                                scan.ah,
1581                                scan.al,
1582                                &opt_ac_derived,
1583                            )?;
1584                        }
1585                    }
1586
1587                    prog_encoder.finish_scan(Some(&opt_ac_derived))?;
1588                }
1589            }
1590
1591            let mut output = bit_writer.into_inner();
1592            output.write_all(&[0xFF, 0xD9])?; // EOI
1593        } else if self.optimize_huffman {
1594            // 2-pass: collect blocks, count frequencies, then encode
1595            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_blocks)?;
1596            let mut dct_block = [0i16; DCTSIZE2];
1597
1598            // Collect all blocks using the same process as RGB encoding
1599            for mcu_row in 0..mcu_rows {
1600                for mcu_col in 0..mcu_cols {
1601                    let block_idx = mcu_row * mcu_cols + mcu_col;
1602                    self.process_block_to_storage_with_raw(
1603                        &y_mcu,
1604                        mcu_width,
1605                        mcu_row,
1606                        mcu_col,
1607                        &luma_qtable.values,
1608                        &ac_luma_derived,
1609                        &mut y_blocks[block_idx],
1610                        &mut dct_block,
1611                        None, // No raw DCT storage needed for grayscale
1612                    )?;
1613                }
1614            }
1615
1616            // Count frequencies using SymbolCounter
1617            let mut dc_freq = FrequencyCounter::new();
1618            let mut ac_freq = FrequencyCounter::new();
1619            let mut counter = SymbolCounter::new();
1620            for block in &y_blocks {
1621                counter.count_block(block, 0, &mut dc_freq, &mut ac_freq);
1622            }
1623
1624            // Generate optimized tables
1625            let opt_dc_huff = dc_freq.generate_table()?;
1626            let opt_ac_huff = ac_freq.generate_table()?;
1627            let opt_dc_derived = DerivedTable::from_huff_table(&opt_dc_huff, true)?;
1628            let opt_ac_derived = DerivedTable::from_huff_table(&opt_ac_huff, false)?;
1629
1630            // Write optimized Huffman tables
1631            marker_writer
1632                .write_dht_multiple(&[(0, false, &opt_dc_huff), (0, true, &opt_ac_huff)])?;
1633
1634            // Write SOS and encode
1635            let scans = generate_baseline_scan(1);
1636            marker_writer.write_sos(&scans[0], &components)?;
1637
1638            let output = marker_writer.into_inner();
1639            let mut bit_writer = BitWriter::new(output);
1640            let mut encoder = EntropyEncoder::new(&mut bit_writer);
1641
1642            // Restart marker support for grayscale (each block = 1 MCU)
1643            let restart_interval = self.restart_interval as usize;
1644            let mut restart_num = 0u8;
1645
1646            for (mcu_count, block) in y_blocks.iter().enumerate() {
1647                // Emit restart marker if needed
1648                if restart_interval > 0
1649                    && mcu_count > 0
1650                    && mcu_count.is_multiple_of(restart_interval)
1651                {
1652                    encoder.emit_restart(restart_num)?;
1653                    restart_num = restart_num.wrapping_add(1) & 0x07;
1654                }
1655                encoder.encode_block(block, 0, &opt_dc_derived, &opt_ac_derived)?;
1656            }
1657
1658            bit_writer.flush()?;
1659            let mut output = bit_writer.into_inner();
1660            output.write_all(&[0xFF, 0xD9])?; // EOI
1661        } else {
1662            // Single-pass encoding
1663            let scans = generate_baseline_scan(1);
1664            marker_writer.write_sos(&scans[0], &components)?;
1665
1666            let output = marker_writer.into_inner();
1667            let mut bit_writer = BitWriter::new(output);
1668            let mut encoder = EntropyEncoder::new(&mut bit_writer);
1669            let mut dct_block = [0i16; DCTSIZE2];
1670            let mut quant_block = [0i16; DCTSIZE2];
1671
1672            // Restart marker support
1673            let restart_interval = self.restart_interval as usize;
1674            let mut mcu_count = 0usize;
1675            let mut restart_num = 0u8;
1676
1677            for mcu_row in 0..mcu_rows {
1678                for mcu_col in 0..mcu_cols {
1679                    // Emit restart marker if needed
1680                    if restart_interval > 0
1681                        && mcu_count > 0
1682                        && mcu_count.is_multiple_of(restart_interval)
1683                    {
1684                        encoder.emit_restart(restart_num)?;
1685                        restart_num = restart_num.wrapping_add(1) & 0x07;
1686                    }
1687
1688                    // Process block directly to quant_block
1689                    self.process_block_to_storage_with_raw(
1690                        &y_mcu,
1691                        mcu_width,
1692                        mcu_row,
1693                        mcu_col,
1694                        &luma_qtable.values,
1695                        &ac_luma_derived,
1696                        &mut quant_block,
1697                        &mut dct_block,
1698                        None,
1699                    )?;
1700                    encoder.encode_block(&quant_block, 0, &dc_luma_derived, &ac_luma_derived)?;
1701                    mcu_count += 1;
1702                }
1703            }
1704
1705            bit_writer.flush()?;
1706            let mut output = bit_writer.into_inner();
1707            output.write_all(&[0xFF, 0xD9])?; // EOI
1708        }
1709
1710        Ok(())
1711    }
1712
1713    /// Encode pre-converted planar YCbCr image data to JPEG.
1714    ///
1715    /// This method accepts tightly packed YCbCr data (no row padding).
1716    /// For strided data, use [`encode_ycbcr_planar_strided`](Self::encode_ycbcr_planar_strided).
1717    ///
1718    /// # Arguments
1719    /// * `y` - Luma plane (width × height bytes, tightly packed)
1720    /// * `cb` - Cb chroma plane (chroma_width × chroma_height bytes)
1721    /// * `cr` - Cr chroma plane (chroma_width × chroma_height bytes)
1722    /// * `width` - Image width in pixels
1723    /// * `height` - Image height in pixels
1724    ///
1725    /// The chroma plane dimensions depend on the subsampling mode:
1726    /// - 4:4:4: chroma_width = width, chroma_height = height
1727    /// - 4:2:2: chroma_width = ceil(width/2), chroma_height = height
1728    /// - 4:2:0: chroma_width = ceil(width/2), chroma_height = ceil(height/2)
1729    ///
1730    /// # Returns
1731    /// JPEG-encoded data as a `Vec<u8>`.
1732    ///
1733    /// # Errors
1734    /// Returns an error if plane sizes don't match expected dimensions.
1735    pub fn encode_ycbcr_planar(
1736        &self,
1737        y: &[u8],
1738        cb: &[u8],
1739        cr: &[u8],
1740        width: u32,
1741        height: u32,
1742    ) -> Result<Vec<u8>> {
1743        // For packed data, stride equals width
1744        let (luma_h, luma_v) = self.subsampling.luma_factors();
1745        let (chroma_width, _) = sample::subsampled_dimensions(
1746            width as usize,
1747            height as usize,
1748            luma_h as usize,
1749            luma_v as usize,
1750        );
1751        self.encode_ycbcr_planar_strided(
1752            y,
1753            width as usize,
1754            cb,
1755            chroma_width,
1756            cr,
1757            chroma_width,
1758            width,
1759            height,
1760        )
1761    }
1762
1763    /// Encode pre-converted planar YCbCr image data to a writer.
1764    ///
1765    /// See [`encode_ycbcr_planar`](Self::encode_ycbcr_planar) for details.
1766    pub fn encode_ycbcr_planar_to_writer<W: Write>(
1767        &self,
1768        y: &[u8],
1769        cb: &[u8],
1770        cr: &[u8],
1771        width: u32,
1772        height: u32,
1773        output: W,
1774    ) -> Result<()> {
1775        // For packed data, stride equals width
1776        let (luma_h, luma_v) = self.subsampling.luma_factors();
1777        let (chroma_width, _) = sample::subsampled_dimensions(
1778            width as usize,
1779            height as usize,
1780            luma_h as usize,
1781            luma_v as usize,
1782        );
1783        self.encode_ycbcr_planar_strided_to_writer(
1784            y,
1785            width as usize,
1786            cb,
1787            chroma_width,
1788            cr,
1789            chroma_width,
1790            width,
1791            height,
1792            output,
1793        )
1794    }
1795
1796    /// Encode pre-converted planar YCbCr image data with arbitrary strides.
1797    ///
1798    /// This method accepts YCbCr data that has already been:
1799    /// 1. Converted from RGB to YCbCr color space
1800    /// 2. Downsampled according to the encoder's subsampling mode
1801    ///
1802    /// Use this when you have YCbCr data from video decoders or other sources
1803    /// that may have row padding (stride > width).
1804    ///
1805    /// # Arguments
1806    /// * `y` - Luma plane data
1807    /// * `y_stride` - Bytes per row in luma plane (must be >= width)
1808    /// * `cb` - Cb chroma plane data
1809    /// * `cb_stride` - Bytes per row in Cb plane (must be >= chroma_width)
1810    /// * `cr` - Cr chroma plane data
1811    /// * `cr_stride` - Bytes per row in Cr plane (must be >= chroma_width)
1812    /// * `width` - Image width in pixels
1813    /// * `height` - Image height in pixels
1814    ///
1815    /// The chroma plane dimensions depend on the subsampling mode:
1816    /// - 4:4:4: chroma_width = width, chroma_height = height
1817    /// - 4:2:2: chroma_width = ceil(width/2), chroma_height = height
1818    /// - 4:2:0: chroma_width = ceil(width/2), chroma_height = ceil(height/2)
1819    ///
1820    /// # Returns
1821    /// JPEG-encoded data as a `Vec<u8>`.
1822    ///
1823    /// # Errors
1824    /// Returns an error if:
1825    /// - Strides are less than the required width
1826    /// - Plane sizes don't match stride × height
1827    #[allow(clippy::too_many_arguments)]
1828    pub fn encode_ycbcr_planar_strided(
1829        &self,
1830        y: &[u8],
1831        y_stride: usize,
1832        cb: &[u8],
1833        cb_stride: usize,
1834        cr: &[u8],
1835        cr_stride: usize,
1836        width: u32,
1837        height: u32,
1838    ) -> Result<Vec<u8>> {
1839        let mut output = Vec::new();
1840        self.encode_ycbcr_planar_strided_to_writer(
1841            y,
1842            y_stride,
1843            cb,
1844            cb_stride,
1845            cr,
1846            cr_stride,
1847            width,
1848            height,
1849            &mut output,
1850        )?;
1851        Ok(output)
1852    }
1853
1854    /// Encode pre-converted planar YCbCr image data with arbitrary strides to a writer.
1855    ///
1856    /// See [`encode_ycbcr_planar_strided`](Self::encode_ycbcr_planar_strided) for details.
1857    #[allow(clippy::too_many_arguments)]
1858    pub fn encode_ycbcr_planar_strided_to_writer<W: Write>(
1859        &self,
1860        y: &[u8],
1861        y_stride: usize,
1862        cb: &[u8],
1863        cb_stride: usize,
1864        cr: &[u8],
1865        cr_stride: usize,
1866        width: u32,
1867        height: u32,
1868        output: W,
1869    ) -> Result<()> {
1870        let width = width as usize;
1871        let height = height as usize;
1872
1873        // Validate dimensions
1874        if width == 0 || height == 0 {
1875            return Err(Error::InvalidDimensions {
1876                width: width as u32,
1877                height: height as u32,
1878            });
1879        }
1880
1881        // Validate Y stride
1882        if y_stride < width {
1883            return Err(Error::InvalidSamplingFactor {
1884                h: y_stride as u8,
1885                v: width as u8,
1886            });
1887        }
1888
1889        let (luma_h, luma_v) = self.subsampling.luma_factors();
1890        let (chroma_width, chroma_height) =
1891            sample::subsampled_dimensions(width, height, luma_h as usize, luma_v as usize);
1892
1893        // Validate chroma strides
1894        if cb_stride < chroma_width {
1895            return Err(Error::InvalidSamplingFactor {
1896                h: cb_stride as u8,
1897                v: chroma_width as u8,
1898            });
1899        }
1900        if cr_stride < chroma_width {
1901            return Err(Error::InvalidSamplingFactor {
1902                h: cr_stride as u8,
1903                v: chroma_width as u8,
1904            });
1905        }
1906
1907        // Calculate expected plane sizes (stride × height)
1908        let y_size = y_stride
1909            .checked_mul(height)
1910            .ok_or(Error::InvalidDimensions {
1911                width: width as u32,
1912                height: height as u32,
1913            })?;
1914        let cb_size = cb_stride
1915            .checked_mul(chroma_height)
1916            .ok_or(Error::AllocationFailed)?;
1917        let cr_size = cr_stride
1918            .checked_mul(chroma_height)
1919            .ok_or(Error::AllocationFailed)?;
1920
1921        // Validate Y plane size
1922        if y.len() < y_size {
1923            return Err(Error::BufferSizeMismatch {
1924                expected: y_size,
1925                actual: y.len(),
1926            });
1927        }
1928
1929        // Validate Cb plane size
1930        if cb.len() < cb_size {
1931            return Err(Error::BufferSizeMismatch {
1932                expected: cb_size,
1933                actual: cb.len(),
1934            });
1935        }
1936
1937        // Validate Cr plane size
1938        if cr.len() < cr_size {
1939            return Err(Error::BufferSizeMismatch {
1940                expected: cr_size,
1941                actual: cr.len(),
1942            });
1943        }
1944
1945        // Expand planes to MCU-aligned dimensions
1946        let (mcu_width, mcu_height) =
1947            sample::mcu_aligned_dimensions(width, height, luma_h as usize, luma_v as usize);
1948        let (mcu_chroma_w, mcu_chroma_h) =
1949            (mcu_width / luma_h as usize, mcu_height / luma_v as usize);
1950
1951        let mcu_y_size = mcu_width
1952            .checked_mul(mcu_height)
1953            .ok_or(Error::AllocationFailed)?;
1954        let mcu_chroma_size = mcu_chroma_w
1955            .checked_mul(mcu_chroma_h)
1956            .ok_or(Error::AllocationFailed)?;
1957        let mut y_mcu = try_alloc_vec(0u8, mcu_y_size)?;
1958        let mut cb_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
1959        let mut cr_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
1960
1961        sample::expand_to_mcu_strided(
1962            y, width, y_stride, height, &mut y_mcu, mcu_width, mcu_height,
1963        );
1964        sample::expand_to_mcu_strided(
1965            cb,
1966            chroma_width,
1967            cb_stride,
1968            chroma_height,
1969            &mut cb_mcu,
1970            mcu_chroma_w,
1971            mcu_chroma_h,
1972        );
1973        sample::expand_to_mcu_strided(
1974            cr,
1975            chroma_width,
1976            cr_stride,
1977            chroma_height,
1978            &mut cr_mcu,
1979            mcu_chroma_w,
1980            mcu_chroma_h,
1981        );
1982
1983        // Encode using shared helper
1984        self.encode_ycbcr_mcu_to_writer(
1985            &y_mcu,
1986            &cb_mcu,
1987            &cr_mcu,
1988            width,
1989            height,
1990            mcu_width,
1991            mcu_height,
1992            chroma_width,
1993            chroma_height,
1994            mcu_chroma_w,
1995            mcu_chroma_h,
1996            output,
1997        )
1998    }
1999
2000    /// Encode RGB image data to a writer.
2001    pub fn encode_rgb_to_writer<W: Write>(
2002        &self,
2003        rgb_data: &[u8],
2004        width: u32,
2005        height: u32,
2006        output: W,
2007    ) -> Result<()> {
2008        let width = width as usize;
2009        let height = height as usize;
2010
2011        // Step 1: Convert RGB to YCbCr
2012        // Use checked arithmetic for num_pixels calculation
2013        let num_pixels = width.checked_mul(height).ok_or(Error::InvalidDimensions {
2014            width: width as u32,
2015            height: height as u32,
2016        })?;
2017
2018        let mut y_plane = try_alloc_vec(0u8, num_pixels)?;
2019        let mut cb_plane = try_alloc_vec(0u8, num_pixels)?;
2020        let mut cr_plane = try_alloc_vec(0u8, num_pixels)?;
2021
2022        (self.simd.color_convert_rgb_to_ycbcr)(
2023            rgb_data,
2024            &mut y_plane,
2025            &mut cb_plane,
2026            &mut cr_plane,
2027            num_pixels,
2028        );
2029
2030        // Step 2: Downsample chroma if needed
2031        let (luma_h, luma_v) = self.subsampling.luma_factors();
2032        let (chroma_width, chroma_height) =
2033            sample::subsampled_dimensions(width, height, luma_h as usize, luma_v as usize);
2034
2035        let chroma_size = chroma_width
2036            .checked_mul(chroma_height)
2037            .ok_or(Error::AllocationFailed)?;
2038        let mut cb_subsampled = try_alloc_vec(0u8, chroma_size)?;
2039        let mut cr_subsampled = try_alloc_vec(0u8, chroma_size)?;
2040
2041        sample::downsample_plane(
2042            &cb_plane,
2043            width,
2044            height,
2045            luma_h as usize,
2046            luma_v as usize,
2047            &mut cb_subsampled,
2048        );
2049        sample::downsample_plane(
2050            &cr_plane,
2051            width,
2052            height,
2053            luma_h as usize,
2054            luma_v as usize,
2055            &mut cr_subsampled,
2056        );
2057
2058        // Step 3: Expand planes to MCU-aligned dimensions
2059        let (mcu_width, mcu_height) =
2060            sample::mcu_aligned_dimensions(width, height, luma_h as usize, luma_v as usize);
2061        let (mcu_chroma_w, mcu_chroma_h) =
2062            (mcu_width / luma_h as usize, mcu_height / luma_v as usize);
2063
2064        let mcu_y_size = mcu_width
2065            .checked_mul(mcu_height)
2066            .ok_or(Error::AllocationFailed)?;
2067        let mcu_chroma_size = mcu_chroma_w
2068            .checked_mul(mcu_chroma_h)
2069            .ok_or(Error::AllocationFailed)?;
2070        let mut y_mcu = try_alloc_vec(0u8, mcu_y_size)?;
2071        let mut cb_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
2072        let mut cr_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
2073
2074        sample::expand_to_mcu(&y_plane, width, height, &mut y_mcu, mcu_width, mcu_height);
2075        sample::expand_to_mcu(
2076            &cb_subsampled,
2077            chroma_width,
2078            chroma_height,
2079            &mut cb_mcu,
2080            mcu_chroma_w,
2081            mcu_chroma_h,
2082        );
2083        sample::expand_to_mcu(
2084            &cr_subsampled,
2085            chroma_width,
2086            chroma_height,
2087            &mut cr_mcu,
2088            mcu_chroma_w,
2089            mcu_chroma_h,
2090        );
2091
2092        // Encode using shared helper
2093        self.encode_ycbcr_mcu_to_writer(
2094            &y_mcu,
2095            &cb_mcu,
2096            &cr_mcu,
2097            width,
2098            height,
2099            mcu_width,
2100            mcu_height,
2101            chroma_width,
2102            chroma_height,
2103            mcu_chroma_w,
2104            mcu_chroma_h,
2105            output,
2106        )
2107    }
2108
2109    /// Internal helper: Encode MCU-aligned YCbCr planes to JPEG.
2110    ///
2111    /// This is the shared encoding logic used by both `encode_rgb_to_writer`
2112    /// and `encode_ycbcr_planar_to_writer`.
2113    #[allow(clippy::too_many_arguments)]
2114    fn encode_ycbcr_mcu_to_writer<W: Write>(
2115        &self,
2116        y_mcu: &[u8],
2117        cb_mcu: &[u8],
2118        cr_mcu: &[u8],
2119        width: usize,
2120        height: usize,
2121        mcu_width: usize,
2122        mcu_height: usize,
2123        chroma_width: usize,
2124        chroma_height: usize,
2125        mcu_chroma_w: usize,
2126        mcu_chroma_h: usize,
2127        output: W,
2128    ) -> Result<()> {
2129        let (luma_h, luma_v) = self.subsampling.luma_factors();
2130
2131        // Step 4: Create quantization tables
2132        let (luma_qtable, chroma_qtable) = {
2133            let (default_luma, default_chroma) =
2134                create_quant_tables(self.quality, self.quant_table_idx, self.force_baseline);
2135            let luma = if let Some(ref custom) = self.custom_luma_qtable {
2136                crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
2137            } else {
2138                default_luma
2139            };
2140            let chroma = if let Some(ref custom) = self.custom_chroma_qtable {
2141                crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
2142            } else {
2143                default_chroma
2144            };
2145            (luma, chroma)
2146        };
2147
2148        // Step 5: Create Huffman tables (standard tables)
2149        let dc_luma_huff = create_std_dc_luma_table();
2150        let dc_chroma_huff = create_std_dc_chroma_table();
2151        let ac_luma_huff = create_std_ac_luma_table();
2152        let ac_chroma_huff = create_std_ac_chroma_table();
2153
2154        let dc_luma_derived = DerivedTable::from_huff_table(&dc_luma_huff, true)?;
2155        let dc_chroma_derived = DerivedTable::from_huff_table(&dc_chroma_huff, true)?;
2156        let ac_luma_derived = DerivedTable::from_huff_table(&ac_luma_huff, false)?;
2157        let ac_chroma_derived = DerivedTable::from_huff_table(&ac_chroma_huff, false)?;
2158
2159        // Step 6: Set up components
2160        let components = create_ycbcr_components(self.subsampling);
2161
2162        // Step 7: Write JPEG file
2163        let mut marker_writer = MarkerWriter::new(output);
2164
2165        // SOI
2166        marker_writer.write_soi()?;
2167
2168        // APP0 (JFIF) with pixel density
2169        marker_writer.write_jfif_app0(
2170            self.pixel_density.unit as u8,
2171            self.pixel_density.x,
2172            self.pixel_density.y,
2173        )?;
2174
2175        // APP1 (EXIF) - if present
2176        if let Some(ref exif) = self.exif_data {
2177            marker_writer.write_app1_exif(exif)?;
2178        }
2179
2180        // ICC profile (if present)
2181        if let Some(ref icc) = self.icc_profile {
2182            marker_writer.write_icc_profile(icc)?;
2183        }
2184
2185        // Custom APP markers
2186        for (app_num, data) in &self.custom_markers {
2187            marker_writer.write_app(*app_num, data)?;
2188        }
2189
2190        // DQT (quantization tables in zigzag order) - combined into single marker
2191        let luma_qtable_zz = natural_to_zigzag(&luma_qtable.values);
2192        let chroma_qtable_zz = natural_to_zigzag(&chroma_qtable.values);
2193        marker_writer
2194            .write_dqt_multiple(&[(0, &luma_qtable_zz, false), (1, &chroma_qtable_zz, false)])?;
2195
2196        // SOF
2197        marker_writer.write_sof(
2198            self.progressive,
2199            8,
2200            height as u16,
2201            width as u16,
2202            &components,
2203        )?;
2204
2205        // DRI (restart interval) - if enabled
2206        if self.restart_interval > 0 {
2207            marker_writer.write_dri(self.restart_interval)?;
2208        }
2209
2210        // DHT (Huffman tables) - written here for non-optimized modes,
2211        // or later after frequency counting for optimized modes
2212        if !self.optimize_huffman {
2213            // Combine all tables into single DHT marker for smaller file size
2214            marker_writer.write_dht_multiple(&[
2215                (0, false, &dc_luma_huff),
2216                (1, false, &dc_chroma_huff),
2217                (0, true, &ac_luma_huff),
2218                (1, true, &ac_chroma_huff),
2219            ])?;
2220        }
2221
2222        if self.progressive {
2223            // Progressive mode: Store all blocks, then encode multiple scans
2224            let mcu_rows = mcu_height / (DCTSIZE * luma_v as usize);
2225            let mcu_cols = mcu_width / (DCTSIZE * luma_h as usize);
2226            let num_y_blocks = mcu_rows
2227                .checked_mul(mcu_cols)
2228                .and_then(|n| n.checked_mul(luma_h as usize))
2229                .and_then(|n| n.checked_mul(luma_v as usize))
2230                .ok_or(Error::AllocationFailed)?;
2231            let num_chroma_blocks = mcu_rows
2232                .checked_mul(mcu_cols)
2233                .ok_or(Error::AllocationFailed)?;
2234
2235            // Collect all quantized blocks
2236            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_y_blocks)?;
2237            let mut cb_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
2238            let mut cr_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
2239
2240            // Optionally collect raw DCT for DC trellis
2241            let dc_trellis_enabled = self.trellis.enabled && self.trellis.dc_enabled;
2242            let mut y_raw_dct = if dc_trellis_enabled {
2243                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_y_blocks)?)
2244            } else {
2245                None
2246            };
2247            let mut cb_raw_dct = if dc_trellis_enabled {
2248                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
2249            } else {
2250                None
2251            };
2252            let mut cr_raw_dct = if dc_trellis_enabled {
2253                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
2254            } else {
2255                None
2256            };
2257
2258            self.collect_blocks(
2259                y_mcu,
2260                mcu_width,
2261                mcu_height,
2262                cb_mcu,
2263                cr_mcu,
2264                mcu_chroma_w,
2265                mcu_chroma_h,
2266                &luma_qtable.values,
2267                &chroma_qtable.values,
2268                &ac_luma_derived,
2269                &ac_chroma_derived,
2270                &mut y_blocks,
2271                &mut cb_blocks,
2272                &mut cr_blocks,
2273                y_raw_dct.as_deref_mut(),
2274                cb_raw_dct.as_deref_mut(),
2275                cr_raw_dct.as_deref_mut(),
2276                luma_h,
2277                luma_v,
2278            )?;
2279
2280            // Run DC trellis optimization if enabled
2281            // C mozjpeg processes DC trellis row by row (each row is an independent chain)
2282            if dc_trellis_enabled {
2283                let h = luma_h as usize;
2284                let v = luma_v as usize;
2285                let y_block_cols = mcu_cols * h;
2286                let y_block_rows = mcu_rows * v;
2287
2288                if let Some(ref y_raw) = y_raw_dct {
2289                    run_dc_trellis_by_row(
2290                        y_raw,
2291                        &mut y_blocks,
2292                        luma_qtable.values[0],
2293                        &dc_luma_derived,
2294                        self.trellis.lambda_log_scale1,
2295                        self.trellis.lambda_log_scale2,
2296                        y_block_rows,
2297                        y_block_cols,
2298                        mcu_cols,
2299                        h,
2300                        v,
2301                    );
2302                }
2303                // Chroma has 1x1 per MCU, so MCU order = row order
2304                if let Some(ref cb_raw) = cb_raw_dct {
2305                    run_dc_trellis_by_row(
2306                        cb_raw,
2307                        &mut cb_blocks,
2308                        chroma_qtable.values[0],
2309                        &dc_chroma_derived,
2310                        self.trellis.lambda_log_scale1,
2311                        self.trellis.lambda_log_scale2,
2312                        mcu_rows,
2313                        mcu_cols,
2314                        mcu_cols,
2315                        1,
2316                        1,
2317                    );
2318                }
2319                if let Some(ref cr_raw) = cr_raw_dct {
2320                    run_dc_trellis_by_row(
2321                        cr_raw,
2322                        &mut cr_blocks,
2323                        chroma_qtable.values[0],
2324                        &dc_chroma_derived,
2325                        self.trellis.lambda_log_scale1,
2326                        self.trellis.lambda_log_scale2,
2327                        mcu_rows,
2328                        mcu_cols,
2329                        mcu_cols,
2330                        1,
2331                        1,
2332                    );
2333                }
2334            }
2335
2336            // Run EOB optimization if enabled (cross-block EOBRUN optimization)
2337            if self.trellis.enabled && self.trellis.eob_opt {
2338                use crate::trellis::{estimate_block_eob_info, optimize_eob_runs};
2339
2340                // Y component
2341                let y_eob_info: Vec<_> = y_blocks
2342                    .iter()
2343                    .map(|block| estimate_block_eob_info(block, &ac_luma_derived, 1, 63))
2344                    .collect();
2345                optimize_eob_runs(&mut y_blocks, &y_eob_info, &ac_luma_derived, 1, 63);
2346
2347                // Cb component
2348                let cb_eob_info: Vec<_> = cb_blocks
2349                    .iter()
2350                    .map(|block| estimate_block_eob_info(block, &ac_chroma_derived, 1, 63))
2351                    .collect();
2352                optimize_eob_runs(&mut cb_blocks, &cb_eob_info, &ac_chroma_derived, 1, 63);
2353
2354                // Cr component
2355                let cr_eob_info: Vec<_> = cr_blocks
2356                    .iter()
2357                    .map(|block| estimate_block_eob_info(block, &ac_chroma_derived, 1, 63))
2358                    .collect();
2359                optimize_eob_runs(&mut cr_blocks, &cr_eob_info, &ac_chroma_derived, 1, 63);
2360            }
2361
2362            // Generate progressive scan script
2363            //
2364            // TEMPORARY: Always use 4-scan minimal script to avoid refinement scan bugs.
2365            // Our AC refinement encoding has bugs causing "failed to decode huffman code".
2366            // TODO: Fix AC refinement encoding and re-enable optimize_scans.
2367            let scans = if self.optimize_scans {
2368                // When optimize_scans is enabled, use the scan optimizer to find
2369                // the best frequency split and Al levels. However, SA refinement
2370                // (Ah > 0) is currently disabled due to encoding bugs.
2371                self.optimize_progressive_scans(
2372                    3, // num_components
2373                    &y_blocks,
2374                    &cb_blocks,
2375                    &cr_blocks,
2376                    mcu_rows,
2377                    mcu_cols,
2378                    luma_h,
2379                    luma_v,
2380                    width,
2381                    height,
2382                    chroma_width,
2383                    chroma_height,
2384                    &dc_luma_derived,
2385                    &dc_chroma_derived,
2386                    &ac_luma_derived,
2387                    &ac_chroma_derived,
2388                )?
2389            } else {
2390                // Use C mozjpeg's 9-scan JCP_MAX_COMPRESSION script.
2391                // This matches jcparam.c lines 932-947 (the JCP_MAX_COMPRESSION branch).
2392                // mozjpeg-sys defaults to JCP_MAX_COMPRESSION profile, which uses:
2393                // - DC with no successive approximation (Al=0)
2394                // - 8/9 frequency split for luma with successive approximation
2395                // - No successive approximation for chroma
2396                generate_mozjpeg_max_compression_scans(3)
2397            };
2398
2399            // Build Huffman tables and encode scans
2400            //
2401            // When optimize_scans=true, each AC scan gets its own optimal Huffman table
2402            // written immediately before the scan. This matches C mozjpeg behavior and
2403            // ensures the trial encoder's size estimates match actual encoded sizes.
2404            //
2405            // When optimize_huffman=true, use per-scan AC tables (matching C mozjpeg).
2406            // C automatically enables optimize_coding for progressive mode and does
2407            // 2 passes per scan: gather statistics, then output with optimal tables.
2408
2409            if self.optimize_huffman {
2410                // Per-scan AC tables mode: DC tables global, AC tables per-scan
2411                // This matches C mozjpeg's progressive behavior
2412
2413                // Count DC frequencies for first-pass DC scans only (Ah == 0)
2414                // DC refinement scans (Ah > 0) don't use Huffman coding - they output raw bits
2415                let mut dc_luma_freq = FrequencyCounter::new();
2416                let mut dc_chroma_freq = FrequencyCounter::new();
2417
2418                for scan in &scans {
2419                    let is_dc_first_scan = scan.ss == 0 && scan.se == 0 && scan.ah == 0;
2420                    if is_dc_first_scan {
2421                        self.count_dc_scan_symbols(
2422                            scan,
2423                            &y_blocks,
2424                            &cb_blocks,
2425                            &cr_blocks,
2426                            mcu_rows,
2427                            mcu_cols,
2428                            luma_h,
2429                            luma_v,
2430                            &mut dc_luma_freq,
2431                            &mut dc_chroma_freq,
2432                        );
2433                    }
2434                }
2435
2436                // Generate and write DC tables upfront
2437                let opt_dc_luma_huff = dc_luma_freq.generate_table()?;
2438                let opt_dc_chroma_huff = dc_chroma_freq.generate_table()?;
2439                marker_writer.write_dht_multiple(&[
2440                    (0, false, &opt_dc_luma_huff),
2441                    (1, false, &opt_dc_chroma_huff),
2442                ])?;
2443
2444                let opt_dc_luma = DerivedTable::from_huff_table(&opt_dc_luma_huff, true)?;
2445                let opt_dc_chroma = DerivedTable::from_huff_table(&opt_dc_chroma_huff, true)?;
2446
2447                // Get output writer from marker_writer
2448                let output = marker_writer.into_inner();
2449                let mut bit_writer = BitWriter::new(output);
2450
2451                // Encode each scan with per-scan AC tables
2452                for scan in &scans {
2453                    bit_writer.flush()?;
2454                    let mut inner = bit_writer.into_inner();
2455
2456                    let is_dc_scan = scan.ss == 0 && scan.se == 0;
2457
2458                    if !is_dc_scan {
2459                        // AC scan: build per-scan optimal Huffman table
2460                        let comp_idx = scan.component_index[0] as usize;
2461                        let blocks = match comp_idx {
2462                            0 => &y_blocks,
2463                            1 => &cb_blocks,
2464                            2 => &cr_blocks,
2465                            _ => &y_blocks,
2466                        };
2467                        let (block_cols, block_rows) = if comp_idx == 0 {
2468                            (width.div_ceil(DCTSIZE), height.div_ceil(DCTSIZE))
2469                        } else {
2470                            (
2471                                chroma_width.div_ceil(DCTSIZE),
2472                                chroma_height.div_ceil(DCTSIZE),
2473                            )
2474                        };
2475
2476                        // Count frequencies for this scan only
2477                        let mut ac_freq = FrequencyCounter::new();
2478                        self.count_ac_scan_symbols(
2479                            scan,
2480                            blocks,
2481                            mcu_rows,
2482                            mcu_cols,
2483                            luma_h,
2484                            luma_v,
2485                            comp_idx,
2486                            block_cols,
2487                            block_rows,
2488                            &mut ac_freq,
2489                        );
2490
2491                        // Build optimal table and write DHT
2492                        let ac_huff = ac_freq.generate_table()?;
2493                        let table_idx = if comp_idx == 0 { 0 } else { 1 };
2494                        write_dht_marker(&mut inner, table_idx, true, &ac_huff)?;
2495
2496                        // Write SOS and encode
2497                        write_sos_marker(&mut inner, scan, &components)?;
2498                        bit_writer = BitWriter::new(inner);
2499
2500                        let ac_derived = DerivedTable::from_huff_table(&ac_huff, false)?;
2501                        let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
2502
2503                        self.encode_progressive_scan(
2504                            scan,
2505                            &y_blocks,
2506                            &cb_blocks,
2507                            &cr_blocks,
2508                            mcu_rows,
2509                            mcu_cols,
2510                            luma_h,
2511                            luma_v,
2512                            width,
2513                            height,
2514                            chroma_width,
2515                            chroma_height,
2516                            &opt_dc_luma,
2517                            &opt_dc_chroma,
2518                            &ac_derived,
2519                            &ac_derived, // Not used for AC scans, but needed for signature
2520                            &mut prog_encoder,
2521                        )?;
2522                        prog_encoder.finish_scan(Some(&ac_derived))?;
2523                    } else {
2524                        // DC scan: use global DC tables
2525                        write_sos_marker(&mut inner, scan, &components)?;
2526                        bit_writer = BitWriter::new(inner);
2527
2528                        let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
2529                        self.encode_progressive_scan(
2530                            scan,
2531                            &y_blocks,
2532                            &cb_blocks,
2533                            &cr_blocks,
2534                            mcu_rows,
2535                            mcu_cols,
2536                            luma_h,
2537                            luma_v,
2538                            width,
2539                            height,
2540                            chroma_width,
2541                            chroma_height,
2542                            &opt_dc_luma,
2543                            &opt_dc_chroma,
2544                            &ac_luma_derived, // Not used for DC scans
2545                            &ac_chroma_derived,
2546                            &mut prog_encoder,
2547                        )?;
2548                        prog_encoder.finish_scan(None)?;
2549                    }
2550                }
2551
2552                // Flush and write EOI
2553                bit_writer.flush()?;
2554                let mut output = bit_writer.into_inner();
2555                output.write_all(&[0xFF, 0xD9])?;
2556            } else {
2557                // Standard tables mode (no optimization)
2558                let output = marker_writer.into_inner();
2559                let mut bit_writer = BitWriter::new(output);
2560
2561                for scan in &scans {
2562                    bit_writer.flush()?;
2563                    let mut inner = bit_writer.into_inner();
2564                    write_sos_marker(&mut inner, scan, &components)?;
2565
2566                    bit_writer = BitWriter::new(inner);
2567                    let mut prog_encoder = ProgressiveEncoder::new_standard_tables(&mut bit_writer);
2568
2569                    self.encode_progressive_scan(
2570                        scan,
2571                        &y_blocks,
2572                        &cb_blocks,
2573                        &cr_blocks,
2574                        mcu_rows,
2575                        mcu_cols,
2576                        luma_h,
2577                        luma_v,
2578                        width,
2579                        height,
2580                        chroma_width,
2581                        chroma_height,
2582                        &dc_luma_derived,
2583                        &dc_chroma_derived,
2584                        &ac_luma_derived,
2585                        &ac_chroma_derived,
2586                        &mut prog_encoder,
2587                    )?;
2588
2589                    let ac_table = if scan.ss > 0 {
2590                        if scan.component_index[0] == 0 {
2591                            Some(&ac_luma_derived)
2592                        } else {
2593                            Some(&ac_chroma_derived)
2594                        }
2595                    } else {
2596                        None
2597                    };
2598                    prog_encoder.finish_scan(ac_table)?;
2599                }
2600
2601                bit_writer.flush()?;
2602                let mut output = bit_writer.into_inner();
2603                output.write_all(&[0xFF, 0xD9])?;
2604            }
2605        } else if self.optimize_huffman {
2606            // Baseline mode with Huffman optimization (2-pass)
2607            // Pass 1: Collect blocks and count frequencies
2608            let mcu_rows = mcu_height / (DCTSIZE * luma_v as usize);
2609            let mcu_cols = mcu_width / (DCTSIZE * luma_h as usize);
2610            let num_y_blocks = mcu_rows
2611                .checked_mul(mcu_cols)
2612                .and_then(|n| n.checked_mul(luma_h as usize))
2613                .and_then(|n| n.checked_mul(luma_v as usize))
2614                .ok_or(Error::AllocationFailed)?;
2615            let num_chroma_blocks = mcu_rows
2616                .checked_mul(mcu_cols)
2617                .ok_or(Error::AllocationFailed)?;
2618
2619            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_y_blocks)?;
2620            let mut cb_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
2621            let mut cr_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
2622
2623            // Optionally collect raw DCT for DC trellis
2624            let dc_trellis_enabled = self.trellis.enabled && self.trellis.dc_enabled;
2625            let mut y_raw_dct = if dc_trellis_enabled {
2626                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_y_blocks)?)
2627            } else {
2628                None
2629            };
2630            let mut cb_raw_dct = if dc_trellis_enabled {
2631                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
2632            } else {
2633                None
2634            };
2635            let mut cr_raw_dct = if dc_trellis_enabled {
2636                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
2637            } else {
2638                None
2639            };
2640
2641            self.collect_blocks(
2642                y_mcu,
2643                mcu_width,
2644                mcu_height,
2645                cb_mcu,
2646                cr_mcu,
2647                mcu_chroma_w,
2648                mcu_chroma_h,
2649                &luma_qtable.values,
2650                &chroma_qtable.values,
2651                &ac_luma_derived,
2652                &ac_chroma_derived,
2653                &mut y_blocks,
2654                &mut cb_blocks,
2655                &mut cr_blocks,
2656                y_raw_dct.as_deref_mut(),
2657                cb_raw_dct.as_deref_mut(),
2658                cr_raw_dct.as_deref_mut(),
2659                luma_h,
2660                luma_v,
2661            )?;
2662
2663            // Run DC trellis optimization if enabled
2664            // C mozjpeg processes DC trellis row by row (each row is an independent chain)
2665            if dc_trellis_enabled {
2666                let h = luma_h as usize;
2667                let v = luma_v as usize;
2668                let y_block_cols = mcu_cols * h;
2669                let y_block_rows = mcu_rows * v;
2670
2671                if let Some(ref y_raw) = y_raw_dct {
2672                    run_dc_trellis_by_row(
2673                        y_raw,
2674                        &mut y_blocks,
2675                        luma_qtable.values[0],
2676                        &dc_luma_derived,
2677                        self.trellis.lambda_log_scale1,
2678                        self.trellis.lambda_log_scale2,
2679                        y_block_rows,
2680                        y_block_cols,
2681                        mcu_cols,
2682                        h,
2683                        v,
2684                    );
2685                }
2686                // Chroma has 1x1 per MCU, so MCU order = row order
2687                if let Some(ref cb_raw) = cb_raw_dct {
2688                    run_dc_trellis_by_row(
2689                        cb_raw,
2690                        &mut cb_blocks,
2691                        chroma_qtable.values[0],
2692                        &dc_chroma_derived,
2693                        self.trellis.lambda_log_scale1,
2694                        self.trellis.lambda_log_scale2,
2695                        mcu_rows,
2696                        mcu_cols,
2697                        mcu_cols,
2698                        1,
2699                        1,
2700                    );
2701                }
2702                if let Some(ref cr_raw) = cr_raw_dct {
2703                    run_dc_trellis_by_row(
2704                        cr_raw,
2705                        &mut cr_blocks,
2706                        chroma_qtable.values[0],
2707                        &dc_chroma_derived,
2708                        self.trellis.lambda_log_scale1,
2709                        self.trellis.lambda_log_scale2,
2710                        mcu_rows,
2711                        mcu_cols,
2712                        mcu_cols,
2713                        1,
2714                        1,
2715                    );
2716                }
2717            }
2718
2719            // Count symbol frequencies
2720            let mut dc_luma_freq = FrequencyCounter::new();
2721            let mut dc_chroma_freq = FrequencyCounter::new();
2722            let mut ac_luma_freq = FrequencyCounter::new();
2723            let mut ac_chroma_freq = FrequencyCounter::new();
2724
2725            let mut counter = SymbolCounter::new();
2726            let blocks_per_mcu_y = (luma_h * luma_v) as usize;
2727            let mut y_idx = 0;
2728            let mut c_idx = 0;
2729
2730            for _mcu_row in 0..mcu_rows {
2731                for _mcu_col in 0..mcu_cols {
2732                    // Y blocks
2733                    for _ in 0..blocks_per_mcu_y {
2734                        counter.count_block(
2735                            &y_blocks[y_idx],
2736                            0,
2737                            &mut dc_luma_freq,
2738                            &mut ac_luma_freq,
2739                        );
2740                        y_idx += 1;
2741                    }
2742                    // Cb block
2743                    counter.count_block(
2744                        &cb_blocks[c_idx],
2745                        1,
2746                        &mut dc_chroma_freq,
2747                        &mut ac_chroma_freq,
2748                    );
2749                    // Cr block
2750                    counter.count_block(
2751                        &cr_blocks[c_idx],
2752                        2,
2753                        &mut dc_chroma_freq,
2754                        &mut ac_chroma_freq,
2755                    );
2756                    c_idx += 1;
2757                }
2758            }
2759
2760            // Generate optimized Huffman tables
2761            let opt_dc_luma_huff = dc_luma_freq.generate_table()?;
2762            let opt_dc_chroma_huff = dc_chroma_freq.generate_table()?;
2763            let opt_ac_luma_huff = ac_luma_freq.generate_table()?;
2764            let opt_ac_chroma_huff = ac_chroma_freq.generate_table()?;
2765
2766            let opt_dc_luma = DerivedTable::from_huff_table(&opt_dc_luma_huff, true)?;
2767            let opt_dc_chroma = DerivedTable::from_huff_table(&opt_dc_chroma_huff, true)?;
2768            let opt_ac_luma = DerivedTable::from_huff_table(&opt_ac_luma_huff, false)?;
2769            let opt_ac_chroma = DerivedTable::from_huff_table(&opt_ac_chroma_huff, false)?;
2770
2771            // Write DHT with optimized tables - combined into single marker
2772            marker_writer.write_dht_multiple(&[
2773                (0, false, &opt_dc_luma_huff),
2774                (1, false, &opt_dc_chroma_huff),
2775                (0, true, &opt_ac_luma_huff),
2776                (1, true, &opt_ac_chroma_huff),
2777            ])?;
2778
2779            // Write SOS and encode
2780            let scans = generate_baseline_scan(3);
2781            let scan = &scans[0];
2782            marker_writer.write_sos(scan, &components)?;
2783
2784            let mut output = marker_writer.into_inner();
2785
2786            // Use SIMD entropy encoder on x86_64 for ~2x faster encoding
2787            #[cfg(target_arch = "x86_64")]
2788            {
2789                let mut simd_entropy = SimdEntropyEncoder::new();
2790
2791                // Encode from stored blocks with restart marker support
2792                y_idx = 0;
2793                c_idx = 0;
2794                let restart_interval = self.restart_interval as usize;
2795                let mut mcu_count = 0usize;
2796                let mut restart_num = 0u8;
2797
2798                for _mcu_row in 0..mcu_rows {
2799                    for _mcu_col in 0..mcu_cols {
2800                        // Emit restart marker if needed (before this MCU, not first)
2801                        if restart_interval > 0
2802                            && mcu_count > 0
2803                            && mcu_count.is_multiple_of(restart_interval)
2804                        {
2805                            simd_entropy.emit_restart(restart_num);
2806                            restart_num = restart_num.wrapping_add(1) & 0x07;
2807                        }
2808
2809                        // Y blocks
2810                        for _ in 0..blocks_per_mcu_y {
2811                            simd_entropy.encode_block(
2812                                &y_blocks[y_idx],
2813                                0,
2814                                &opt_dc_luma,
2815                                &opt_ac_luma,
2816                            );
2817                            y_idx += 1;
2818                        }
2819                        // Cb block
2820                        simd_entropy.encode_block(
2821                            &cb_blocks[c_idx],
2822                            1,
2823                            &opt_dc_chroma,
2824                            &opt_ac_chroma,
2825                        );
2826                        // Cr block
2827                        simd_entropy.encode_block(
2828                            &cr_blocks[c_idx],
2829                            2,
2830                            &opt_dc_chroma,
2831                            &opt_ac_chroma,
2832                        );
2833                        c_idx += 1;
2834                        mcu_count += 1;
2835                    }
2836                }
2837
2838                simd_entropy.flush();
2839                output.write_all(simd_entropy.get_buffer())?;
2840            }
2841
2842            // Fallback for non-x86_64 platforms
2843            #[cfg(not(target_arch = "x86_64"))]
2844            {
2845                let mut bit_writer = BitWriter::new(output);
2846                let mut entropy = EntropyEncoder::new(&mut bit_writer);
2847
2848                // Encode from stored blocks with restart marker support
2849                y_idx = 0;
2850                c_idx = 0;
2851                let restart_interval = self.restart_interval as usize;
2852                let mut mcu_count = 0usize;
2853                let mut restart_num = 0u8;
2854
2855                for _mcu_row in 0..mcu_rows {
2856                    for _mcu_col in 0..mcu_cols {
2857                        // Emit restart marker if needed (before this MCU, not first)
2858                        if restart_interval > 0
2859                            && mcu_count > 0
2860                            && mcu_count.is_multiple_of(restart_interval)
2861                        {
2862                            entropy.emit_restart(restart_num)?;
2863                            restart_num = restart_num.wrapping_add(1) & 0x07;
2864                        }
2865
2866                        // Y blocks
2867                        for _ in 0..blocks_per_mcu_y {
2868                            entropy.encode_block(
2869                                &y_blocks[y_idx],
2870                                0,
2871                                &opt_dc_luma,
2872                                &opt_ac_luma,
2873                            )?;
2874                            y_idx += 1;
2875                        }
2876                        // Cb block
2877                        entropy.encode_block(
2878                            &cb_blocks[c_idx],
2879                            1,
2880                            &opt_dc_chroma,
2881                            &opt_ac_chroma,
2882                        )?;
2883                        // Cr block
2884                        entropy.encode_block(
2885                            &cr_blocks[c_idx],
2886                            2,
2887                            &opt_dc_chroma,
2888                            &opt_ac_chroma,
2889                        )?;
2890                        c_idx += 1;
2891                        mcu_count += 1;
2892                    }
2893                }
2894
2895                bit_writer.flush()?;
2896                output = bit_writer.into_inner();
2897            }
2898
2899            output.write_all(&[0xFF, 0xD9])?;
2900        } else {
2901            // Baseline mode: Encode directly (streaming)
2902            let scans = generate_baseline_scan(3);
2903            let scan = &scans[0]; // Baseline has only one scan
2904            marker_writer.write_sos(scan, &components)?;
2905
2906            // Encode MCU data
2907            let output = marker_writer.into_inner();
2908            let mut bit_writer = BitWriter::new(output);
2909            let mut entropy = EntropyEncoder::new(&mut bit_writer);
2910
2911            self.encode_mcus(
2912                y_mcu,
2913                mcu_width,
2914                mcu_height,
2915                cb_mcu,
2916                cr_mcu,
2917                mcu_chroma_w,
2918                mcu_chroma_h,
2919                &luma_qtable.values,
2920                &chroma_qtable.values,
2921                &dc_luma_derived,
2922                &dc_chroma_derived,
2923                &ac_luma_derived,
2924                &ac_chroma_derived,
2925                &mut entropy,
2926                luma_h,
2927                luma_v,
2928            )?;
2929
2930            // Flush bits and get output back
2931            bit_writer.flush()?;
2932            let mut output = bit_writer.into_inner();
2933
2934            // EOI
2935            output.write_all(&[0xFF, 0xD9])?;
2936        }
2937
2938        Ok(())
2939    }
2940
2941    /// Encode all MCUs (Minimum Coded Units).
2942    #[allow(clippy::too_many_arguments)]
2943    fn encode_mcus<W: Write>(
2944        &self,
2945        y_plane: &[u8],
2946        y_width: usize,
2947        y_height: usize,
2948        cb_plane: &[u8],
2949        cr_plane: &[u8],
2950        chroma_width: usize,
2951        _chroma_height: usize,
2952        luma_qtable: &[u16; DCTSIZE2],
2953        chroma_qtable: &[u16; DCTSIZE2],
2954        dc_luma: &DerivedTable,
2955        dc_chroma: &DerivedTable,
2956        ac_luma: &DerivedTable,
2957        ac_chroma: &DerivedTable,
2958        entropy: &mut EntropyEncoder<W>,
2959        h_samp: u8,
2960        v_samp: u8,
2961    ) -> Result<()> {
2962        let mcu_rows = y_height / (DCTSIZE * v_samp as usize);
2963        let mcu_cols = y_width / (DCTSIZE * h_samp as usize);
2964        let total_mcus = mcu_rows * mcu_cols;
2965
2966        let mut dct_block = [0i16; DCTSIZE2];
2967        let mut quant_block = [0i16; DCTSIZE2];
2968
2969        // Restart marker tracking
2970        let restart_interval = self.restart_interval as usize;
2971        let mut mcu_count = 0usize;
2972        let mut restart_num = 0u8;
2973
2974        for mcu_row in 0..mcu_rows {
2975            for mcu_col in 0..mcu_cols {
2976                // Check if we need to emit a restart marker BEFORE this MCU
2977                // (except for the first MCU)
2978                if restart_interval > 0
2979                    && mcu_count > 0
2980                    && mcu_count.is_multiple_of(restart_interval)
2981                {
2982                    entropy.emit_restart(restart_num)?;
2983                    restart_num = restart_num.wrapping_add(1) & 0x07;
2984                }
2985
2986                // Encode Y blocks (may be multiple per MCU for subsampling)
2987                for v in 0..v_samp as usize {
2988                    for h in 0..h_samp as usize {
2989                        let block_row = mcu_row * v_samp as usize + v;
2990                        let block_col = mcu_col * h_samp as usize + h;
2991
2992                        self.encode_block(
2993                            y_plane,
2994                            y_width,
2995                            block_row,
2996                            block_col,
2997                            luma_qtable,
2998                            dc_luma,
2999                            ac_luma,
3000                            0, // Y component
3001                            entropy,
3002                            &mut dct_block,
3003                            &mut quant_block,
3004                        )?;
3005                    }
3006                }
3007
3008                // Encode Cb block
3009                self.encode_block(
3010                    cb_plane,
3011                    chroma_width,
3012                    mcu_row,
3013                    mcu_col,
3014                    chroma_qtable,
3015                    dc_chroma,
3016                    ac_chroma,
3017                    1, // Cb component
3018                    entropy,
3019                    &mut dct_block,
3020                    &mut quant_block,
3021                )?;
3022
3023                // Encode Cr block
3024                self.encode_block(
3025                    cr_plane,
3026                    chroma_width,
3027                    mcu_row,
3028                    mcu_col,
3029                    chroma_qtable,
3030                    dc_chroma,
3031                    ac_chroma,
3032                    2, // Cr component
3033                    entropy,
3034                    &mut dct_block,
3035                    &mut quant_block,
3036                )?;
3037
3038                mcu_count += 1;
3039            }
3040        }
3041
3042        // Suppress unused variable warning
3043        let _ = total_mcus;
3044
3045        Ok(())
3046    }
3047
3048    /// Encode a single 8x8 block.
3049    #[allow(clippy::too_many_arguments)]
3050    fn encode_block<W: Write>(
3051        &self,
3052        plane: &[u8],
3053        plane_width: usize,
3054        block_row: usize,
3055        block_col: usize,
3056        qtable: &[u16; DCTSIZE2],
3057        dc_table: &DerivedTable,
3058        ac_table: &DerivedTable,
3059        component: usize,
3060        entropy: &mut EntropyEncoder<W>,
3061        dct_block: &mut [i16; DCTSIZE2],
3062        quant_block: &mut [i16; DCTSIZE2],
3063    ) -> Result<()> {
3064        // Extract 8x8 block from plane
3065        let mut samples = [0u8; DCTSIZE2];
3066        let base_y = block_row * DCTSIZE;
3067        let base_x = block_col * DCTSIZE;
3068
3069        for row in 0..DCTSIZE {
3070            let src_offset = (base_y + row) * plane_width + base_x;
3071            let dst_offset = row * DCTSIZE;
3072            samples[dst_offset..dst_offset + DCTSIZE]
3073                .copy_from_slice(&plane[src_offset..src_offset + DCTSIZE]);
3074        }
3075
3076        // Level shift (center around 0 for DCT)
3077        let mut shifted = [0i16; DCTSIZE2];
3078        for i in 0..DCTSIZE2 {
3079            shifted[i] = (samples[i] as i16) - 128;
3080        }
3081
3082        // Apply overshoot deringing if enabled (reduces ringing on white backgrounds)
3083        if self.overshoot_deringing {
3084            preprocess_deringing(&mut shifted, qtable[0]);
3085        }
3086
3087        // Forward DCT (output scaled by factor of 8)
3088        self.simd.do_forward_dct(&shifted, dct_block);
3089
3090        // Convert to i32 for quantization
3091        let mut dct_i32 = [0i32; DCTSIZE2];
3092        for i in 0..DCTSIZE2 {
3093            dct_i32[i] = dct_block[i] as i32;
3094        }
3095
3096        // Use trellis quantization if enabled
3097        // Both paths expect raw DCT (scaled by 8) and handle the scaling internally
3098        if self.trellis.enabled {
3099            trellis_quantize_block(&dct_i32, quant_block, qtable, ac_table, &self.trellis);
3100        } else {
3101            // Non-trellis path: use single-step quantization matching C mozjpeg
3102            // This takes raw DCT (scaled by 8) and uses q_scaled = 8 * qtable[i]
3103            quantize_block_raw(&dct_i32, qtable, quant_block);
3104        }
3105
3106        // Entropy encode
3107        entropy.encode_block(quant_block, component, dc_table, ac_table)?;
3108
3109        Ok(())
3110    }
3111
3112    /// Collect all quantized DCT blocks for progressive encoding.
3113    /// Also collects raw DCT blocks if DC trellis is enabled.
3114    #[allow(clippy::too_many_arguments)]
3115    fn collect_blocks(
3116        &self,
3117        y_plane: &[u8],
3118        y_width: usize,
3119        y_height: usize,
3120        cb_plane: &[u8],
3121        cr_plane: &[u8],
3122        chroma_width: usize,
3123        _chroma_height: usize,
3124        luma_qtable: &[u16; DCTSIZE2],
3125        chroma_qtable: &[u16; DCTSIZE2],
3126        ac_luma: &DerivedTable,
3127        ac_chroma: &DerivedTable,
3128        y_blocks: &mut [[i16; DCTSIZE2]],
3129        cb_blocks: &mut [[i16; DCTSIZE2]],
3130        cr_blocks: &mut [[i16; DCTSIZE2]],
3131        mut y_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
3132        mut cb_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
3133        mut cr_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
3134        h_samp: u8,
3135        v_samp: u8,
3136    ) -> Result<()> {
3137        let mcu_rows = y_height / (DCTSIZE * v_samp as usize);
3138        let mcu_cols = y_width / (DCTSIZE * h_samp as usize);
3139
3140        let mut y_idx = 0;
3141        let mut c_idx = 0;
3142        let mut dct_block = [0i16; DCTSIZE2];
3143
3144        for mcu_row in 0..mcu_rows {
3145            for mcu_col in 0..mcu_cols {
3146                // Collect Y blocks (may be multiple per MCU for subsampling)
3147                for v in 0..v_samp as usize {
3148                    for h in 0..h_samp as usize {
3149                        let block_row = mcu_row * v_samp as usize + v;
3150                        let block_col = mcu_col * h_samp as usize + h;
3151
3152                        // Get mutable reference to raw DCT output if collecting
3153                        let raw_dct_out = y_raw_dct.as_mut().map(|arr| &mut arr[y_idx][..]);
3154                        self.process_block_to_storage_with_raw(
3155                            y_plane,
3156                            y_width,
3157                            block_row,
3158                            block_col,
3159                            luma_qtable,
3160                            ac_luma,
3161                            &mut y_blocks[y_idx],
3162                            &mut dct_block,
3163                            raw_dct_out,
3164                        )?;
3165                        y_idx += 1;
3166                    }
3167                }
3168
3169                // Collect Cb block
3170                let raw_dct_out = cb_raw_dct.as_mut().map(|arr| &mut arr[c_idx][..]);
3171                self.process_block_to_storage_with_raw(
3172                    cb_plane,
3173                    chroma_width,
3174                    mcu_row,
3175                    mcu_col,
3176                    chroma_qtable,
3177                    ac_chroma,
3178                    &mut cb_blocks[c_idx],
3179                    &mut dct_block,
3180                    raw_dct_out,
3181                )?;
3182
3183                // Collect Cr block
3184                let raw_dct_out = cr_raw_dct.as_mut().map(|arr| &mut arr[c_idx][..]);
3185                self.process_block_to_storage_with_raw(
3186                    cr_plane,
3187                    chroma_width,
3188                    mcu_row,
3189                    mcu_col,
3190                    chroma_qtable,
3191                    ac_chroma,
3192                    &mut cr_blocks[c_idx],
3193                    &mut dct_block,
3194                    raw_dct_out,
3195                )?;
3196
3197                c_idx += 1;
3198            }
3199        }
3200
3201        Ok(())
3202    }
3203
3204    /// Process a block: DCT + quantize, storing the result.
3205    /// Optionally stores raw DCT coefficients for DC trellis.
3206    #[allow(clippy::too_many_arguments)]
3207    fn process_block_to_storage_with_raw(
3208        &self,
3209        plane: &[u8],
3210        plane_width: usize,
3211        block_row: usize,
3212        block_col: usize,
3213        qtable: &[u16; DCTSIZE2],
3214        ac_table: &DerivedTable,
3215        out_block: &mut [i16; DCTSIZE2],
3216        dct_block: &mut [i16; DCTSIZE2],
3217        raw_dct_out: Option<&mut [i32]>,
3218    ) -> Result<()> {
3219        // Extract 8x8 block from plane
3220        let mut samples = [0u8; DCTSIZE2];
3221        let base_y = block_row * DCTSIZE;
3222        let base_x = block_col * DCTSIZE;
3223
3224        for row in 0..DCTSIZE {
3225            let src_offset = (base_y + row) * plane_width + base_x;
3226            let dst_offset = row * DCTSIZE;
3227            samples[dst_offset..dst_offset + DCTSIZE]
3228                .copy_from_slice(&plane[src_offset..src_offset + DCTSIZE]);
3229        }
3230
3231        // Level shift (center around 0 for DCT)
3232        let mut shifted = [0i16; DCTSIZE2];
3233        for i in 0..DCTSIZE2 {
3234            shifted[i] = (samples[i] as i16) - 128;
3235        }
3236
3237        // Apply overshoot deringing if enabled (reduces ringing on white backgrounds)
3238        if self.overshoot_deringing {
3239            preprocess_deringing(&mut shifted, qtable[0]);
3240        }
3241
3242        // Forward DCT (output scaled by factor of 8)
3243        self.simd.do_forward_dct(&shifted, dct_block);
3244
3245        // Convert to i32 for quantization
3246        let mut dct_i32 = [0i32; DCTSIZE2];
3247        for i in 0..DCTSIZE2 {
3248            dct_i32[i] = dct_block[i] as i32;
3249        }
3250
3251        // Store raw DCT if requested (for DC trellis)
3252        if let Some(raw_out) = raw_dct_out {
3253            raw_out.copy_from_slice(&dct_i32);
3254        }
3255
3256        // Use trellis quantization if enabled
3257        // Both paths expect raw DCT (scaled by 8) and handle the scaling internally
3258        if self.trellis.enabled {
3259            trellis_quantize_block(&dct_i32, out_block, qtable, ac_table, &self.trellis);
3260        } else {
3261            // Non-trellis path: use single-step quantization matching C mozjpeg
3262            // This takes raw DCT (scaled by 8) and uses q_scaled = 8 * qtable[i]
3263            quantize_block_raw(&dct_i32, qtable, out_block);
3264        }
3265
3266        Ok(())
3267    }
3268
3269    /// Optimize progressive scan configuration (C mozjpeg-compatible).
3270    ///
3271    /// This implements the optimize_scans feature from C mozjpeg:
3272    /// 1. Generate 64 individual candidate scans
3273    /// 2. Trial-encode scans SEQUENTIALLY to get accurate sizes
3274    /// 3. Use ScanSelector to find optimal Al levels and frequency splits
3275    /// 4. Build the final scan script from the selection
3276    ///
3277    /// IMPORTANT: Scans must be encoded sequentially (not independently) because
3278    /// refinement scans (Ah > 0) need context from previous scans to produce
3279    /// correct output sizes.
3280    #[allow(clippy::too_many_arguments)]
3281    fn optimize_progressive_scans(
3282        &self,
3283        num_components: u8,
3284        y_blocks: &[[i16; DCTSIZE2]],
3285        cb_blocks: &[[i16; DCTSIZE2]],
3286        cr_blocks: &[[i16; DCTSIZE2]],
3287        mcu_rows: usize,
3288        mcu_cols: usize,
3289        h_samp: u8,
3290        v_samp: u8,
3291        actual_width: usize,
3292        actual_height: usize,
3293        chroma_width: usize,
3294        chroma_height: usize,
3295        dc_luma: &DerivedTable,
3296        dc_chroma: &DerivedTable,
3297        ac_luma: &DerivedTable,
3298        ac_chroma: &DerivedTable,
3299    ) -> Result<Vec<crate::types::ScanInfo>> {
3300        let config = ScanSearchConfig::default();
3301        let candidate_scans = generate_search_scans(num_components, &config);
3302
3303        // Use ScanTrialEncoder for sequential trial encoding with proper state tracking
3304        let mut trial_encoder = ScanTrialEncoder::new(
3305            y_blocks,
3306            cb_blocks,
3307            cr_blocks,
3308            dc_luma,
3309            dc_chroma,
3310            ac_luma,
3311            ac_chroma,
3312            mcu_rows,
3313            mcu_cols,
3314            h_samp,
3315            v_samp,
3316            actual_width,
3317            actual_height,
3318            chroma_width,
3319            chroma_height,
3320        );
3321
3322        // Trial-encode all scans sequentially to get accurate sizes
3323        let scan_sizes = trial_encoder.encode_all_scans(&candidate_scans)?;
3324
3325        // Use ScanSelector to find the optimal configuration
3326        let selector = ScanSelector::new(num_components, config.clone());
3327        let result = selector.select_best(&scan_sizes);
3328
3329        // Build the final scan script from the selection
3330        Ok(result.build_final_scans(num_components, &config))
3331    }
3332
3333    /// Encode a single progressive scan.
3334    #[allow(clippy::too_many_arguments)]
3335    fn encode_progressive_scan<W: Write>(
3336        &self,
3337        scan: &crate::types::ScanInfo,
3338        y_blocks: &[[i16; DCTSIZE2]],
3339        cb_blocks: &[[i16; DCTSIZE2]],
3340        cr_blocks: &[[i16; DCTSIZE2]],
3341        mcu_rows: usize,
3342        mcu_cols: usize,
3343        h_samp: u8,
3344        v_samp: u8,
3345        actual_width: usize,
3346        actual_height: usize,
3347        chroma_width: usize,
3348        chroma_height: usize,
3349        dc_luma: &DerivedTable,
3350        dc_chroma: &DerivedTable,
3351        ac_luma: &DerivedTable,
3352        ac_chroma: &DerivedTable,
3353        encoder: &mut ProgressiveEncoder<W>,
3354    ) -> Result<()> {
3355        let is_dc_scan = scan.ss == 0 && scan.se == 0;
3356        let is_refinement = scan.ah != 0;
3357
3358        if is_dc_scan {
3359            // DC scan - can be interleaved (multiple components)
3360            self.encode_dc_scan(
3361                scan,
3362                y_blocks,
3363                cb_blocks,
3364                cr_blocks,
3365                mcu_rows,
3366                mcu_cols,
3367                h_samp,
3368                v_samp,
3369                dc_luma,
3370                dc_chroma,
3371                is_refinement,
3372                encoder,
3373            )?;
3374        } else {
3375            // AC scan - single component only (non-interleaved)
3376            // For non-interleaved scans, use actual component block dimensions
3377            let comp_idx = scan.component_index[0] as usize;
3378            let blocks = match comp_idx {
3379                0 => y_blocks,
3380                1 => cb_blocks,
3381                2 => cr_blocks,
3382                _ => return Err(Error::InvalidComponentIndex(comp_idx)),
3383            };
3384            let ac_table = if comp_idx == 0 { ac_luma } else { ac_chroma };
3385
3386            // Calculate actual block dimensions for this component.
3387            // Non-interleaved AC scans encode only the actual image blocks, not MCU padding.
3388            // This differs from interleaved DC scans which encode all MCU blocks.
3389            // Reference: ITU-T T.81 Section F.2.3
3390            let (block_cols, block_rows) = if comp_idx == 0 {
3391                // Y component: full resolution
3392                (
3393                    actual_width.div_ceil(DCTSIZE),
3394                    actual_height.div_ceil(DCTSIZE),
3395                )
3396            } else {
3397                // Chroma components: subsampled resolution
3398                (
3399                    chroma_width.div_ceil(DCTSIZE),
3400                    chroma_height.div_ceil(DCTSIZE),
3401                )
3402            };
3403
3404            self.encode_ac_scan(
3405                scan,
3406                blocks,
3407                mcu_rows,
3408                mcu_cols,
3409                h_samp,
3410                v_samp,
3411                comp_idx,
3412                block_cols,
3413                block_rows,
3414                ac_table,
3415                is_refinement,
3416                encoder,
3417            )?;
3418        }
3419
3420        Ok(())
3421    }
3422
3423    /// Encode a DC scan (Ss=Se=0).
3424    #[allow(clippy::too_many_arguments)]
3425    fn encode_dc_scan<W: Write>(
3426        &self,
3427        scan: &crate::types::ScanInfo,
3428        y_blocks: &[[i16; DCTSIZE2]],
3429        cb_blocks: &[[i16; DCTSIZE2]],
3430        cr_blocks: &[[i16; DCTSIZE2]],
3431        mcu_rows: usize,
3432        mcu_cols: usize,
3433        h_samp: u8,
3434        v_samp: u8,
3435        dc_luma: &DerivedTable,
3436        dc_chroma: &DerivedTable,
3437        is_refinement: bool,
3438        encoder: &mut ProgressiveEncoder<W>,
3439    ) -> Result<()> {
3440        let blocks_per_mcu_y = (h_samp * v_samp) as usize;
3441        let mut y_idx = 0;
3442        let mut c_idx = 0;
3443
3444        for _mcu_row in 0..mcu_rows {
3445            for _mcu_col in 0..mcu_cols {
3446                // Encode Y blocks
3447                for _ in 0..blocks_per_mcu_y {
3448                    if is_refinement {
3449                        encoder.encode_dc_refine(&y_blocks[y_idx], scan.al)?;
3450                    } else {
3451                        encoder.encode_dc_first(&y_blocks[y_idx], 0, dc_luma, scan.al)?;
3452                    }
3453                    y_idx += 1;
3454                }
3455
3456                // Encode Cb
3457                if is_refinement {
3458                    encoder.encode_dc_refine(&cb_blocks[c_idx], scan.al)?;
3459                } else {
3460                    encoder.encode_dc_first(&cb_blocks[c_idx], 1, dc_chroma, scan.al)?;
3461                }
3462
3463                // Encode Cr
3464                if is_refinement {
3465                    encoder.encode_dc_refine(&cr_blocks[c_idx], scan.al)?;
3466                } else {
3467                    encoder.encode_dc_first(&cr_blocks[c_idx], 2, dc_chroma, scan.al)?;
3468                }
3469
3470                c_idx += 1;
3471            }
3472        }
3473
3474        Ok(())
3475    }
3476
3477    /// Encode an AC scan (Ss > 0).
3478    ///
3479    /// **IMPORTANT**: Progressive AC scans are always non-interleaved, meaning blocks
3480    /// must be encoded in component raster order (row-major within the component's
3481    /// block grid), NOT in MCU-interleaved order.
3482    ///
3483    /// For non-interleaved scans, the number of blocks is determined by the actual
3484    /// component dimensions (ceil(width/8) × ceil(height/8)), NOT the MCU-padded
3485    /// dimensions. This is different from interleaved DC scans which use MCU order.
3486    /// The padding blocks (beyond actual image dimensions) have DC coefficients but
3487    /// no AC coefficients - the decoder only outputs the actual image dimensions.
3488    ///
3489    /// Reference: ITU-T T.81 Section F.2.3 - "The scan data for a non-interleaved
3490    /// scan shall consist of a sequence of entropy-coded segments... The data units
3491    /// are processed in the order defined by the scan component."
3492    #[allow(clippy::too_many_arguments)]
3493    fn encode_ac_scan<W: Write>(
3494        &self,
3495        scan: &crate::types::ScanInfo,
3496        blocks: &[[i16; DCTSIZE2]],
3497        _mcu_rows: usize,
3498        mcu_cols: usize,
3499        h_samp: u8,
3500        v_samp: u8,
3501        comp_idx: usize,
3502        block_cols: usize,
3503        block_rows: usize,
3504        ac_table: &DerivedTable,
3505        is_refinement: bool,
3506        encoder: &mut ProgressiveEncoder<W>,
3507    ) -> Result<()> {
3508        // For Y component with subsampling, blocks are stored in MCU-interleaved order
3509        // but AC scans must encode them in component raster order.
3510        // For chroma components (1 block per MCU), the orders are identical.
3511        //
3512        // For non-interleaved scans, encode only the actual image blocks (block_rows × block_cols),
3513        // not all MCU-padded blocks. Padding blocks have DC coefficients but no AC coefficients.
3514
3515        let blocks_per_mcu = if comp_idx == 0 {
3516            (h_samp * v_samp) as usize
3517        } else {
3518            1
3519        };
3520
3521        if blocks_per_mcu == 1 {
3522            // Chroma or 4:4:4 Y: storage order = raster order
3523            let total_blocks = block_rows * block_cols;
3524            for block in blocks.iter().take(total_blocks) {
3525                if is_refinement {
3526                    encoder
3527                        .encode_ac_refine(block, scan.ss, scan.se, scan.ah, scan.al, ac_table)?;
3528                } else {
3529                    encoder.encode_ac_first(block, scan.ss, scan.se, scan.al, ac_table)?;
3530                }
3531            }
3532        } else {
3533            // Y component with subsampling (h_samp > 1 or v_samp > 1)
3534            // Convert from MCU-interleaved storage to component raster order
3535            let h = h_samp as usize;
3536            let v = v_samp as usize;
3537
3538            for block_row in 0..block_rows {
3539                for block_col in 0..block_cols {
3540                    // Convert raster position to MCU-interleaved storage index
3541                    let mcu_row = block_row / v;
3542                    let mcu_col = block_col / h;
3543                    let v_idx = block_row % v;
3544                    let h_idx = block_col % h;
3545                    let storage_idx = mcu_row * (mcu_cols * blocks_per_mcu)
3546                        + mcu_col * blocks_per_mcu
3547                        + v_idx * h
3548                        + h_idx;
3549
3550                    if is_refinement {
3551                        encoder.encode_ac_refine(
3552                            &blocks[storage_idx],
3553                            scan.ss,
3554                            scan.se,
3555                            scan.ah,
3556                            scan.al,
3557                            ac_table,
3558                        )?;
3559                    } else {
3560                        encoder.encode_ac_first(
3561                            &blocks[storage_idx],
3562                            scan.ss,
3563                            scan.se,
3564                            scan.al,
3565                            ac_table,
3566                        )?;
3567                    }
3568                }
3569            }
3570        }
3571
3572        Ok(())
3573    }
3574
3575    /// Count DC symbols for a progressive DC scan.
3576    #[allow(clippy::too_many_arguments)]
3577    fn count_dc_scan_symbols(
3578        &self,
3579        scan: &crate::types::ScanInfo,
3580        y_blocks: &[[i16; DCTSIZE2]],
3581        cb_blocks: &[[i16; DCTSIZE2]],
3582        cr_blocks: &[[i16; DCTSIZE2]],
3583        mcu_rows: usize,
3584        mcu_cols: usize,
3585        h_samp: u8,
3586        v_samp: u8,
3587        dc_luma_freq: &mut FrequencyCounter,
3588        dc_chroma_freq: &mut FrequencyCounter,
3589    ) {
3590        let blocks_per_mcu_y = (h_samp * v_samp) as usize;
3591        let mut y_idx = 0;
3592        let mut c_idx = 0;
3593        let mut counter = ProgressiveSymbolCounter::new();
3594
3595        for _mcu_row in 0..mcu_rows {
3596            for _mcu_col in 0..mcu_cols {
3597                // Y blocks
3598                for _ in 0..blocks_per_mcu_y {
3599                    counter.count_dc_first(&y_blocks[y_idx], 0, scan.al, dc_luma_freq);
3600                    y_idx += 1;
3601                }
3602                // Cb block
3603                counter.count_dc_first(&cb_blocks[c_idx], 1, scan.al, dc_chroma_freq);
3604                // Cr block
3605                counter.count_dc_first(&cr_blocks[c_idx], 2, scan.al, dc_chroma_freq);
3606                c_idx += 1;
3607            }
3608        }
3609    }
3610
3611    /// Count AC symbols for a progressive AC scan.
3612    ///
3613    /// Must iterate blocks in the same order as `encode_ac_scan` (component raster order)
3614    /// to ensure EOBRUN counts match and Huffman tables are correct.
3615    ///
3616    /// Uses actual block dimensions (not MCU-padded) for non-interleaved scans.
3617    #[allow(clippy::too_many_arguments)]
3618    fn count_ac_scan_symbols(
3619        &self,
3620        scan: &crate::types::ScanInfo,
3621        blocks: &[[i16; DCTSIZE2]],
3622        _mcu_rows: usize,
3623        mcu_cols: usize,
3624        h_samp: u8,
3625        v_samp: u8,
3626        comp_idx: usize,
3627        block_cols: usize,
3628        block_rows: usize,
3629        ac_freq: &mut FrequencyCounter,
3630    ) {
3631        let blocks_per_mcu = if comp_idx == 0 {
3632            (h_samp * v_samp) as usize
3633        } else {
3634            1
3635        };
3636
3637        let mut counter = ProgressiveSymbolCounter::new();
3638        let is_refinement = scan.ah != 0;
3639
3640        if blocks_per_mcu == 1 {
3641            // Chroma or 4:4:4 Y: storage order = raster order
3642            let total_blocks = block_rows * block_cols;
3643            for block in blocks.iter().take(total_blocks) {
3644                if is_refinement {
3645                    counter.count_ac_refine(block, scan.ss, scan.se, scan.ah, scan.al, ac_freq);
3646                } else {
3647                    counter.count_ac_first(block, scan.ss, scan.se, scan.al, ac_freq);
3648                }
3649            }
3650        } else {
3651            // Y component with subsampling - iterate in raster order (matching encode_ac_scan)
3652            let h = h_samp as usize;
3653            let v = v_samp as usize;
3654
3655            for block_row in 0..block_rows {
3656                for block_col in 0..block_cols {
3657                    // Convert raster position to MCU-interleaved storage index
3658                    let mcu_row = block_row / v;
3659                    let mcu_col = block_col / h;
3660                    let v_idx = block_row % v;
3661                    let h_idx = block_col % h;
3662                    let storage_idx = mcu_row * (mcu_cols * blocks_per_mcu)
3663                        + mcu_col * blocks_per_mcu
3664                        + v_idx * h
3665                        + h_idx;
3666
3667                    if is_refinement {
3668                        counter.count_ac_refine(
3669                            &blocks[storage_idx],
3670                            scan.ss,
3671                            scan.se,
3672                            scan.ah,
3673                            scan.al,
3674                            ac_freq,
3675                        );
3676                    } else {
3677                        counter.count_ac_first(
3678                            &blocks[storage_idx],
3679                            scan.ss,
3680                            scan.se,
3681                            scan.al,
3682                            ac_freq,
3683                        );
3684                    }
3685                }
3686            }
3687        }
3688
3689        // Flush any pending EOBRUN
3690        counter.finish_scan(Some(ac_freq));
3691    }
3692}
3693
3694// ============================================================================
3695// Encode Trait Implementation
3696// ============================================================================
3697
3698impl Encode for Encoder {
3699    fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
3700        self.encode_rgb(rgb_data, width, height)
3701    }
3702
3703    fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
3704        self.encode_gray(gray_data, width, height)
3705    }
3706}
3707
3708// Note: StreamingEncoder and EncodingStream are in the `streaming` module.
3709
3710// Add streaming() method to Encoder
3711impl Encoder {
3712    /// Create a streaming encoder.
3713    ///
3714    /// Returns a [`StreamingEncoder`] which supports scanline-by-scanline encoding.
3715    /// Note that streaming mode does NOT support trellis quantization, progressive
3716    /// mode, or Huffman optimization (these require buffering the entire image).
3717    ///
3718    /// For full-featured encoding with all mozjpeg optimizations, use [`Encoder::new(Preset)`]
3719    /// with [`encode_rgb()`](Encoder::encode_rgb) or [`encode_gray()`](Encoder::encode_gray).
3720    ///
3721    /// # Example
3722    ///
3723    /// ```ignore
3724    /// use mozjpeg_rs::Encoder;
3725    /// use std::fs::File;
3726    ///
3727    /// let file = File::create("output.jpg")?;
3728    /// let mut stream = Encoder::streaming()
3729    ///     .quality(85)
3730    ///     .start_rgb(1920, 1080, file)?;
3731    ///
3732    /// // Write scanlines...
3733    /// stream.finish()?;
3734    /// ```
3735    pub fn streaming() -> StreamingEncoder {
3736        StreamingEncoder::baseline_fastest()
3737    }
3738}
3739
3740// ============================================================================
3741// C mozjpeg encoding (optional feature)
3742// ============================================================================
3743
3744#[cfg(feature = "mozjpeg-sys-config")]
3745impl Encoder {
3746    /// Convert this encoder to a C mozjpeg encoder.
3747    ///
3748    /// Returns a [`CMozjpeg`](crate::CMozjpeg) that can encode images using
3749    /// the C mozjpeg library with settings matching this Rust encoder.
3750    ///
3751    /// # Example
3752    ///
3753    /// ```no_run
3754    /// use mozjpeg_rs::{Encoder, Preset};
3755    ///
3756    /// let pixels: Vec<u8> = vec![128; 64 * 64 * 3];
3757    /// let encoder = Encoder::new(Preset::ProgressiveBalanced).quality(85);
3758    ///
3759    /// // Encode with C mozjpeg
3760    /// let c_jpeg = encoder.to_c_mozjpeg().encode_rgb(&pixels, 64, 64)?;
3761    ///
3762    /// // Compare with Rust encoder
3763    /// let rust_jpeg = encoder.encode_rgb(&pixels, 64, 64)?;
3764    /// # Ok::<(), mozjpeg_rs::Error>(())
3765    /// ```
3766    pub fn to_c_mozjpeg(&self) -> crate::compat::CMozjpeg {
3767        crate::compat::CMozjpeg {
3768            quality: self.quality,
3769            force_baseline: self.force_baseline,
3770            subsampling: self.subsampling,
3771            progressive: self.progressive,
3772            optimize_huffman: self.optimize_huffman,
3773            optimize_scans: self.optimize_scans,
3774            trellis: self.trellis,
3775            overshoot_deringing: self.overshoot_deringing,
3776            smoothing: self.smoothing,
3777            restart_interval: self.restart_interval,
3778            quant_table_idx: self.quant_table_idx,
3779            has_custom_qtables: self.custom_luma_qtable.is_some()
3780                || self.custom_chroma_qtable.is_some(),
3781            exif_data: self.exif_data.clone(),
3782            icc_profile: self.icc_profile.clone(),
3783            custom_markers: self.custom_markers.clone(),
3784        }
3785    }
3786}
3787
3788/// Unit tests for private encoder internals.
3789/// Public API tests are in tests/encode_tests.rs.
3790#[cfg(test)]
3791mod tests {
3792    use super::*;
3793
3794    #[test]
3795    fn test_encoder_defaults() {
3796        // Default preset is ProgressiveBalanced
3797        let enc = Encoder::default();
3798        assert_eq!(enc.quality, 75);
3799        assert!(enc.progressive); // ProgressiveBalanced is progressive
3800        assert_eq!(enc.subsampling, Subsampling::S420);
3801        assert!(enc.trellis.enabled);
3802        assert!(enc.optimize_huffman);
3803        assert!(!enc.optimize_scans); // ProgressiveBalanced does NOT include optimize_scans
3804    }
3805
3806    #[test]
3807    fn test_encoder_presets() {
3808        let fastest = Encoder::new(Preset::BaselineFastest);
3809        assert!(!fastest.progressive);
3810        assert!(!fastest.trellis.enabled);
3811        assert!(!fastest.optimize_huffman);
3812
3813        let baseline = Encoder::new(Preset::BaselineBalanced);
3814        assert!(!baseline.progressive);
3815        assert!(baseline.trellis.enabled);
3816        assert!(baseline.optimize_huffman);
3817
3818        let prog_balanced = Encoder::new(Preset::ProgressiveBalanced);
3819        assert!(prog_balanced.progressive);
3820        assert!(prog_balanced.trellis.enabled);
3821        assert!(!prog_balanced.optimize_scans);
3822
3823        let prog_smallest = Encoder::new(Preset::ProgressiveSmallest);
3824        assert!(prog_smallest.progressive);
3825        assert!(prog_smallest.optimize_scans);
3826    }
3827
3828    #[test]
3829    fn test_encoder_builder_fields() {
3830        let enc = Encoder::baseline_optimized()
3831            .quality(90)
3832            .progressive(true)
3833            .subsampling(Subsampling::S444);
3834
3835        assert_eq!(enc.quality, 90);
3836        assert!(enc.progressive);
3837        assert_eq!(enc.subsampling, Subsampling::S444);
3838    }
3839
3840    #[test]
3841    fn test_quality_clamping() {
3842        let enc = Encoder::baseline_optimized().quality(0);
3843        assert_eq!(enc.quality, 1);
3844
3845        let enc = Encoder::baseline_optimized().quality(150);
3846        assert_eq!(enc.quality, 100);
3847    }
3848
3849    #[test]
3850    fn test_natural_to_zigzag() {
3851        let mut natural = [0u16; 64];
3852        for i in 0..64 {
3853            natural[i] = i as u16;
3854        }
3855        let zigzag = natural_to_zigzag(&natural);
3856
3857        assert_eq!(zigzag[0], 0);
3858        assert_eq!(zigzag[1], 1);
3859    }
3860
3861    #[test]
3862    fn test_max_compression_uses_all_optimizations() {
3863        let encoder = Encoder::max_compression();
3864        assert!(encoder.trellis.enabled);
3865        assert!(encoder.progressive);
3866        assert!(encoder.optimize_huffman);
3867        assert!(encoder.optimize_scans);
3868    }
3869
3870    #[test]
3871    fn test_encode_ycbcr_planar_444() {
3872        let width = 32u32;
3873        let height = 32u32;
3874
3875        // Create test image with gradient pattern
3876        let y_plane: Vec<u8> = (0..width * height)
3877            .map(|i| ((i % width) * 255 / width) as u8)
3878            .collect();
3879        let cb_plane: Vec<u8> = (0..width * height)
3880            .map(|i| ((i / width) * 255 / height) as u8)
3881            .collect();
3882        let cr_plane: Vec<u8> = vec![128u8; (width * height) as usize];
3883
3884        let encoder = Encoder::new(Preset::BaselineBalanced)
3885            .quality(85)
3886            .subsampling(Subsampling::S444);
3887
3888        let jpeg_data = encoder
3889            .encode_ycbcr_planar(&y_plane, &cb_plane, &cr_plane, width, height)
3890            .expect("encode_ycbcr_planar should succeed");
3891
3892        // Verify it's a valid JPEG
3893        assert!(jpeg_data.starts_with(&[0xFF, 0xD8, 0xFF])); // SOI + marker
3894        assert!(jpeg_data.ends_with(&[0xFF, 0xD9])); // EOI
3895        assert!(jpeg_data.len() > 200); // Reasonable size for 32x32
3896    }
3897
3898    #[test]
3899    fn test_encode_ycbcr_planar_420() {
3900        let width = 32u32;
3901        let height = 32u32;
3902
3903        // For 4:2:0, chroma planes are half resolution in each dimension
3904        let chroma_w = (width + 1) / 2;
3905        let chroma_h = (height + 1) / 2;
3906
3907        let y_plane: Vec<u8> = vec![128u8; (width * height) as usize];
3908        let cb_plane: Vec<u8> = vec![100u8; (chroma_w * chroma_h) as usize];
3909        let cr_plane: Vec<u8> = vec![150u8; (chroma_w * chroma_h) as usize];
3910
3911        let encoder = Encoder::new(Preset::BaselineBalanced)
3912            .quality(85)
3913            .subsampling(Subsampling::S420);
3914
3915        let jpeg_data = encoder
3916            .encode_ycbcr_planar(&y_plane, &cb_plane, &cr_plane, width, height)
3917            .expect("encode_ycbcr_planar with 4:2:0 should succeed");
3918
3919        // Verify it's a valid JPEG
3920        assert!(jpeg_data.starts_with(&[0xFF, 0xD8, 0xFF]));
3921        assert!(jpeg_data.ends_with(&[0xFF, 0xD9]));
3922    }
3923
3924    #[test]
3925    fn test_encode_ycbcr_planar_422() {
3926        let width = 32u32;
3927        let height = 32u32;
3928
3929        // For 4:2:2, chroma is half width, full height
3930        let chroma_w = (width + 1) / 2;
3931
3932        let y_plane: Vec<u8> = vec![128u8; (width * height) as usize];
3933        let cb_plane: Vec<u8> = vec![100u8; (chroma_w * height) as usize];
3934        let cr_plane: Vec<u8> = vec![150u8; (chroma_w * height) as usize];
3935
3936        let encoder = Encoder::new(Preset::BaselineBalanced)
3937            .quality(85)
3938            .subsampling(Subsampling::S422);
3939
3940        let jpeg_data = encoder
3941            .encode_ycbcr_planar(&y_plane, &cb_plane, &cr_plane, width, height)
3942            .expect("encode_ycbcr_planar with 4:2:2 should succeed");
3943
3944        assert!(jpeg_data.starts_with(&[0xFF, 0xD8, 0xFF]));
3945        assert!(jpeg_data.ends_with(&[0xFF, 0xD9]));
3946    }
3947
3948    #[test]
3949    fn test_encode_ycbcr_planar_wrong_size() {
3950        let width = 32u32;
3951        let height = 32u32;
3952
3953        // Correct Y plane but wrong chroma plane sizes for 4:2:0
3954        let y_plane: Vec<u8> = vec![128u8; (width * height) as usize];
3955        let cb_plane: Vec<u8> = vec![100u8; 10]; // Too small!
3956        let cr_plane: Vec<u8> = vec![150u8; 10]; // Too small!
3957
3958        let encoder = Encoder::new(Preset::BaselineBalanced)
3959            .quality(85)
3960            .subsampling(Subsampling::S420);
3961
3962        let result = encoder.encode_ycbcr_planar(&y_plane, &cb_plane, &cr_plane, width, height);
3963
3964        assert!(result.is_err());
3965    }
3966
3967    #[test]
3968    fn test_encode_ycbcr_planar_strided() {
3969        let width = 30u32; // Not a multiple of stride
3970        let height = 20u32;
3971        let y_stride = 32usize; // Stride with 2 bytes padding per row
3972
3973        // For 4:2:0, chroma is half resolution
3974        let chroma_width = 15usize;
3975        let chroma_height = 10usize;
3976        let cb_stride = 16usize; // Stride with 1 byte padding per row
3977
3978        // Create Y plane with stride (fill with gradient, padding with zeros)
3979        let mut y_plane = vec![0u8; y_stride * height as usize];
3980        for row in 0..height as usize {
3981            for col in 0..width as usize {
3982                y_plane[row * y_stride + col] = ((col * 255) / width as usize) as u8;
3983            }
3984        }
3985
3986        // Create chroma planes with stride
3987        let mut cb_plane = vec![0u8; cb_stride * chroma_height];
3988        let mut cr_plane = vec![0u8; cb_stride * chroma_height];
3989        for row in 0..chroma_height {
3990            for col in 0..chroma_width {
3991                cb_plane[row * cb_stride + col] = 100;
3992                cr_plane[row * cb_stride + col] = 150;
3993            }
3994        }
3995
3996        let encoder = Encoder::new(Preset::BaselineBalanced)
3997            .quality(85)
3998            .subsampling(Subsampling::S420);
3999
4000        let jpeg_data = encoder
4001            .encode_ycbcr_planar_strided(
4002                &y_plane, y_stride, &cb_plane, cb_stride, &cr_plane, cb_stride, width, height,
4003            )
4004            .expect("strided encoding should succeed");
4005
4006        // Verify it's a valid JPEG
4007        assert!(jpeg_data.starts_with(&[0xFF, 0xD8, 0xFF]));
4008        assert!(jpeg_data.ends_with(&[0xFF, 0xD9]));
4009    }
4010
4011    #[test]
4012    fn test_encode_ycbcr_planar_strided_matches_packed() {
4013        let width = 32u32;
4014        let height = 32u32;
4015
4016        // Create packed plane data
4017        let y_packed: Vec<u8> = (0..width * height).map(|i| (i % 256) as u8).collect();
4018        let chroma_w = (width + 1) / 2;
4019        let chroma_h = (height + 1) / 2;
4020        let cb_packed: Vec<u8> = vec![100u8; (chroma_w * chroma_h) as usize];
4021        let cr_packed: Vec<u8> = vec![150u8; (chroma_w * chroma_h) as usize];
4022
4023        let encoder = Encoder::new(Preset::BaselineBalanced)
4024            .quality(85)
4025            .subsampling(Subsampling::S420);
4026
4027        // Encode with packed API
4028        let jpeg_packed = encoder
4029            .encode_ycbcr_planar(&y_packed, &cb_packed, &cr_packed, width, height)
4030            .expect("packed encoding should succeed");
4031
4032        // Encode with strided API (stride == width means packed)
4033        let jpeg_strided = encoder
4034            .encode_ycbcr_planar_strided(
4035                &y_packed,
4036                width as usize,
4037                &cb_packed,
4038                chroma_w as usize,
4039                &cr_packed,
4040                chroma_w as usize,
4041                width,
4042                height,
4043            )
4044            .expect("strided encoding should succeed");
4045
4046        // Both should produce identical output
4047        assert_eq!(jpeg_packed, jpeg_strided);
4048    }
4049
4050    // =========================================================================
4051    // Resource Estimation Tests
4052    // =========================================================================
4053
4054    #[test]
4055    fn test_estimate_resources_basic() {
4056        let encoder = Encoder::new(Preset::BaselineBalanced);
4057        let estimate = encoder.estimate_resources(1920, 1080);
4058
4059        // Should have reasonable memory estimate (> input size)
4060        let input_size = 1920 * 1080 * 3;
4061        assert!(
4062            estimate.peak_memory_bytes > input_size,
4063            "Peak memory {} should exceed input size {}",
4064            estimate.peak_memory_bytes,
4065            input_size
4066        );
4067
4068        // Should have reasonable CPU cost (> 1.0 due to trellis)
4069        assert!(
4070            estimate.cpu_cost_multiplier > 1.0,
4071            "CPU cost {} should be > 1.0 for BaselineBalanced",
4072            estimate.cpu_cost_multiplier
4073        );
4074
4075        // Block count should match expected
4076        assert!(estimate.block_count > 0, "Block count should be > 0");
4077    }
4078
4079    #[test]
4080    fn test_estimate_resources_fastest_has_lower_cpu() {
4081        let fastest = Encoder::new(Preset::BaselineFastest);
4082        let balanced = Encoder::new(Preset::BaselineBalanced);
4083
4084        let est_fast = fastest.estimate_resources(512, 512);
4085        let est_balanced = balanced.estimate_resources(512, 512);
4086
4087        // Fastest should have lower CPU cost (no trellis)
4088        assert!(
4089            est_fast.cpu_cost_multiplier < est_balanced.cpu_cost_multiplier,
4090            "Fastest ({:.2}) should have lower CPU cost than Balanced ({:.2})",
4091            est_fast.cpu_cost_multiplier,
4092            est_balanced.cpu_cost_multiplier
4093        );
4094    }
4095
4096    #[test]
4097    fn test_estimate_resources_progressive_has_higher_cpu() {
4098        let baseline = Encoder::new(Preset::BaselineBalanced);
4099        let progressive = Encoder::new(Preset::ProgressiveBalanced);
4100
4101        let est_baseline = baseline.estimate_resources(512, 512);
4102        let est_prog = progressive.estimate_resources(512, 512);
4103
4104        // Progressive should have higher CPU cost (multiple scans)
4105        assert!(
4106            est_prog.cpu_cost_multiplier > est_baseline.cpu_cost_multiplier,
4107            "Progressive ({:.2}) should have higher CPU cost than Baseline ({:.2})",
4108            est_prog.cpu_cost_multiplier,
4109            est_baseline.cpu_cost_multiplier
4110        );
4111    }
4112
4113    #[test]
4114    fn test_estimate_resources_gray() {
4115        let encoder = Encoder::new(Preset::BaselineBalanced);
4116        let rgb_estimate = encoder.estimate_resources(512, 512);
4117        let gray_estimate = encoder.estimate_resources_gray(512, 512);
4118
4119        // Grayscale should use less memory (1 channel vs 3)
4120        assert!(
4121            gray_estimate.peak_memory_bytes < rgb_estimate.peak_memory_bytes,
4122            "Grayscale memory {} should be less than RGB {}",
4123            gray_estimate.peak_memory_bytes,
4124            rgb_estimate.peak_memory_bytes
4125        );
4126
4127        // Grayscale should have lower CPU cost
4128        assert!(
4129            gray_estimate.cpu_cost_multiplier < rgb_estimate.cpu_cost_multiplier,
4130            "Grayscale CPU {:.2} should be less than RGB {:.2}",
4131            gray_estimate.cpu_cost_multiplier,
4132            rgb_estimate.cpu_cost_multiplier
4133        );
4134    }
4135
4136    // =========================================================================
4137    // Resource Limit Tests
4138    // =========================================================================
4139
4140    #[test]
4141    fn test_dimension_limit_width() {
4142        let limits = Limits::default().max_width(100).max_height(100);
4143        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4144
4145        let pixels = vec![128u8; 200 * 50 * 3];
4146        let result = encoder.encode_rgb(&pixels, 200, 50);
4147
4148        assert!(matches!(result, Err(Error::DimensionLimitExceeded { .. })));
4149    }
4150
4151    #[test]
4152    fn test_dimension_limit_height() {
4153        let limits = Limits::default().max_width(100).max_height(100);
4154        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4155
4156        let pixels = vec![128u8; 50 * 200 * 3];
4157        let result = encoder.encode_rgb(&pixels, 50, 200);
4158
4159        assert!(matches!(result, Err(Error::DimensionLimitExceeded { .. })));
4160    }
4161
4162    #[test]
4163    fn test_dimension_limit_passes_when_within() {
4164        let limits = Limits::default().max_width(100).max_height(100);
4165        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4166
4167        let pixels = vec![128u8; 64 * 64 * 3];
4168        let result = encoder.encode_rgb(&pixels, 64, 64);
4169
4170        assert!(result.is_ok());
4171    }
4172
4173    #[test]
4174    fn test_allocation_limit() {
4175        let limits = Limits::default().max_alloc_bytes(1000); // Very small limit
4176        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4177
4178        let pixels = vec![128u8; 256 * 256 * 3];
4179        let result = encoder.encode_rgb(&pixels, 256, 256);
4180
4181        assert!(matches!(result, Err(Error::AllocationLimitExceeded { .. })));
4182    }
4183
4184    #[test]
4185    fn test_allocation_limit_passes_when_within() {
4186        let limits = Limits::default().max_alloc_bytes(10_000_000); // 10 MB limit
4187        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4188
4189        let pixels = vec![128u8; 64 * 64 * 3];
4190        let result = encoder.encode_rgb(&pixels, 64, 64);
4191
4192        assert!(result.is_ok());
4193    }
4194
4195    #[test]
4196    fn test_pixel_count_limit() {
4197        let limits = Limits::default().max_pixel_count(1000); // Very small limit
4198        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4199
4200        let pixels = vec![128u8; 64 * 64 * 3]; // 4096 pixels
4201        let result = encoder.encode_rgb(&pixels, 64, 64);
4202
4203        assert!(matches!(result, Err(Error::PixelCountExceeded { .. })));
4204    }
4205
4206    #[test]
4207    fn test_pixel_count_limit_passes_when_within() {
4208        let limits = Limits::default().max_pixel_count(10000); // 10000 pixels
4209        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4210
4211        let pixels = vec![128u8; 64 * 64 * 3]; // 4096 pixels
4212        let result = encoder.encode_rgb(&pixels, 64, 64);
4213
4214        assert!(result.is_ok());
4215    }
4216
4217    #[test]
4218    fn test_icc_profile_size_limit() {
4219        let limits = Limits::default().max_icc_profile_bytes(100);
4220        let encoder = Encoder::new(Preset::BaselineFastest)
4221            .limits(limits)
4222            .icc_profile(vec![0u8; 1000]); // 1000 byte ICC profile
4223
4224        let pixels = vec![128u8; 64 * 64 * 3];
4225        let result = encoder.encode_rgb(&pixels, 64, 64);
4226
4227        assert!(matches!(result, Err(Error::IccProfileTooLarge { .. })));
4228    }
4229
4230    #[test]
4231    fn test_icc_profile_size_limit_passes_when_within() {
4232        let limits = Limits::default().max_icc_profile_bytes(2000);
4233        let encoder = Encoder::new(Preset::BaselineFastest)
4234            .limits(limits)
4235            .icc_profile(vec![0u8; 1000]); // 1000 byte ICC profile
4236
4237        let pixels = vec![128u8; 64 * 64 * 3];
4238        let result = encoder.encode_rgb(&pixels, 64, 64);
4239
4240        assert!(result.is_ok());
4241    }
4242
4243    #[test]
4244    fn test_limits_disabled_by_default() {
4245        let encoder = Encoder::new(Preset::BaselineFastest);
4246        assert_eq!(encoder.limits, Limits::none());
4247    }
4248
4249    #[test]
4250    fn test_limits_has_limits() {
4251        assert!(!Limits::none().has_limits());
4252        assert!(Limits::default().max_width(100).has_limits());
4253        assert!(Limits::default().max_height(100).has_limits());
4254        assert!(Limits::default().max_pixel_count(1000).has_limits());
4255        assert!(Limits::default().max_alloc_bytes(1000).has_limits());
4256        assert!(Limits::default().max_icc_profile_bytes(1000).has_limits());
4257    }
4258
4259    // =========================================================================
4260    // Cancellation Tests
4261    // =========================================================================
4262
4263    #[test]
4264    fn test_cancellable_with_no_cancellation() {
4265        let encoder = Encoder::new(Preset::BaselineFastest);
4266        let pixels = vec![128u8; 64 * 64 * 3];
4267
4268        let result = encoder.encode_rgb_cancellable(&pixels, 64, 64, None, None);
4269
4270        assert!(result.is_ok());
4271    }
4272
4273    #[test]
4274    fn test_cancellable_immediate_cancel() {
4275        let encoder = Encoder::new(Preset::BaselineFastest);
4276        let pixels = vec![128u8; 64 * 64 * 3];
4277        let cancel = AtomicBool::new(true); // Already cancelled
4278
4279        let result = encoder.encode_rgb_cancellable(&pixels, 64, 64, Some(&cancel), None);
4280
4281        assert!(matches!(result, Err(Error::Cancelled)));
4282    }
4283
4284    #[test]
4285    fn test_cancellable_with_timeout() {
4286        let encoder = Encoder::new(Preset::BaselineFastest);
4287        let pixels = vec![128u8; 64 * 64 * 3];
4288
4289        // 10 second timeout - should complete well within this
4290        let result =
4291            encoder.encode_rgb_cancellable(&pixels, 64, 64, None, Some(Duration::from_secs(10)));
4292
4293        assert!(result.is_ok());
4294    }
4295
4296    #[test]
4297    fn test_cancellable_gray() {
4298        let encoder = Encoder::new(Preset::BaselineFastest);
4299        let pixels = vec![128u8; 64 * 64];
4300
4301        let result = encoder.encode_gray_cancellable(&pixels, 64, 64, None, None);
4302
4303        assert!(result.is_ok());
4304    }
4305
4306    #[test]
4307    fn test_cancellable_with_limits() {
4308        // Test that limits work in cancellable method too
4309        let limits = Limits::default().max_width(32);
4310        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4311
4312        let pixels = vec![128u8; 64 * 64 * 3];
4313        let result = encoder.encode_rgb_cancellable(&pixels, 64, 64, None, None);
4314
4315        assert!(matches!(result, Err(Error::DimensionLimitExceeded { .. })));
4316    }
4317
4318    #[test]
4319    fn test_cancellation_context_none() {
4320        let ctx = CancellationContext::none();
4321        assert!(ctx.check().is_ok());
4322    }
4323
4324    #[test]
4325    fn test_cancellation_context_with_cancel_flag() {
4326        use std::sync::atomic::Ordering;
4327
4328        let cancel = AtomicBool::new(false);
4329        let ctx = CancellationContext::new(Some(&cancel), None);
4330        assert!(ctx.check().is_ok());
4331
4332        cancel.store(true, Ordering::Relaxed);
4333        assert!(matches!(ctx.check(), Err(Error::Cancelled)));
4334    }
4335
4336    #[test]
4337    fn test_cancellation_context_with_expired_deadline() {
4338        // Create a deadline that's already passed
4339        let ctx = CancellationContext {
4340            cancel: None,
4341            deadline: Some(Instant::now() - Duration::from_secs(1)),
4342        };
4343
4344        assert!(matches!(ctx.check(), Err(Error::TimedOut)));
4345    }
4346
4347    #[test]
4348    fn test_dimension_exact_at_limit_passes() {
4349        // Dimensions exactly at limit should pass
4350        let limits = Limits::default().max_width(64).max_height(64);
4351        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4352
4353        let pixels = vec![128u8; 64 * 64 * 3];
4354        let result = encoder.encode_rgb(&pixels, 64, 64);
4355
4356        assert!(result.is_ok());
4357    }
4358
4359    #[test]
4360    fn test_pixel_count_exact_at_limit_passes() {
4361        // Pixel count exactly at limit should pass
4362        let limits = Limits::default().max_pixel_count(4096); // Exactly 64*64
4363        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4364
4365        let pixels = vec![128u8; 64 * 64 * 3];
4366        let result = encoder.encode_rgb(&pixels, 64, 64);
4367
4368        assert!(result.is_ok());
4369    }
4370
4371    #[test]
4372    fn test_multiple_limits_all_checked() {
4373        // Test that all limits are checked, not just the first
4374        let limits = Limits::default()
4375            .max_width(1000)
4376            .max_height(1000)
4377            .max_pixel_count(100); // This should fail
4378
4379        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4380        let pixels = vec![128u8; 64 * 64 * 3]; // 4096 pixels
4381
4382        let result = encoder.encode_rgb(&pixels, 64, 64);
4383        assert!(matches!(result, Err(Error::PixelCountExceeded { .. })));
4384    }
4385
4386    #[test]
4387    fn test_limits_with_grayscale() {
4388        let limits = Limits::default().max_pixel_count(100);
4389        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4390
4391        let pixels = vec![128u8; 64 * 64]; // Grayscale, 4096 pixels
4392        let result = encoder.encode_gray(&pixels, 64, 64);
4393
4394        assert!(matches!(result, Err(Error::PixelCountExceeded { .. })));
4395    }
4396
4397    #[test]
4398    fn test_estimate_resources_with_subsampling() {
4399        let encoder_444 = Encoder::new(Preset::BaselineBalanced).subsampling(Subsampling::S444);
4400        let encoder_420 = Encoder::new(Preset::BaselineBalanced).subsampling(Subsampling::S420);
4401
4402        let est_444 = encoder_444.estimate_resources(512, 512);
4403        let est_420 = encoder_420.estimate_resources(512, 512);
4404
4405        // 4:4:4 should use more memory than 4:2:0 (no chroma downsampling)
4406        assert!(
4407            est_444.peak_memory_bytes > est_420.peak_memory_bytes,
4408            "4:4:4 memory {} should exceed 4:2:0 memory {}",
4409            est_444.peak_memory_bytes,
4410            est_420.peak_memory_bytes
4411        );
4412    }
4413
4414    #[test]
4415    fn test_estimate_resources_block_count() {
4416        // With 4:2:0 subsampling (default): Y gets full blocks, chroma gets 1/4
4417        let encoder = Encoder::new(Preset::BaselineFastest);
4418
4419        // 64x64 image with 4:2:0:
4420        // Y blocks: 8x8 = 64
4421        // Chroma: 32x32 pixels, 4x4 blocks each = 16 per component
4422        // Total: 64 + 16 + 16 = 96
4423        let estimate = encoder.estimate_resources(64, 64);
4424        assert_eq!(estimate.block_count, 96);
4425
4426        // With 4:4:4 subsampling: all components get full blocks
4427        let encoder_444 = Encoder::new(Preset::BaselineFastest).subsampling(Subsampling::S444);
4428        let estimate_444 = encoder_444.estimate_resources(64, 64);
4429        // 64 blocks * 3 components = 192
4430        assert_eq!(estimate_444.block_count, 192);
4431    }
4432
4433    #[test]
4434    fn test_cancellable_gray_with_limits() {
4435        let limits = Limits::default().max_width(32);
4436        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4437
4438        let pixels = vec![128u8; 64 * 64];
4439        let result = encoder.encode_gray_cancellable(&pixels, 64, 64, None, None);
4440
4441        assert!(matches!(result, Err(Error::DimensionLimitExceeded { .. })));
4442    }
4443}