mozjpeg_rs/
encode.rs

1//! JPEG encoder pipeline.
2//!
3//! This module provides two encoder types:
4//!
5//! - [`Encoder`]: Full-featured encoder with trellis quantization, progressive mode,
6//!   and Huffman optimization. Batch encoding only.
7//! - [`StreamingEncoder`]: Streaming-capable encoder without optimizations.
8//!   Supports both batch and scanline-by-scanline encoding.
9//!
10//! Both implement the [`Encode`] trait for batch encoding.
11//!
12//! # Examples
13//!
14//! ```ignore
15//! use mozjpeg_rs::{Encoder, Preset};
16//!
17//! // Full-featured batch encoding
18//! let jpeg = Encoder::new(Preset::default())
19//!     .quality(85)
20//!     .encode_rgb(&pixels, width, height)?;
21//!
22//! // Streaming encoding (memory-efficient for large images)
23//! let mut stream = Encoder::streaming()
24//!     .quality(85)
25//!     .start(width, height, file)?;
26//! for row in scanlines.chunks(16) {
27//!     stream.write_scanlines(row)?;
28//! }
29//! stream.finish()?;
30//! ```
31
32use std::io::Write;
33use std::sync::atomic::{AtomicBool, Ordering};
34use std::time::{Duration, Instant};
35
36use crate::bitstream::BitWriter;
37use crate::consts::{QuantTableIdx, DCTSIZE, DCTSIZE2};
38use crate::deringing::preprocess_deringing;
39use crate::entropy::{EntropyEncoder, ProgressiveEncoder, ProgressiveSymbolCounter, SymbolCounter};
40use crate::error::{Error, Result};
41use crate::huffman::DerivedTable;
42use crate::huffman::FrequencyCounter;
43use crate::marker::MarkerWriter;
44use crate::progressive::{generate_baseline_scan, generate_mozjpeg_max_compression_scans};
45use crate::quant::{create_quant_tables, quantize_block_raw};
46use crate::sample;
47use crate::scan_optimize::{generate_search_scans, ScanSearchConfig, ScanSelector};
48use crate::scan_trial::ScanTrialEncoder;
49use crate::simd::SimdOps;
50use crate::trellis::trellis_quantize_block;
51use crate::types::{Limits, PixelDensity, Preset, Subsampling, TrellisConfig};
52
53mod helpers;
54mod streaming;
55
56pub(crate) use helpers::{
57    create_components, create_std_ac_chroma_table, create_std_ac_luma_table,
58    create_std_dc_chroma_table, create_std_dc_luma_table, create_ycbcr_components,
59    natural_to_zigzag, run_dc_trellis_by_row, try_alloc_vec, try_alloc_vec_array, write_dht_marker,
60    write_sos_marker,
61};
62pub use streaming::{EncodingStream, StreamingEncoder};
63
64// ============================================================================
65// Cancellation Support
66// ============================================================================
67
68/// Internal context for cancellation checking during encoding.
69///
70/// This is passed through the encoding pipeline to allow periodic
71/// cancellation checks without function signature changes everywhere.
72#[derive(Clone, Copy)]
73pub(crate) struct CancellationContext<'a> {
74    /// Optional cancellation flag - if set to true, encoding should abort.
75    pub cancel: Option<&'a AtomicBool>,
76    /// Optional deadline - if current time exceeds this, encoding should abort.
77    pub deadline: Option<Instant>,
78}
79
80impl<'a> CancellationContext<'a> {
81    /// Create a context with no cancellation (always succeeds).
82    #[allow(dead_code)]
83    pub const fn none() -> Self {
84        Self {
85            cancel: None,
86            deadline: None,
87        }
88    }
89
90    /// Create a context from optional cancel flag and timeout.
91    #[allow(dead_code)]
92    pub fn new(cancel: Option<&'a AtomicBool>, timeout: Option<Duration>) -> Self {
93        Self {
94            cancel,
95            deadline: timeout.map(|d| Instant::now() + d),
96        }
97    }
98
99    /// Check if cancellation has been requested.
100    ///
101    /// Returns `Ok(())` if encoding should continue, or `Err` if cancelled/timed out.
102    #[inline]
103    pub fn check(&self) -> Result<()> {
104        if let Some(c) = self.cancel {
105            if c.load(Ordering::Relaxed) {
106                return Err(Error::Cancelled);
107            }
108        }
109        if let Some(d) = self.deadline {
110            if Instant::now() > d {
111                return Err(Error::TimedOut);
112            }
113        }
114        Ok(())
115    }
116
117    /// Check cancellation every N iterations (to reduce overhead).
118    ///
119    /// Only performs the check when `iteration % interval == 0`.
120    #[inline]
121    #[allow(dead_code)]
122    pub fn check_periodic(&self, iteration: usize, interval: usize) -> Result<()> {
123        if iteration.is_multiple_of(interval) {
124            self.check()
125        } else {
126            Ok(())
127        }
128    }
129}
130
131// ============================================================================
132// Encode Trait (internal, for potential future streaming API)
133// ============================================================================
134
135/// Trait for JPEG encoding (batch mode).
136///
137/// Implemented by both [`Encoder`] and [`StreamingEncoder`].
138#[allow(dead_code)]
139pub trait Encode {
140    /// Encode RGB image data to JPEG.
141    ///
142    /// # Arguments
143    /// * `rgb_data` - RGB pixel data (3 bytes per pixel, row-major order)
144    /// * `width` - Image width in pixels
145    /// * `height` - Image height in pixels
146    fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>>;
147
148    /// Encode grayscale image data to JPEG.
149    ///
150    /// # Arguments
151    /// * `gray_data` - Grayscale pixel data (1 byte per pixel, row-major order)
152    /// * `width` - Image width in pixels
153    /// * `height` - Image height in pixels
154    fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>>;
155}
156
157/// JPEG encoder with configurable quality and features.
158#[derive(Debug, Clone)]
159pub struct Encoder {
160    /// Quality level (1-100)
161    quality: u8,
162    /// Enable progressive mode
163    progressive: bool,
164    /// Chroma subsampling mode
165    subsampling: Subsampling,
166    /// Quantization table variant
167    quant_table_idx: QuantTableIdx,
168    /// Custom luminance quantization table (overrides quant_table_idx if set)
169    custom_luma_qtable: Option<[u16; DCTSIZE2]>,
170    /// Custom chrominance quantization table (overrides quant_table_idx if set)
171    custom_chroma_qtable: Option<[u16; DCTSIZE2]>,
172    /// Trellis quantization configuration
173    trellis: TrellisConfig,
174    /// Force baseline-compatible output
175    force_baseline: bool,
176    /// Optimize Huffman tables (requires 2-pass)
177    optimize_huffman: bool,
178    /// Enable overshoot deringing (reduces ringing on white backgrounds)
179    overshoot_deringing: bool,
180    /// Optimize progressive scan configuration (tries multiple configs, picks smallest)
181    optimize_scans: bool,
182    /// Restart interval in MCUs (0 = disabled)
183    restart_interval: u16,
184    /// Pixel density for JFIF APP0 marker
185    pixel_density: PixelDensity,
186    /// EXIF data to embed (raw TIFF structure, without "Exif\0\0" header)
187    exif_data: Option<Vec<u8>>,
188    /// ICC color profile to embed (will be chunked into APP2 markers)
189    icc_profile: Option<Vec<u8>>,
190    /// Custom APP markers to embed (marker number 0-15, data)
191    custom_markers: Vec<(u8, Vec<u8>)>,
192    /// SIMD operations dispatch (detected once at construction)
193    simd: SimdOps,
194    /// Smoothing factor (0-100, 0 = disabled)
195    /// Applies a weighted average filter to reduce fine-scale noise.
196    /// Useful for converting dithered images (like GIFs) to JPEG.
197    smoothing: u8,
198    /// Resource limits (dimensions, memory, ICC size)
199    limits: Limits,
200}
201
202impl Default for Encoder {
203    fn default() -> Self {
204        Self::new(Preset::default())
205    }
206}
207
208impl Encoder {
209    /// Create an encoder with the specified preset.
210    ///
211    /// # Arguments
212    ///
213    /// * `preset` - Encoding preset (see [`Preset`] for details):
214    ///   - [`BaselineFastest`](Preset::BaselineFastest): No optimizations, fastest encoding
215    ///   - [`BaselineBalanced`](Preset::BaselineBalanced): Baseline with all optimizations
216    ///   - [`ProgressiveBalanced`](Preset::ProgressiveBalanced): Progressive with optimizations (default)
217    ///   - [`ProgressiveSmallest`](Preset::ProgressiveSmallest): Maximum compression
218    ///
219    /// # Preset Comparison
220    ///
221    /// | Preset | Time | Size | Best For |
222    /// |--------|------|------|----------|
223    /// | `BaselineFastest` | ~2ms | baseline | Real-time, thumbnails |
224    /// | `BaselineBalanced` | ~7ms | -13% | Sequential playback |
225    /// | `ProgressiveBalanced` | ~9ms | -13% | Web images (default) |
226    /// | `ProgressiveSmallest` | ~21ms | -14% | Storage, archival |
227    ///
228    /// *Benchmarks: 512×512 Q75 image*
229    ///
230    /// # Example
231    ///
232    /// ```no_run
233    /// use mozjpeg_rs::{Encoder, Preset};
234    ///
235    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
236    ///
237    /// // Default: progressive with good balance
238    /// let jpeg = Encoder::new(Preset::default())
239    ///     .quality(85)
240    ///     .encode_rgb(&pixels, 256, 256)
241    ///     .unwrap();
242    ///
243    /// // Fastest for real-time applications
244    /// let jpeg = Encoder::new(Preset::BaselineFastest)
245    ///     .quality(80)
246    ///     .encode_rgb(&pixels, 256, 256)
247    ///     .unwrap();
248    ///
249    /// // Maximum compression (matches C mozjpeg)
250    /// let jpeg = Encoder::new(Preset::ProgressiveSmallest)
251    ///     .quality(85)
252    ///     .encode_rgb(&pixels, 256, 256)
253    ///     .unwrap();
254    /// ```
255    pub fn new(preset: Preset) -> Self {
256        match preset {
257            Preset::BaselineFastest => Self::fastest(),
258            Preset::BaselineBalanced => Self::baseline_optimized(),
259            Preset::ProgressiveBalanced => Self::progressive_balanced(),
260            Preset::ProgressiveSmallest => Self::max_compression(),
261        }
262    }
263
264    /// Create an encoder with the most optimized baseline (non-progressive) settings.
265    ///
266    /// This is the recommended starting point for most use cases. It produces
267    /// sequential (non-progressive) JPEGs with all mozjpeg optimizations enabled:
268    /// trellis quantization, Huffman optimization, and overshoot deringing.
269    ///
270    /// # Default Settings
271    ///
272    /// | Setting | Value | Notes |
273    /// |---------|-------|-------|
274    /// | quality | 75 | Good balance of size/quality |
275    /// | progressive | **false** | Sequential baseline JPEG |
276    /// | optimize_scans | **false** | N/A for baseline mode |
277    /// | subsampling | 4:2:0 | Standard chroma subsampling |
278    /// | trellis | **enabled** | AC + DC trellis quantization |
279    /// | optimize_huffman | **true** | 2-pass for optimal Huffman tables |
280    /// | overshoot_deringing | **true** | Reduces ringing on hard edges |
281    /// | quant_tables | ImageMagick | Same as C mozjpeg default |
282    /// | force_baseline | false | Allows 16-bit DQT at very low Q |
283    ///
284    /// # Comparison with C mozjpeg
285    ///
286    /// **Important:** This differs from C mozjpeg's `jpeg_set_defaults()`!
287    ///
288    /// C mozjpeg uses `JCP_MAX_COMPRESSION` profile by default, which enables
289    /// progressive mode and optimize_scans. This produces ~20% smaller files
290    /// but with slower encoding and progressive rendering.
291    ///
292    /// | Setting | `baseline_optimized()` | C mozjpeg default |
293    /// |---------|------------------------|-------------------|
294    /// | progressive | **false** | true |
295    /// | optimize_scans | **false** | true |
296    /// | trellis | true | true |
297    /// | deringing | true | true |
298    ///
299    /// To match C mozjpeg's default behavior, use [`max_compression()`](Self::max_compression).
300    ///
301    /// # Example
302    ///
303    /// ```no_run
304    /// use mozjpeg_rs::Encoder;
305    ///
306    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
307    /// let jpeg = Encoder::baseline_optimized()
308    ///     .quality(85)
309    ///     .encode_rgb(&pixels, 256, 256)
310    ///     .unwrap();
311    /// ```
312    pub fn baseline_optimized() -> Self {
313        Self {
314            quality: 75,
315            progressive: false,
316            subsampling: Subsampling::S420,
317            quant_table_idx: QuantTableIdx::ImageMagick,
318            custom_luma_qtable: None,
319            custom_chroma_qtable: None,
320            trellis: TrellisConfig::default(),
321            force_baseline: false,
322            optimize_huffman: true,
323            overshoot_deringing: true,
324            optimize_scans: false,
325            restart_interval: 0,
326            pixel_density: PixelDensity::default(),
327            exif_data: None,
328            icc_profile: None,
329            custom_markers: Vec::new(),
330            simd: SimdOps::detect(),
331            smoothing: 0,
332            limits: Limits::none(),
333        }
334    }
335
336    /// Create encoder with maximum compression (matches C mozjpeg defaults).
337    ///
338    /// This matches the `JCP_MAX_COMPRESSION` profile used by C mozjpeg's
339    /// `jpeg_set_defaults()` and the `mozjpeg` crate.
340    ///
341    /// # Settings (differences from `new()` in **bold**)
342    ///
343    /// | Setting | Value | Notes |
344    /// |---------|-------|-------|
345    /// | quality | 75 | Same as `new()` |
346    /// | progressive | **true** | Multi-scan progressive JPEG |
347    /// | optimize_scans | **true** | Tries multiple scan configs |
348    /// | subsampling | 4:2:0 | Same as `new()` |
349    /// | trellis | enabled | Same as `new()` |
350    /// | optimize_huffman | true | Same as `new()` |
351    /// | overshoot_deringing | true | Same as `new()` |
352    ///
353    /// # File Size Comparison
354    ///
355    /// Typical results at Q75 (256×256 image):
356    /// - `Encoder::baseline_optimized()`: ~650 bytes (baseline)
357    /// - `Encoder::max_compression()`: ~520 bytes (**~20% smaller**)
358    ///
359    /// # Example
360    ///
361    /// ```no_run
362    /// use mozjpeg_rs::Encoder;
363    ///
364    /// // Match C mozjpeg's default compression
365    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
366    /// let jpeg = Encoder::max_compression()
367    ///     .quality(85)
368    ///     .encode_rgb(&pixels, 256, 256)
369    ///     .unwrap();
370    /// ```
371    pub fn max_compression() -> Self {
372        Self {
373            quality: 75,
374            progressive: true,
375            subsampling: Subsampling::S420,
376            quant_table_idx: QuantTableIdx::ImageMagick,
377            custom_luma_qtable: None,
378            custom_chroma_qtable: None,
379            trellis: TrellisConfig::default(),
380            force_baseline: false,
381            optimize_huffman: true,
382            overshoot_deringing: true,
383            optimize_scans: true,
384            restart_interval: 0,
385            pixel_density: PixelDensity::default(),
386            exif_data: None,
387            icc_profile: None,
388            custom_markers: Vec::new(),
389            simd: SimdOps::detect(),
390            smoothing: 0,
391            limits: Limits::none(),
392        }
393    }
394
395    /// Create encoder with progressive mode and all optimizations except optimize_scans.
396    ///
397    /// This is the **recommended default** for most use cases. It provides:
398    /// - Progressive rendering (blurry-to-sharp loading)
399    /// - All mozjpeg optimizations (trellis, Huffman, deringing)
400    /// - Good balance between file size and encoding speed
401    ///
402    /// # Settings
403    ///
404    /// | Setting | Value | Notes |
405    /// |---------|-------|-------|
406    /// | progressive | **true** | Multi-scan progressive JPEG |
407    /// | optimize_scans | **false** | Uses fixed 9-scan config |
408    /// | trellis | enabled | AC + DC trellis quantization |
409    /// | optimize_huffman | true | 2-pass for optimal tables |
410    /// | overshoot_deringing | true | Reduces ringing on hard edges |
411    ///
412    /// # vs `max_compression()`
413    ///
414    /// This preset omits `optimize_scans` which:
415    /// - Saves ~100% encoding time (9ms vs 21ms at 512×512)
416    /// - Loses only ~1% file size reduction
417    ///
418    /// Use `max_compression()` only when file size is critical.
419    ///
420    /// # Example
421    ///
422    /// ```no_run
423    /// use mozjpeg_rs::Encoder;
424    ///
425    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
426    /// let jpeg = Encoder::progressive_balanced()
427    ///     .quality(85)
428    ///     .encode_rgb(&pixels, 256, 256)
429    ///     .unwrap();
430    /// ```
431    pub fn progressive_balanced() -> Self {
432        Self {
433            quality: 75,
434            progressive: true,
435            subsampling: Subsampling::S420,
436            quant_table_idx: QuantTableIdx::ImageMagick,
437            custom_luma_qtable: None,
438            custom_chroma_qtable: None,
439            trellis: TrellisConfig::default(),
440            force_baseline: false,
441            optimize_huffman: true,
442            overshoot_deringing: true,
443            optimize_scans: false, // Key difference from max_compression()
444            restart_interval: 0,
445            pixel_density: PixelDensity::default(),
446            exif_data: None,
447            icc_profile: None,
448            custom_markers: Vec::new(),
449            simd: SimdOps::detect(),
450            smoothing: 0,
451            limits: Limits::none(),
452        }
453    }
454
455    /// Create encoder with fastest settings (libjpeg-turbo compatible).
456    ///
457    /// Disables all mozjpeg-specific optimizations for maximum encoding speed.
458    /// Output is compatible with standard libjpeg/libjpeg-turbo.
459    ///
460    /// # Settings (differences from `new()` in **bold**)
461    ///
462    /// | Setting | Value | Notes |
463    /// |---------|-------|-------|
464    /// | quality | 75 | Same as `new()` |
465    /// | progressive | false | Same as `new()` |
466    /// | trellis | **disabled** | No trellis quantization |
467    /// | optimize_huffman | **false** | Uses default Huffman tables |
468    /// | overshoot_deringing | **false** | No deringing filter |
469    /// | force_baseline | **true** | 8-bit DQT only |
470    ///
471    /// # Performance
472    ///
473    /// Encoding is ~4-10x faster than `new()`, but files are ~10-20% larger.
474    ///
475    /// # Example
476    ///
477    /// ```no_run
478    /// use mozjpeg_rs::Encoder;
479    ///
480    /// // Fast encoding for real-time applications
481    /// let pixels: Vec<u8> = vec![128; 256 * 256 * 3];
482    /// let jpeg = Encoder::fastest()
483    ///     .quality(80)
484    ///     .encode_rgb(&pixels, 256, 256)
485    ///     .unwrap();
486    /// ```
487    pub fn fastest() -> Self {
488        Self {
489            quality: 75,
490            progressive: false,
491            subsampling: Subsampling::S420,
492            quant_table_idx: QuantTableIdx::ImageMagick,
493            custom_luma_qtable: None,
494            custom_chroma_qtable: None,
495            trellis: TrellisConfig::disabled(),
496            force_baseline: true,
497            optimize_huffman: false,
498            overshoot_deringing: false,
499            optimize_scans: false,
500            restart_interval: 0,
501            pixel_density: PixelDensity::default(),
502            exif_data: None,
503            icc_profile: None,
504            custom_markers: Vec::new(),
505            simd: SimdOps::detect(),
506            smoothing: 0,
507            limits: Limits::none(),
508        }
509    }
510
511    /// Set quality level (1-100).
512    ///
513    /// Higher values produce larger, higher-quality images.
514    pub fn quality(mut self, quality: u8) -> Self {
515        self.quality = quality.clamp(1, 100);
516        self
517    }
518
519    /// Enable or disable progressive mode.
520    pub fn progressive(mut self, enable: bool) -> Self {
521        self.progressive = enable;
522        self
523    }
524
525    /// Set chroma subsampling mode.
526    pub fn subsampling(mut self, mode: Subsampling) -> Self {
527        self.subsampling = mode;
528        self
529    }
530
531    /// Set quantization table variant.
532    pub fn quant_tables(mut self, idx: QuantTableIdx) -> Self {
533        self.quant_table_idx = idx;
534        self
535    }
536
537    /// Configure trellis quantization.
538    pub fn trellis(mut self, config: TrellisConfig) -> Self {
539        self.trellis = config;
540        self
541    }
542
543    /// Force baseline-compatible output.
544    pub fn force_baseline(mut self, enable: bool) -> Self {
545        self.force_baseline = enable;
546        self
547    }
548
549    /// Enable Huffman table optimization.
550    pub fn optimize_huffman(mut self, enable: bool) -> Self {
551        self.optimize_huffman = enable;
552        self
553    }
554
555    /// Enable overshoot deringing.
556    ///
557    /// Reduces visible ringing artifacts near hard edges, especially on white
558    /// backgrounds. Works by allowing encoded values to "overshoot" above 255
559    /// (which will clamp back to 255 when decoded) to create smoother waveforms.
560    ///
561    /// This is a mozjpeg-specific feature that can improve visual quality at
562    /// minimal file size cost. Enabled by default.
563    pub fn overshoot_deringing(mut self, enable: bool) -> Self {
564        self.overshoot_deringing = enable;
565        self
566    }
567
568    /// Enable or disable scan optimization for progressive mode.
569    ///
570    /// When enabled, the encoder tries multiple scan configurations and
571    /// picks the one that produces the smallest output. This can improve
572    /// compression by 1-3% but increases encoding time.
573    ///
574    /// Only has effect when progressive mode is enabled.
575    pub fn optimize_scans(mut self, enable: bool) -> Self {
576        self.optimize_scans = enable;
577        self
578    }
579
580    /// Set input smoothing factor (0-100).
581    ///
582    /// Applies a weighted average filter to reduce fine-scale noise in the
583    /// input image before encoding. This is particularly useful for converting
584    /// dithered images (like GIFs) to JPEG.
585    ///
586    /// - 0 = disabled (default)
587    /// - 10-50 = recommended for dithered images
588    /// - Higher values = more smoothing (may blur the image)
589    ///
590    /// # Example
591    /// ```
592    /// use mozjpeg_rs::Encoder;
593    ///
594    /// // Convert a dithered GIF to JPEG with smoothing
595    /// let encoder = Encoder::baseline_optimized()
596    ///     .quality(85)
597    ///     .smoothing(30);
598    /// ```
599    pub fn smoothing(mut self, factor: u8) -> Self {
600        self.smoothing = factor.min(100);
601        self
602    }
603
604    /// Set restart interval in MCUs.
605    ///
606    /// Restart markers are inserted every N MCUs, which can help with
607    /// error recovery and parallel decoding. Set to 0 to disable (default).
608    ///
609    /// Common values: 0 (disabled), or image width in MCUs for row-by-row restarts.
610    pub fn restart_interval(mut self, interval: u16) -> Self {
611        self.restart_interval = interval;
612        self
613    }
614
615    /// Set EXIF data to embed in the JPEG.
616    ///
617    /// # Arguments
618    /// * `data` - Raw EXIF data (TIFF structure). The "Exif\0\0" header
619    ///   will be added automatically.
620    ///
621    /// Pass empty or call without this method to omit EXIF data.
622    pub fn exif_data(mut self, data: Vec<u8>) -> Self {
623        self.exif_data = if data.is_empty() { None } else { Some(data) };
624        self
625    }
626
627    /// Set pixel density for the JFIF APP0 marker.
628    ///
629    /// This specifies the physical pixel density (DPI/DPC) or aspect ratio.
630    /// Note that most software ignores JFIF density in favor of EXIF metadata.
631    ///
632    /// # Example
633    /// ```
634    /// use mozjpeg_rs::{Encoder, PixelDensity};
635    ///
636    /// let encoder = Encoder::baseline_optimized()
637    ///     .pixel_density(PixelDensity::dpi(300, 300)); // 300 DPI
638    /// ```
639    pub fn pixel_density(mut self, density: PixelDensity) -> Self {
640        self.pixel_density = density;
641        self
642    }
643
644    /// Set ICC color profile to embed.
645    ///
646    /// The profile will be embedded in APP2 markers with the standard
647    /// "ICC_PROFILE" identifier. Large profiles are automatically chunked.
648    ///
649    /// # Arguments
650    /// * `profile` - Raw ICC profile data
651    pub fn icc_profile(mut self, profile: Vec<u8>) -> Self {
652        self.icc_profile = if profile.is_empty() {
653            None
654        } else {
655            Some(profile)
656        };
657        self
658    }
659
660    /// Add a custom APP marker.
661    ///
662    /// # Arguments
663    /// * `app_num` - APP marker number (0-15, e.g., 1 for EXIF, 2 for ICC)
664    /// * `data` - Raw marker data (including any identifier prefix)
665    ///
666    /// Multiple markers with the same number are allowed.
667    /// Markers are written in the order they are added.
668    pub fn add_marker(mut self, app_num: u8, data: Vec<u8>) -> Self {
669        if app_num <= 15 && !data.is_empty() {
670            self.custom_markers.push((app_num, data));
671        }
672        self
673    }
674
675    /// Set custom luminance quantization table.
676    ///
677    /// This overrides the table selected by `quant_tables()`.
678    /// Values should be in natural (row-major) order, not zigzag.
679    ///
680    /// # Arguments
681    /// * `table` - 64 quantization values (quality scaling still applies)
682    pub fn custom_luma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
683        self.custom_luma_qtable = Some(table);
684        self
685    }
686
687    /// Set custom chrominance quantization table.
688    ///
689    /// This overrides the table selected by `quant_tables()`.
690    /// Values should be in natural (row-major) order, not zigzag.
691    ///
692    /// # Arguments
693    /// * `table` - 64 quantization values (quality scaling still applies)
694    pub fn custom_chroma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
695        self.custom_chroma_qtable = Some(table);
696        self
697    }
698
699    // =========================================================================
700    // Resource Limits
701    // =========================================================================
702
703    /// Set resource limits for the encoder.
704    ///
705    /// Limits can restrict:
706    /// - Maximum image width and height
707    /// - Maximum pixel count (width × height)
708    /// - Maximum estimated memory allocation
709    /// - Maximum ICC profile size
710    ///
711    /// # Example
712    /// ```
713    /// use mozjpeg_rs::{Encoder, Preset, Limits};
714    ///
715    /// let limits = Limits::default()
716    ///     .max_width(4096)
717    ///     .max_height(4096)
718    ///     .max_pixel_count(16_000_000)
719    ///     .max_alloc_bytes(100 * 1024 * 1024);
720    ///
721    /// let encoder = Encoder::new(Preset::default())
722    ///     .limits(limits);
723    /// ```
724    pub fn limits(mut self, limits: Limits) -> Self {
725        self.limits = limits;
726        self
727    }
728
729    /// Check all resource limits before encoding.
730    ///
731    /// # Arguments
732    /// * `width` - Image width
733    /// * `height` - Image height
734    /// * `is_gray` - True for grayscale images (affects memory estimate)
735    fn check_limits(&self, width: u32, height: u32, is_gray: bool) -> Result<()> {
736        let limits = &self.limits;
737
738        // Check dimension limits
739        if (limits.max_width > 0 && width > limits.max_width)
740            || (limits.max_height > 0 && height > limits.max_height)
741        {
742            return Err(Error::DimensionLimitExceeded {
743                width,
744                height,
745                max_width: limits.max_width,
746                max_height: limits.max_height,
747            });
748        }
749
750        // Check pixel count limit
751        if limits.max_pixel_count > 0 {
752            let pixel_count = width as u64 * height as u64;
753            if pixel_count > limits.max_pixel_count {
754                return Err(Error::PixelCountExceeded {
755                    pixel_count,
756                    limit: limits.max_pixel_count,
757                });
758            }
759        }
760
761        // Check allocation limit
762        if limits.max_alloc_bytes > 0 {
763            let estimate = if is_gray {
764                self.estimate_resources_gray(width, height)
765            } else {
766                self.estimate_resources(width, height)
767            };
768            if estimate.peak_memory_bytes > limits.max_alloc_bytes {
769                return Err(Error::AllocationLimitExceeded {
770                    estimated: estimate.peak_memory_bytes,
771                    limit: limits.max_alloc_bytes,
772                });
773            }
774        }
775
776        // Check ICC profile size limit
777        if limits.max_icc_profile_bytes > 0 {
778            if let Some(ref icc) = self.icc_profile {
779                if icc.len() > limits.max_icc_profile_bytes {
780                    return Err(Error::IccProfileTooLarge {
781                        size: icc.len(),
782                        limit: limits.max_icc_profile_bytes,
783                    });
784                }
785            }
786        }
787
788        Ok(())
789    }
790
791    // =========================================================================
792    // Aliases for rimage/CLI-style naming
793    // =========================================================================
794
795    /// Set baseline mode (opposite of progressive).
796    ///
797    /// When `true`, produces a sequential JPEG (non-progressive).
798    /// This is equivalent to `progressive(false)`.
799    ///
800    /// # Example
801    /// ```
802    /// use mozjpeg_rs::Encoder;
803    ///
804    /// // These are equivalent:
805    /// let enc1 = Encoder::baseline_optimized().baseline(true);
806    /// let enc2 = Encoder::baseline_optimized().progressive(false);
807    /// ```
808    #[inline]
809    pub fn baseline(self, enable: bool) -> Self {
810        self.progressive(!enable)
811    }
812
813    /// Enable or disable Huffman coding optimization.
814    ///
815    /// Alias for [`optimize_huffman()`](Self::optimize_huffman).
816    /// This name matches mozjpeg's CLI flag naming.
817    #[inline]
818    pub fn optimize_coding(self, enable: bool) -> Self {
819        self.optimize_huffman(enable)
820    }
821
822    /// Set chroma subsampling mode.
823    ///
824    /// Alias for [`subsampling()`](Self::subsampling).
825    #[inline]
826    pub fn chroma_subsampling(self, mode: Subsampling) -> Self {
827        self.subsampling(mode)
828    }
829
830    /// Set quantization table variant.
831    ///
832    /// Alias for [`quant_tables()`](Self::quant_tables).
833    #[inline]
834    pub fn qtable(self, idx: QuantTableIdx) -> Self {
835        self.quant_tables(idx)
836    }
837
838    // =========================================================================
839    // Resource Estimation
840    // =========================================================================
841
842    /// Estimate resource usage for encoding an RGB image of the given dimensions.
843    ///
844    /// Returns peak memory usage (in bytes) and a relative CPU cost multiplier.
845    /// Useful for scheduling, enforcing resource limits, or providing feedback.
846    ///
847    /// # Arguments
848    /// * `width` - Image width in pixels
849    /// * `height` - Image height in pixels
850    ///
851    /// # Example
852    ///
853    /// ```
854    /// use mozjpeg_rs::{Encoder, Preset};
855    ///
856    /// let encoder = Encoder::new(Preset::ProgressiveBalanced).quality(85);
857    /// let estimate = encoder.estimate_resources(1920, 1080);
858    ///
859    /// println!("Peak memory: {} MB", estimate.peak_memory_bytes / 1_000_000);
860    /// println!("Relative CPU cost: {:.1}x", estimate.cpu_cost_multiplier);
861    /// ```
862    pub fn estimate_resources(&self, width: u32, height: u32) -> crate::types::ResourceEstimate {
863        let width = width as usize;
864        let height = height as usize;
865        let pixels = width * height;
866
867        // Calculate chroma dimensions based on subsampling
868        let (h_samp, v_samp) = self.subsampling.luma_factors();
869        let chroma_width = (width + h_samp as usize - 1) / h_samp as usize;
870        let chroma_height = (height + v_samp as usize - 1) / v_samp as usize;
871        let chroma_pixels = chroma_width * chroma_height;
872
873        // MCU-aligned dimensions
874        let mcu_h = 8 * h_samp as usize;
875        let mcu_v = 8 * v_samp as usize;
876        let mcu_width = (width + mcu_h - 1) / mcu_h * mcu_h;
877        let mcu_height = (height + mcu_v - 1) / mcu_v * mcu_v;
878
879        // Block counts
880        let y_blocks = (mcu_width / 8) * (mcu_height / 8);
881        let chroma_block_w = (chroma_width + 7) / 8;
882        let chroma_block_h = (chroma_height + 7) / 8;
883        let chroma_blocks = chroma_block_w * chroma_block_h;
884        let total_blocks = y_blocks + 2 * chroma_blocks;
885
886        // --- Memory estimation ---
887        let mut memory: usize = 0;
888
889        // Color conversion buffers (Y, Cb, Cr planes)
890        memory += 3 * pixels;
891
892        // Chroma subsampled buffers
893        memory += 2 * chroma_pixels;
894
895        // MCU-padded buffers
896        memory += mcu_width * mcu_height; // Y
897        let mcu_chroma_w = (chroma_width + 7) / 8 * 8;
898        let mcu_chroma_h = (chroma_height + 7) / 8 * 8;
899        memory += 2 * mcu_chroma_w * mcu_chroma_h; // Cb, Cr
900
901        // Block storage (needed for progressive or optimize_huffman)
902        let needs_block_storage = self.progressive || self.optimize_huffman;
903        if needs_block_storage {
904            // i16[64] per block = 128 bytes
905            memory += total_blocks * 128;
906        }
907
908        // Raw DCT storage (needed for DC trellis)
909        if self.trellis.dc_enabled {
910            // i32[64] per block = 256 bytes
911            memory += total_blocks * 256;
912        }
913
914        // Output buffer estimate (varies by quality, ~0.3-1.0x input for typical images)
915        // Use a conservative estimate based on quality
916        let output_ratio = if self.quality >= 95 {
917            0.8
918        } else if self.quality >= 85 {
919            0.5
920        } else if self.quality >= 75 {
921            0.3
922        } else {
923            0.2
924        };
925        memory += (pixels as f64 * 3.0 * output_ratio) as usize;
926
927        // --- CPU cost estimation ---
928        // Reference: BaselineFastest Q75 = 1.0
929        let mut cpu_cost = 1.0;
930
931        // Trellis AC quantization is the biggest CPU factor
932        if self.trellis.enabled {
933            cpu_cost += 3.5;
934        }
935
936        // DC trellis adds extra work
937        if self.trellis.dc_enabled {
938            cpu_cost += 0.5;
939        }
940
941        // Huffman optimization (frequency counting pass)
942        if self.optimize_huffman {
943            cpu_cost += 0.3;
944        }
945
946        // Progressive mode (multiple scan encoding)
947        if self.progressive {
948            cpu_cost += 1.5;
949        }
950
951        // optimize_scans (trial encoding many scan configurations)
952        if self.optimize_scans {
953            cpu_cost += 3.0;
954        }
955
956        // High quality increases trellis work (more candidates to evaluate)
957        // This matters most when trellis is enabled
958        if self.trellis.enabled && self.quality >= 85 {
959            let quality_factor = 1.0 + (self.quality as f64 - 85.0) / 30.0;
960            cpu_cost *= quality_factor;
961        }
962
963        crate::types::ResourceEstimate {
964            peak_memory_bytes: memory,
965            cpu_cost_multiplier: cpu_cost,
966            block_count: total_blocks,
967        }
968    }
969
970    /// Estimate resource usage for encoding a grayscale image.
971    ///
972    /// Similar to [`estimate_resources`](Self::estimate_resources) but for single-channel images.
973    pub fn estimate_resources_gray(
974        &self,
975        width: u32,
976        height: u32,
977    ) -> crate::types::ResourceEstimate {
978        let width = width as usize;
979        let height = height as usize;
980        let pixels = width * height;
981
982        // MCU-aligned dimensions (always 8x8 for grayscale)
983        let mcu_width = (width + 7) / 8 * 8;
984        let mcu_height = (height + 7) / 8 * 8;
985
986        // Block count
987        let blocks = (mcu_width / 8) * (mcu_height / 8);
988
989        // --- Memory estimation ---
990        let mut memory: usize = 0;
991
992        // MCU-padded buffer
993        memory += mcu_width * mcu_height;
994
995        // Block storage (needed for progressive or optimize_huffman)
996        let needs_block_storage = self.progressive || self.optimize_huffman;
997        if needs_block_storage {
998            memory += blocks * 128;
999        }
1000
1001        // Raw DCT storage (needed for DC trellis)
1002        if self.trellis.dc_enabled {
1003            memory += blocks * 256;
1004        }
1005
1006        // Output buffer estimate
1007        let output_ratio = if self.quality >= 95 {
1008            0.8
1009        } else if self.quality >= 85 {
1010            0.5
1011        } else if self.quality >= 75 {
1012            0.3
1013        } else {
1014            0.2
1015        };
1016        memory += (pixels as f64 * output_ratio) as usize;
1017
1018        // --- CPU cost (same formula, but less work due to single channel) ---
1019        let mut cpu_cost = 1.0;
1020
1021        if self.trellis.enabled {
1022            cpu_cost += 3.5;
1023        }
1024        if self.trellis.dc_enabled {
1025            cpu_cost += 0.5;
1026        }
1027        if self.optimize_huffman {
1028            cpu_cost += 0.3;
1029        }
1030        if self.progressive {
1031            cpu_cost += 1.0; // Less for grayscale (fewer scans)
1032        }
1033        if self.optimize_scans {
1034            cpu_cost += 2.0; // Less for grayscale
1035        }
1036        if self.trellis.enabled && self.quality >= 85 {
1037            let quality_factor = 1.0 + (self.quality as f64 - 85.0) / 30.0;
1038            cpu_cost *= quality_factor;
1039        }
1040
1041        // Grayscale is ~1/3 the work of RGB (single channel)
1042        cpu_cost /= 3.0;
1043
1044        crate::types::ResourceEstimate {
1045            peak_memory_bytes: memory,
1046            cpu_cost_multiplier: cpu_cost,
1047            block_count: blocks,
1048        }
1049    }
1050
1051    // =========================================================================
1052    // Encoding
1053    // =========================================================================
1054
1055    /// Encode RGB image data to JPEG.
1056    ///
1057    /// # Arguments
1058    /// * `rgb_data` - RGB pixel data (3 bytes per pixel, row-major)
1059    /// * `width` - Image width in pixels
1060    /// * `height` - Image height in pixels
1061    ///
1062    /// # Returns
1063    /// JPEG-encoded data as a `Vec<u8>`.
1064    pub fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
1065        // Validate dimensions: must be non-zero
1066        if width == 0 || height == 0 {
1067            return Err(Error::InvalidDimensions { width, height });
1068        }
1069
1070        // Check all resource limits
1071        self.check_limits(width, height, false)?;
1072
1073        // Use checked arithmetic to prevent overflow
1074        let expected_len = (width as usize)
1075            .checked_mul(height as usize)
1076            .and_then(|n| n.checked_mul(3))
1077            .ok_or(Error::InvalidDimensions { width, height })?;
1078
1079        if rgb_data.len() != expected_len {
1080            return Err(Error::BufferSizeMismatch {
1081                expected: expected_len,
1082                actual: rgb_data.len(),
1083            });
1084        }
1085
1086        // Apply smoothing if enabled
1087        let rgb_data = if self.smoothing > 0 {
1088            std::borrow::Cow::Owned(crate::smooth::smooth_rgb(
1089                rgb_data,
1090                width,
1091                height,
1092                self.smoothing,
1093            ))
1094        } else {
1095            std::borrow::Cow::Borrowed(rgb_data)
1096        };
1097
1098        let mut output = Vec::new();
1099        self.encode_rgb_to_writer(&rgb_data, width, height, &mut output)?;
1100        Ok(output)
1101    }
1102
1103    /// Encode grayscale image data to JPEG.
1104    ///
1105    /// # Arguments
1106    /// * `gray_data` - Grayscale pixel data (1 byte per pixel, row-major)
1107    /// * `width` - Image width in pixels
1108    /// * `height` - Image height in pixels
1109    ///
1110    /// # Returns
1111    /// JPEG-encoded data as a `Vec<u8>`.
1112    pub fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
1113        // Validate dimensions: must be non-zero
1114        if width == 0 || height == 0 {
1115            return Err(Error::InvalidDimensions { width, height });
1116        }
1117
1118        // Check all resource limits
1119        self.check_limits(width, height, true)?;
1120
1121        // Use checked arithmetic to prevent overflow
1122        let expected_len = (width as usize)
1123            .checked_mul(height as usize)
1124            .ok_or(Error::InvalidDimensions { width, height })?;
1125
1126        if gray_data.len() != expected_len {
1127            return Err(Error::BufferSizeMismatch {
1128                expected: expected_len,
1129                actual: gray_data.len(),
1130            });
1131        }
1132
1133        // Apply smoothing if enabled
1134        let gray_data = if self.smoothing > 0 {
1135            std::borrow::Cow::Owned(crate::smooth::smooth_grayscale(
1136                gray_data,
1137                width,
1138                height,
1139                self.smoothing,
1140            ))
1141        } else {
1142            std::borrow::Cow::Borrowed(gray_data)
1143        };
1144
1145        let mut output = Vec::new();
1146        self.encode_gray_to_writer(&gray_data, width, height, &mut output)?;
1147        Ok(output)
1148    }
1149
1150    /// Encode RGB image data to JPEG with cancellation and timeout support.
1151    ///
1152    /// This method allows encoding to be cancelled mid-operation via an atomic flag,
1153    /// or to automatically abort if a timeout is exceeded.
1154    ///
1155    /// # Arguments
1156    /// * `rgb_data` - RGB pixel data (3 bytes per pixel, row-major)
1157    /// * `width` - Image width in pixels
1158    /// * `height` - Image height in pixels
1159    /// * `cancel` - Optional cancellation flag. Set to `true` to abort encoding.
1160    /// * `timeout` - Optional maximum encoding duration.
1161    ///
1162    /// # Returns
1163    /// * `Ok(Vec<u8>)` - JPEG-encoded data
1164    /// * `Err(Error::Cancelled)` - If cancelled via the flag
1165    /// * `Err(Error::TimedOut)` - If the timeout was exceeded
1166    ///
1167    /// # Example
1168    /// ```no_run
1169    /// use mozjpeg_rs::{Encoder, Preset};
1170    /// use std::sync::atomic::AtomicBool;
1171    /// use std::time::Duration;
1172    ///
1173    /// let encoder = Encoder::new(Preset::ProgressiveBalanced);
1174    /// let pixels: Vec<u8> = vec![128; 1920 * 1080 * 3];
1175    /// let cancel = AtomicBool::new(false);
1176    ///
1177    /// // Encode with 5 second timeout
1178    /// let result = encoder.encode_rgb_cancellable(
1179    ///     &pixels, 1920, 1080,
1180    ///     Some(&cancel),
1181    ///     Some(Duration::from_secs(5)),
1182    /// );
1183    /// ```
1184    pub fn encode_rgb_cancellable(
1185        &self,
1186        rgb_data: &[u8],
1187        width: u32,
1188        height: u32,
1189        cancel: Option<&AtomicBool>,
1190        timeout: Option<Duration>,
1191    ) -> Result<Vec<u8>> {
1192        // Validate dimensions
1193        if width == 0 || height == 0 {
1194            return Err(Error::InvalidDimensions { width, height });
1195        }
1196
1197        // Check all resource limits
1198        self.check_limits(width, height, false)?;
1199
1200        // Check buffer size
1201        let expected_len = (width as usize)
1202            .checked_mul(height as usize)
1203            .and_then(|n| n.checked_mul(3))
1204            .ok_or(Error::InvalidDimensions { width, height })?;
1205
1206        if rgb_data.len() != expected_len {
1207            return Err(Error::BufferSizeMismatch {
1208                expected: expected_len,
1209                actual: rgb_data.len(),
1210            });
1211        }
1212
1213        // Create cancellation context
1214        let ctx = CancellationContext::new(cancel, timeout);
1215
1216        // Check for immediate cancellation
1217        ctx.check()?;
1218
1219        // Apply smoothing if enabled
1220        let rgb_data = if self.smoothing > 0 {
1221            std::borrow::Cow::Owned(crate::smooth::smooth_rgb(
1222                rgb_data,
1223                width,
1224                height,
1225                self.smoothing,
1226            ))
1227        } else {
1228            std::borrow::Cow::Borrowed(rgb_data)
1229        };
1230
1231        let mut output = Vec::new();
1232        // For now, use the regular encoder (cancellation hooks can be added to
1233        // internal functions in a follow-up). Check cancellation before and after.
1234        ctx.check()?;
1235        self.encode_rgb_to_writer(&rgb_data, width, height, &mut output)?;
1236        ctx.check()?;
1237
1238        Ok(output)
1239    }
1240
1241    /// Encode grayscale image data to JPEG with cancellation and timeout support.
1242    ///
1243    /// This method allows encoding to be cancelled mid-operation via an atomic flag,
1244    /// or to automatically abort if a timeout is exceeded.
1245    ///
1246    /// # Arguments
1247    /// * `gray_data` - Grayscale pixel data (1 byte per pixel, row-major)
1248    /// * `width` - Image width in pixels
1249    /// * `height` - Image height in pixels
1250    /// * `cancel` - Optional cancellation flag. Set to `true` to abort encoding.
1251    /// * `timeout` - Optional maximum encoding duration.
1252    ///
1253    /// # Returns
1254    /// * `Ok(Vec<u8>)` - JPEG-encoded data
1255    /// * `Err(Error::Cancelled)` - If cancelled via the flag
1256    /// * `Err(Error::TimedOut)` - If the timeout was exceeded
1257    pub fn encode_gray_cancellable(
1258        &self,
1259        gray_data: &[u8],
1260        width: u32,
1261        height: u32,
1262        cancel: Option<&AtomicBool>,
1263        timeout: Option<Duration>,
1264    ) -> Result<Vec<u8>> {
1265        // Validate dimensions
1266        if width == 0 || height == 0 {
1267            return Err(Error::InvalidDimensions { width, height });
1268        }
1269
1270        // Check all resource limits
1271        self.check_limits(width, height, true)?;
1272
1273        // Check buffer size
1274        let expected_len = (width as usize)
1275            .checked_mul(height as usize)
1276            .ok_or(Error::InvalidDimensions { width, height })?;
1277
1278        if gray_data.len() != expected_len {
1279            return Err(Error::BufferSizeMismatch {
1280                expected: expected_len,
1281                actual: gray_data.len(),
1282            });
1283        }
1284
1285        // Create cancellation context
1286        let ctx = CancellationContext::new(cancel, timeout);
1287
1288        // Check for immediate cancellation
1289        ctx.check()?;
1290
1291        // Apply smoothing if enabled
1292        let gray_data = if self.smoothing > 0 {
1293            std::borrow::Cow::Owned(crate::smooth::smooth_grayscale(
1294                gray_data,
1295                width,
1296                height,
1297                self.smoothing,
1298            ))
1299        } else {
1300            std::borrow::Cow::Borrowed(gray_data)
1301        };
1302
1303        let mut output = Vec::new();
1304        // For now, use the regular encoder (cancellation hooks can be added to
1305        // internal functions in a follow-up). Check cancellation before and after.
1306        ctx.check()?;
1307        self.encode_gray_to_writer(&gray_data, width, height, &mut output)?;
1308        ctx.check()?;
1309
1310        Ok(output)
1311    }
1312
1313    /// Encode grayscale image data to a writer.
1314    pub fn encode_gray_to_writer<W: Write>(
1315        &self,
1316        gray_data: &[u8],
1317        width: u32,
1318        height: u32,
1319        output: W,
1320    ) -> Result<()> {
1321        let width = width as usize;
1322        let height = height as usize;
1323
1324        // For grayscale, Y plane is the input directly (no conversion needed)
1325        let y_plane = gray_data;
1326
1327        // Grayscale uses 1x1 sampling
1328        let (mcu_width, mcu_height) = sample::mcu_aligned_dimensions(width, height, 1, 1);
1329
1330        let mcu_y_size = mcu_width
1331            .checked_mul(mcu_height)
1332            .ok_or(Error::AllocationFailed)?;
1333        let mut y_mcu = try_alloc_vec(0u8, mcu_y_size)?;
1334        sample::expand_to_mcu(y_plane, width, height, &mut y_mcu, mcu_width, mcu_height);
1335
1336        // Create quantization table (only luma needed)
1337        let luma_qtable = if let Some(ref custom) = self.custom_luma_qtable {
1338            crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
1339        } else {
1340            let (luma, _) =
1341                create_quant_tables(self.quality, self.quant_table_idx, self.force_baseline);
1342            luma
1343        };
1344
1345        // Create Huffman tables (only luma needed)
1346        let dc_luma_huff = create_std_dc_luma_table();
1347        let ac_luma_huff = create_std_ac_luma_table();
1348        let dc_luma_derived = DerivedTable::from_huff_table(&dc_luma_huff, true)?;
1349        let ac_luma_derived = DerivedTable::from_huff_table(&ac_luma_huff, false)?;
1350
1351        // Single component for grayscale
1352        let components = create_components(Subsampling::Gray);
1353
1354        // Write JPEG file
1355        let mut marker_writer = MarkerWriter::new(output);
1356
1357        // SOI
1358        marker_writer.write_soi()?;
1359
1360        // APP0 (JFIF) with pixel density
1361        marker_writer.write_jfif_app0(
1362            self.pixel_density.unit as u8,
1363            self.pixel_density.x,
1364            self.pixel_density.y,
1365        )?;
1366
1367        // EXIF (if present)
1368        if let Some(ref exif) = self.exif_data {
1369            marker_writer.write_app1_exif(exif)?;
1370        }
1371
1372        // ICC profile (if present)
1373        if let Some(ref icc) = self.icc_profile {
1374            marker_writer.write_icc_profile(icc)?;
1375        }
1376
1377        // Custom APP markers
1378        for (app_num, data) in &self.custom_markers {
1379            marker_writer.write_app(*app_num, data)?;
1380        }
1381
1382        // DQT (only luma table for grayscale)
1383        let luma_qtable_zz = natural_to_zigzag(&luma_qtable.values);
1384        marker_writer.write_dqt(0, &luma_qtable_zz, false)?;
1385
1386        // SOF (baseline or progressive)
1387        marker_writer.write_sof(
1388            self.progressive,
1389            8,
1390            height as u16,
1391            width as u16,
1392            &components,
1393        )?;
1394
1395        // DRI (restart interval)
1396        if self.restart_interval > 0 {
1397            marker_writer.write_dri(self.restart_interval)?;
1398        }
1399
1400        // DHT (only luma tables for grayscale) - written later for progressive
1401        if !self.progressive && !self.optimize_huffman {
1402            marker_writer
1403                .write_dht_multiple(&[(0, false, &dc_luma_huff), (0, true, &ac_luma_huff)])?;
1404        }
1405
1406        let mcu_rows = mcu_height / DCTSIZE;
1407        let mcu_cols = mcu_width / DCTSIZE;
1408        let num_blocks = mcu_rows
1409            .checked_mul(mcu_cols)
1410            .ok_or(Error::AllocationFailed)?;
1411
1412        if self.progressive {
1413            // Progressive mode: collect all blocks, then encode multiple scans
1414            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_blocks)?;
1415            let mut dct_block = [0i16; DCTSIZE2];
1416
1417            // Optionally collect raw DCT for DC trellis
1418            let dc_trellis_enabled = self.trellis.enabled && self.trellis.dc_enabled;
1419            let mut y_raw_dct = if dc_trellis_enabled {
1420                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_blocks)?)
1421            } else {
1422                None
1423            };
1424
1425            // Collect all blocks
1426            for mcu_row in 0..mcu_rows {
1427                for mcu_col in 0..mcu_cols {
1428                    let block_idx = mcu_row * mcu_cols + mcu_col;
1429                    self.process_block_to_storage_with_raw(
1430                        &y_mcu,
1431                        mcu_width,
1432                        mcu_row,
1433                        mcu_col,
1434                        &luma_qtable.values,
1435                        &ac_luma_derived,
1436                        &mut y_blocks[block_idx],
1437                        &mut dct_block,
1438                        y_raw_dct.as_mut().map(|v| v[block_idx].as_mut_slice()),
1439                    )?;
1440                }
1441            }
1442
1443            // Run DC trellis optimization if enabled
1444            if dc_trellis_enabled {
1445                if let Some(ref y_raw) = y_raw_dct {
1446                    run_dc_trellis_by_row(
1447                        y_raw,
1448                        &mut y_blocks,
1449                        luma_qtable.values[0],
1450                        &dc_luma_derived,
1451                        self.trellis.lambda_log_scale1,
1452                        self.trellis.lambda_log_scale2,
1453                        mcu_rows,
1454                        mcu_cols,
1455                        mcu_cols,
1456                        1,
1457                        1,
1458                    );
1459                }
1460            }
1461
1462            // Run EOB optimization if enabled (cross-block EOBRUN optimization)
1463            if self.trellis.enabled && self.trellis.eob_opt {
1464                use crate::trellis::{estimate_block_eob_info, optimize_eob_runs};
1465
1466                // Estimate EOB info for each block
1467                let eob_info: Vec<_> = y_blocks
1468                    .iter()
1469                    .map(|block| estimate_block_eob_info(block, &ac_luma_derived, 1, 63))
1470                    .collect();
1471
1472                // Optimize EOB runs across all blocks
1473                optimize_eob_runs(&mut y_blocks, &eob_info, &ac_luma_derived, 1, 63);
1474            }
1475
1476            // Generate progressive scan script for grayscale (1 component)
1477            let scans = generate_mozjpeg_max_compression_scans(1);
1478
1479            // Build optimized Huffman tables
1480            let mut dc_freq = FrequencyCounter::new();
1481            let mut dc_counter = ProgressiveSymbolCounter::new();
1482            for scan in &scans {
1483                let is_dc_first_scan = scan.ss == 0 && scan.se == 0 && scan.ah == 0;
1484                if is_dc_first_scan {
1485                    // Count DC symbols using progressive counter
1486                    for block in &y_blocks {
1487                        dc_counter.count_dc_first(block, 0, scan.al, &mut dc_freq);
1488                    }
1489                }
1490            }
1491
1492            let opt_dc_huff = dc_freq.generate_table()?;
1493            let opt_dc_derived = DerivedTable::from_huff_table(&opt_dc_huff, true)?;
1494
1495            // Write DC Huffman table upfront
1496            marker_writer.write_dht_multiple(&[(0, false, &opt_dc_huff)])?;
1497
1498            // Encode each scan
1499            let output = marker_writer.into_inner();
1500            let mut bit_writer = BitWriter::new(output);
1501
1502            for scan in &scans {
1503                let is_dc_scan = scan.ss == 0 && scan.se == 0;
1504
1505                if is_dc_scan {
1506                    // DC scan
1507                    marker_writer = MarkerWriter::new(bit_writer.into_inner());
1508                    marker_writer.write_sos(scan, &components)?;
1509                    bit_writer = BitWriter::new(marker_writer.into_inner());
1510
1511                    let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
1512
1513                    if scan.ah == 0 {
1514                        // DC first scan
1515                        for block in &y_blocks {
1516                            prog_encoder.encode_dc_first(block, 0, &opt_dc_derived, scan.al)?;
1517                        }
1518                    } else {
1519                        // DC refinement scan
1520                        for block in &y_blocks {
1521                            prog_encoder.encode_dc_refine(block, scan.al)?;
1522                        }
1523                    }
1524
1525                    prog_encoder.finish_scan(None)?;
1526                } else {
1527                    // AC scan - generate per-scan Huffman table
1528                    let mut ac_freq = FrequencyCounter::new();
1529                    let mut ac_counter = ProgressiveSymbolCounter::new();
1530
1531                    for block in &y_blocks {
1532                        if scan.ah == 0 {
1533                            ac_counter.count_ac_first(
1534                                block,
1535                                scan.ss,
1536                                scan.se,
1537                                scan.al,
1538                                &mut ac_freq,
1539                            );
1540                        } else {
1541                            ac_counter.count_ac_refine(
1542                                block,
1543                                scan.ss,
1544                                scan.se,
1545                                scan.ah,
1546                                scan.al,
1547                                &mut ac_freq,
1548                            );
1549                        }
1550                    }
1551                    ac_counter.finish_scan(Some(&mut ac_freq));
1552
1553                    let opt_ac_huff = ac_freq.generate_table()?;
1554                    let opt_ac_derived = DerivedTable::from_huff_table(&opt_ac_huff, false)?;
1555
1556                    // Write AC Huffman table and SOS
1557                    marker_writer = MarkerWriter::new(bit_writer.into_inner());
1558                    marker_writer.write_dht_multiple(&[(0, true, &opt_ac_huff)])?;
1559                    marker_writer.write_sos(scan, &components)?;
1560                    bit_writer = BitWriter::new(marker_writer.into_inner());
1561
1562                    let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
1563
1564                    for block in &y_blocks {
1565                        if scan.ah == 0 {
1566                            prog_encoder.encode_ac_first(
1567                                block,
1568                                scan.ss,
1569                                scan.se,
1570                                scan.al,
1571                                &opt_ac_derived,
1572                            )?;
1573                        } else {
1574                            prog_encoder.encode_ac_refine(
1575                                block,
1576                                scan.ss,
1577                                scan.se,
1578                                scan.ah,
1579                                scan.al,
1580                                &opt_ac_derived,
1581                            )?;
1582                        }
1583                    }
1584
1585                    prog_encoder.finish_scan(Some(&opt_ac_derived))?;
1586                }
1587            }
1588
1589            let mut output = bit_writer.into_inner();
1590            output.write_all(&[0xFF, 0xD9])?; // EOI
1591        } else if self.optimize_huffman {
1592            // 2-pass: collect blocks, count frequencies, then encode
1593            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_blocks)?;
1594            let mut dct_block = [0i16; DCTSIZE2];
1595
1596            // Collect all blocks using the same process as RGB encoding
1597            for mcu_row in 0..mcu_rows {
1598                for mcu_col in 0..mcu_cols {
1599                    let block_idx = mcu_row * mcu_cols + mcu_col;
1600                    self.process_block_to_storage_with_raw(
1601                        &y_mcu,
1602                        mcu_width,
1603                        mcu_row,
1604                        mcu_col,
1605                        &luma_qtable.values,
1606                        &ac_luma_derived,
1607                        &mut y_blocks[block_idx],
1608                        &mut dct_block,
1609                        None, // No raw DCT storage needed for grayscale
1610                    )?;
1611                }
1612            }
1613
1614            // Count frequencies using SymbolCounter
1615            let mut dc_freq = FrequencyCounter::new();
1616            let mut ac_freq = FrequencyCounter::new();
1617            let mut counter = SymbolCounter::new();
1618            for block in &y_blocks {
1619                counter.count_block(block, 0, &mut dc_freq, &mut ac_freq);
1620            }
1621
1622            // Generate optimized tables
1623            let opt_dc_huff = dc_freq.generate_table()?;
1624            let opt_ac_huff = ac_freq.generate_table()?;
1625            let opt_dc_derived = DerivedTable::from_huff_table(&opt_dc_huff, true)?;
1626            let opt_ac_derived = DerivedTable::from_huff_table(&opt_ac_huff, false)?;
1627
1628            // Write optimized Huffman tables
1629            marker_writer
1630                .write_dht_multiple(&[(0, false, &opt_dc_huff), (0, true, &opt_ac_huff)])?;
1631
1632            // Write SOS and encode
1633            let scans = generate_baseline_scan(1);
1634            marker_writer.write_sos(&scans[0], &components)?;
1635
1636            let output = marker_writer.into_inner();
1637            let mut bit_writer = BitWriter::new(output);
1638            let mut encoder = EntropyEncoder::new(&mut bit_writer);
1639
1640            // Restart marker support for grayscale (each block = 1 MCU)
1641            let restart_interval = self.restart_interval as usize;
1642            let mut restart_num = 0u8;
1643
1644            for (mcu_count, block) in y_blocks.iter().enumerate() {
1645                // Emit restart marker if needed
1646                if restart_interval > 0
1647                    && mcu_count > 0
1648                    && mcu_count.is_multiple_of(restart_interval)
1649                {
1650                    encoder.emit_restart(restart_num)?;
1651                    restart_num = restart_num.wrapping_add(1) & 0x07;
1652                }
1653                encoder.encode_block(block, 0, &opt_dc_derived, &opt_ac_derived)?;
1654            }
1655
1656            bit_writer.flush()?;
1657            let mut output = bit_writer.into_inner();
1658            output.write_all(&[0xFF, 0xD9])?; // EOI
1659        } else {
1660            // Single-pass encoding
1661            let scans = generate_baseline_scan(1);
1662            marker_writer.write_sos(&scans[0], &components)?;
1663
1664            let output = marker_writer.into_inner();
1665            let mut bit_writer = BitWriter::new(output);
1666            let mut encoder = EntropyEncoder::new(&mut bit_writer);
1667            let mut dct_block = [0i16; DCTSIZE2];
1668            let mut quant_block = [0i16; DCTSIZE2];
1669
1670            // Restart marker support
1671            let restart_interval = self.restart_interval as usize;
1672            let mut mcu_count = 0usize;
1673            let mut restart_num = 0u8;
1674
1675            for mcu_row in 0..mcu_rows {
1676                for mcu_col in 0..mcu_cols {
1677                    // Emit restart marker if needed
1678                    if restart_interval > 0
1679                        && mcu_count > 0
1680                        && mcu_count.is_multiple_of(restart_interval)
1681                    {
1682                        encoder.emit_restart(restart_num)?;
1683                        restart_num = restart_num.wrapping_add(1) & 0x07;
1684                    }
1685
1686                    // Process block directly to quant_block
1687                    self.process_block_to_storage_with_raw(
1688                        &y_mcu,
1689                        mcu_width,
1690                        mcu_row,
1691                        mcu_col,
1692                        &luma_qtable.values,
1693                        &ac_luma_derived,
1694                        &mut quant_block,
1695                        &mut dct_block,
1696                        None,
1697                    )?;
1698                    encoder.encode_block(&quant_block, 0, &dc_luma_derived, &ac_luma_derived)?;
1699                    mcu_count += 1;
1700                }
1701            }
1702
1703            bit_writer.flush()?;
1704            let mut output = bit_writer.into_inner();
1705            output.write_all(&[0xFF, 0xD9])?; // EOI
1706        }
1707
1708        Ok(())
1709    }
1710
1711    /// Encode pre-converted planar YCbCr image data to JPEG.
1712    ///
1713    /// This method accepts tightly packed YCbCr data (no row padding).
1714    /// For strided data, use [`encode_ycbcr_planar_strided`](Self::encode_ycbcr_planar_strided).
1715    ///
1716    /// # Arguments
1717    /// * `y` - Luma plane (width × height bytes, tightly packed)
1718    /// * `cb` - Cb chroma plane (chroma_width × chroma_height bytes)
1719    /// * `cr` - Cr chroma plane (chroma_width × chroma_height bytes)
1720    /// * `width` - Image width in pixels
1721    /// * `height` - Image height in pixels
1722    ///
1723    /// The chroma plane dimensions depend on the subsampling mode:
1724    /// - 4:4:4: chroma_width = width, chroma_height = height
1725    /// - 4:2:2: chroma_width = ceil(width/2), chroma_height = height
1726    /// - 4:2:0: chroma_width = ceil(width/2), chroma_height = ceil(height/2)
1727    ///
1728    /// # Returns
1729    /// JPEG-encoded data as a `Vec<u8>`.
1730    ///
1731    /// # Errors
1732    /// Returns an error if plane sizes don't match expected dimensions.
1733    pub fn encode_ycbcr_planar(
1734        &self,
1735        y: &[u8],
1736        cb: &[u8],
1737        cr: &[u8],
1738        width: u32,
1739        height: u32,
1740    ) -> Result<Vec<u8>> {
1741        // For packed data, stride equals width
1742        let (luma_h, luma_v) = self.subsampling.luma_factors();
1743        let (chroma_width, _) = sample::subsampled_dimensions(
1744            width as usize,
1745            height as usize,
1746            luma_h as usize,
1747            luma_v as usize,
1748        );
1749        self.encode_ycbcr_planar_strided(
1750            y,
1751            width as usize,
1752            cb,
1753            chroma_width,
1754            cr,
1755            chroma_width,
1756            width,
1757            height,
1758        )
1759    }
1760
1761    /// Encode pre-converted planar YCbCr image data to a writer.
1762    ///
1763    /// See [`encode_ycbcr_planar`](Self::encode_ycbcr_planar) for details.
1764    pub fn encode_ycbcr_planar_to_writer<W: Write>(
1765        &self,
1766        y: &[u8],
1767        cb: &[u8],
1768        cr: &[u8],
1769        width: u32,
1770        height: u32,
1771        output: W,
1772    ) -> Result<()> {
1773        // For packed data, stride equals width
1774        let (luma_h, luma_v) = self.subsampling.luma_factors();
1775        let (chroma_width, _) = sample::subsampled_dimensions(
1776            width as usize,
1777            height as usize,
1778            luma_h as usize,
1779            luma_v as usize,
1780        );
1781        self.encode_ycbcr_planar_strided_to_writer(
1782            y,
1783            width as usize,
1784            cb,
1785            chroma_width,
1786            cr,
1787            chroma_width,
1788            width,
1789            height,
1790            output,
1791        )
1792    }
1793
1794    /// Encode pre-converted planar YCbCr image data with arbitrary strides.
1795    ///
1796    /// This method accepts YCbCr data that has already been:
1797    /// 1. Converted from RGB to YCbCr color space
1798    /// 2. Downsampled according to the encoder's subsampling mode
1799    ///
1800    /// Use this when you have YCbCr data from video decoders or other sources
1801    /// that may have row padding (stride > width).
1802    ///
1803    /// # Arguments
1804    /// * `y` - Luma plane data
1805    /// * `y_stride` - Bytes per row in luma plane (must be >= width)
1806    /// * `cb` - Cb chroma plane data
1807    /// * `cb_stride` - Bytes per row in Cb plane (must be >= chroma_width)
1808    /// * `cr` - Cr chroma plane data
1809    /// * `cr_stride` - Bytes per row in Cr plane (must be >= chroma_width)
1810    /// * `width` - Image width in pixels
1811    /// * `height` - Image height in pixels
1812    ///
1813    /// The chroma plane dimensions depend on the subsampling mode:
1814    /// - 4:4:4: chroma_width = width, chroma_height = height
1815    /// - 4:2:2: chroma_width = ceil(width/2), chroma_height = height
1816    /// - 4:2:0: chroma_width = ceil(width/2), chroma_height = ceil(height/2)
1817    ///
1818    /// # Returns
1819    /// JPEG-encoded data as a `Vec<u8>`.
1820    ///
1821    /// # Errors
1822    /// Returns an error if:
1823    /// - Strides are less than the required width
1824    /// - Plane sizes don't match stride × height
1825    #[allow(clippy::too_many_arguments)]
1826    pub fn encode_ycbcr_planar_strided(
1827        &self,
1828        y: &[u8],
1829        y_stride: usize,
1830        cb: &[u8],
1831        cb_stride: usize,
1832        cr: &[u8],
1833        cr_stride: usize,
1834        width: u32,
1835        height: u32,
1836    ) -> Result<Vec<u8>> {
1837        let mut output = Vec::new();
1838        self.encode_ycbcr_planar_strided_to_writer(
1839            y,
1840            y_stride,
1841            cb,
1842            cb_stride,
1843            cr,
1844            cr_stride,
1845            width,
1846            height,
1847            &mut output,
1848        )?;
1849        Ok(output)
1850    }
1851
1852    /// Encode pre-converted planar YCbCr image data with arbitrary strides to a writer.
1853    ///
1854    /// See [`encode_ycbcr_planar_strided`](Self::encode_ycbcr_planar_strided) for details.
1855    #[allow(clippy::too_many_arguments)]
1856    pub fn encode_ycbcr_planar_strided_to_writer<W: Write>(
1857        &self,
1858        y: &[u8],
1859        y_stride: usize,
1860        cb: &[u8],
1861        cb_stride: usize,
1862        cr: &[u8],
1863        cr_stride: usize,
1864        width: u32,
1865        height: u32,
1866        output: W,
1867    ) -> Result<()> {
1868        let width = width as usize;
1869        let height = height as usize;
1870
1871        // Validate dimensions
1872        if width == 0 || height == 0 {
1873            return Err(Error::InvalidDimensions {
1874                width: width as u32,
1875                height: height as u32,
1876            });
1877        }
1878
1879        // Validate Y stride
1880        if y_stride < width {
1881            return Err(Error::InvalidSamplingFactor {
1882                h: y_stride as u8,
1883                v: width as u8,
1884            });
1885        }
1886
1887        let (luma_h, luma_v) = self.subsampling.luma_factors();
1888        let (chroma_width, chroma_height) =
1889            sample::subsampled_dimensions(width, height, luma_h as usize, luma_v as usize);
1890
1891        // Validate chroma strides
1892        if cb_stride < chroma_width {
1893            return Err(Error::InvalidSamplingFactor {
1894                h: cb_stride as u8,
1895                v: chroma_width as u8,
1896            });
1897        }
1898        if cr_stride < chroma_width {
1899            return Err(Error::InvalidSamplingFactor {
1900                h: cr_stride as u8,
1901                v: chroma_width as u8,
1902            });
1903        }
1904
1905        // Calculate expected plane sizes (stride × height)
1906        let y_size = y_stride
1907            .checked_mul(height)
1908            .ok_or(Error::InvalidDimensions {
1909                width: width as u32,
1910                height: height as u32,
1911            })?;
1912        let cb_size = cb_stride
1913            .checked_mul(chroma_height)
1914            .ok_or(Error::AllocationFailed)?;
1915        let cr_size = cr_stride
1916            .checked_mul(chroma_height)
1917            .ok_or(Error::AllocationFailed)?;
1918
1919        // Validate Y plane size
1920        if y.len() < y_size {
1921            return Err(Error::BufferSizeMismatch {
1922                expected: y_size,
1923                actual: y.len(),
1924            });
1925        }
1926
1927        // Validate Cb plane size
1928        if cb.len() < cb_size {
1929            return Err(Error::BufferSizeMismatch {
1930                expected: cb_size,
1931                actual: cb.len(),
1932            });
1933        }
1934
1935        // Validate Cr plane size
1936        if cr.len() < cr_size {
1937            return Err(Error::BufferSizeMismatch {
1938                expected: cr_size,
1939                actual: cr.len(),
1940            });
1941        }
1942
1943        // Expand planes to MCU-aligned dimensions
1944        let (mcu_width, mcu_height) =
1945            sample::mcu_aligned_dimensions(width, height, luma_h as usize, luma_v as usize);
1946        let (mcu_chroma_w, mcu_chroma_h) =
1947            (mcu_width / luma_h as usize, mcu_height / luma_v as usize);
1948
1949        let mcu_y_size = mcu_width
1950            .checked_mul(mcu_height)
1951            .ok_or(Error::AllocationFailed)?;
1952        let mcu_chroma_size = mcu_chroma_w
1953            .checked_mul(mcu_chroma_h)
1954            .ok_or(Error::AllocationFailed)?;
1955        let mut y_mcu = try_alloc_vec(0u8, mcu_y_size)?;
1956        let mut cb_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
1957        let mut cr_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
1958
1959        sample::expand_to_mcu_strided(
1960            y, width, y_stride, height, &mut y_mcu, mcu_width, mcu_height,
1961        );
1962        sample::expand_to_mcu_strided(
1963            cb,
1964            chroma_width,
1965            cb_stride,
1966            chroma_height,
1967            &mut cb_mcu,
1968            mcu_chroma_w,
1969            mcu_chroma_h,
1970        );
1971        sample::expand_to_mcu_strided(
1972            cr,
1973            chroma_width,
1974            cr_stride,
1975            chroma_height,
1976            &mut cr_mcu,
1977            mcu_chroma_w,
1978            mcu_chroma_h,
1979        );
1980
1981        // Encode using shared helper
1982        self.encode_ycbcr_mcu_to_writer(
1983            &y_mcu,
1984            &cb_mcu,
1985            &cr_mcu,
1986            width,
1987            height,
1988            mcu_width,
1989            mcu_height,
1990            chroma_width,
1991            chroma_height,
1992            mcu_chroma_w,
1993            mcu_chroma_h,
1994            output,
1995        )
1996    }
1997
1998    /// Encode RGB image data to a writer.
1999    pub fn encode_rgb_to_writer<W: Write>(
2000        &self,
2001        rgb_data: &[u8],
2002        width: u32,
2003        height: u32,
2004        output: W,
2005    ) -> Result<()> {
2006        let width = width as usize;
2007        let height = height as usize;
2008
2009        // Step 1: Convert RGB to YCbCr
2010        // Use checked arithmetic for num_pixels calculation
2011        let num_pixels = width.checked_mul(height).ok_or(Error::InvalidDimensions {
2012            width: width as u32,
2013            height: height as u32,
2014        })?;
2015
2016        let mut y_plane = try_alloc_vec(0u8, num_pixels)?;
2017        let mut cb_plane = try_alloc_vec(0u8, num_pixels)?;
2018        let mut cr_plane = try_alloc_vec(0u8, num_pixels)?;
2019
2020        (self.simd.color_convert_rgb_to_ycbcr)(
2021            rgb_data,
2022            &mut y_plane,
2023            &mut cb_plane,
2024            &mut cr_plane,
2025            num_pixels,
2026        );
2027
2028        // Step 2: Downsample chroma if needed
2029        let (luma_h, luma_v) = self.subsampling.luma_factors();
2030        let (chroma_width, chroma_height) =
2031            sample::subsampled_dimensions(width, height, luma_h as usize, luma_v as usize);
2032
2033        let chroma_size = chroma_width
2034            .checked_mul(chroma_height)
2035            .ok_or(Error::AllocationFailed)?;
2036        let mut cb_subsampled = try_alloc_vec(0u8, chroma_size)?;
2037        let mut cr_subsampled = try_alloc_vec(0u8, chroma_size)?;
2038
2039        sample::downsample_plane(
2040            &cb_plane,
2041            width,
2042            height,
2043            luma_h as usize,
2044            luma_v as usize,
2045            &mut cb_subsampled,
2046        );
2047        sample::downsample_plane(
2048            &cr_plane,
2049            width,
2050            height,
2051            luma_h as usize,
2052            luma_v as usize,
2053            &mut cr_subsampled,
2054        );
2055
2056        // Step 3: Expand planes to MCU-aligned dimensions
2057        let (mcu_width, mcu_height) =
2058            sample::mcu_aligned_dimensions(width, height, luma_h as usize, luma_v as usize);
2059        let (mcu_chroma_w, mcu_chroma_h) =
2060            (mcu_width / luma_h as usize, mcu_height / luma_v as usize);
2061
2062        let mcu_y_size = mcu_width
2063            .checked_mul(mcu_height)
2064            .ok_or(Error::AllocationFailed)?;
2065        let mcu_chroma_size = mcu_chroma_w
2066            .checked_mul(mcu_chroma_h)
2067            .ok_or(Error::AllocationFailed)?;
2068        let mut y_mcu = try_alloc_vec(0u8, mcu_y_size)?;
2069        let mut cb_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
2070        let mut cr_mcu = try_alloc_vec(0u8, mcu_chroma_size)?;
2071
2072        sample::expand_to_mcu(&y_plane, width, height, &mut y_mcu, mcu_width, mcu_height);
2073        sample::expand_to_mcu(
2074            &cb_subsampled,
2075            chroma_width,
2076            chroma_height,
2077            &mut cb_mcu,
2078            mcu_chroma_w,
2079            mcu_chroma_h,
2080        );
2081        sample::expand_to_mcu(
2082            &cr_subsampled,
2083            chroma_width,
2084            chroma_height,
2085            &mut cr_mcu,
2086            mcu_chroma_w,
2087            mcu_chroma_h,
2088        );
2089
2090        // Encode using shared helper
2091        self.encode_ycbcr_mcu_to_writer(
2092            &y_mcu,
2093            &cb_mcu,
2094            &cr_mcu,
2095            width,
2096            height,
2097            mcu_width,
2098            mcu_height,
2099            chroma_width,
2100            chroma_height,
2101            mcu_chroma_w,
2102            mcu_chroma_h,
2103            output,
2104        )
2105    }
2106
2107    /// Internal helper: Encode MCU-aligned YCbCr planes to JPEG.
2108    ///
2109    /// This is the shared encoding logic used by both `encode_rgb_to_writer`
2110    /// and `encode_ycbcr_planar_to_writer`.
2111    #[allow(clippy::too_many_arguments)]
2112    fn encode_ycbcr_mcu_to_writer<W: Write>(
2113        &self,
2114        y_mcu: &[u8],
2115        cb_mcu: &[u8],
2116        cr_mcu: &[u8],
2117        width: usize,
2118        height: usize,
2119        mcu_width: usize,
2120        mcu_height: usize,
2121        chroma_width: usize,
2122        chroma_height: usize,
2123        mcu_chroma_w: usize,
2124        mcu_chroma_h: usize,
2125        output: W,
2126    ) -> Result<()> {
2127        let (luma_h, luma_v) = self.subsampling.luma_factors();
2128
2129        // Step 4: Create quantization tables
2130        let (luma_qtable, chroma_qtable) = {
2131            let (default_luma, default_chroma) =
2132                create_quant_tables(self.quality, self.quant_table_idx, self.force_baseline);
2133            let luma = if let Some(ref custom) = self.custom_luma_qtable {
2134                crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
2135            } else {
2136                default_luma
2137            };
2138            let chroma = if let Some(ref custom) = self.custom_chroma_qtable {
2139                crate::quant::create_quant_table(custom, self.quality, self.force_baseline)
2140            } else {
2141                default_chroma
2142            };
2143            (luma, chroma)
2144        };
2145
2146        // Step 5: Create Huffman tables (standard tables)
2147        let dc_luma_huff = create_std_dc_luma_table();
2148        let dc_chroma_huff = create_std_dc_chroma_table();
2149        let ac_luma_huff = create_std_ac_luma_table();
2150        let ac_chroma_huff = create_std_ac_chroma_table();
2151
2152        let dc_luma_derived = DerivedTable::from_huff_table(&dc_luma_huff, true)?;
2153        let dc_chroma_derived = DerivedTable::from_huff_table(&dc_chroma_huff, true)?;
2154        let ac_luma_derived = DerivedTable::from_huff_table(&ac_luma_huff, false)?;
2155        let ac_chroma_derived = DerivedTable::from_huff_table(&ac_chroma_huff, false)?;
2156
2157        // Step 6: Set up components
2158        let components = create_ycbcr_components(self.subsampling);
2159
2160        // Step 7: Write JPEG file
2161        let mut marker_writer = MarkerWriter::new(output);
2162
2163        // SOI
2164        marker_writer.write_soi()?;
2165
2166        // APP0 (JFIF) with pixel density
2167        marker_writer.write_jfif_app0(
2168            self.pixel_density.unit as u8,
2169            self.pixel_density.x,
2170            self.pixel_density.y,
2171        )?;
2172
2173        // APP1 (EXIF) - if present
2174        if let Some(ref exif) = self.exif_data {
2175            marker_writer.write_app1_exif(exif)?;
2176        }
2177
2178        // ICC profile (if present)
2179        if let Some(ref icc) = self.icc_profile {
2180            marker_writer.write_icc_profile(icc)?;
2181        }
2182
2183        // Custom APP markers
2184        for (app_num, data) in &self.custom_markers {
2185            marker_writer.write_app(*app_num, data)?;
2186        }
2187
2188        // DQT (quantization tables in zigzag order) - combined into single marker
2189        let luma_qtable_zz = natural_to_zigzag(&luma_qtable.values);
2190        let chroma_qtable_zz = natural_to_zigzag(&chroma_qtable.values);
2191        marker_writer
2192            .write_dqt_multiple(&[(0, &luma_qtable_zz, false), (1, &chroma_qtable_zz, false)])?;
2193
2194        // SOF
2195        marker_writer.write_sof(
2196            self.progressive,
2197            8,
2198            height as u16,
2199            width as u16,
2200            &components,
2201        )?;
2202
2203        // DRI (restart interval) - if enabled
2204        if self.restart_interval > 0 {
2205            marker_writer.write_dri(self.restart_interval)?;
2206        }
2207
2208        // DHT (Huffman tables) - written here for non-optimized modes,
2209        // or later after frequency counting for optimized modes
2210        if !self.optimize_huffman {
2211            // Combine all tables into single DHT marker for smaller file size
2212            marker_writer.write_dht_multiple(&[
2213                (0, false, &dc_luma_huff),
2214                (1, false, &dc_chroma_huff),
2215                (0, true, &ac_luma_huff),
2216                (1, true, &ac_chroma_huff),
2217            ])?;
2218        }
2219
2220        if self.progressive {
2221            // Progressive mode: Store all blocks, then encode multiple scans
2222            let mcu_rows = mcu_height / (DCTSIZE * luma_v as usize);
2223            let mcu_cols = mcu_width / (DCTSIZE * luma_h as usize);
2224            let num_y_blocks = mcu_rows
2225                .checked_mul(mcu_cols)
2226                .and_then(|n| n.checked_mul(luma_h as usize))
2227                .and_then(|n| n.checked_mul(luma_v as usize))
2228                .ok_or(Error::AllocationFailed)?;
2229            let num_chroma_blocks = mcu_rows
2230                .checked_mul(mcu_cols)
2231                .ok_or(Error::AllocationFailed)?;
2232
2233            // Collect all quantized blocks
2234            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_y_blocks)?;
2235            let mut cb_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
2236            let mut cr_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
2237
2238            // Optionally collect raw DCT for DC trellis
2239            let dc_trellis_enabled = self.trellis.enabled && self.trellis.dc_enabled;
2240            let mut y_raw_dct = if dc_trellis_enabled {
2241                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_y_blocks)?)
2242            } else {
2243                None
2244            };
2245            let mut cb_raw_dct = if dc_trellis_enabled {
2246                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
2247            } else {
2248                None
2249            };
2250            let mut cr_raw_dct = if dc_trellis_enabled {
2251                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
2252            } else {
2253                None
2254            };
2255
2256            self.collect_blocks(
2257                y_mcu,
2258                mcu_width,
2259                mcu_height,
2260                cb_mcu,
2261                cr_mcu,
2262                mcu_chroma_w,
2263                mcu_chroma_h,
2264                &luma_qtable.values,
2265                &chroma_qtable.values,
2266                &ac_luma_derived,
2267                &ac_chroma_derived,
2268                &mut y_blocks,
2269                &mut cb_blocks,
2270                &mut cr_blocks,
2271                y_raw_dct.as_deref_mut(),
2272                cb_raw_dct.as_deref_mut(),
2273                cr_raw_dct.as_deref_mut(),
2274                luma_h,
2275                luma_v,
2276            )?;
2277
2278            // Run DC trellis optimization if enabled
2279            // C mozjpeg processes DC trellis row by row (each row is an independent chain)
2280            if dc_trellis_enabled {
2281                let h = luma_h as usize;
2282                let v = luma_v as usize;
2283                let y_block_cols = mcu_cols * h;
2284                let y_block_rows = mcu_rows * v;
2285
2286                if let Some(ref y_raw) = y_raw_dct {
2287                    run_dc_trellis_by_row(
2288                        y_raw,
2289                        &mut y_blocks,
2290                        luma_qtable.values[0],
2291                        &dc_luma_derived,
2292                        self.trellis.lambda_log_scale1,
2293                        self.trellis.lambda_log_scale2,
2294                        y_block_rows,
2295                        y_block_cols,
2296                        mcu_cols,
2297                        h,
2298                        v,
2299                    );
2300                }
2301                // Chroma has 1x1 per MCU, so MCU order = row order
2302                if let Some(ref cb_raw) = cb_raw_dct {
2303                    run_dc_trellis_by_row(
2304                        cb_raw,
2305                        &mut cb_blocks,
2306                        chroma_qtable.values[0],
2307                        &dc_chroma_derived,
2308                        self.trellis.lambda_log_scale1,
2309                        self.trellis.lambda_log_scale2,
2310                        mcu_rows,
2311                        mcu_cols,
2312                        mcu_cols,
2313                        1,
2314                        1,
2315                    );
2316                }
2317                if let Some(ref cr_raw) = cr_raw_dct {
2318                    run_dc_trellis_by_row(
2319                        cr_raw,
2320                        &mut cr_blocks,
2321                        chroma_qtable.values[0],
2322                        &dc_chroma_derived,
2323                        self.trellis.lambda_log_scale1,
2324                        self.trellis.lambda_log_scale2,
2325                        mcu_rows,
2326                        mcu_cols,
2327                        mcu_cols,
2328                        1,
2329                        1,
2330                    );
2331                }
2332            }
2333
2334            // Run EOB optimization if enabled (cross-block EOBRUN optimization)
2335            if self.trellis.enabled && self.trellis.eob_opt {
2336                use crate::trellis::{estimate_block_eob_info, optimize_eob_runs};
2337
2338                // Y component
2339                let y_eob_info: Vec<_> = y_blocks
2340                    .iter()
2341                    .map(|block| estimate_block_eob_info(block, &ac_luma_derived, 1, 63))
2342                    .collect();
2343                optimize_eob_runs(&mut y_blocks, &y_eob_info, &ac_luma_derived, 1, 63);
2344
2345                // Cb component
2346                let cb_eob_info: Vec<_> = cb_blocks
2347                    .iter()
2348                    .map(|block| estimate_block_eob_info(block, &ac_chroma_derived, 1, 63))
2349                    .collect();
2350                optimize_eob_runs(&mut cb_blocks, &cb_eob_info, &ac_chroma_derived, 1, 63);
2351
2352                // Cr component
2353                let cr_eob_info: Vec<_> = cr_blocks
2354                    .iter()
2355                    .map(|block| estimate_block_eob_info(block, &ac_chroma_derived, 1, 63))
2356                    .collect();
2357                optimize_eob_runs(&mut cr_blocks, &cr_eob_info, &ac_chroma_derived, 1, 63);
2358            }
2359
2360            // Generate progressive scan script
2361            //
2362            // TEMPORARY: Always use 4-scan minimal script to avoid refinement scan bugs.
2363            // Our AC refinement encoding has bugs causing "failed to decode huffman code".
2364            // TODO: Fix AC refinement encoding and re-enable optimize_scans.
2365            let scans = if self.optimize_scans {
2366                // When optimize_scans is enabled, use the scan optimizer to find
2367                // the best frequency split and Al levels. However, SA refinement
2368                // (Ah > 0) is currently disabled due to encoding bugs.
2369                self.optimize_progressive_scans(
2370                    3, // num_components
2371                    &y_blocks,
2372                    &cb_blocks,
2373                    &cr_blocks,
2374                    mcu_rows,
2375                    mcu_cols,
2376                    luma_h,
2377                    luma_v,
2378                    width,
2379                    height,
2380                    chroma_width,
2381                    chroma_height,
2382                    &dc_luma_derived,
2383                    &dc_chroma_derived,
2384                    &ac_luma_derived,
2385                    &ac_chroma_derived,
2386                )?
2387            } else {
2388                // Use C mozjpeg's 9-scan JCP_MAX_COMPRESSION script.
2389                // This matches jcparam.c lines 932-947 (the JCP_MAX_COMPRESSION branch).
2390                // mozjpeg-sys defaults to JCP_MAX_COMPRESSION profile, which uses:
2391                // - DC with no successive approximation (Al=0)
2392                // - 8/9 frequency split for luma with successive approximation
2393                // - No successive approximation for chroma
2394                generate_mozjpeg_max_compression_scans(3)
2395            };
2396
2397            // Build Huffman tables and encode scans
2398            //
2399            // When optimize_scans=true, each AC scan gets its own optimal Huffman table
2400            // written immediately before the scan. This matches C mozjpeg behavior and
2401            // ensures the trial encoder's size estimates match actual encoded sizes.
2402            //
2403            // When optimize_huffman=true, use per-scan AC tables (matching C mozjpeg).
2404            // C automatically enables optimize_coding for progressive mode and does
2405            // 2 passes per scan: gather statistics, then output with optimal tables.
2406
2407            if self.optimize_huffman {
2408                // Per-scan AC tables mode: DC tables global, AC tables per-scan
2409                // This matches C mozjpeg's progressive behavior
2410
2411                // Count DC frequencies for first-pass DC scans only (Ah == 0)
2412                // DC refinement scans (Ah > 0) don't use Huffman coding - they output raw bits
2413                let mut dc_luma_freq = FrequencyCounter::new();
2414                let mut dc_chroma_freq = FrequencyCounter::new();
2415
2416                for scan in &scans {
2417                    let is_dc_first_scan = scan.ss == 0 && scan.se == 0 && scan.ah == 0;
2418                    if is_dc_first_scan {
2419                        self.count_dc_scan_symbols(
2420                            scan,
2421                            &y_blocks,
2422                            &cb_blocks,
2423                            &cr_blocks,
2424                            mcu_rows,
2425                            mcu_cols,
2426                            luma_h,
2427                            luma_v,
2428                            &mut dc_luma_freq,
2429                            &mut dc_chroma_freq,
2430                        );
2431                    }
2432                }
2433
2434                // Generate and write DC tables upfront
2435                let opt_dc_luma_huff = dc_luma_freq.generate_table()?;
2436                let opt_dc_chroma_huff = dc_chroma_freq.generate_table()?;
2437                marker_writer.write_dht_multiple(&[
2438                    (0, false, &opt_dc_luma_huff),
2439                    (1, false, &opt_dc_chroma_huff),
2440                ])?;
2441
2442                let opt_dc_luma = DerivedTable::from_huff_table(&opt_dc_luma_huff, true)?;
2443                let opt_dc_chroma = DerivedTable::from_huff_table(&opt_dc_chroma_huff, true)?;
2444
2445                // Get output writer from marker_writer
2446                let output = marker_writer.into_inner();
2447                let mut bit_writer = BitWriter::new(output);
2448
2449                // Encode each scan with per-scan AC tables
2450                for scan in &scans {
2451                    bit_writer.flush()?;
2452                    let mut inner = bit_writer.into_inner();
2453
2454                    let is_dc_scan = scan.ss == 0 && scan.se == 0;
2455
2456                    if !is_dc_scan {
2457                        // AC scan: build per-scan optimal Huffman table
2458                        let comp_idx = scan.component_index[0] as usize;
2459                        let blocks = match comp_idx {
2460                            0 => &y_blocks,
2461                            1 => &cb_blocks,
2462                            2 => &cr_blocks,
2463                            _ => &y_blocks,
2464                        };
2465                        let (block_cols, block_rows) = if comp_idx == 0 {
2466                            (width.div_ceil(DCTSIZE), height.div_ceil(DCTSIZE))
2467                        } else {
2468                            (
2469                                chroma_width.div_ceil(DCTSIZE),
2470                                chroma_height.div_ceil(DCTSIZE),
2471                            )
2472                        };
2473
2474                        // Count frequencies for this scan only
2475                        let mut ac_freq = FrequencyCounter::new();
2476                        self.count_ac_scan_symbols(
2477                            scan,
2478                            blocks,
2479                            mcu_rows,
2480                            mcu_cols,
2481                            luma_h,
2482                            luma_v,
2483                            comp_idx,
2484                            block_cols,
2485                            block_rows,
2486                            &mut ac_freq,
2487                        );
2488
2489                        // Build optimal table and write DHT
2490                        let ac_huff = ac_freq.generate_table()?;
2491                        let table_idx = if comp_idx == 0 { 0 } else { 1 };
2492                        write_dht_marker(&mut inner, table_idx, true, &ac_huff)?;
2493
2494                        // Write SOS and encode
2495                        write_sos_marker(&mut inner, scan, &components)?;
2496                        bit_writer = BitWriter::new(inner);
2497
2498                        let ac_derived = DerivedTable::from_huff_table(&ac_huff, false)?;
2499                        let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
2500
2501                        self.encode_progressive_scan(
2502                            scan,
2503                            &y_blocks,
2504                            &cb_blocks,
2505                            &cr_blocks,
2506                            mcu_rows,
2507                            mcu_cols,
2508                            luma_h,
2509                            luma_v,
2510                            width,
2511                            height,
2512                            chroma_width,
2513                            chroma_height,
2514                            &opt_dc_luma,
2515                            &opt_dc_chroma,
2516                            &ac_derived,
2517                            &ac_derived, // Not used for AC scans, but needed for signature
2518                            &mut prog_encoder,
2519                        )?;
2520                        prog_encoder.finish_scan(Some(&ac_derived))?;
2521                    } else {
2522                        // DC scan: use global DC tables
2523                        write_sos_marker(&mut inner, scan, &components)?;
2524                        bit_writer = BitWriter::new(inner);
2525
2526                        let mut prog_encoder = ProgressiveEncoder::new(&mut bit_writer);
2527                        self.encode_progressive_scan(
2528                            scan,
2529                            &y_blocks,
2530                            &cb_blocks,
2531                            &cr_blocks,
2532                            mcu_rows,
2533                            mcu_cols,
2534                            luma_h,
2535                            luma_v,
2536                            width,
2537                            height,
2538                            chroma_width,
2539                            chroma_height,
2540                            &opt_dc_luma,
2541                            &opt_dc_chroma,
2542                            &ac_luma_derived, // Not used for DC scans
2543                            &ac_chroma_derived,
2544                            &mut prog_encoder,
2545                        )?;
2546                        prog_encoder.finish_scan(None)?;
2547                    }
2548                }
2549
2550                // Flush and write EOI
2551                bit_writer.flush()?;
2552                let mut output = bit_writer.into_inner();
2553                output.write_all(&[0xFF, 0xD9])?;
2554            } else {
2555                // Standard tables mode (no optimization)
2556                let output = marker_writer.into_inner();
2557                let mut bit_writer = BitWriter::new(output);
2558
2559                for scan in &scans {
2560                    bit_writer.flush()?;
2561                    let mut inner = bit_writer.into_inner();
2562                    write_sos_marker(&mut inner, scan, &components)?;
2563
2564                    bit_writer = BitWriter::new(inner);
2565                    let mut prog_encoder = ProgressiveEncoder::new_standard_tables(&mut bit_writer);
2566
2567                    self.encode_progressive_scan(
2568                        scan,
2569                        &y_blocks,
2570                        &cb_blocks,
2571                        &cr_blocks,
2572                        mcu_rows,
2573                        mcu_cols,
2574                        luma_h,
2575                        luma_v,
2576                        width,
2577                        height,
2578                        chroma_width,
2579                        chroma_height,
2580                        &dc_luma_derived,
2581                        &dc_chroma_derived,
2582                        &ac_luma_derived,
2583                        &ac_chroma_derived,
2584                        &mut prog_encoder,
2585                    )?;
2586
2587                    let ac_table = if scan.ss > 0 {
2588                        if scan.component_index[0] == 0 {
2589                            Some(&ac_luma_derived)
2590                        } else {
2591                            Some(&ac_chroma_derived)
2592                        }
2593                    } else {
2594                        None
2595                    };
2596                    prog_encoder.finish_scan(ac_table)?;
2597                }
2598
2599                bit_writer.flush()?;
2600                let mut output = bit_writer.into_inner();
2601                output.write_all(&[0xFF, 0xD9])?;
2602            }
2603        } else if self.optimize_huffman {
2604            // Baseline mode with Huffman optimization (2-pass)
2605            // Pass 1: Collect blocks and count frequencies
2606            let mcu_rows = mcu_height / (DCTSIZE * luma_v as usize);
2607            let mcu_cols = mcu_width / (DCTSIZE * luma_h as usize);
2608            let num_y_blocks = mcu_rows
2609                .checked_mul(mcu_cols)
2610                .and_then(|n| n.checked_mul(luma_h as usize))
2611                .and_then(|n| n.checked_mul(luma_v as usize))
2612                .ok_or(Error::AllocationFailed)?;
2613            let num_chroma_blocks = mcu_rows
2614                .checked_mul(mcu_cols)
2615                .ok_or(Error::AllocationFailed)?;
2616
2617            let mut y_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_y_blocks)?;
2618            let mut cb_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
2619            let mut cr_blocks = try_alloc_vec_array::<i16, DCTSIZE2>(num_chroma_blocks)?;
2620
2621            // Optionally collect raw DCT for DC trellis
2622            let dc_trellis_enabled = self.trellis.enabled && self.trellis.dc_enabled;
2623            let mut y_raw_dct = if dc_trellis_enabled {
2624                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_y_blocks)?)
2625            } else {
2626                None
2627            };
2628            let mut cb_raw_dct = if dc_trellis_enabled {
2629                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
2630            } else {
2631                None
2632            };
2633            let mut cr_raw_dct = if dc_trellis_enabled {
2634                Some(try_alloc_vec_array::<i32, DCTSIZE2>(num_chroma_blocks)?)
2635            } else {
2636                None
2637            };
2638
2639            self.collect_blocks(
2640                y_mcu,
2641                mcu_width,
2642                mcu_height,
2643                cb_mcu,
2644                cr_mcu,
2645                mcu_chroma_w,
2646                mcu_chroma_h,
2647                &luma_qtable.values,
2648                &chroma_qtable.values,
2649                &ac_luma_derived,
2650                &ac_chroma_derived,
2651                &mut y_blocks,
2652                &mut cb_blocks,
2653                &mut cr_blocks,
2654                y_raw_dct.as_deref_mut(),
2655                cb_raw_dct.as_deref_mut(),
2656                cr_raw_dct.as_deref_mut(),
2657                luma_h,
2658                luma_v,
2659            )?;
2660
2661            // Run DC trellis optimization if enabled
2662            // C mozjpeg processes DC trellis row by row (each row is an independent chain)
2663            if dc_trellis_enabled {
2664                let h = luma_h as usize;
2665                let v = luma_v as usize;
2666                let y_block_cols = mcu_cols * h;
2667                let y_block_rows = mcu_rows * v;
2668
2669                if let Some(ref y_raw) = y_raw_dct {
2670                    run_dc_trellis_by_row(
2671                        y_raw,
2672                        &mut y_blocks,
2673                        luma_qtable.values[0],
2674                        &dc_luma_derived,
2675                        self.trellis.lambda_log_scale1,
2676                        self.trellis.lambda_log_scale2,
2677                        y_block_rows,
2678                        y_block_cols,
2679                        mcu_cols,
2680                        h,
2681                        v,
2682                    );
2683                }
2684                // Chroma has 1x1 per MCU, so MCU order = row order
2685                if let Some(ref cb_raw) = cb_raw_dct {
2686                    run_dc_trellis_by_row(
2687                        cb_raw,
2688                        &mut cb_blocks,
2689                        chroma_qtable.values[0],
2690                        &dc_chroma_derived,
2691                        self.trellis.lambda_log_scale1,
2692                        self.trellis.lambda_log_scale2,
2693                        mcu_rows,
2694                        mcu_cols,
2695                        mcu_cols,
2696                        1,
2697                        1,
2698                    );
2699                }
2700                if let Some(ref cr_raw) = cr_raw_dct {
2701                    run_dc_trellis_by_row(
2702                        cr_raw,
2703                        &mut cr_blocks,
2704                        chroma_qtable.values[0],
2705                        &dc_chroma_derived,
2706                        self.trellis.lambda_log_scale1,
2707                        self.trellis.lambda_log_scale2,
2708                        mcu_rows,
2709                        mcu_cols,
2710                        mcu_cols,
2711                        1,
2712                        1,
2713                    );
2714                }
2715            }
2716
2717            // Count symbol frequencies
2718            let mut dc_luma_freq = FrequencyCounter::new();
2719            let mut dc_chroma_freq = FrequencyCounter::new();
2720            let mut ac_luma_freq = FrequencyCounter::new();
2721            let mut ac_chroma_freq = FrequencyCounter::new();
2722
2723            let mut counter = SymbolCounter::new();
2724            let blocks_per_mcu_y = (luma_h * luma_v) as usize;
2725            let mut y_idx = 0;
2726            let mut c_idx = 0;
2727
2728            for _mcu_row in 0..mcu_rows {
2729                for _mcu_col in 0..mcu_cols {
2730                    // Y blocks
2731                    for _ in 0..blocks_per_mcu_y {
2732                        counter.count_block(
2733                            &y_blocks[y_idx],
2734                            0,
2735                            &mut dc_luma_freq,
2736                            &mut ac_luma_freq,
2737                        );
2738                        y_idx += 1;
2739                    }
2740                    // Cb block
2741                    counter.count_block(
2742                        &cb_blocks[c_idx],
2743                        1,
2744                        &mut dc_chroma_freq,
2745                        &mut ac_chroma_freq,
2746                    );
2747                    // Cr block
2748                    counter.count_block(
2749                        &cr_blocks[c_idx],
2750                        2,
2751                        &mut dc_chroma_freq,
2752                        &mut ac_chroma_freq,
2753                    );
2754                    c_idx += 1;
2755                }
2756            }
2757
2758            // Generate optimized Huffman tables
2759            let opt_dc_luma_huff = dc_luma_freq.generate_table()?;
2760            let opt_dc_chroma_huff = dc_chroma_freq.generate_table()?;
2761            let opt_ac_luma_huff = ac_luma_freq.generate_table()?;
2762            let opt_ac_chroma_huff = ac_chroma_freq.generate_table()?;
2763
2764            let opt_dc_luma = DerivedTable::from_huff_table(&opt_dc_luma_huff, true)?;
2765            let opt_dc_chroma = DerivedTable::from_huff_table(&opt_dc_chroma_huff, true)?;
2766            let opt_ac_luma = DerivedTable::from_huff_table(&opt_ac_luma_huff, false)?;
2767            let opt_ac_chroma = DerivedTable::from_huff_table(&opt_ac_chroma_huff, false)?;
2768
2769            // Write DHT with optimized tables - combined into single marker
2770            marker_writer.write_dht_multiple(&[
2771                (0, false, &opt_dc_luma_huff),
2772                (1, false, &opt_dc_chroma_huff),
2773                (0, true, &opt_ac_luma_huff),
2774                (1, true, &opt_ac_chroma_huff),
2775            ])?;
2776
2777            // Write SOS and encode
2778            let scans = generate_baseline_scan(3);
2779            let scan = &scans[0];
2780            marker_writer.write_sos(scan, &components)?;
2781
2782            let output = marker_writer.into_inner();
2783            let mut bit_writer = BitWriter::new(output);
2784            let mut entropy = EntropyEncoder::new(&mut bit_writer);
2785
2786            // Encode from stored blocks with restart marker support
2787            y_idx = 0;
2788            c_idx = 0;
2789            let restart_interval = self.restart_interval as usize;
2790            let mut mcu_count = 0usize;
2791            let mut restart_num = 0u8;
2792
2793            for _mcu_row in 0..mcu_rows {
2794                for _mcu_col in 0..mcu_cols {
2795                    // Emit restart marker if needed (before this MCU, not first)
2796                    if restart_interval > 0
2797                        && mcu_count > 0
2798                        && mcu_count.is_multiple_of(restart_interval)
2799                    {
2800                        entropy.emit_restart(restart_num)?;
2801                        restart_num = restart_num.wrapping_add(1) & 0x07;
2802                    }
2803
2804                    // Y blocks
2805                    for _ in 0..blocks_per_mcu_y {
2806                        entropy.encode_block(&y_blocks[y_idx], 0, &opt_dc_luma, &opt_ac_luma)?;
2807                        y_idx += 1;
2808                    }
2809                    // Cb block
2810                    entropy.encode_block(&cb_blocks[c_idx], 1, &opt_dc_chroma, &opt_ac_chroma)?;
2811                    // Cr block
2812                    entropy.encode_block(&cr_blocks[c_idx], 2, &opt_dc_chroma, &opt_ac_chroma)?;
2813                    c_idx += 1;
2814                    mcu_count += 1;
2815                }
2816            }
2817
2818            bit_writer.flush()?;
2819            let mut output = bit_writer.into_inner();
2820            output.write_all(&[0xFF, 0xD9])?;
2821        } else {
2822            // Baseline mode: Encode directly (streaming)
2823            let scans = generate_baseline_scan(3);
2824            let scan = &scans[0]; // Baseline has only one scan
2825            marker_writer.write_sos(scan, &components)?;
2826
2827            // Encode MCU data
2828            let output = marker_writer.into_inner();
2829            let mut bit_writer = BitWriter::new(output);
2830            let mut entropy = EntropyEncoder::new(&mut bit_writer);
2831
2832            self.encode_mcus(
2833                y_mcu,
2834                mcu_width,
2835                mcu_height,
2836                cb_mcu,
2837                cr_mcu,
2838                mcu_chroma_w,
2839                mcu_chroma_h,
2840                &luma_qtable.values,
2841                &chroma_qtable.values,
2842                &dc_luma_derived,
2843                &dc_chroma_derived,
2844                &ac_luma_derived,
2845                &ac_chroma_derived,
2846                &mut entropy,
2847                luma_h,
2848                luma_v,
2849            )?;
2850
2851            // Flush bits and get output back
2852            bit_writer.flush()?;
2853            let mut output = bit_writer.into_inner();
2854
2855            // EOI
2856            output.write_all(&[0xFF, 0xD9])?;
2857        }
2858
2859        Ok(())
2860    }
2861
2862    /// Encode all MCUs (Minimum Coded Units).
2863    #[allow(clippy::too_many_arguments)]
2864    fn encode_mcus<W: Write>(
2865        &self,
2866        y_plane: &[u8],
2867        y_width: usize,
2868        y_height: usize,
2869        cb_plane: &[u8],
2870        cr_plane: &[u8],
2871        chroma_width: usize,
2872        _chroma_height: usize,
2873        luma_qtable: &[u16; DCTSIZE2],
2874        chroma_qtable: &[u16; DCTSIZE2],
2875        dc_luma: &DerivedTable,
2876        dc_chroma: &DerivedTable,
2877        ac_luma: &DerivedTable,
2878        ac_chroma: &DerivedTable,
2879        entropy: &mut EntropyEncoder<W>,
2880        h_samp: u8,
2881        v_samp: u8,
2882    ) -> Result<()> {
2883        let mcu_rows = y_height / (DCTSIZE * v_samp as usize);
2884        let mcu_cols = y_width / (DCTSIZE * h_samp as usize);
2885        let total_mcus = mcu_rows * mcu_cols;
2886
2887        let mut dct_block = [0i16; DCTSIZE2];
2888        let mut quant_block = [0i16; DCTSIZE2];
2889
2890        // Restart marker tracking
2891        let restart_interval = self.restart_interval as usize;
2892        let mut mcu_count = 0usize;
2893        let mut restart_num = 0u8;
2894
2895        for mcu_row in 0..mcu_rows {
2896            for mcu_col in 0..mcu_cols {
2897                // Check if we need to emit a restart marker BEFORE this MCU
2898                // (except for the first MCU)
2899                if restart_interval > 0
2900                    && mcu_count > 0
2901                    && mcu_count.is_multiple_of(restart_interval)
2902                {
2903                    entropy.emit_restart(restart_num)?;
2904                    restart_num = restart_num.wrapping_add(1) & 0x07;
2905                }
2906
2907                // Encode Y blocks (may be multiple per MCU for subsampling)
2908                for v in 0..v_samp as usize {
2909                    for h in 0..h_samp as usize {
2910                        let block_row = mcu_row * v_samp as usize + v;
2911                        let block_col = mcu_col * h_samp as usize + h;
2912
2913                        self.encode_block(
2914                            y_plane,
2915                            y_width,
2916                            block_row,
2917                            block_col,
2918                            luma_qtable,
2919                            dc_luma,
2920                            ac_luma,
2921                            0, // Y component
2922                            entropy,
2923                            &mut dct_block,
2924                            &mut quant_block,
2925                        )?;
2926                    }
2927                }
2928
2929                // Encode Cb block
2930                self.encode_block(
2931                    cb_plane,
2932                    chroma_width,
2933                    mcu_row,
2934                    mcu_col,
2935                    chroma_qtable,
2936                    dc_chroma,
2937                    ac_chroma,
2938                    1, // Cb component
2939                    entropy,
2940                    &mut dct_block,
2941                    &mut quant_block,
2942                )?;
2943
2944                // Encode Cr block
2945                self.encode_block(
2946                    cr_plane,
2947                    chroma_width,
2948                    mcu_row,
2949                    mcu_col,
2950                    chroma_qtable,
2951                    dc_chroma,
2952                    ac_chroma,
2953                    2, // Cr component
2954                    entropy,
2955                    &mut dct_block,
2956                    &mut quant_block,
2957                )?;
2958
2959                mcu_count += 1;
2960            }
2961        }
2962
2963        // Suppress unused variable warning
2964        let _ = total_mcus;
2965
2966        Ok(())
2967    }
2968
2969    /// Encode a single 8x8 block.
2970    #[allow(clippy::too_many_arguments)]
2971    fn encode_block<W: Write>(
2972        &self,
2973        plane: &[u8],
2974        plane_width: usize,
2975        block_row: usize,
2976        block_col: usize,
2977        qtable: &[u16; DCTSIZE2],
2978        dc_table: &DerivedTable,
2979        ac_table: &DerivedTable,
2980        component: usize,
2981        entropy: &mut EntropyEncoder<W>,
2982        dct_block: &mut [i16; DCTSIZE2],
2983        quant_block: &mut [i16; DCTSIZE2],
2984    ) -> Result<()> {
2985        // Extract 8x8 block from plane
2986        let mut samples = [0u8; DCTSIZE2];
2987        let base_y = block_row * DCTSIZE;
2988        let base_x = block_col * DCTSIZE;
2989
2990        for row in 0..DCTSIZE {
2991            let src_offset = (base_y + row) * plane_width + base_x;
2992            let dst_offset = row * DCTSIZE;
2993            samples[dst_offset..dst_offset + DCTSIZE]
2994                .copy_from_slice(&plane[src_offset..src_offset + DCTSIZE]);
2995        }
2996
2997        // Level shift (center around 0 for DCT)
2998        let mut shifted = [0i16; DCTSIZE2];
2999        for i in 0..DCTSIZE2 {
3000            shifted[i] = (samples[i] as i16) - 128;
3001        }
3002
3003        // Apply overshoot deringing if enabled (reduces ringing on white backgrounds)
3004        if self.overshoot_deringing {
3005            preprocess_deringing(&mut shifted, qtable[0]);
3006        }
3007
3008        // Forward DCT (output scaled by factor of 8)
3009        (self.simd.forward_dct)(&shifted, dct_block);
3010
3011        // Convert to i32 for quantization
3012        let mut dct_i32 = [0i32; DCTSIZE2];
3013        for i in 0..DCTSIZE2 {
3014            dct_i32[i] = dct_block[i] as i32;
3015        }
3016
3017        // Use trellis quantization if enabled
3018        // Both paths expect raw DCT (scaled by 8) and handle the scaling internally
3019        if self.trellis.enabled {
3020            trellis_quantize_block(&dct_i32, quant_block, qtable, ac_table, &self.trellis);
3021        } else {
3022            // Non-trellis path: use single-step quantization matching C mozjpeg
3023            // This takes raw DCT (scaled by 8) and uses q_scaled = 8 * qtable[i]
3024            quantize_block_raw(&dct_i32, qtable, quant_block);
3025        }
3026
3027        // Entropy encode
3028        entropy.encode_block(quant_block, component, dc_table, ac_table)?;
3029
3030        Ok(())
3031    }
3032
3033    /// Collect all quantized DCT blocks for progressive encoding.
3034    /// Also collects raw DCT blocks if DC trellis is enabled.
3035    #[allow(clippy::too_many_arguments)]
3036    fn collect_blocks(
3037        &self,
3038        y_plane: &[u8],
3039        y_width: usize,
3040        y_height: usize,
3041        cb_plane: &[u8],
3042        cr_plane: &[u8],
3043        chroma_width: usize,
3044        _chroma_height: usize,
3045        luma_qtable: &[u16; DCTSIZE2],
3046        chroma_qtable: &[u16; DCTSIZE2],
3047        ac_luma: &DerivedTable,
3048        ac_chroma: &DerivedTable,
3049        y_blocks: &mut [[i16; DCTSIZE2]],
3050        cb_blocks: &mut [[i16; DCTSIZE2]],
3051        cr_blocks: &mut [[i16; DCTSIZE2]],
3052        mut y_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
3053        mut cb_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
3054        mut cr_raw_dct: Option<&mut [[i32; DCTSIZE2]]>,
3055        h_samp: u8,
3056        v_samp: u8,
3057    ) -> Result<()> {
3058        let mcu_rows = y_height / (DCTSIZE * v_samp as usize);
3059        let mcu_cols = y_width / (DCTSIZE * h_samp as usize);
3060
3061        let mut y_idx = 0;
3062        let mut c_idx = 0;
3063        let mut dct_block = [0i16; DCTSIZE2];
3064
3065        for mcu_row in 0..mcu_rows {
3066            for mcu_col in 0..mcu_cols {
3067                // Collect Y blocks (may be multiple per MCU for subsampling)
3068                for v in 0..v_samp as usize {
3069                    for h in 0..h_samp as usize {
3070                        let block_row = mcu_row * v_samp as usize + v;
3071                        let block_col = mcu_col * h_samp as usize + h;
3072
3073                        // Get mutable reference to raw DCT output if collecting
3074                        let raw_dct_out = y_raw_dct.as_mut().map(|arr| &mut arr[y_idx][..]);
3075                        self.process_block_to_storage_with_raw(
3076                            y_plane,
3077                            y_width,
3078                            block_row,
3079                            block_col,
3080                            luma_qtable,
3081                            ac_luma,
3082                            &mut y_blocks[y_idx],
3083                            &mut dct_block,
3084                            raw_dct_out,
3085                        )?;
3086                        y_idx += 1;
3087                    }
3088                }
3089
3090                // Collect Cb block
3091                let raw_dct_out = cb_raw_dct.as_mut().map(|arr| &mut arr[c_idx][..]);
3092                self.process_block_to_storage_with_raw(
3093                    cb_plane,
3094                    chroma_width,
3095                    mcu_row,
3096                    mcu_col,
3097                    chroma_qtable,
3098                    ac_chroma,
3099                    &mut cb_blocks[c_idx],
3100                    &mut dct_block,
3101                    raw_dct_out,
3102                )?;
3103
3104                // Collect Cr block
3105                let raw_dct_out = cr_raw_dct.as_mut().map(|arr| &mut arr[c_idx][..]);
3106                self.process_block_to_storage_with_raw(
3107                    cr_plane,
3108                    chroma_width,
3109                    mcu_row,
3110                    mcu_col,
3111                    chroma_qtable,
3112                    ac_chroma,
3113                    &mut cr_blocks[c_idx],
3114                    &mut dct_block,
3115                    raw_dct_out,
3116                )?;
3117
3118                c_idx += 1;
3119            }
3120        }
3121
3122        Ok(())
3123    }
3124
3125    /// Process a block: DCT + quantize, storing the result.
3126    /// Optionally stores raw DCT coefficients for DC trellis.
3127    #[allow(clippy::too_many_arguments)]
3128    fn process_block_to_storage_with_raw(
3129        &self,
3130        plane: &[u8],
3131        plane_width: usize,
3132        block_row: usize,
3133        block_col: usize,
3134        qtable: &[u16; DCTSIZE2],
3135        ac_table: &DerivedTable,
3136        out_block: &mut [i16; DCTSIZE2],
3137        dct_block: &mut [i16; DCTSIZE2],
3138        raw_dct_out: Option<&mut [i32]>,
3139    ) -> Result<()> {
3140        // Extract 8x8 block from plane
3141        let mut samples = [0u8; DCTSIZE2];
3142        let base_y = block_row * DCTSIZE;
3143        let base_x = block_col * DCTSIZE;
3144
3145        for row in 0..DCTSIZE {
3146            let src_offset = (base_y + row) * plane_width + base_x;
3147            let dst_offset = row * DCTSIZE;
3148            samples[dst_offset..dst_offset + DCTSIZE]
3149                .copy_from_slice(&plane[src_offset..src_offset + DCTSIZE]);
3150        }
3151
3152        // Level shift (center around 0 for DCT)
3153        let mut shifted = [0i16; DCTSIZE2];
3154        for i in 0..DCTSIZE2 {
3155            shifted[i] = (samples[i] as i16) - 128;
3156        }
3157
3158        // Apply overshoot deringing if enabled (reduces ringing on white backgrounds)
3159        if self.overshoot_deringing {
3160            preprocess_deringing(&mut shifted, qtable[0]);
3161        }
3162
3163        // Forward DCT (output scaled by factor of 8)
3164        (self.simd.forward_dct)(&shifted, dct_block);
3165
3166        // Convert to i32 for quantization
3167        let mut dct_i32 = [0i32; DCTSIZE2];
3168        for i in 0..DCTSIZE2 {
3169            dct_i32[i] = dct_block[i] as i32;
3170        }
3171
3172        // Store raw DCT if requested (for DC trellis)
3173        if let Some(raw_out) = raw_dct_out {
3174            raw_out.copy_from_slice(&dct_i32);
3175        }
3176
3177        // Use trellis quantization if enabled
3178        // Both paths expect raw DCT (scaled by 8) and handle the scaling internally
3179        if self.trellis.enabled {
3180            trellis_quantize_block(&dct_i32, out_block, qtable, ac_table, &self.trellis);
3181        } else {
3182            // Non-trellis path: use single-step quantization matching C mozjpeg
3183            // This takes raw DCT (scaled by 8) and uses q_scaled = 8 * qtable[i]
3184            quantize_block_raw(&dct_i32, qtable, out_block);
3185        }
3186
3187        Ok(())
3188    }
3189
3190    /// Optimize progressive scan configuration (C mozjpeg-compatible).
3191    ///
3192    /// This implements the optimize_scans feature from C mozjpeg:
3193    /// 1. Generate 64 individual candidate scans
3194    /// 2. Trial-encode scans SEQUENTIALLY to get accurate sizes
3195    /// 3. Use ScanSelector to find optimal Al levels and frequency splits
3196    /// 4. Build the final scan script from the selection
3197    ///
3198    /// IMPORTANT: Scans must be encoded sequentially (not independently) because
3199    /// refinement scans (Ah > 0) need context from previous scans to produce
3200    /// correct output sizes.
3201    #[allow(clippy::too_many_arguments)]
3202    fn optimize_progressive_scans(
3203        &self,
3204        num_components: u8,
3205        y_blocks: &[[i16; DCTSIZE2]],
3206        cb_blocks: &[[i16; DCTSIZE2]],
3207        cr_blocks: &[[i16; DCTSIZE2]],
3208        mcu_rows: usize,
3209        mcu_cols: usize,
3210        h_samp: u8,
3211        v_samp: u8,
3212        actual_width: usize,
3213        actual_height: usize,
3214        chroma_width: usize,
3215        chroma_height: usize,
3216        dc_luma: &DerivedTable,
3217        dc_chroma: &DerivedTable,
3218        ac_luma: &DerivedTable,
3219        ac_chroma: &DerivedTable,
3220    ) -> Result<Vec<crate::types::ScanInfo>> {
3221        let config = ScanSearchConfig::default();
3222        let candidate_scans = generate_search_scans(num_components, &config);
3223
3224        // Use ScanTrialEncoder for sequential trial encoding with proper state tracking
3225        let mut trial_encoder = ScanTrialEncoder::new(
3226            y_blocks,
3227            cb_blocks,
3228            cr_blocks,
3229            dc_luma,
3230            dc_chroma,
3231            ac_luma,
3232            ac_chroma,
3233            mcu_rows,
3234            mcu_cols,
3235            h_samp,
3236            v_samp,
3237            actual_width,
3238            actual_height,
3239            chroma_width,
3240            chroma_height,
3241        );
3242
3243        // Trial-encode all scans sequentially to get accurate sizes
3244        let scan_sizes = trial_encoder.encode_all_scans(&candidate_scans)?;
3245
3246        // Use ScanSelector to find the optimal configuration
3247        let selector = ScanSelector::new(num_components, config.clone());
3248        let result = selector.select_best(&scan_sizes);
3249
3250        // Build the final scan script from the selection
3251        Ok(result.build_final_scans(num_components, &config))
3252    }
3253
3254    /// Encode a single progressive scan.
3255    #[allow(clippy::too_many_arguments)]
3256    fn encode_progressive_scan<W: Write>(
3257        &self,
3258        scan: &crate::types::ScanInfo,
3259        y_blocks: &[[i16; DCTSIZE2]],
3260        cb_blocks: &[[i16; DCTSIZE2]],
3261        cr_blocks: &[[i16; DCTSIZE2]],
3262        mcu_rows: usize,
3263        mcu_cols: usize,
3264        h_samp: u8,
3265        v_samp: u8,
3266        actual_width: usize,
3267        actual_height: usize,
3268        chroma_width: usize,
3269        chroma_height: usize,
3270        dc_luma: &DerivedTable,
3271        dc_chroma: &DerivedTable,
3272        ac_luma: &DerivedTable,
3273        ac_chroma: &DerivedTable,
3274        encoder: &mut ProgressiveEncoder<W>,
3275    ) -> Result<()> {
3276        let is_dc_scan = scan.ss == 0 && scan.se == 0;
3277        let is_refinement = scan.ah != 0;
3278
3279        if is_dc_scan {
3280            // DC scan - can be interleaved (multiple components)
3281            self.encode_dc_scan(
3282                scan,
3283                y_blocks,
3284                cb_blocks,
3285                cr_blocks,
3286                mcu_rows,
3287                mcu_cols,
3288                h_samp,
3289                v_samp,
3290                dc_luma,
3291                dc_chroma,
3292                is_refinement,
3293                encoder,
3294            )?;
3295        } else {
3296            // AC scan - single component only (non-interleaved)
3297            // For non-interleaved scans, use actual component block dimensions
3298            let comp_idx = scan.component_index[0] as usize;
3299            let blocks = match comp_idx {
3300                0 => y_blocks,
3301                1 => cb_blocks,
3302                2 => cr_blocks,
3303                _ => return Err(Error::InvalidComponentIndex(comp_idx)),
3304            };
3305            let ac_table = if comp_idx == 0 { ac_luma } else { ac_chroma };
3306
3307            // Calculate actual block dimensions for this component.
3308            // Non-interleaved AC scans encode only the actual image blocks, not MCU padding.
3309            // This differs from interleaved DC scans which encode all MCU blocks.
3310            // Reference: ITU-T T.81 Section F.2.3
3311            let (block_cols, block_rows) = if comp_idx == 0 {
3312                // Y component: full resolution
3313                (
3314                    actual_width.div_ceil(DCTSIZE),
3315                    actual_height.div_ceil(DCTSIZE),
3316                )
3317            } else {
3318                // Chroma components: subsampled resolution
3319                (
3320                    chroma_width.div_ceil(DCTSIZE),
3321                    chroma_height.div_ceil(DCTSIZE),
3322                )
3323            };
3324
3325            self.encode_ac_scan(
3326                scan,
3327                blocks,
3328                mcu_rows,
3329                mcu_cols,
3330                h_samp,
3331                v_samp,
3332                comp_idx,
3333                block_cols,
3334                block_rows,
3335                ac_table,
3336                is_refinement,
3337                encoder,
3338            )?;
3339        }
3340
3341        Ok(())
3342    }
3343
3344    /// Encode a DC scan (Ss=Se=0).
3345    #[allow(clippy::too_many_arguments)]
3346    fn encode_dc_scan<W: Write>(
3347        &self,
3348        scan: &crate::types::ScanInfo,
3349        y_blocks: &[[i16; DCTSIZE2]],
3350        cb_blocks: &[[i16; DCTSIZE2]],
3351        cr_blocks: &[[i16; DCTSIZE2]],
3352        mcu_rows: usize,
3353        mcu_cols: usize,
3354        h_samp: u8,
3355        v_samp: u8,
3356        dc_luma: &DerivedTable,
3357        dc_chroma: &DerivedTable,
3358        is_refinement: bool,
3359        encoder: &mut ProgressiveEncoder<W>,
3360    ) -> Result<()> {
3361        let blocks_per_mcu_y = (h_samp * v_samp) as usize;
3362        let mut y_idx = 0;
3363        let mut c_idx = 0;
3364
3365        for _mcu_row in 0..mcu_rows {
3366            for _mcu_col in 0..mcu_cols {
3367                // Encode Y blocks
3368                for _ in 0..blocks_per_mcu_y {
3369                    if is_refinement {
3370                        encoder.encode_dc_refine(&y_blocks[y_idx], scan.al)?;
3371                    } else {
3372                        encoder.encode_dc_first(&y_blocks[y_idx], 0, dc_luma, scan.al)?;
3373                    }
3374                    y_idx += 1;
3375                }
3376
3377                // Encode Cb
3378                if is_refinement {
3379                    encoder.encode_dc_refine(&cb_blocks[c_idx], scan.al)?;
3380                } else {
3381                    encoder.encode_dc_first(&cb_blocks[c_idx], 1, dc_chroma, scan.al)?;
3382                }
3383
3384                // Encode Cr
3385                if is_refinement {
3386                    encoder.encode_dc_refine(&cr_blocks[c_idx], scan.al)?;
3387                } else {
3388                    encoder.encode_dc_first(&cr_blocks[c_idx], 2, dc_chroma, scan.al)?;
3389                }
3390
3391                c_idx += 1;
3392            }
3393        }
3394
3395        Ok(())
3396    }
3397
3398    /// Encode an AC scan (Ss > 0).
3399    ///
3400    /// **IMPORTANT**: Progressive AC scans are always non-interleaved, meaning blocks
3401    /// must be encoded in component raster order (row-major within the component's
3402    /// block grid), NOT in MCU-interleaved order.
3403    ///
3404    /// For non-interleaved scans, the number of blocks is determined by the actual
3405    /// component dimensions (ceil(width/8) × ceil(height/8)), NOT the MCU-padded
3406    /// dimensions. This is different from interleaved DC scans which use MCU order.
3407    /// The padding blocks (beyond actual image dimensions) have DC coefficients but
3408    /// no AC coefficients - the decoder only outputs the actual image dimensions.
3409    ///
3410    /// Reference: ITU-T T.81 Section F.2.3 - "The scan data for a non-interleaved
3411    /// scan shall consist of a sequence of entropy-coded segments... The data units
3412    /// are processed in the order defined by the scan component."
3413    #[allow(clippy::too_many_arguments)]
3414    fn encode_ac_scan<W: Write>(
3415        &self,
3416        scan: &crate::types::ScanInfo,
3417        blocks: &[[i16; DCTSIZE2]],
3418        _mcu_rows: usize,
3419        mcu_cols: usize,
3420        h_samp: u8,
3421        v_samp: u8,
3422        comp_idx: usize,
3423        block_cols: usize,
3424        block_rows: usize,
3425        ac_table: &DerivedTable,
3426        is_refinement: bool,
3427        encoder: &mut ProgressiveEncoder<W>,
3428    ) -> Result<()> {
3429        // For Y component with subsampling, blocks are stored in MCU-interleaved order
3430        // but AC scans must encode them in component raster order.
3431        // For chroma components (1 block per MCU), the orders are identical.
3432        //
3433        // For non-interleaved scans, encode only the actual image blocks (block_rows × block_cols),
3434        // not all MCU-padded blocks. Padding blocks have DC coefficients but no AC coefficients.
3435
3436        let blocks_per_mcu = if comp_idx == 0 {
3437            (h_samp * v_samp) as usize
3438        } else {
3439            1
3440        };
3441
3442        if blocks_per_mcu == 1 {
3443            // Chroma or 4:4:4 Y: storage order = raster order
3444            let total_blocks = block_rows * block_cols;
3445            for block in blocks.iter().take(total_blocks) {
3446                if is_refinement {
3447                    encoder
3448                        .encode_ac_refine(block, scan.ss, scan.se, scan.ah, scan.al, ac_table)?;
3449                } else {
3450                    encoder.encode_ac_first(block, scan.ss, scan.se, scan.al, ac_table)?;
3451                }
3452            }
3453        } else {
3454            // Y component with subsampling (h_samp > 1 or v_samp > 1)
3455            // Convert from MCU-interleaved storage to component raster order
3456            let h = h_samp as usize;
3457            let v = v_samp as usize;
3458
3459            for block_row in 0..block_rows {
3460                for block_col in 0..block_cols {
3461                    // Convert raster position to MCU-interleaved storage index
3462                    let mcu_row = block_row / v;
3463                    let mcu_col = block_col / h;
3464                    let v_idx = block_row % v;
3465                    let h_idx = block_col % h;
3466                    let storage_idx = mcu_row * (mcu_cols * blocks_per_mcu)
3467                        + mcu_col * blocks_per_mcu
3468                        + v_idx * h
3469                        + h_idx;
3470
3471                    if is_refinement {
3472                        encoder.encode_ac_refine(
3473                            &blocks[storage_idx],
3474                            scan.ss,
3475                            scan.se,
3476                            scan.ah,
3477                            scan.al,
3478                            ac_table,
3479                        )?;
3480                    } else {
3481                        encoder.encode_ac_first(
3482                            &blocks[storage_idx],
3483                            scan.ss,
3484                            scan.se,
3485                            scan.al,
3486                            ac_table,
3487                        )?;
3488                    }
3489                }
3490            }
3491        }
3492
3493        Ok(())
3494    }
3495
3496    /// Count DC symbols for a progressive DC scan.
3497    #[allow(clippy::too_many_arguments)]
3498    fn count_dc_scan_symbols(
3499        &self,
3500        scan: &crate::types::ScanInfo,
3501        y_blocks: &[[i16; DCTSIZE2]],
3502        cb_blocks: &[[i16; DCTSIZE2]],
3503        cr_blocks: &[[i16; DCTSIZE2]],
3504        mcu_rows: usize,
3505        mcu_cols: usize,
3506        h_samp: u8,
3507        v_samp: u8,
3508        dc_luma_freq: &mut FrequencyCounter,
3509        dc_chroma_freq: &mut FrequencyCounter,
3510    ) {
3511        let blocks_per_mcu_y = (h_samp * v_samp) as usize;
3512        let mut y_idx = 0;
3513        let mut c_idx = 0;
3514        let mut counter = ProgressiveSymbolCounter::new();
3515
3516        for _mcu_row in 0..mcu_rows {
3517            for _mcu_col in 0..mcu_cols {
3518                // Y blocks
3519                for _ in 0..blocks_per_mcu_y {
3520                    counter.count_dc_first(&y_blocks[y_idx], 0, scan.al, dc_luma_freq);
3521                    y_idx += 1;
3522                }
3523                // Cb block
3524                counter.count_dc_first(&cb_blocks[c_idx], 1, scan.al, dc_chroma_freq);
3525                // Cr block
3526                counter.count_dc_first(&cr_blocks[c_idx], 2, scan.al, dc_chroma_freq);
3527                c_idx += 1;
3528            }
3529        }
3530    }
3531
3532    /// Count AC symbols for a progressive AC scan.
3533    ///
3534    /// Must iterate blocks in the same order as `encode_ac_scan` (component raster order)
3535    /// to ensure EOBRUN counts match and Huffman tables are correct.
3536    ///
3537    /// Uses actual block dimensions (not MCU-padded) for non-interleaved scans.
3538    #[allow(clippy::too_many_arguments)]
3539    fn count_ac_scan_symbols(
3540        &self,
3541        scan: &crate::types::ScanInfo,
3542        blocks: &[[i16; DCTSIZE2]],
3543        _mcu_rows: usize,
3544        mcu_cols: usize,
3545        h_samp: u8,
3546        v_samp: u8,
3547        comp_idx: usize,
3548        block_cols: usize,
3549        block_rows: usize,
3550        ac_freq: &mut FrequencyCounter,
3551    ) {
3552        let blocks_per_mcu = if comp_idx == 0 {
3553            (h_samp * v_samp) as usize
3554        } else {
3555            1
3556        };
3557
3558        let mut counter = ProgressiveSymbolCounter::new();
3559        let is_refinement = scan.ah != 0;
3560
3561        if blocks_per_mcu == 1 {
3562            // Chroma or 4:4:4 Y: storage order = raster order
3563            let total_blocks = block_rows * block_cols;
3564            for block in blocks.iter().take(total_blocks) {
3565                if is_refinement {
3566                    counter.count_ac_refine(block, scan.ss, scan.se, scan.ah, scan.al, ac_freq);
3567                } else {
3568                    counter.count_ac_first(block, scan.ss, scan.se, scan.al, ac_freq);
3569                }
3570            }
3571        } else {
3572            // Y component with subsampling - iterate in raster order (matching encode_ac_scan)
3573            let h = h_samp as usize;
3574            let v = v_samp as usize;
3575
3576            for block_row in 0..block_rows {
3577                for block_col in 0..block_cols {
3578                    // Convert raster position to MCU-interleaved storage index
3579                    let mcu_row = block_row / v;
3580                    let mcu_col = block_col / h;
3581                    let v_idx = block_row % v;
3582                    let h_idx = block_col % h;
3583                    let storage_idx = mcu_row * (mcu_cols * blocks_per_mcu)
3584                        + mcu_col * blocks_per_mcu
3585                        + v_idx * h
3586                        + h_idx;
3587
3588                    if is_refinement {
3589                        counter.count_ac_refine(
3590                            &blocks[storage_idx],
3591                            scan.ss,
3592                            scan.se,
3593                            scan.ah,
3594                            scan.al,
3595                            ac_freq,
3596                        );
3597                    } else {
3598                        counter.count_ac_first(
3599                            &blocks[storage_idx],
3600                            scan.ss,
3601                            scan.se,
3602                            scan.al,
3603                            ac_freq,
3604                        );
3605                    }
3606                }
3607            }
3608        }
3609
3610        // Flush any pending EOBRUN
3611        counter.finish_scan(Some(ac_freq));
3612    }
3613}
3614
3615// ============================================================================
3616// Encode Trait Implementation
3617// ============================================================================
3618
3619impl Encode for Encoder {
3620    fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
3621        self.encode_rgb(rgb_data, width, height)
3622    }
3623
3624    fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
3625        self.encode_gray(gray_data, width, height)
3626    }
3627}
3628
3629// Note: StreamingEncoder and EncodingStream are in the `streaming` module.
3630
3631// Add streaming() method to Encoder
3632impl Encoder {
3633    /// Create a streaming encoder.
3634    ///
3635    /// Returns a [`StreamingEncoder`] which supports scanline-by-scanline encoding.
3636    /// Note that streaming mode does NOT support trellis quantization, progressive
3637    /// mode, or Huffman optimization (these require buffering the entire image).
3638    ///
3639    /// For full-featured encoding with all mozjpeg optimizations, use [`Encoder::new(Preset)`]
3640    /// with [`encode_rgb()`](Encoder::encode_rgb) or [`encode_gray()`](Encoder::encode_gray).
3641    ///
3642    /// # Example
3643    ///
3644    /// ```ignore
3645    /// use mozjpeg_rs::Encoder;
3646    /// use std::fs::File;
3647    ///
3648    /// let file = File::create("output.jpg")?;
3649    /// let mut stream = Encoder::streaming()
3650    ///     .quality(85)
3651    ///     .start_rgb(1920, 1080, file)?;
3652    ///
3653    /// // Write scanlines...
3654    /// stream.finish()?;
3655    /// ```
3656    pub fn streaming() -> StreamingEncoder {
3657        StreamingEncoder::baseline_fastest()
3658    }
3659}
3660
3661// ============================================================================
3662// C mozjpeg encoding (optional feature)
3663// ============================================================================
3664
3665#[cfg(feature = "mozjpeg-sys-config")]
3666impl Encoder {
3667    /// Convert this encoder to a C mozjpeg encoder.
3668    ///
3669    /// Returns a [`CMozjpeg`](crate::CMozjpeg) that can encode images using
3670    /// the C mozjpeg library with settings matching this Rust encoder.
3671    ///
3672    /// # Example
3673    ///
3674    /// ```no_run
3675    /// use mozjpeg_rs::{Encoder, Preset};
3676    ///
3677    /// let pixels: Vec<u8> = vec![128; 64 * 64 * 3];
3678    /// let encoder = Encoder::new(Preset::ProgressiveBalanced).quality(85);
3679    ///
3680    /// // Encode with C mozjpeg
3681    /// let c_jpeg = encoder.to_c_mozjpeg().encode_rgb(&pixels, 64, 64)?;
3682    ///
3683    /// // Compare with Rust encoder
3684    /// let rust_jpeg = encoder.encode_rgb(&pixels, 64, 64)?;
3685    /// # Ok::<(), mozjpeg_rs::Error>(())
3686    /// ```
3687    pub fn to_c_mozjpeg(&self) -> crate::compat::CMozjpeg {
3688        crate::compat::CMozjpeg {
3689            quality: self.quality,
3690            force_baseline: self.force_baseline,
3691            subsampling: self.subsampling,
3692            progressive: self.progressive,
3693            optimize_huffman: self.optimize_huffman,
3694            optimize_scans: self.optimize_scans,
3695            trellis: self.trellis,
3696            overshoot_deringing: self.overshoot_deringing,
3697            smoothing: self.smoothing,
3698            restart_interval: self.restart_interval,
3699            quant_table_idx: self.quant_table_idx,
3700            has_custom_qtables: self.custom_luma_qtable.is_some()
3701                || self.custom_chroma_qtable.is_some(),
3702            exif_data: self.exif_data.clone(),
3703            icc_profile: self.icc_profile.clone(),
3704            custom_markers: self.custom_markers.clone(),
3705        }
3706    }
3707}
3708
3709/// Unit tests for private encoder internals.
3710/// Public API tests are in tests/encode_tests.rs.
3711#[cfg(test)]
3712mod tests {
3713    use super::*;
3714
3715    #[test]
3716    fn test_encoder_defaults() {
3717        // Default preset is ProgressiveBalanced
3718        let enc = Encoder::default();
3719        assert_eq!(enc.quality, 75);
3720        assert!(enc.progressive); // ProgressiveBalanced is progressive
3721        assert_eq!(enc.subsampling, Subsampling::S420);
3722        assert!(enc.trellis.enabled);
3723        assert!(enc.optimize_huffman);
3724        assert!(!enc.optimize_scans); // ProgressiveBalanced does NOT include optimize_scans
3725    }
3726
3727    #[test]
3728    fn test_encoder_presets() {
3729        let fastest = Encoder::new(Preset::BaselineFastest);
3730        assert!(!fastest.progressive);
3731        assert!(!fastest.trellis.enabled);
3732        assert!(!fastest.optimize_huffman);
3733
3734        let baseline = Encoder::new(Preset::BaselineBalanced);
3735        assert!(!baseline.progressive);
3736        assert!(baseline.trellis.enabled);
3737        assert!(baseline.optimize_huffman);
3738
3739        let prog_balanced = Encoder::new(Preset::ProgressiveBalanced);
3740        assert!(prog_balanced.progressive);
3741        assert!(prog_balanced.trellis.enabled);
3742        assert!(!prog_balanced.optimize_scans);
3743
3744        let prog_smallest = Encoder::new(Preset::ProgressiveSmallest);
3745        assert!(prog_smallest.progressive);
3746        assert!(prog_smallest.optimize_scans);
3747    }
3748
3749    #[test]
3750    fn test_encoder_builder_fields() {
3751        let enc = Encoder::baseline_optimized()
3752            .quality(90)
3753            .progressive(true)
3754            .subsampling(Subsampling::S444);
3755
3756        assert_eq!(enc.quality, 90);
3757        assert!(enc.progressive);
3758        assert_eq!(enc.subsampling, Subsampling::S444);
3759    }
3760
3761    #[test]
3762    fn test_quality_clamping() {
3763        let enc = Encoder::baseline_optimized().quality(0);
3764        assert_eq!(enc.quality, 1);
3765
3766        let enc = Encoder::baseline_optimized().quality(150);
3767        assert_eq!(enc.quality, 100);
3768    }
3769
3770    #[test]
3771    fn test_natural_to_zigzag() {
3772        let mut natural = [0u16; 64];
3773        for i in 0..64 {
3774            natural[i] = i as u16;
3775        }
3776        let zigzag = natural_to_zigzag(&natural);
3777
3778        assert_eq!(zigzag[0], 0);
3779        assert_eq!(zigzag[1], 1);
3780    }
3781
3782    #[test]
3783    fn test_max_compression_uses_all_optimizations() {
3784        let encoder = Encoder::max_compression();
3785        assert!(encoder.trellis.enabled);
3786        assert!(encoder.progressive);
3787        assert!(encoder.optimize_huffman);
3788        assert!(encoder.optimize_scans);
3789    }
3790
3791    #[test]
3792    fn test_encode_ycbcr_planar_444() {
3793        let width = 32u32;
3794        let height = 32u32;
3795
3796        // Create test image with gradient pattern
3797        let y_plane: Vec<u8> = (0..width * height)
3798            .map(|i| ((i % width) * 255 / width) as u8)
3799            .collect();
3800        let cb_plane: Vec<u8> = (0..width * height)
3801            .map(|i| ((i / width) * 255 / height) as u8)
3802            .collect();
3803        let cr_plane: Vec<u8> = vec![128u8; (width * height) as usize];
3804
3805        let encoder = Encoder::new(Preset::BaselineBalanced)
3806            .quality(85)
3807            .subsampling(Subsampling::S444);
3808
3809        let jpeg_data = encoder
3810            .encode_ycbcr_planar(&y_plane, &cb_plane, &cr_plane, width, height)
3811            .expect("encode_ycbcr_planar should succeed");
3812
3813        // Verify it's a valid JPEG
3814        assert!(jpeg_data.starts_with(&[0xFF, 0xD8, 0xFF])); // SOI + marker
3815        assert!(jpeg_data.ends_with(&[0xFF, 0xD9])); // EOI
3816        assert!(jpeg_data.len() > 200); // Reasonable size for 32x32
3817    }
3818
3819    #[test]
3820    fn test_encode_ycbcr_planar_420() {
3821        let width = 32u32;
3822        let height = 32u32;
3823
3824        // For 4:2:0, chroma planes are half resolution in each dimension
3825        let chroma_w = (width + 1) / 2;
3826        let chroma_h = (height + 1) / 2;
3827
3828        let y_plane: Vec<u8> = vec![128u8; (width * height) as usize];
3829        let cb_plane: Vec<u8> = vec![100u8; (chroma_w * chroma_h) as usize];
3830        let cr_plane: Vec<u8> = vec![150u8; (chroma_w * chroma_h) as usize];
3831
3832        let encoder = Encoder::new(Preset::BaselineBalanced)
3833            .quality(85)
3834            .subsampling(Subsampling::S420);
3835
3836        let jpeg_data = encoder
3837            .encode_ycbcr_planar(&y_plane, &cb_plane, &cr_plane, width, height)
3838            .expect("encode_ycbcr_planar with 4:2:0 should succeed");
3839
3840        // Verify it's a valid JPEG
3841        assert!(jpeg_data.starts_with(&[0xFF, 0xD8, 0xFF]));
3842        assert!(jpeg_data.ends_with(&[0xFF, 0xD9]));
3843    }
3844
3845    #[test]
3846    fn test_encode_ycbcr_planar_422() {
3847        let width = 32u32;
3848        let height = 32u32;
3849
3850        // For 4:2:2, chroma is half width, full height
3851        let chroma_w = (width + 1) / 2;
3852
3853        let y_plane: Vec<u8> = vec![128u8; (width * height) as usize];
3854        let cb_plane: Vec<u8> = vec![100u8; (chroma_w * height) as usize];
3855        let cr_plane: Vec<u8> = vec![150u8; (chroma_w * height) as usize];
3856
3857        let encoder = Encoder::new(Preset::BaselineBalanced)
3858            .quality(85)
3859            .subsampling(Subsampling::S422);
3860
3861        let jpeg_data = encoder
3862            .encode_ycbcr_planar(&y_plane, &cb_plane, &cr_plane, width, height)
3863            .expect("encode_ycbcr_planar with 4:2:2 should succeed");
3864
3865        assert!(jpeg_data.starts_with(&[0xFF, 0xD8, 0xFF]));
3866        assert!(jpeg_data.ends_with(&[0xFF, 0xD9]));
3867    }
3868
3869    #[test]
3870    fn test_encode_ycbcr_planar_wrong_size() {
3871        let width = 32u32;
3872        let height = 32u32;
3873
3874        // Correct Y plane but wrong chroma plane sizes for 4:2:0
3875        let y_plane: Vec<u8> = vec![128u8; (width * height) as usize];
3876        let cb_plane: Vec<u8> = vec![100u8; 10]; // Too small!
3877        let cr_plane: Vec<u8> = vec![150u8; 10]; // Too small!
3878
3879        let encoder = Encoder::new(Preset::BaselineBalanced)
3880            .quality(85)
3881            .subsampling(Subsampling::S420);
3882
3883        let result = encoder.encode_ycbcr_planar(&y_plane, &cb_plane, &cr_plane, width, height);
3884
3885        assert!(result.is_err());
3886    }
3887
3888    #[test]
3889    fn test_encode_ycbcr_planar_strided() {
3890        let width = 30u32; // Not a multiple of stride
3891        let height = 20u32;
3892        let y_stride = 32usize; // Stride with 2 bytes padding per row
3893
3894        // For 4:2:0, chroma is half resolution
3895        let chroma_width = 15usize;
3896        let chroma_height = 10usize;
3897        let cb_stride = 16usize; // Stride with 1 byte padding per row
3898
3899        // Create Y plane with stride (fill with gradient, padding with zeros)
3900        let mut y_plane = vec![0u8; y_stride * height as usize];
3901        for row in 0..height as usize {
3902            for col in 0..width as usize {
3903                y_plane[row * y_stride + col] = ((col * 255) / width as usize) as u8;
3904            }
3905        }
3906
3907        // Create chroma planes with stride
3908        let mut cb_plane = vec![0u8; cb_stride * chroma_height];
3909        let mut cr_plane = vec![0u8; cb_stride * chroma_height];
3910        for row in 0..chroma_height {
3911            for col in 0..chroma_width {
3912                cb_plane[row * cb_stride + col] = 100;
3913                cr_plane[row * cb_stride + col] = 150;
3914            }
3915        }
3916
3917        let encoder = Encoder::new(Preset::BaselineBalanced)
3918            .quality(85)
3919            .subsampling(Subsampling::S420);
3920
3921        let jpeg_data = encoder
3922            .encode_ycbcr_planar_strided(
3923                &y_plane, y_stride, &cb_plane, cb_stride, &cr_plane, cb_stride, width, height,
3924            )
3925            .expect("strided encoding should succeed");
3926
3927        // Verify it's a valid JPEG
3928        assert!(jpeg_data.starts_with(&[0xFF, 0xD8, 0xFF]));
3929        assert!(jpeg_data.ends_with(&[0xFF, 0xD9]));
3930    }
3931
3932    #[test]
3933    fn test_encode_ycbcr_planar_strided_matches_packed() {
3934        let width = 32u32;
3935        let height = 32u32;
3936
3937        // Create packed plane data
3938        let y_packed: Vec<u8> = (0..width * height).map(|i| (i % 256) as u8).collect();
3939        let chroma_w = (width + 1) / 2;
3940        let chroma_h = (height + 1) / 2;
3941        let cb_packed: Vec<u8> = vec![100u8; (chroma_w * chroma_h) as usize];
3942        let cr_packed: Vec<u8> = vec![150u8; (chroma_w * chroma_h) as usize];
3943
3944        let encoder = Encoder::new(Preset::BaselineBalanced)
3945            .quality(85)
3946            .subsampling(Subsampling::S420);
3947
3948        // Encode with packed API
3949        let jpeg_packed = encoder
3950            .encode_ycbcr_planar(&y_packed, &cb_packed, &cr_packed, width, height)
3951            .expect("packed encoding should succeed");
3952
3953        // Encode with strided API (stride == width means packed)
3954        let jpeg_strided = encoder
3955            .encode_ycbcr_planar_strided(
3956                &y_packed,
3957                width as usize,
3958                &cb_packed,
3959                chroma_w as usize,
3960                &cr_packed,
3961                chroma_w as usize,
3962                width,
3963                height,
3964            )
3965            .expect("strided encoding should succeed");
3966
3967        // Both should produce identical output
3968        assert_eq!(jpeg_packed, jpeg_strided);
3969    }
3970
3971    // =========================================================================
3972    // Resource Estimation Tests
3973    // =========================================================================
3974
3975    #[test]
3976    fn test_estimate_resources_basic() {
3977        let encoder = Encoder::new(Preset::BaselineBalanced);
3978        let estimate = encoder.estimate_resources(1920, 1080);
3979
3980        // Should have reasonable memory estimate (> input size)
3981        let input_size = 1920 * 1080 * 3;
3982        assert!(
3983            estimate.peak_memory_bytes > input_size,
3984            "Peak memory {} should exceed input size {}",
3985            estimate.peak_memory_bytes,
3986            input_size
3987        );
3988
3989        // Should have reasonable CPU cost (> 1.0 due to trellis)
3990        assert!(
3991            estimate.cpu_cost_multiplier > 1.0,
3992            "CPU cost {} should be > 1.0 for BaselineBalanced",
3993            estimate.cpu_cost_multiplier
3994        );
3995
3996        // Block count should match expected
3997        assert!(estimate.block_count > 0, "Block count should be > 0");
3998    }
3999
4000    #[test]
4001    fn test_estimate_resources_fastest_has_lower_cpu() {
4002        let fastest = Encoder::new(Preset::BaselineFastest);
4003        let balanced = Encoder::new(Preset::BaselineBalanced);
4004
4005        let est_fast = fastest.estimate_resources(512, 512);
4006        let est_balanced = balanced.estimate_resources(512, 512);
4007
4008        // Fastest should have lower CPU cost (no trellis)
4009        assert!(
4010            est_fast.cpu_cost_multiplier < est_balanced.cpu_cost_multiplier,
4011            "Fastest ({:.2}) should have lower CPU cost than Balanced ({:.2})",
4012            est_fast.cpu_cost_multiplier,
4013            est_balanced.cpu_cost_multiplier
4014        );
4015    }
4016
4017    #[test]
4018    fn test_estimate_resources_progressive_has_higher_cpu() {
4019        let baseline = Encoder::new(Preset::BaselineBalanced);
4020        let progressive = Encoder::new(Preset::ProgressiveBalanced);
4021
4022        let est_baseline = baseline.estimate_resources(512, 512);
4023        let est_prog = progressive.estimate_resources(512, 512);
4024
4025        // Progressive should have higher CPU cost (multiple scans)
4026        assert!(
4027            est_prog.cpu_cost_multiplier > est_baseline.cpu_cost_multiplier,
4028            "Progressive ({:.2}) should have higher CPU cost than Baseline ({:.2})",
4029            est_prog.cpu_cost_multiplier,
4030            est_baseline.cpu_cost_multiplier
4031        );
4032    }
4033
4034    #[test]
4035    fn test_estimate_resources_gray() {
4036        let encoder = Encoder::new(Preset::BaselineBalanced);
4037        let rgb_estimate = encoder.estimate_resources(512, 512);
4038        let gray_estimate = encoder.estimate_resources_gray(512, 512);
4039
4040        // Grayscale should use less memory (1 channel vs 3)
4041        assert!(
4042            gray_estimate.peak_memory_bytes < rgb_estimate.peak_memory_bytes,
4043            "Grayscale memory {} should be less than RGB {}",
4044            gray_estimate.peak_memory_bytes,
4045            rgb_estimate.peak_memory_bytes
4046        );
4047
4048        // Grayscale should have lower CPU cost
4049        assert!(
4050            gray_estimate.cpu_cost_multiplier < rgb_estimate.cpu_cost_multiplier,
4051            "Grayscale CPU {:.2} should be less than RGB {:.2}",
4052            gray_estimate.cpu_cost_multiplier,
4053            rgb_estimate.cpu_cost_multiplier
4054        );
4055    }
4056
4057    // =========================================================================
4058    // Resource Limit Tests
4059    // =========================================================================
4060
4061    #[test]
4062    fn test_dimension_limit_width() {
4063        let limits = Limits::default().max_width(100).max_height(100);
4064        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4065
4066        let pixels = vec![128u8; 200 * 50 * 3];
4067        let result = encoder.encode_rgb(&pixels, 200, 50);
4068
4069        assert!(matches!(result, Err(Error::DimensionLimitExceeded { .. })));
4070    }
4071
4072    #[test]
4073    fn test_dimension_limit_height() {
4074        let limits = Limits::default().max_width(100).max_height(100);
4075        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4076
4077        let pixels = vec![128u8; 50 * 200 * 3];
4078        let result = encoder.encode_rgb(&pixels, 50, 200);
4079
4080        assert!(matches!(result, Err(Error::DimensionLimitExceeded { .. })));
4081    }
4082
4083    #[test]
4084    fn test_dimension_limit_passes_when_within() {
4085        let limits = Limits::default().max_width(100).max_height(100);
4086        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4087
4088        let pixels = vec![128u8; 64 * 64 * 3];
4089        let result = encoder.encode_rgb(&pixels, 64, 64);
4090
4091        assert!(result.is_ok());
4092    }
4093
4094    #[test]
4095    fn test_allocation_limit() {
4096        let limits = Limits::default().max_alloc_bytes(1000); // Very small limit
4097        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4098
4099        let pixels = vec![128u8; 256 * 256 * 3];
4100        let result = encoder.encode_rgb(&pixels, 256, 256);
4101
4102        assert!(matches!(result, Err(Error::AllocationLimitExceeded { .. })));
4103    }
4104
4105    #[test]
4106    fn test_allocation_limit_passes_when_within() {
4107        let limits = Limits::default().max_alloc_bytes(10_000_000); // 10 MB limit
4108        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4109
4110        let pixels = vec![128u8; 64 * 64 * 3];
4111        let result = encoder.encode_rgb(&pixels, 64, 64);
4112
4113        assert!(result.is_ok());
4114    }
4115
4116    #[test]
4117    fn test_pixel_count_limit() {
4118        let limits = Limits::default().max_pixel_count(1000); // Very small limit
4119        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4120
4121        let pixels = vec![128u8; 64 * 64 * 3]; // 4096 pixels
4122        let result = encoder.encode_rgb(&pixels, 64, 64);
4123
4124        assert!(matches!(result, Err(Error::PixelCountExceeded { .. })));
4125    }
4126
4127    #[test]
4128    fn test_pixel_count_limit_passes_when_within() {
4129        let limits = Limits::default().max_pixel_count(10000); // 10000 pixels
4130        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4131
4132        let pixels = vec![128u8; 64 * 64 * 3]; // 4096 pixels
4133        let result = encoder.encode_rgb(&pixels, 64, 64);
4134
4135        assert!(result.is_ok());
4136    }
4137
4138    #[test]
4139    fn test_icc_profile_size_limit() {
4140        let limits = Limits::default().max_icc_profile_bytes(100);
4141        let encoder = Encoder::new(Preset::BaselineFastest)
4142            .limits(limits)
4143            .icc_profile(vec![0u8; 1000]); // 1000 byte ICC profile
4144
4145        let pixels = vec![128u8; 64 * 64 * 3];
4146        let result = encoder.encode_rgb(&pixels, 64, 64);
4147
4148        assert!(matches!(result, Err(Error::IccProfileTooLarge { .. })));
4149    }
4150
4151    #[test]
4152    fn test_icc_profile_size_limit_passes_when_within() {
4153        let limits = Limits::default().max_icc_profile_bytes(2000);
4154        let encoder = Encoder::new(Preset::BaselineFastest)
4155            .limits(limits)
4156            .icc_profile(vec![0u8; 1000]); // 1000 byte ICC profile
4157
4158        let pixels = vec![128u8; 64 * 64 * 3];
4159        let result = encoder.encode_rgb(&pixels, 64, 64);
4160
4161        assert!(result.is_ok());
4162    }
4163
4164    #[test]
4165    fn test_limits_disabled_by_default() {
4166        let encoder = Encoder::new(Preset::BaselineFastest);
4167        assert_eq!(encoder.limits, Limits::none());
4168    }
4169
4170    #[test]
4171    fn test_limits_has_limits() {
4172        assert!(!Limits::none().has_limits());
4173        assert!(Limits::default().max_width(100).has_limits());
4174        assert!(Limits::default().max_height(100).has_limits());
4175        assert!(Limits::default().max_pixel_count(1000).has_limits());
4176        assert!(Limits::default().max_alloc_bytes(1000).has_limits());
4177        assert!(Limits::default().max_icc_profile_bytes(1000).has_limits());
4178    }
4179
4180    // =========================================================================
4181    // Cancellation Tests
4182    // =========================================================================
4183
4184    #[test]
4185    fn test_cancellable_with_no_cancellation() {
4186        let encoder = Encoder::new(Preset::BaselineFastest);
4187        let pixels = vec![128u8; 64 * 64 * 3];
4188
4189        let result = encoder.encode_rgb_cancellable(&pixels, 64, 64, None, None);
4190
4191        assert!(result.is_ok());
4192    }
4193
4194    #[test]
4195    fn test_cancellable_immediate_cancel() {
4196        let encoder = Encoder::new(Preset::BaselineFastest);
4197        let pixels = vec![128u8; 64 * 64 * 3];
4198        let cancel = AtomicBool::new(true); // Already cancelled
4199
4200        let result = encoder.encode_rgb_cancellable(&pixels, 64, 64, Some(&cancel), None);
4201
4202        assert!(matches!(result, Err(Error::Cancelled)));
4203    }
4204
4205    #[test]
4206    fn test_cancellable_with_timeout() {
4207        let encoder = Encoder::new(Preset::BaselineFastest);
4208        let pixels = vec![128u8; 64 * 64 * 3];
4209
4210        // 10 second timeout - should complete well within this
4211        let result =
4212            encoder.encode_rgb_cancellable(&pixels, 64, 64, None, Some(Duration::from_secs(10)));
4213
4214        assert!(result.is_ok());
4215    }
4216
4217    #[test]
4218    fn test_cancellable_gray() {
4219        let encoder = Encoder::new(Preset::BaselineFastest);
4220        let pixels = vec![128u8; 64 * 64];
4221
4222        let result = encoder.encode_gray_cancellable(&pixels, 64, 64, None, None);
4223
4224        assert!(result.is_ok());
4225    }
4226
4227    #[test]
4228    fn test_cancellable_with_limits() {
4229        // Test that limits work in cancellable method too
4230        let limits = Limits::default().max_width(32);
4231        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4232
4233        let pixels = vec![128u8; 64 * 64 * 3];
4234        let result = encoder.encode_rgb_cancellable(&pixels, 64, 64, None, None);
4235
4236        assert!(matches!(result, Err(Error::DimensionLimitExceeded { .. })));
4237    }
4238
4239    #[test]
4240    fn test_cancellation_context_none() {
4241        let ctx = CancellationContext::none();
4242        assert!(ctx.check().is_ok());
4243    }
4244
4245    #[test]
4246    fn test_cancellation_context_with_cancel_flag() {
4247        use std::sync::atomic::Ordering;
4248
4249        let cancel = AtomicBool::new(false);
4250        let ctx = CancellationContext::new(Some(&cancel), None);
4251        assert!(ctx.check().is_ok());
4252
4253        cancel.store(true, Ordering::Relaxed);
4254        assert!(matches!(ctx.check(), Err(Error::Cancelled)));
4255    }
4256
4257    #[test]
4258    fn test_cancellation_context_with_expired_deadline() {
4259        // Create a deadline that's already passed
4260        let ctx = CancellationContext {
4261            cancel: None,
4262            deadline: Some(Instant::now() - Duration::from_secs(1)),
4263        };
4264
4265        assert!(matches!(ctx.check(), Err(Error::TimedOut)));
4266    }
4267
4268    #[test]
4269    fn test_dimension_exact_at_limit_passes() {
4270        // Dimensions exactly at limit should pass
4271        let limits = Limits::default().max_width(64).max_height(64);
4272        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4273
4274        let pixels = vec![128u8; 64 * 64 * 3];
4275        let result = encoder.encode_rgb(&pixels, 64, 64);
4276
4277        assert!(result.is_ok());
4278    }
4279
4280    #[test]
4281    fn test_pixel_count_exact_at_limit_passes() {
4282        // Pixel count exactly at limit should pass
4283        let limits = Limits::default().max_pixel_count(4096); // Exactly 64*64
4284        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4285
4286        let pixels = vec![128u8; 64 * 64 * 3];
4287        let result = encoder.encode_rgb(&pixels, 64, 64);
4288
4289        assert!(result.is_ok());
4290    }
4291
4292    #[test]
4293    fn test_multiple_limits_all_checked() {
4294        // Test that all limits are checked, not just the first
4295        let limits = Limits::default()
4296            .max_width(1000)
4297            .max_height(1000)
4298            .max_pixel_count(100); // This should fail
4299
4300        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4301        let pixels = vec![128u8; 64 * 64 * 3]; // 4096 pixels
4302
4303        let result = encoder.encode_rgb(&pixels, 64, 64);
4304        assert!(matches!(result, Err(Error::PixelCountExceeded { .. })));
4305    }
4306
4307    #[test]
4308    fn test_limits_with_grayscale() {
4309        let limits = Limits::default().max_pixel_count(100);
4310        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4311
4312        let pixels = vec![128u8; 64 * 64]; // Grayscale, 4096 pixels
4313        let result = encoder.encode_gray(&pixels, 64, 64);
4314
4315        assert!(matches!(result, Err(Error::PixelCountExceeded { .. })));
4316    }
4317
4318    #[test]
4319    fn test_estimate_resources_with_subsampling() {
4320        let encoder_444 = Encoder::new(Preset::BaselineBalanced).subsampling(Subsampling::S444);
4321        let encoder_420 = Encoder::new(Preset::BaselineBalanced).subsampling(Subsampling::S420);
4322
4323        let est_444 = encoder_444.estimate_resources(512, 512);
4324        let est_420 = encoder_420.estimate_resources(512, 512);
4325
4326        // 4:4:4 should use more memory than 4:2:0 (no chroma downsampling)
4327        assert!(
4328            est_444.peak_memory_bytes > est_420.peak_memory_bytes,
4329            "4:4:4 memory {} should exceed 4:2:0 memory {}",
4330            est_444.peak_memory_bytes,
4331            est_420.peak_memory_bytes
4332        );
4333    }
4334
4335    #[test]
4336    fn test_estimate_resources_block_count() {
4337        // With 4:2:0 subsampling (default): Y gets full blocks, chroma gets 1/4
4338        let encoder = Encoder::new(Preset::BaselineFastest);
4339
4340        // 64x64 image with 4:2:0:
4341        // Y blocks: 8x8 = 64
4342        // Chroma: 32x32 pixels, 4x4 blocks each = 16 per component
4343        // Total: 64 + 16 + 16 = 96
4344        let estimate = encoder.estimate_resources(64, 64);
4345        assert_eq!(estimate.block_count, 96);
4346
4347        // With 4:4:4 subsampling: all components get full blocks
4348        let encoder_444 = Encoder::new(Preset::BaselineFastest).subsampling(Subsampling::S444);
4349        let estimate_444 = encoder_444.estimate_resources(64, 64);
4350        // 64 blocks * 3 components = 192
4351        assert_eq!(estimate_444.block_count, 192);
4352    }
4353
4354    #[test]
4355    fn test_cancellable_gray_with_limits() {
4356        let limits = Limits::default().max_width(32);
4357        let encoder = Encoder::new(Preset::BaselineFastest).limits(limits);
4358
4359        let pixels = vec![128u8; 64 * 64];
4360        let result = encoder.encode_gray_cancellable(&pixels, 64, 64, None, None);
4361
4362        assert!(matches!(result, Err(Error::DimensionLimitExceeded { .. })));
4363    }
4364}