mozjpeg_rs/encode/
streaming.rs

1//! Streaming JPEG encoder.
2//!
3//! This module provides [`StreamingEncoder`] and [`EncodingStream`] for
4//! scanline-by-scanline encoding, which is memory-efficient for large images.
5
6use std::io::Write;
7
8use crate::consts::{QuantTableIdx, DCTSIZE, DCTSIZE2, JPEG_NATURAL_ORDER};
9use crate::error::{Error, Result};
10use crate::huffman::DerivedTable;
11use crate::marker::MarkerWriter;
12use crate::progressive::generate_baseline_scan;
13use crate::quant::{create_quant_tables, quantize_block};
14use crate::simd::SimdOps;
15use crate::types::{ComponentInfo, PixelDensity, QuantTable, Subsampling};
16
17use super::{
18    create_std_ac_chroma_table, create_std_ac_luma_table, create_std_dc_chroma_table,
19    create_std_dc_luma_table, try_alloc_vec, Encode,
20};
21
22/// Streaming JPEG encoder configuration.
23///
24/// This encoder supports scanline-by-scanline encoding, which is memory-efficient
25/// for large images. It does NOT support trellis quantization, progressive mode,
26/// or Huffman optimization (these require buffering the entire image).
27///
28/// Use [`Encoder`](super::Encoder) for full-featured batch encoding with optimizations.
29///
30/// # Example
31///
32/// ```ignore
33/// use mozjpeg_rs::Encoder;
34///
35/// // Create streaming encoder
36/// let mut stream = Encoder::streaming()
37///     .quality(85)
38///     .start_rgb(1920, 1080, output_file)?;
39///
40/// // Write scanlines (must be in multiples of 8 or 16 depending on subsampling)
41/// for chunk in rgb_scanlines.chunks(16 * 1920 * 3) {
42///     stream.write_scanlines(chunk)?;
43/// }
44///
45/// // Finalize the JPEG
46/// stream.finish()?;
47/// ```
48#[derive(Debug, Clone)]
49pub struct StreamingEncoder {
50    /// Quality level (1-100)
51    quality: u8,
52    /// Chroma subsampling mode
53    subsampling: Subsampling,
54    /// Quantization table variant
55    quant_table_idx: QuantTableIdx,
56    /// Custom luminance quantization table
57    custom_luma_qtable: Option<[u16; DCTSIZE2]>,
58    /// Custom chrominance quantization table
59    custom_chroma_qtable: Option<[u16; DCTSIZE2]>,
60    /// Force baseline-compatible output
61    force_baseline: bool,
62    /// Restart interval in MCUs (0 = disabled)
63    restart_interval: u16,
64    /// Pixel density for JFIF APP0 marker
65    pixel_density: PixelDensity,
66    /// EXIF data to embed
67    exif_data: Option<Vec<u8>>,
68    /// ICC color profile to embed
69    icc_profile: Option<Vec<u8>>,
70    /// Custom APP markers to embed
71    custom_markers: Vec<(u8, Vec<u8>)>,
72    /// SIMD operations dispatch
73    simd: SimdOps,
74}
75
76impl Default for StreamingEncoder {
77    fn default() -> Self {
78        Self::baseline_fastest()
79    }
80}
81
82impl StreamingEncoder {
83    /// Create a streaming encoder with fastest settings.
84    ///
85    /// This matches [`Preset::BaselineFastest`](crate::Preset::BaselineFastest) but for streaming.
86    ///
87    /// Streaming mode does NOT support any optimizations that require buffering
88    /// the entire image:
89    /// - No trellis quantization (requires global context)
90    /// - No progressive mode (requires multiple passes)
91    /// - No Huffman optimization (requires 2-pass)
92    ///
93    /// # Example
94    ///
95    /// ```ignore
96    /// use mozjpeg_rs::StreamingEncoder;
97    ///
98    /// let mut stream = StreamingEncoder::baseline_fastest()
99    ///     .quality(85)
100    ///     .start_rgb(1920, 1080, output_file)?;
101    /// ```
102    pub fn baseline_fastest() -> Self {
103        Self {
104            quality: 75,
105            subsampling: Subsampling::S420,
106            quant_table_idx: QuantTableIdx::ImageMagick,
107            custom_luma_qtable: None,
108            custom_chroma_qtable: None,
109            force_baseline: true,
110            restart_interval: 0,
111            pixel_density: PixelDensity::default(),
112            exif_data: None,
113            icc_profile: None,
114            custom_markers: Vec::new(),
115            simd: SimdOps::detect(),
116        }
117    }
118
119    /// Set quality level (1-100).
120    pub fn quality(mut self, quality: u8) -> Self {
121        self.quality = quality.clamp(1, 100);
122        self
123    }
124
125    /// Set chroma subsampling mode.
126    pub fn subsampling(mut self, mode: Subsampling) -> Self {
127        self.subsampling = mode;
128        self
129    }
130
131    /// Set quantization table variant.
132    pub fn quant_tables(mut self, idx: QuantTableIdx) -> Self {
133        self.quant_table_idx = idx;
134        self
135    }
136
137    /// Force baseline-compatible output.
138    pub fn force_baseline(mut self, enable: bool) -> Self {
139        self.force_baseline = enable;
140        self
141    }
142
143    /// Set restart interval in MCUs.
144    pub fn restart_interval(mut self, interval: u16) -> Self {
145        self.restart_interval = interval;
146        self
147    }
148
149    /// Set pixel density for the JFIF APP0 marker.
150    pub fn pixel_density(mut self, density: PixelDensity) -> Self {
151        self.pixel_density = density;
152        self
153    }
154
155    /// Set EXIF data to embed.
156    pub fn exif_data(mut self, data: Vec<u8>) -> Self {
157        self.exif_data = if data.is_empty() { None } else { Some(data) };
158        self
159    }
160
161    /// Set ICC color profile to embed.
162    pub fn icc_profile(mut self, profile: Vec<u8>) -> Self {
163        self.icc_profile = if profile.is_empty() {
164            None
165        } else {
166            Some(profile)
167        };
168        self
169    }
170
171    /// Add a custom APP marker.
172    pub fn add_marker(mut self, app_num: u8, data: Vec<u8>) -> Self {
173        if app_num <= 15 && !data.is_empty() {
174            self.custom_markers.push((app_num, data));
175        }
176        self
177    }
178
179    /// Set custom luminance quantization table.
180    pub fn custom_luma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
181        self.custom_luma_qtable = Some(table);
182        self
183    }
184
185    /// Set custom chrominance quantization table.
186    pub fn custom_chroma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
187        self.custom_chroma_qtable = Some(table);
188        self
189    }
190
191    /// Start streaming RGB encoding to a writer.
192    ///
193    /// # Arguments
194    /// * `width` - Image width in pixels
195    /// * `height` - Image height in pixels
196    /// * `writer` - Output writer
197    ///
198    /// # Returns
199    /// An [`EncodingStream`] that accepts scanlines.
200    pub fn start_rgb<W: Write>(
201        self,
202        width: u32,
203        height: u32,
204        writer: W,
205    ) -> Result<EncodingStream<W>> {
206        EncodingStream::new_rgb(self, width, height, writer)
207    }
208
209    /// Start streaming grayscale encoding to a writer.
210    ///
211    /// # Arguments
212    /// * `width` - Image width in pixels
213    /// * `height` - Image height in pixels
214    /// * `writer` - Output writer
215    ///
216    /// # Returns
217    /// An [`EncodingStream`] that accepts scanlines.
218    pub fn start_gray<W: Write>(
219        self,
220        width: u32,
221        height: u32,
222        writer: W,
223    ) -> Result<EncodingStream<W>> {
224        EncodingStream::new_gray(self, width, height, writer)
225    }
226}
227
228/// Implement batch encoding for StreamingEncoder (without optimizations).
229impl Encode for StreamingEncoder {
230    fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
231        let mut output = Vec::new();
232        let mut stream = self.clone().start_rgb(width, height, &mut output)?;
233        stream.write_scanlines(rgb_data)?;
234        stream.finish()?;
235        Ok(output)
236    }
237
238    fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
239        let mut output = Vec::new();
240        let mut stream = self.clone().start_gray(width, height, &mut output)?;
241        stream.write_scanlines(gray_data)?;
242        stream.finish()?;
243        Ok(output)
244    }
245}
246
247// ============================================================================
248// EncodingStream - Active Streaming Session
249// ============================================================================
250
251/// Active streaming encoding session.
252///
253/// Created by [`StreamingEncoder::start_rgb()`] or [`StreamingEncoder::start_gray()`].
254/// Accepts scanlines via [`write_scanlines()`](Self::write_scanlines) and must be
255/// finalized with [`finish()`](Self::finish).
256pub struct EncodingStream<W: Write> {
257    /// Output writer wrapped in marker writer
258    writer: MarkerWriter<W>,
259    /// Image width
260    width: u32,
261    /// Number of color components (1 for gray, 3 for RGB/YCbCr)
262    num_components: u8,
263    /// Bytes per input pixel
264    bytes_per_pixel: u8,
265    /// Chroma subsampling mode
266    subsampling: Subsampling,
267    /// MCU height in pixels (8 or 16 depending on subsampling)
268    mcu_height: u32,
269    /// MCU width in pixels (8 or 16 depending on subsampling)
270    mcu_width: u32,
271    /// Number of MCUs per row
272    mcus_per_row: u32,
273    /// Luminance quantization table (zigzag order)
274    luma_qtable: QuantTable,
275    /// Chrominance quantization table (zigzag order)
276    chroma_qtable: QuantTable,
277    /// DC Huffman table for luminance
278    dc_luma_table: DerivedTable,
279    /// AC Huffman table for luminance
280    ac_luma_table: DerivedTable,
281    /// DC Huffman table for chrominance
282    dc_chroma_table: DerivedTable,
283    /// AC Huffman table for chrominance
284    ac_chroma_table: DerivedTable,
285    /// Previous DC values for differential encoding
286    prev_dc: [i32; 4],
287    /// Scanlines accumulated for current MCU row
288    scanline_buffer: Vec<u8>,
289    /// Lines accumulated in buffer
290    lines_in_buffer: u32,
291    /// Bit buffer for entropy encoding
292    bit_buffer: u64,
293    /// Bits used in bit_buffer
294    bits_in_buffer: u8,
295    /// SIMD operations dispatch
296    simd: SimdOps,
297    /// Restart interval tracking
298    restart_interval: u16,
299    /// MCUs since last restart marker
300    mcus_since_restart: u16,
301    /// Next restart marker number (0-7)
302    next_restart_num: u8,
303}
304
305impl<W: Write> EncodingStream<W> {
306    /// Create a new RGB encoding stream.
307    fn new_rgb(config: StreamingEncoder, width: u32, height: u32, writer: W) -> Result<Self> {
308        Self::new(config, width, height, 3, writer)
309    }
310
311    /// Create a new grayscale encoding stream.
312    fn new_gray(config: StreamingEncoder, width: u32, height: u32, writer: W) -> Result<Self> {
313        Self::new(config, width, height, 1, writer)
314    }
315
316    /// Create a new encoding stream.
317    fn new(
318        config: StreamingEncoder,
319        width: u32,
320        height: u32,
321        num_components: u8,
322        writer: W,
323    ) -> Result<Self> {
324        // Validate dimensions
325        if width == 0 || height == 0 {
326            return Err(Error::InvalidDimensions { width, height });
327        }
328
329        // Determine MCU dimensions based on subsampling
330        let (mcu_width, mcu_height) = if num_components == 1 {
331            (DCTSIZE as u32, DCTSIZE as u32)
332        } else {
333            match config.subsampling {
334                Subsampling::S444 | Subsampling::Gray => (DCTSIZE as u32, DCTSIZE as u32),
335                Subsampling::S422 => (DCTSIZE as u32 * 2, DCTSIZE as u32),
336                Subsampling::S420 => (DCTSIZE as u32 * 2, DCTSIZE as u32 * 2),
337                Subsampling::S440 => (DCTSIZE as u32, DCTSIZE as u32 * 2),
338            }
339        };
340
341        let mcus_per_row = (width + mcu_width - 1) / mcu_width;
342
343        // Create quantization tables
344        let (luma_qtable, chroma_qtable) = create_quant_tables(
345            config.quality,
346            config.quant_table_idx,
347            config.force_baseline,
348        );
349
350        // Apply custom tables if specified
351        let luma_qtable = if let Some(custom) = config.custom_luma_qtable {
352            let mut values = luma_qtable.values;
353            for (i, &val) in custom.iter().enumerate() {
354                values[JPEG_NATURAL_ORDER[i] as usize] = val;
355            }
356            QuantTable::new(values)
357        } else {
358            luma_qtable
359        };
360
361        let chroma_qtable = if let Some(custom) = config.custom_chroma_qtable {
362            let mut values = chroma_qtable.values;
363            for (i, &val) in custom.iter().enumerate() {
364                values[JPEG_NATURAL_ORDER[i] as usize] = val;
365            }
366            QuantTable::new(values)
367        } else {
368            chroma_qtable
369        };
370
371        // Create standard Huffman tables (no optimization in streaming mode)
372        let dc_luma_htable = create_std_dc_luma_table();
373        let ac_luma_htable = create_std_ac_luma_table();
374        let dc_chroma_htable = create_std_dc_chroma_table();
375        let ac_chroma_htable = create_std_ac_chroma_table();
376
377        // Derive encoding tables
378        let dc_luma_table = DerivedTable::from_huff_table(&dc_luma_htable, true)?;
379        let ac_luma_table = DerivedTable::from_huff_table(&ac_luma_htable, false)?;
380        let dc_chroma_table = DerivedTable::from_huff_table(&dc_chroma_htable, true)?;
381        let ac_chroma_table = DerivedTable::from_huff_table(&ac_chroma_htable, false)?;
382
383        // Allocate scanline buffer for one MCU row
384        let buffer_size = (mcu_height as usize) * (width as usize) * (num_components as usize);
385        let scanline_buffer = try_alloc_vec(0u8, buffer_size)?;
386
387        let mut marker_writer = MarkerWriter::new(writer);
388
389        // Write JPEG headers
390        marker_writer.write_soi()?;
391        marker_writer.write_jfif_app0(
392            config.pixel_density.unit as u8,
393            config.pixel_density.x,
394            config.pixel_density.y,
395        )?;
396
397        // Write EXIF data if provided
398        if let Some(ref exif) = config.exif_data {
399            marker_writer.write_app1_exif(exif)?;
400        }
401
402        // Write ICC profile if provided
403        if let Some(ref icc) = config.icc_profile {
404            marker_writer.write_icc_profile(icc)?;
405        }
406
407        // Write custom markers
408        for (app_num, data) in &config.custom_markers {
409            marker_writer.write_app(*app_num, data)?;
410        }
411
412        // Write quantization tables
413        let use_16bit = !config.force_baseline;
414        if num_components == 1 {
415            marker_writer.write_dqt(0, &luma_qtable.values, use_16bit)?;
416        } else {
417            marker_writer.write_dqt_multiple(&[
418                (0, &luma_qtable.values, use_16bit),
419                (1, &chroma_qtable.values, use_16bit),
420            ])?;
421        }
422
423        // Write frame header
424        let components: Vec<ComponentInfo> = if num_components == 1 {
425            vec![ComponentInfo {
426                component_id: 1,
427                component_index: 0,
428                h_samp_factor: 1,
429                v_samp_factor: 1,
430                quant_tbl_no: 0,
431                dc_tbl_no: 0,
432                ac_tbl_no: 0,
433            }]
434        } else {
435            let (h_samp, v_samp) = match config.subsampling {
436                Subsampling::S444 | Subsampling::Gray => (1, 1),
437                Subsampling::S422 => (2, 1),
438                Subsampling::S420 => (2, 2),
439                Subsampling::S440 => (1, 2),
440            };
441            vec![
442                ComponentInfo {
443                    component_id: 1,
444                    component_index: 0,
445                    h_samp_factor: h_samp,
446                    v_samp_factor: v_samp,
447                    quant_tbl_no: 0,
448                    dc_tbl_no: 0,
449                    ac_tbl_no: 0,
450                },
451                ComponentInfo {
452                    component_id: 2,
453                    component_index: 1,
454                    h_samp_factor: 1,
455                    v_samp_factor: 1,
456                    quant_tbl_no: 1,
457                    dc_tbl_no: 1,
458                    ac_tbl_no: 1,
459                },
460                ComponentInfo {
461                    component_id: 3,
462                    component_index: 2,
463                    h_samp_factor: 1,
464                    v_samp_factor: 1,
465                    quant_tbl_no: 1,
466                    dc_tbl_no: 1,
467                    ac_tbl_no: 1,
468                },
469            ]
470        };
471
472        // Always baseline (not progressive) for streaming
473        marker_writer.write_sof(false, 8, height as u16, width as u16, &components)?;
474
475        // Write Huffman tables
476        if num_components == 1 {
477            marker_writer
478                .write_dht_multiple(&[(0, false, &dc_luma_htable), (0, true, &ac_luma_htable)])?;
479        } else {
480            marker_writer.write_dht_multiple(&[
481                (0, false, &dc_luma_htable),
482                (0, true, &ac_luma_htable),
483                (1, false, &dc_chroma_htable),
484                (1, true, &ac_chroma_htable),
485            ])?;
486        }
487
488        // Write restart interval if specified
489        if config.restart_interval > 0 {
490            marker_writer.write_dri(config.restart_interval)?;
491        }
492
493        // Write SOS marker
494        let scans = generate_baseline_scan(num_components);
495        marker_writer.write_sos(&scans[0], &components)?;
496
497        Ok(Self {
498            writer: marker_writer,
499            width,
500            num_components,
501            bytes_per_pixel: num_components,
502            subsampling: config.subsampling,
503            mcu_height,
504            mcu_width,
505            mcus_per_row,
506            luma_qtable,
507            chroma_qtable,
508            dc_luma_table,
509            ac_luma_table,
510            dc_chroma_table,
511            ac_chroma_table,
512            prev_dc: [0; 4],
513            scanline_buffer,
514            lines_in_buffer: 0,
515            bit_buffer: 0,
516            bits_in_buffer: 0,
517            simd: config.simd,
518            restart_interval: config.restart_interval,
519            mcus_since_restart: 0,
520            next_restart_num: 0,
521        })
522    }
523
524    /// Write scanlines to the encoder.
525    ///
526    /// Scanlines are buffered until a complete MCU row is available, then encoded.
527    /// The number of bytes should be `num_lines * width * bytes_per_pixel`.
528    ///
529    /// For best performance, write in multiples of the MCU height (8 or 16 lines).
530    pub fn write_scanlines(&mut self, data: &[u8]) -> Result<()> {
531        let bytes_per_line = self.width as usize * self.bytes_per_pixel as usize;
532        let lines_in_data = data.len() / bytes_per_line;
533
534        if data.len() != lines_in_data * bytes_per_line {
535            return Err(Error::BufferSizeMismatch {
536                expected: lines_in_data * bytes_per_line,
537                actual: data.len(),
538            });
539        }
540
541        let mut data_offset = 0;
542        let mut lines_remaining = lines_in_data as u32;
543
544        while lines_remaining > 0 {
545            // How many lines can we fit in the buffer?
546            let lines_to_copy =
547                (self.mcu_height - self.lines_in_buffer).min(lines_remaining) as usize;
548
549            // Copy lines to buffer
550            let buffer_offset = self.lines_in_buffer as usize * bytes_per_line;
551            let src_bytes = lines_to_copy * bytes_per_line;
552            self.scanline_buffer[buffer_offset..buffer_offset + src_bytes]
553                .copy_from_slice(&data[data_offset..data_offset + src_bytes]);
554
555            self.lines_in_buffer += lines_to_copy as u32;
556            data_offset += src_bytes;
557            lines_remaining -= lines_to_copy as u32;
558
559            // If buffer is full, encode the MCU row
560            if self.lines_in_buffer == self.mcu_height {
561                self.encode_mcu_row()?;
562                self.lines_in_buffer = 0;
563            }
564        }
565
566        Ok(())
567    }
568
569    /// Encode one complete MCU row from the scanline buffer.
570    fn encode_mcu_row(&mut self) -> Result<()> {
571        let width = self.width as usize;
572        let mcu_height = self.mcu_height as usize;
573
574        if self.num_components == 1 {
575            self.encode_gray_mcu_row(width, mcu_height)?;
576        } else {
577            self.encode_color_mcu_row(width, mcu_height)?;
578        }
579
580        Ok(())
581    }
582
583    /// Encode a grayscale MCU row.
584    fn encode_gray_mcu_row(&mut self, width: usize, mcu_height: usize) -> Result<()> {
585        let mcus_per_row = self.mcus_per_row as usize;
586
587        for mcu_x in 0..mcus_per_row {
588            // Handle restart markers
589            if self.restart_interval > 0 && self.mcus_since_restart == self.restart_interval {
590                self.write_restart_marker()?;
591            }
592
593            // Extract 8x8 block
594            let mut block = [0i16; DCTSIZE2];
595            let x_start = mcu_x * DCTSIZE;
596
597            for y in 0..DCTSIZE {
598                let src_y = y.min(mcu_height - 1);
599                for x in 0..DCTSIZE {
600                    let src_x = (x_start + x).min(width - 1);
601                    let pixel = self.scanline_buffer[src_y * width + src_x];
602                    // Level shift: 0..255 -> -128..127
603                    block[y * DCTSIZE + x] = pixel as i16 - 128;
604                }
605            }
606
607            // Forward DCT
608            let mut dct_block = [0i16; DCTSIZE2];
609            (self.simd.forward_dct)(&block, &mut dct_block);
610
611            // Convert to i32 for quantization
612            let mut dct_i32 = [0i32; DCTSIZE2];
613            for i in 0..DCTSIZE2 {
614                dct_i32[i] = dct_block[i] as i32;
615            }
616
617            // Quantize
618            let mut quantized = [0i16; DCTSIZE2];
619            quantize_block(&dct_i32, &self.luma_qtable.values, &mut quantized);
620
621            // Encode DC coefficient (differential)
622            let dc = quantized[0] as i32;
623            let dc_diff = dc - self.prev_dc[0];
624            self.prev_dc[0] = dc;
625
626            // Clone tables to avoid borrow conflicts
627            let dc_table = self.dc_luma_table.clone();
628            let ac_table = self.ac_luma_table.clone();
629            self.encode_dc(dc_diff, &dc_table)?;
630            self.encode_ac(&quantized, &ac_table)?;
631
632            if self.restart_interval > 0 {
633                self.mcus_since_restart += 1;
634            }
635        }
636
637        Ok(())
638    }
639
640    /// Encode a color (YCbCr) MCU row.
641    fn encode_color_mcu_row(&mut self, width: usize, mcu_height: usize) -> Result<()> {
642        let mcus_per_row = self.mcus_per_row as usize;
643
644        // Temporary storage for Y, Cb, Cr planes
645        let mcu_width = self.mcu_width as usize;
646        let mut y_plane = vec![0i16; mcu_width * mcu_height];
647        let mut cb_plane = vec![0i16; mcu_width * mcu_height];
648        let mut cr_plane = vec![0i16; mcu_width * mcu_height];
649
650        for mcu_x in 0..mcus_per_row {
651            // Handle restart markers
652            if self.restart_interval > 0 && self.mcus_since_restart == self.restart_interval {
653                self.write_restart_marker()?;
654            }
655
656            let x_start = mcu_x * mcu_width;
657
658            // Convert RGB to YCbCr for this MCU
659            for y in 0..mcu_height {
660                let src_y = y.min(mcu_height - 1);
661                for x in 0..mcu_width {
662                    let src_x = (x_start + x).min(width - 1);
663                    let pixel_idx = (src_y * width + src_x) * 3;
664
665                    let r = self.scanline_buffer[pixel_idx] as i32;
666                    let g = self.scanline_buffer[pixel_idx + 1] as i32;
667                    let b = self.scanline_buffer[pixel_idx + 2] as i32;
668
669                    // RGB to YCbCr conversion (BT.601)
670                    let y_val = ((77 * r + 150 * g + 29 * b + 128) >> 8) - 128;
671                    let cb_val = (-43 * r - 85 * g + 128 * b + 128) >> 8;
672                    let cr_val = (128 * r - 107 * g - 21 * b + 128) >> 8;
673
674                    let idx = y * mcu_width + x;
675                    y_plane[idx] = y_val as i16;
676                    cb_plane[idx] = cb_val as i16;
677                    cr_plane[idx] = cr_val as i16;
678                }
679            }
680
681            // Encode Y blocks
682            match self.subsampling {
683                Subsampling::S444 | Subsampling::Gray => {
684                    // One 8x8 Y block
685                    self.encode_luma_block(&y_plane, mcu_width, 0, 0)?;
686                }
687                Subsampling::S422 => {
688                    // Two 8x8 Y blocks horizontally
689                    self.encode_luma_block(&y_plane, mcu_width, 0, 0)?;
690                    self.encode_luma_block(&y_plane, mcu_width, DCTSIZE, 0)?;
691                }
692                Subsampling::S420 => {
693                    // Four 8x8 Y blocks (2x2)
694                    self.encode_luma_block(&y_plane, mcu_width, 0, 0)?;
695                    self.encode_luma_block(&y_plane, mcu_width, DCTSIZE, 0)?;
696                    self.encode_luma_block(&y_plane, mcu_width, 0, DCTSIZE)?;
697                    self.encode_luma_block(&y_plane, mcu_width, DCTSIZE, DCTSIZE)?;
698                }
699                Subsampling::S440 => {
700                    // Two 8x8 Y blocks vertically
701                    self.encode_luma_block(&y_plane, mcu_width, 0, 0)?;
702                    self.encode_luma_block(&y_plane, mcu_width, 0, DCTSIZE)?;
703                }
704            }
705
706            // Downsample and encode Cb, Cr
707            self.encode_chroma_block(&cb_plane, mcu_width, 1)?;
708            self.encode_chroma_block(&cr_plane, mcu_width, 2)?;
709
710            if self.restart_interval > 0 {
711                self.mcus_since_restart += 1;
712            }
713        }
714
715        Ok(())
716    }
717
718    /// Encode a single 8x8 luma block from the Y plane.
719    fn encode_luma_block(
720        &mut self,
721        y_plane: &[i16],
722        plane_width: usize,
723        x_off: usize,
724        y_off: usize,
725    ) -> Result<()> {
726        let mut block = [0i16; DCTSIZE2];
727
728        for y in 0..DCTSIZE {
729            for x in 0..DCTSIZE {
730                block[y * DCTSIZE + x] = y_plane[(y_off + y) * plane_width + x_off + x];
731            }
732        }
733
734        let mut dct_block = [0i16; DCTSIZE2];
735        (self.simd.forward_dct)(&block, &mut dct_block);
736
737        // Convert to i32 for quantization
738        let mut dct_i32 = [0i32; DCTSIZE2];
739        for i in 0..DCTSIZE2 {
740            dct_i32[i] = dct_block[i] as i32;
741        }
742
743        let mut quantized = [0i16; DCTSIZE2];
744        quantize_block(&dct_i32, &self.luma_qtable.values, &mut quantized);
745
746        let dc = quantized[0] as i32;
747        let dc_diff = dc - self.prev_dc[0];
748        self.prev_dc[0] = dc;
749
750        // Clone tables to avoid borrow conflicts
751        let dc_table = self.dc_luma_table.clone();
752        let ac_table = self.ac_luma_table.clone();
753        self.encode_dc(dc_diff, &dc_table)?;
754        self.encode_ac(&quantized, &ac_table)?;
755
756        Ok(())
757    }
758
759    /// Encode a chroma block with downsampling.
760    fn encode_chroma_block(
761        &mut self,
762        chroma_plane: &[i16],
763        plane_width: usize,
764        comp_idx: usize,
765    ) -> Result<()> {
766        let mut block = [0i16; DCTSIZE2];
767
768        // Downsample based on subsampling mode
769        match self.subsampling {
770            Subsampling::S444 | Subsampling::Gray => {
771                // No downsampling
772                for y in 0..DCTSIZE {
773                    for x in 0..DCTSIZE {
774                        block[y * DCTSIZE + x] = chroma_plane[y * plane_width + x];
775                    }
776                }
777            }
778            Subsampling::S422 => {
779                // 2:1 horizontal downsampling
780                for y in 0..DCTSIZE {
781                    for x in 0..DCTSIZE {
782                        let x2 = x * 2;
783                        let val = (chroma_plane[y * plane_width + x2] as i32
784                            + chroma_plane[y * plane_width + x2 + 1] as i32)
785                            / 2;
786                        block[y * DCTSIZE + x] = val as i16;
787                    }
788                }
789            }
790            Subsampling::S420 => {
791                // 2:1 horizontal and vertical downsampling
792                for y in 0..DCTSIZE {
793                    for x in 0..DCTSIZE {
794                        let x2 = x * 2;
795                        let y2 = y * 2;
796                        let val = (chroma_plane[y2 * plane_width + x2] as i32
797                            + chroma_plane[y2 * plane_width + x2 + 1] as i32
798                            + chroma_plane[(y2 + 1) * plane_width + x2] as i32
799                            + chroma_plane[(y2 + 1) * plane_width + x2 + 1] as i32)
800                            / 4;
801                        block[y * DCTSIZE + x] = val as i16;
802                    }
803                }
804            }
805            Subsampling::S440 => {
806                // 2:1 vertical downsampling
807                for y in 0..DCTSIZE {
808                    for x in 0..DCTSIZE {
809                        let y2 = y * 2;
810                        let val = (chroma_plane[y2 * plane_width + x] as i32
811                            + chroma_plane[(y2 + 1) * plane_width + x] as i32)
812                            / 2;
813                        block[y * DCTSIZE + x] = val as i16;
814                    }
815                }
816            }
817        }
818
819        let mut dct_block = [0i16; DCTSIZE2];
820        (self.simd.forward_dct)(&block, &mut dct_block);
821
822        // Convert to i32 for quantization
823        let mut dct_i32 = [0i32; DCTSIZE2];
824        for i in 0..DCTSIZE2 {
825            dct_i32[i] = dct_block[i] as i32;
826        }
827
828        let mut quantized = [0i16; DCTSIZE2];
829        quantize_block(&dct_i32, &self.chroma_qtable.values, &mut quantized);
830
831        let dc = quantized[0] as i32;
832        let dc_diff = dc - self.prev_dc[comp_idx];
833        self.prev_dc[comp_idx] = dc;
834
835        // Clone tables to avoid borrow conflicts
836        let dc_table = self.dc_chroma_table.clone();
837        let ac_table = self.ac_chroma_table.clone();
838        self.encode_dc(dc_diff, &dc_table)?;
839        self.encode_ac(&quantized, &ac_table)?;
840
841        Ok(())
842    }
843
844    /// Encode a DC coefficient (differential).
845    fn encode_dc(&mut self, diff: i32, table: &DerivedTable) -> Result<()> {
846        let (size, bits) = if diff == 0 {
847            (0, 0)
848        } else {
849            let abs_diff = diff.unsigned_abs();
850            let size = 32 - abs_diff.leading_zeros();
851            let bits = if diff > 0 {
852                diff as u32
853            } else {
854                (diff - 1) as u32 & ((1 << size) - 1)
855            };
856            (size, bits)
857        };
858
859        // Write Huffman code for size
860        let (code, code_len) = table.get_code(size as u8);
861        self.write_bits(code as u64, code_len)?;
862
863        // Write magnitude bits
864        if size > 0 {
865            self.write_bits(bits as u64, size as u8)?;
866        }
867
868        Ok(())
869    }
870
871    /// Encode AC coefficients in zigzag order.
872    fn encode_ac(&mut self, quantized: &[i16; DCTSIZE2], table: &DerivedTable) -> Result<()> {
873        let mut run = 0;
874
875        for i in 1..DCTSIZE2 {
876            let val = quantized[JPEG_NATURAL_ORDER[i] as usize];
877
878            if val == 0 {
879                run += 1;
880            } else {
881                // Write ZRL (16 zeros) codes if needed
882                while run >= 16 {
883                    let (code, code_len) = table.get_code(0xF0); // ZRL
884                    self.write_bits(code as u64, code_len)?;
885                    run -= 16;
886                }
887
888                // Compute size and bits
889                let abs_val = val.unsigned_abs() as u32;
890                let size = 32 - abs_val.leading_zeros();
891                let bits = if val > 0 {
892                    val as u32
893                } else {
894                    (val - 1) as u32 & ((1 << size) - 1)
895                };
896
897                // Symbol is (run << 4) | size
898                let symbol = ((run as u8) << 4) | (size as u8);
899                let (code, code_len) = table.get_code(symbol);
900                self.write_bits(code as u64, code_len)?;
901                self.write_bits(bits as u64, size as u8)?;
902
903                run = 0;
904            }
905        }
906
907        // End of block
908        if run > 0 {
909            let (code, code_len) = table.get_code(0x00); // EOB
910            self.write_bits(code as u64, code_len)?;
911        }
912
913        Ok(())
914    }
915
916    /// Write bits to the output with byte stuffing.
917    fn write_bits(&mut self, bits: u64, count: u8) -> Result<()> {
918        self.bit_buffer |= bits << (64 - self.bits_in_buffer - count);
919        self.bits_in_buffer += count;
920
921        while self.bits_in_buffer >= 8 {
922            let byte = (self.bit_buffer >> 56) as u8;
923            self.writer.get_mut().write_all(&[byte])?;
924
925            // Byte stuffing: 0xFF must be followed by 0x00
926            if byte == 0xFF {
927                self.writer.get_mut().write_all(&[0x00])?;
928            }
929
930            self.bit_buffer <<= 8;
931            self.bits_in_buffer -= 8;
932        }
933
934        Ok(())
935    }
936
937    /// Write a restart marker and reset DC predictors.
938    fn write_restart_marker(&mut self) -> Result<()> {
939        // Flush remaining bits with 1s padding
940        if self.bits_in_buffer > 0 {
941            let padding = 8 - self.bits_in_buffer;
942            self.write_bits((1u64 << padding) - 1, padding)?;
943        }
944
945        // Write RST marker
946        let marker = 0xD0 + self.next_restart_num;
947        self.writer.get_mut().write_all(&[0xFF, marker])?;
948
949        // Reset state
950        self.prev_dc = [0; 4];
951        self.mcus_since_restart = 0;
952        self.next_restart_num = (self.next_restart_num + 1) & 7;
953
954        Ok(())
955    }
956
957    /// Finish encoding and write the EOI marker.
958    ///
959    /// This must be called after all scanlines have been written.
960    /// Consumes the stream and returns the underlying writer.
961    pub fn finish(mut self) -> Result<W> {
962        // Encode any remaining lines in the buffer (partial MCU row)
963        if self.lines_in_buffer > 0 {
964            // Pad the buffer with the last line
965            let bytes_per_line = self.width as usize * self.bytes_per_pixel as usize;
966            let last_line_start = (self.lines_in_buffer as usize - 1) * bytes_per_line;
967            let last_line =
968                self.scanline_buffer[last_line_start..last_line_start + bytes_per_line].to_vec();
969
970            while self.lines_in_buffer < self.mcu_height {
971                let buffer_offset = self.lines_in_buffer as usize * bytes_per_line;
972                self.scanline_buffer[buffer_offset..buffer_offset + bytes_per_line]
973                    .copy_from_slice(&last_line);
974                self.lines_in_buffer += 1;
975            }
976
977            self.encode_mcu_row()?;
978        }
979
980        // Flush remaining bits with 1s padding
981        if self.bits_in_buffer > 0 {
982            let padding = 8 - self.bits_in_buffer;
983            self.write_bits((1u64 << padding) - 1, padding)?;
984        }
985
986        // Write EOI marker
987        self.writer.write_eoi()?;
988
989        Ok(self.writer.into_inner())
990    }
991}