mozjpeg_rs/encode/
streaming.rs

1//! Streaming JPEG encoder.
2//!
3//! This module provides [`StreamingEncoder`] and [`EncodingStream`] for
4//! scanline-by-scanline encoding, which is memory-efficient for large images.
5
6use std::io::Write;
7
8use crate::consts::{QuantTableIdx, DCTSIZE, DCTSIZE2, JPEG_NATURAL_ORDER};
9use crate::error::{Error, Result};
10use crate::huffman::DerivedTable;
11use crate::marker::MarkerWriter;
12use crate::progressive::generate_baseline_scan;
13use crate::quant::{create_quant_tables, quantize_block};
14use crate::simd::SimdOps;
15use crate::types::{ComponentInfo, PixelDensity, QuantTable, Subsampling};
16
17use super::{
18    create_std_ac_chroma_table, create_std_ac_luma_table, create_std_dc_chroma_table,
19    create_std_dc_luma_table, try_alloc_vec, Encode,
20};
21
22/// Streaming JPEG encoder configuration.
23///
24/// This encoder supports scanline-by-scanline encoding, which is memory-efficient
25/// for large images. It does NOT support trellis quantization, progressive mode,
26/// or Huffman optimization (these require buffering the entire image).
27///
28/// Use [`Encoder`](super::Encoder) for full-featured batch encoding with optimizations.
29///
30/// # Example
31///
32/// ```ignore
33/// use mozjpeg_rs::Encoder;
34///
35/// // Create streaming encoder
36/// let mut stream = Encoder::streaming()
37///     .quality(85)
38///     .start_rgb(1920, 1080, output_file)?;
39///
40/// // Write scanlines (must be in multiples of 8 or 16 depending on subsampling)
41/// for chunk in rgb_scanlines.chunks(16 * 1920 * 3) {
42///     stream.write_scanlines(chunk)?;
43/// }
44///
45/// // Finalize the JPEG
46/// stream.finish()?;
47/// ```
48#[derive(Debug, Clone)]
49pub struct StreamingEncoder {
50    /// Quality level (1-100)
51    quality: u8,
52    /// Chroma subsampling mode
53    subsampling: Subsampling,
54    /// Quantization table variant
55    quant_table_idx: QuantTableIdx,
56    /// Custom luminance quantization table
57    custom_luma_qtable: Option<[u16; DCTSIZE2]>,
58    /// Custom chrominance quantization table
59    custom_chroma_qtable: Option<[u16; DCTSIZE2]>,
60    /// Force baseline-compatible output
61    force_baseline: bool,
62    /// Restart interval in MCUs (0 = disabled)
63    restart_interval: u16,
64    /// Pixel density for JFIF APP0 marker
65    pixel_density: PixelDensity,
66    /// EXIF data to embed
67    exif_data: Option<Vec<u8>>,
68    /// ICC color profile to embed
69    icc_profile: Option<Vec<u8>>,
70    /// Custom APP markers to embed
71    custom_markers: Vec<(u8, Vec<u8>)>,
72    /// SIMD operations dispatch
73    simd: SimdOps,
74}
75
76impl Default for StreamingEncoder {
77    fn default() -> Self {
78        Self::new()
79    }
80}
81
82impl StreamingEncoder {
83    /// Create a new streaming encoder with default settings.
84    ///
85    /// Unlike [`Encoder::new()`], this uses settings optimized for streaming:
86    /// - No trellis quantization (requires global context)
87    /// - No progressive mode (requires buffering entire image)
88    /// - No Huffman optimization (requires 2-pass)
89    pub fn new() -> Self {
90        Self {
91            quality: 75,
92            subsampling: Subsampling::S420,
93            quant_table_idx: QuantTableIdx::ImageMagick,
94            custom_luma_qtable: None,
95            custom_chroma_qtable: None,
96            force_baseline: true,
97            restart_interval: 0,
98            pixel_density: PixelDensity::default(),
99            exif_data: None,
100            icc_profile: None,
101            custom_markers: Vec::new(),
102            simd: SimdOps::detect(),
103        }
104    }
105
106    /// Set quality level (1-100).
107    pub fn quality(mut self, quality: u8) -> Self {
108        self.quality = quality.clamp(1, 100);
109        self
110    }
111
112    /// Set chroma subsampling mode.
113    pub fn subsampling(mut self, mode: Subsampling) -> Self {
114        self.subsampling = mode;
115        self
116    }
117
118    /// Set quantization table variant.
119    pub fn quant_tables(mut self, idx: QuantTableIdx) -> Self {
120        self.quant_table_idx = idx;
121        self
122    }
123
124    /// Force baseline-compatible output.
125    pub fn force_baseline(mut self, enable: bool) -> Self {
126        self.force_baseline = enable;
127        self
128    }
129
130    /// Set restart interval in MCUs.
131    pub fn restart_interval(mut self, interval: u16) -> Self {
132        self.restart_interval = interval;
133        self
134    }
135
136    /// Set pixel density for the JFIF APP0 marker.
137    pub fn pixel_density(mut self, density: PixelDensity) -> Self {
138        self.pixel_density = density;
139        self
140    }
141
142    /// Set EXIF data to embed.
143    pub fn exif_data(mut self, data: Vec<u8>) -> Self {
144        self.exif_data = if data.is_empty() { None } else { Some(data) };
145        self
146    }
147
148    /// Set ICC color profile to embed.
149    pub fn icc_profile(mut self, profile: Vec<u8>) -> Self {
150        self.icc_profile = if profile.is_empty() {
151            None
152        } else {
153            Some(profile)
154        };
155        self
156    }
157
158    /// Add a custom APP marker.
159    pub fn add_marker(mut self, app_num: u8, data: Vec<u8>) -> Self {
160        if app_num <= 15 && !data.is_empty() {
161            self.custom_markers.push((app_num, data));
162        }
163        self
164    }
165
166    /// Set custom luminance quantization table.
167    pub fn custom_luma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
168        self.custom_luma_qtable = Some(table);
169        self
170    }
171
172    /// Set custom chrominance quantization table.
173    pub fn custom_chroma_qtable(mut self, table: [u16; DCTSIZE2]) -> Self {
174        self.custom_chroma_qtable = Some(table);
175        self
176    }
177
178    /// Start streaming RGB encoding to a writer.
179    ///
180    /// # Arguments
181    /// * `width` - Image width in pixels
182    /// * `height` - Image height in pixels
183    /// * `writer` - Output writer
184    ///
185    /// # Returns
186    /// An [`EncodingStream`] that accepts scanlines.
187    pub fn start_rgb<W: Write>(
188        self,
189        width: u32,
190        height: u32,
191        writer: W,
192    ) -> Result<EncodingStream<W>> {
193        EncodingStream::new_rgb(self, width, height, writer)
194    }
195
196    /// Start streaming grayscale encoding to a writer.
197    ///
198    /// # Arguments
199    /// * `width` - Image width in pixels
200    /// * `height` - Image height in pixels
201    /// * `writer` - Output writer
202    ///
203    /// # Returns
204    /// An [`EncodingStream`] that accepts scanlines.
205    pub fn start_gray<W: Write>(
206        self,
207        width: u32,
208        height: u32,
209        writer: W,
210    ) -> Result<EncodingStream<W>> {
211        EncodingStream::new_gray(self, width, height, writer)
212    }
213}
214
215/// Implement batch encoding for StreamingEncoder (without optimizations).
216impl Encode for StreamingEncoder {
217    fn encode_rgb(&self, rgb_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
218        let mut output = Vec::new();
219        let mut stream = self.clone().start_rgb(width, height, &mut output)?;
220        stream.write_scanlines(rgb_data)?;
221        stream.finish()?;
222        Ok(output)
223    }
224
225    fn encode_gray(&self, gray_data: &[u8], width: u32, height: u32) -> Result<Vec<u8>> {
226        let mut output = Vec::new();
227        let mut stream = self.clone().start_gray(width, height, &mut output)?;
228        stream.write_scanlines(gray_data)?;
229        stream.finish()?;
230        Ok(output)
231    }
232}
233
234// ============================================================================
235// EncodingStream - Active Streaming Session
236// ============================================================================
237
238/// Active streaming encoding session.
239///
240/// Created by [`StreamingEncoder::start_rgb()`] or [`StreamingEncoder::start_gray()`].
241/// Accepts scanlines via [`write_scanlines()`](Self::write_scanlines) and must be
242/// finalized with [`finish()`](Self::finish).
243pub struct EncodingStream<W: Write> {
244    /// Output writer wrapped in marker writer
245    writer: MarkerWriter<W>,
246    /// Image width
247    width: u32,
248    /// Number of color components (1 for gray, 3 for RGB/YCbCr)
249    num_components: u8,
250    /// Bytes per input pixel
251    bytes_per_pixel: u8,
252    /// Chroma subsampling mode
253    subsampling: Subsampling,
254    /// MCU height in pixels (8 or 16 depending on subsampling)
255    mcu_height: u32,
256    /// MCU width in pixels (8 or 16 depending on subsampling)
257    mcu_width: u32,
258    /// Number of MCUs per row
259    mcus_per_row: u32,
260    /// Luminance quantization table (zigzag order)
261    luma_qtable: QuantTable,
262    /// Chrominance quantization table (zigzag order)
263    chroma_qtable: QuantTable,
264    /// DC Huffman table for luminance
265    dc_luma_table: DerivedTable,
266    /// AC Huffman table for luminance
267    ac_luma_table: DerivedTable,
268    /// DC Huffman table for chrominance
269    dc_chroma_table: DerivedTable,
270    /// AC Huffman table for chrominance
271    ac_chroma_table: DerivedTable,
272    /// Previous DC values for differential encoding
273    prev_dc: [i32; 4],
274    /// Scanlines accumulated for current MCU row
275    scanline_buffer: Vec<u8>,
276    /// Lines accumulated in buffer
277    lines_in_buffer: u32,
278    /// Bit buffer for entropy encoding
279    bit_buffer: u64,
280    /// Bits used in bit_buffer
281    bits_in_buffer: u8,
282    /// SIMD operations dispatch
283    simd: SimdOps,
284    /// Restart interval tracking
285    restart_interval: u16,
286    /// MCUs since last restart marker
287    mcus_since_restart: u16,
288    /// Next restart marker number (0-7)
289    next_restart_num: u8,
290}
291
292impl<W: Write> EncodingStream<W> {
293    /// Create a new RGB encoding stream.
294    fn new_rgb(config: StreamingEncoder, width: u32, height: u32, writer: W) -> Result<Self> {
295        Self::new(config, width, height, 3, writer)
296    }
297
298    /// Create a new grayscale encoding stream.
299    fn new_gray(config: StreamingEncoder, width: u32, height: u32, writer: W) -> Result<Self> {
300        Self::new(config, width, height, 1, writer)
301    }
302
303    /// Create a new encoding stream.
304    fn new(
305        config: StreamingEncoder,
306        width: u32,
307        height: u32,
308        num_components: u8,
309        writer: W,
310    ) -> Result<Self> {
311        // Validate dimensions
312        if width == 0 || height == 0 {
313            return Err(Error::InvalidDimensions { width, height });
314        }
315
316        // Determine MCU dimensions based on subsampling
317        let (mcu_width, mcu_height) = if num_components == 1 {
318            (DCTSIZE as u32, DCTSIZE as u32)
319        } else {
320            match config.subsampling {
321                Subsampling::S444 | Subsampling::Gray => (DCTSIZE as u32, DCTSIZE as u32),
322                Subsampling::S422 => (DCTSIZE as u32 * 2, DCTSIZE as u32),
323                Subsampling::S420 => (DCTSIZE as u32 * 2, DCTSIZE as u32 * 2),
324                Subsampling::S440 => (DCTSIZE as u32, DCTSIZE as u32 * 2),
325            }
326        };
327
328        let mcus_per_row = (width + mcu_width - 1) / mcu_width;
329
330        // Create quantization tables
331        let (luma_qtable, chroma_qtable) = create_quant_tables(
332            config.quality,
333            config.quant_table_idx,
334            config.force_baseline,
335        );
336
337        // Apply custom tables if specified
338        let luma_qtable = if let Some(custom) = config.custom_luma_qtable {
339            let mut values = luma_qtable.values;
340            for (i, &val) in custom.iter().enumerate() {
341                values[JPEG_NATURAL_ORDER[i] as usize] = val;
342            }
343            QuantTable::new(values)
344        } else {
345            luma_qtable
346        };
347
348        let chroma_qtable = if let Some(custom) = config.custom_chroma_qtable {
349            let mut values = chroma_qtable.values;
350            for (i, &val) in custom.iter().enumerate() {
351                values[JPEG_NATURAL_ORDER[i] as usize] = val;
352            }
353            QuantTable::new(values)
354        } else {
355            chroma_qtable
356        };
357
358        // Create standard Huffman tables (no optimization in streaming mode)
359        let dc_luma_htable = create_std_dc_luma_table();
360        let ac_luma_htable = create_std_ac_luma_table();
361        let dc_chroma_htable = create_std_dc_chroma_table();
362        let ac_chroma_htable = create_std_ac_chroma_table();
363
364        // Derive encoding tables
365        let dc_luma_table = DerivedTable::from_huff_table(&dc_luma_htable, true)?;
366        let ac_luma_table = DerivedTable::from_huff_table(&ac_luma_htable, false)?;
367        let dc_chroma_table = DerivedTable::from_huff_table(&dc_chroma_htable, true)?;
368        let ac_chroma_table = DerivedTable::from_huff_table(&ac_chroma_htable, false)?;
369
370        // Allocate scanline buffer for one MCU row
371        let buffer_size = (mcu_height as usize) * (width as usize) * (num_components as usize);
372        let scanline_buffer = try_alloc_vec(0u8, buffer_size)?;
373
374        let mut marker_writer = MarkerWriter::new(writer);
375
376        // Write JPEG headers
377        marker_writer.write_soi()?;
378        marker_writer.write_jfif_app0(
379            config.pixel_density.unit as u8,
380            config.pixel_density.x,
381            config.pixel_density.y,
382        )?;
383
384        // Write EXIF data if provided
385        if let Some(ref exif) = config.exif_data {
386            marker_writer.write_app1_exif(exif)?;
387        }
388
389        // Write ICC profile if provided
390        if let Some(ref icc) = config.icc_profile {
391            marker_writer.write_icc_profile(icc)?;
392        }
393
394        // Write custom markers
395        for (app_num, data) in &config.custom_markers {
396            marker_writer.write_app(*app_num, data)?;
397        }
398
399        // Write quantization tables
400        let use_16bit = !config.force_baseline;
401        if num_components == 1 {
402            marker_writer.write_dqt(0, &luma_qtable.values, use_16bit)?;
403        } else {
404            marker_writer.write_dqt_multiple(&[
405                (0, &luma_qtable.values, use_16bit),
406                (1, &chroma_qtable.values, use_16bit),
407            ])?;
408        }
409
410        // Write frame header
411        let components: Vec<ComponentInfo> = if num_components == 1 {
412            vec![ComponentInfo {
413                component_id: 1,
414                component_index: 0,
415                h_samp_factor: 1,
416                v_samp_factor: 1,
417                quant_tbl_no: 0,
418                dc_tbl_no: 0,
419                ac_tbl_no: 0,
420            }]
421        } else {
422            let (h_samp, v_samp) = match config.subsampling {
423                Subsampling::S444 | Subsampling::Gray => (1, 1),
424                Subsampling::S422 => (2, 1),
425                Subsampling::S420 => (2, 2),
426                Subsampling::S440 => (1, 2),
427            };
428            vec![
429                ComponentInfo {
430                    component_id: 1,
431                    component_index: 0,
432                    h_samp_factor: h_samp,
433                    v_samp_factor: v_samp,
434                    quant_tbl_no: 0,
435                    dc_tbl_no: 0,
436                    ac_tbl_no: 0,
437                },
438                ComponentInfo {
439                    component_id: 2,
440                    component_index: 1,
441                    h_samp_factor: 1,
442                    v_samp_factor: 1,
443                    quant_tbl_no: 1,
444                    dc_tbl_no: 1,
445                    ac_tbl_no: 1,
446                },
447                ComponentInfo {
448                    component_id: 3,
449                    component_index: 2,
450                    h_samp_factor: 1,
451                    v_samp_factor: 1,
452                    quant_tbl_no: 1,
453                    dc_tbl_no: 1,
454                    ac_tbl_no: 1,
455                },
456            ]
457        };
458
459        // Always baseline (not progressive) for streaming
460        marker_writer.write_sof(false, 8, height as u16, width as u16, &components)?;
461
462        // Write Huffman tables
463        if num_components == 1 {
464            marker_writer
465                .write_dht_multiple(&[(0, false, &dc_luma_htable), (0, true, &ac_luma_htable)])?;
466        } else {
467            marker_writer.write_dht_multiple(&[
468                (0, false, &dc_luma_htable),
469                (0, true, &ac_luma_htable),
470                (1, false, &dc_chroma_htable),
471                (1, true, &ac_chroma_htable),
472            ])?;
473        }
474
475        // Write restart interval if specified
476        if config.restart_interval > 0 {
477            marker_writer.write_dri(config.restart_interval)?;
478        }
479
480        // Write SOS marker
481        let scans = generate_baseline_scan(num_components);
482        marker_writer.write_sos(&scans[0], &components)?;
483
484        Ok(Self {
485            writer: marker_writer,
486            width,
487            num_components,
488            bytes_per_pixel: num_components,
489            subsampling: config.subsampling,
490            mcu_height,
491            mcu_width,
492            mcus_per_row,
493            luma_qtable,
494            chroma_qtable,
495            dc_luma_table,
496            ac_luma_table,
497            dc_chroma_table,
498            ac_chroma_table,
499            prev_dc: [0; 4],
500            scanline_buffer,
501            lines_in_buffer: 0,
502            bit_buffer: 0,
503            bits_in_buffer: 0,
504            simd: config.simd,
505            restart_interval: config.restart_interval,
506            mcus_since_restart: 0,
507            next_restart_num: 0,
508        })
509    }
510
511    /// Write scanlines to the encoder.
512    ///
513    /// Scanlines are buffered until a complete MCU row is available, then encoded.
514    /// The number of bytes should be `num_lines * width * bytes_per_pixel`.
515    ///
516    /// For best performance, write in multiples of the MCU height (8 or 16 lines).
517    pub fn write_scanlines(&mut self, data: &[u8]) -> Result<()> {
518        let bytes_per_line = self.width as usize * self.bytes_per_pixel as usize;
519        let lines_in_data = data.len() / bytes_per_line;
520
521        if data.len() != lines_in_data * bytes_per_line {
522            return Err(Error::BufferSizeMismatch {
523                expected: lines_in_data * bytes_per_line,
524                actual: data.len(),
525            });
526        }
527
528        let mut data_offset = 0;
529        let mut lines_remaining = lines_in_data as u32;
530
531        while lines_remaining > 0 {
532            // How many lines can we fit in the buffer?
533            let lines_to_copy =
534                (self.mcu_height - self.lines_in_buffer).min(lines_remaining) as usize;
535
536            // Copy lines to buffer
537            let buffer_offset = self.lines_in_buffer as usize * bytes_per_line;
538            let src_bytes = lines_to_copy * bytes_per_line;
539            self.scanline_buffer[buffer_offset..buffer_offset + src_bytes]
540                .copy_from_slice(&data[data_offset..data_offset + src_bytes]);
541
542            self.lines_in_buffer += lines_to_copy as u32;
543            data_offset += src_bytes;
544            lines_remaining -= lines_to_copy as u32;
545
546            // If buffer is full, encode the MCU row
547            if self.lines_in_buffer == self.mcu_height {
548                self.encode_mcu_row()?;
549                self.lines_in_buffer = 0;
550            }
551        }
552
553        Ok(())
554    }
555
556    /// Encode one complete MCU row from the scanline buffer.
557    fn encode_mcu_row(&mut self) -> Result<()> {
558        let width = self.width as usize;
559        let mcu_height = self.mcu_height as usize;
560
561        if self.num_components == 1 {
562            self.encode_gray_mcu_row(width, mcu_height)?;
563        } else {
564            self.encode_color_mcu_row(width, mcu_height)?;
565        }
566
567        Ok(())
568    }
569
570    /// Encode a grayscale MCU row.
571    fn encode_gray_mcu_row(&mut self, width: usize, mcu_height: usize) -> Result<()> {
572        let mcus_per_row = self.mcus_per_row as usize;
573
574        for mcu_x in 0..mcus_per_row {
575            // Handle restart markers
576            if self.restart_interval > 0 && self.mcus_since_restart == self.restart_interval {
577                self.write_restart_marker()?;
578            }
579
580            // Extract 8x8 block
581            let mut block = [0i16; DCTSIZE2];
582            let x_start = mcu_x * DCTSIZE;
583
584            for y in 0..DCTSIZE {
585                let src_y = y.min(mcu_height - 1);
586                for x in 0..DCTSIZE {
587                    let src_x = (x_start + x).min(width - 1);
588                    let pixel = self.scanline_buffer[src_y * width + src_x];
589                    // Level shift: 0..255 -> -128..127
590                    block[y * DCTSIZE + x] = pixel as i16 - 128;
591                }
592            }
593
594            // Forward DCT
595            let mut dct_block = [0i16; DCTSIZE2];
596            (self.simd.forward_dct)(&block, &mut dct_block);
597
598            // Convert to i32 for quantization
599            let mut dct_i32 = [0i32; DCTSIZE2];
600            for i in 0..DCTSIZE2 {
601                dct_i32[i] = dct_block[i] as i32;
602            }
603
604            // Quantize
605            let mut quantized = [0i16; DCTSIZE2];
606            quantize_block(&dct_i32, &self.luma_qtable.values, &mut quantized);
607
608            // Encode DC coefficient (differential)
609            let dc = quantized[0] as i32;
610            let dc_diff = dc - self.prev_dc[0];
611            self.prev_dc[0] = dc;
612
613            // Clone tables to avoid borrow conflicts
614            let dc_table = self.dc_luma_table.clone();
615            let ac_table = self.ac_luma_table.clone();
616            self.encode_dc(dc_diff, &dc_table)?;
617            self.encode_ac(&quantized, &ac_table)?;
618
619            if self.restart_interval > 0 {
620                self.mcus_since_restart += 1;
621            }
622        }
623
624        Ok(())
625    }
626
627    /// Encode a color (YCbCr) MCU row.
628    fn encode_color_mcu_row(&mut self, width: usize, mcu_height: usize) -> Result<()> {
629        let mcus_per_row = self.mcus_per_row as usize;
630
631        // Temporary storage for Y, Cb, Cr planes
632        let mcu_width = self.mcu_width as usize;
633        let mut y_plane = vec![0i16; mcu_width * mcu_height];
634        let mut cb_plane = vec![0i16; mcu_width * mcu_height];
635        let mut cr_plane = vec![0i16; mcu_width * mcu_height];
636
637        for mcu_x in 0..mcus_per_row {
638            // Handle restart markers
639            if self.restart_interval > 0 && self.mcus_since_restart == self.restart_interval {
640                self.write_restart_marker()?;
641            }
642
643            let x_start = mcu_x * mcu_width;
644
645            // Convert RGB to YCbCr for this MCU
646            for y in 0..mcu_height {
647                let src_y = y.min(mcu_height - 1);
648                for x in 0..mcu_width {
649                    let src_x = (x_start + x).min(width - 1);
650                    let pixel_idx = (src_y * width + src_x) * 3;
651
652                    let r = self.scanline_buffer[pixel_idx] as i32;
653                    let g = self.scanline_buffer[pixel_idx + 1] as i32;
654                    let b = self.scanline_buffer[pixel_idx + 2] as i32;
655
656                    // RGB to YCbCr conversion (BT.601)
657                    let y_val = ((77 * r + 150 * g + 29 * b + 128) >> 8) - 128;
658                    let cb_val = (-43 * r - 85 * g + 128 * b + 128) >> 8;
659                    let cr_val = (128 * r - 107 * g - 21 * b + 128) >> 8;
660
661                    let idx = y * mcu_width + x;
662                    y_plane[idx] = y_val as i16;
663                    cb_plane[idx] = cb_val as i16;
664                    cr_plane[idx] = cr_val as i16;
665                }
666            }
667
668            // Encode Y blocks
669            match self.subsampling {
670                Subsampling::S444 | Subsampling::Gray => {
671                    // One 8x8 Y block
672                    self.encode_luma_block(&y_plane, mcu_width, 0, 0)?;
673                }
674                Subsampling::S422 => {
675                    // Two 8x8 Y blocks horizontally
676                    self.encode_luma_block(&y_plane, mcu_width, 0, 0)?;
677                    self.encode_luma_block(&y_plane, mcu_width, DCTSIZE, 0)?;
678                }
679                Subsampling::S420 => {
680                    // Four 8x8 Y blocks (2x2)
681                    self.encode_luma_block(&y_plane, mcu_width, 0, 0)?;
682                    self.encode_luma_block(&y_plane, mcu_width, DCTSIZE, 0)?;
683                    self.encode_luma_block(&y_plane, mcu_width, 0, DCTSIZE)?;
684                    self.encode_luma_block(&y_plane, mcu_width, DCTSIZE, DCTSIZE)?;
685                }
686                Subsampling::S440 => {
687                    // Two 8x8 Y blocks vertically
688                    self.encode_luma_block(&y_plane, mcu_width, 0, 0)?;
689                    self.encode_luma_block(&y_plane, mcu_width, 0, DCTSIZE)?;
690                }
691            }
692
693            // Downsample and encode Cb, Cr
694            self.encode_chroma_block(&cb_plane, mcu_width, 1)?;
695            self.encode_chroma_block(&cr_plane, mcu_width, 2)?;
696
697            if self.restart_interval > 0 {
698                self.mcus_since_restart += 1;
699            }
700        }
701
702        Ok(())
703    }
704
705    /// Encode a single 8x8 luma block from the Y plane.
706    fn encode_luma_block(
707        &mut self,
708        y_plane: &[i16],
709        plane_width: usize,
710        x_off: usize,
711        y_off: usize,
712    ) -> Result<()> {
713        let mut block = [0i16; DCTSIZE2];
714
715        for y in 0..DCTSIZE {
716            for x in 0..DCTSIZE {
717                block[y * DCTSIZE + x] = y_plane[(y_off + y) * plane_width + x_off + x];
718            }
719        }
720
721        let mut dct_block = [0i16; DCTSIZE2];
722        (self.simd.forward_dct)(&block, &mut dct_block);
723
724        // Convert to i32 for quantization
725        let mut dct_i32 = [0i32; DCTSIZE2];
726        for i in 0..DCTSIZE2 {
727            dct_i32[i] = dct_block[i] as i32;
728        }
729
730        let mut quantized = [0i16; DCTSIZE2];
731        quantize_block(&dct_i32, &self.luma_qtable.values, &mut quantized);
732
733        let dc = quantized[0] as i32;
734        let dc_diff = dc - self.prev_dc[0];
735        self.prev_dc[0] = dc;
736
737        // Clone tables to avoid borrow conflicts
738        let dc_table = self.dc_luma_table.clone();
739        let ac_table = self.ac_luma_table.clone();
740        self.encode_dc(dc_diff, &dc_table)?;
741        self.encode_ac(&quantized, &ac_table)?;
742
743        Ok(())
744    }
745
746    /// Encode a chroma block with downsampling.
747    fn encode_chroma_block(
748        &mut self,
749        chroma_plane: &[i16],
750        plane_width: usize,
751        comp_idx: usize,
752    ) -> Result<()> {
753        let mut block = [0i16; DCTSIZE2];
754
755        // Downsample based on subsampling mode
756        match self.subsampling {
757            Subsampling::S444 | Subsampling::Gray => {
758                // No downsampling
759                for y in 0..DCTSIZE {
760                    for x in 0..DCTSIZE {
761                        block[y * DCTSIZE + x] = chroma_plane[y * plane_width + x];
762                    }
763                }
764            }
765            Subsampling::S422 => {
766                // 2:1 horizontal downsampling
767                for y in 0..DCTSIZE {
768                    for x in 0..DCTSIZE {
769                        let x2 = x * 2;
770                        let val = (chroma_plane[y * plane_width + x2] as i32
771                            + chroma_plane[y * plane_width + x2 + 1] as i32)
772                            / 2;
773                        block[y * DCTSIZE + x] = val as i16;
774                    }
775                }
776            }
777            Subsampling::S420 => {
778                // 2:1 horizontal and vertical downsampling
779                for y in 0..DCTSIZE {
780                    for x in 0..DCTSIZE {
781                        let x2 = x * 2;
782                        let y2 = y * 2;
783                        let val = (chroma_plane[y2 * plane_width + x2] as i32
784                            + chroma_plane[y2 * plane_width + x2 + 1] as i32
785                            + chroma_plane[(y2 + 1) * plane_width + x2] as i32
786                            + chroma_plane[(y2 + 1) * plane_width + x2 + 1] as i32)
787                            / 4;
788                        block[y * DCTSIZE + x] = val as i16;
789                    }
790                }
791            }
792            Subsampling::S440 => {
793                // 2:1 vertical downsampling
794                for y in 0..DCTSIZE {
795                    for x in 0..DCTSIZE {
796                        let y2 = y * 2;
797                        let val = (chroma_plane[y2 * plane_width + x] as i32
798                            + chroma_plane[(y2 + 1) * plane_width + x] as i32)
799                            / 2;
800                        block[y * DCTSIZE + x] = val as i16;
801                    }
802                }
803            }
804        }
805
806        let mut dct_block = [0i16; DCTSIZE2];
807        (self.simd.forward_dct)(&block, &mut dct_block);
808
809        // Convert to i32 for quantization
810        let mut dct_i32 = [0i32; DCTSIZE2];
811        for i in 0..DCTSIZE2 {
812            dct_i32[i] = dct_block[i] as i32;
813        }
814
815        let mut quantized = [0i16; DCTSIZE2];
816        quantize_block(&dct_i32, &self.chroma_qtable.values, &mut quantized);
817
818        let dc = quantized[0] as i32;
819        let dc_diff = dc - self.prev_dc[comp_idx];
820        self.prev_dc[comp_idx] = dc;
821
822        // Clone tables to avoid borrow conflicts
823        let dc_table = self.dc_chroma_table.clone();
824        let ac_table = self.ac_chroma_table.clone();
825        self.encode_dc(dc_diff, &dc_table)?;
826        self.encode_ac(&quantized, &ac_table)?;
827
828        Ok(())
829    }
830
831    /// Encode a DC coefficient (differential).
832    fn encode_dc(&mut self, diff: i32, table: &DerivedTable) -> Result<()> {
833        let (size, bits) = if diff == 0 {
834            (0, 0)
835        } else {
836            let abs_diff = diff.unsigned_abs();
837            let size = 32 - abs_diff.leading_zeros();
838            let bits = if diff > 0 {
839                diff as u32
840            } else {
841                (diff - 1) as u32 & ((1 << size) - 1)
842            };
843            (size, bits)
844        };
845
846        // Write Huffman code for size
847        let (code, code_len) = table.get_code(size as u8);
848        self.write_bits(code as u64, code_len)?;
849
850        // Write magnitude bits
851        if size > 0 {
852            self.write_bits(bits as u64, size as u8)?;
853        }
854
855        Ok(())
856    }
857
858    /// Encode AC coefficients in zigzag order.
859    fn encode_ac(&mut self, quantized: &[i16; DCTSIZE2], table: &DerivedTable) -> Result<()> {
860        let mut run = 0;
861
862        for i in 1..DCTSIZE2 {
863            let val = quantized[JPEG_NATURAL_ORDER[i] as usize];
864
865            if val == 0 {
866                run += 1;
867            } else {
868                // Write ZRL (16 zeros) codes if needed
869                while run >= 16 {
870                    let (code, code_len) = table.get_code(0xF0); // ZRL
871                    self.write_bits(code as u64, code_len)?;
872                    run -= 16;
873                }
874
875                // Compute size and bits
876                let abs_val = val.unsigned_abs() as u32;
877                let size = 32 - abs_val.leading_zeros();
878                let bits = if val > 0 {
879                    val as u32
880                } else {
881                    (val - 1) as u32 & ((1 << size) - 1)
882                };
883
884                // Symbol is (run << 4) | size
885                let symbol = ((run as u8) << 4) | (size as u8);
886                let (code, code_len) = table.get_code(symbol);
887                self.write_bits(code as u64, code_len)?;
888                self.write_bits(bits as u64, size as u8)?;
889
890                run = 0;
891            }
892        }
893
894        // End of block
895        if run > 0 {
896            let (code, code_len) = table.get_code(0x00); // EOB
897            self.write_bits(code as u64, code_len)?;
898        }
899
900        Ok(())
901    }
902
903    /// Write bits to the output with byte stuffing.
904    fn write_bits(&mut self, bits: u64, count: u8) -> Result<()> {
905        self.bit_buffer |= bits << (64 - self.bits_in_buffer - count);
906        self.bits_in_buffer += count;
907
908        while self.bits_in_buffer >= 8 {
909            let byte = (self.bit_buffer >> 56) as u8;
910            self.writer.get_mut().write_all(&[byte])?;
911
912            // Byte stuffing: 0xFF must be followed by 0x00
913            if byte == 0xFF {
914                self.writer.get_mut().write_all(&[0x00])?;
915            }
916
917            self.bit_buffer <<= 8;
918            self.bits_in_buffer -= 8;
919        }
920
921        Ok(())
922    }
923
924    /// Write a restart marker and reset DC predictors.
925    fn write_restart_marker(&mut self) -> Result<()> {
926        // Flush remaining bits with 1s padding
927        if self.bits_in_buffer > 0 {
928            let padding = 8 - self.bits_in_buffer;
929            self.write_bits((1u64 << padding) - 1, padding)?;
930        }
931
932        // Write RST marker
933        let marker = 0xD0 + self.next_restart_num;
934        self.writer.get_mut().write_all(&[0xFF, marker])?;
935
936        // Reset state
937        self.prev_dc = [0; 4];
938        self.mcus_since_restart = 0;
939        self.next_restart_num = (self.next_restart_num + 1) & 7;
940
941        Ok(())
942    }
943
944    /// Finish encoding and write the EOI marker.
945    ///
946    /// This must be called after all scanlines have been written.
947    /// Consumes the stream and returns the underlying writer.
948    pub fn finish(mut self) -> Result<W> {
949        // Encode any remaining lines in the buffer (partial MCU row)
950        if self.lines_in_buffer > 0 {
951            // Pad the buffer with the last line
952            let bytes_per_line = self.width as usize * self.bytes_per_pixel as usize;
953            let last_line_start = (self.lines_in_buffer as usize - 1) * bytes_per_line;
954            let last_line =
955                self.scanline_buffer[last_line_start..last_line_start + bytes_per_line].to_vec();
956
957            while self.lines_in_buffer < self.mcu_height {
958                let buffer_offset = self.lines_in_buffer as usize * bytes_per_line;
959                self.scanline_buffer[buffer_offset..buffer_offset + bytes_per_line]
960                    .copy_from_slice(&last_line);
961                self.lines_in_buffer += 1;
962            }
963
964            self.encode_mcu_row()?;
965        }
966
967        // Flush remaining bits with 1s padding
968        if self.bits_in_buffer > 0 {
969            let padding = 8 - self.bits_in_buffer;
970            self.write_bits((1u64 << padding) - 1, padding)?;
971        }
972
973        // Write EOI marker
974        self.writer.write_eoi()?;
975
976        Ok(self.writer.into_inner())
977    }
978}