edgefirst-codec 0.23.0

Image codec for decoding JPEG/PNG into pre-allocated EdgeFirst tensors
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
// SPDX-FileCopyrightText: Copyright 2026 Au-Zone Technologies
// SPDX-License-Identifier: Apache-2.0

//! MCU (Minimum Coded Unit) decode loop.
//!
//! Orchestrates: Huffman decode → IDCT → chroma upsample → color convert →
//! strided output into the destination buffer.

use crate::error::CodecError;
use crate::jpeg::bitstream::BitStream;
use crate::jpeg::color::{self, ColorConvertFn};
use crate::jpeg::huffman::{self, HuffmanTable};
use crate::jpeg::idct::{self, IdctDcOnlyFn, IdctFn};
use crate::jpeg::markers::JpegHeaders;
use crate::jpeg::upsample;
use edgefirst_tensor::PixelFormat;

/// Scratch buffers reused across MCU decode iterations.
pub struct McuScratch {
    /// Per-component IDCT output buffers for one MCU row.
    /// Indexed by component index, each is `mcu_blocks_h * 8` wide ×
    /// `mcu_blocks_v * 8` tall.
    component_bufs: Vec<Vec<u8>>,
    /// Upsampled chroma row buffers (full image width).
    cb_row: Vec<u8>,
    cr_row: Vec<u8>,
    /// Output row buffer for color conversion before writing to tensor.
    output_row: Vec<u8>,
}

impl McuScratch {
    /// Allocate scratch buffers for the given image header.
    pub fn new(headers: &JpegHeaders) -> Self {
        let hdr = &headers.header;
        let _max_h = hdr.max_h_samp as usize;
        let _max_v = hdr.max_v_samp as usize;
        let w = hdr.width as usize;

        let mut component_bufs = Vec::with_capacity(hdr.components.len());
        for comp in &hdr.components {
            let blocks_h = comp.sampling.h as usize;
            let blocks_v = comp.sampling.v as usize;
            // Width of one MCU column of blocks for this component
            let mcu_w = blocks_h * 8;
            let mcu_h = blocks_v * 8;
            // Full MCU row: mcus_x MCU columns × mcu_h rows
            let row_pixels = hdr.mcus_x() * mcu_w;
            let buf_size = row_pixels * mcu_h;
            component_bufs.push(vec![0u8; buf_size]);
        }

        let output_channels = 4; // Max (RGBA)
        Self {
            component_bufs,
            cb_row: vec![0u8; w + 16], // Padding for SIMD
            cr_row: vec![0u8; w + 16],
            output_row: vec![0u8; (w + 16) * output_channels],
        }
    }

    /// Grow buffers if needed (for a larger image than previously seen).
    pub fn ensure_capacity(&mut self, headers: &JpegHeaders) {
        let hdr = &headers.header;
        let w = hdr.width as usize;

        for (i, comp) in hdr.components.iter().enumerate() {
            let blocks_h = comp.sampling.h as usize;
            let blocks_v = comp.sampling.v as usize;
            let row_pixels = hdr.mcus_x() * blocks_h * 8;
            let buf_size = row_pixels * blocks_v * 8;
            if i >= self.component_bufs.len() {
                self.component_bufs.push(vec![0u8; buf_size]);
            } else if self.component_bufs[i].len() < buf_size {
                self.component_bufs[i].resize(buf_size, 0);
            }
        }

        let needed = w + 16;
        if self.cb_row.len() < needed {
            self.cb_row.resize(needed, 0);
        }
        if self.cr_row.len() < needed {
            self.cr_row.resize(needed, 0);
        }
        let output_needed = needed * 4;
        if self.output_row.len() < output_needed {
            self.output_row.resize(output_needed, 0);
        }
    }
}

/// Decode all MCUs and write output pixels into `dst` at `dst_stride` byte
/// offsets.
///
/// `dst` is the mapped tensor buffer (u8 pixels in the target format).
pub fn decode_image(
    data: &[u8],
    headers: &JpegHeaders,
    scratch: &mut McuScratch,
    dst: &mut [u8],
    dst_stride: usize,
    output_format: PixelFormat,
) -> crate::Result<()> {
    let hdr = &headers.header;
    let img_w = hdr.width as usize;
    let img_h = hdr.height as usize;
    let num_components = hdr.components.len();

    // Select kernel functions
    let idct_fn: IdctFn = idct::select_idct();
    let idct_dc_fn: IdctDcOnlyFn = idct::select_idct_dc_only();

    let is_greyscale = num_components == 1;

    // Validate Huffman tables
    let dc_tables: Vec<&HuffmanTable> = hdr
        .components
        .iter()
        .map(|c| {
            headers.dc_tables[c.dc_table_id as usize]
                .as_ref()
                .ok_or_else(|| {
                    CodecError::InvalidData(format!("missing DC Huffman table {}", c.dc_table_id))
                })
        })
        .collect::<crate::Result<Vec<_>>>()?;

    let ac_tables: Vec<&HuffmanTable> = hdr
        .components
        .iter()
        .map(|c| {
            headers.ac_tables[c.ac_table_id as usize]
                .as_ref()
                .ok_or_else(|| {
                    CodecError::InvalidData(format!("missing AC Huffman table {}", c.ac_table_id))
                })
        })
        .collect::<crate::Result<Vec<_>>>()?;

    // DC prediction values (one per component)
    let mut dc_pred = vec![0i32; num_components];

    // Create bit stream starting at the entropy data
    let mut bs = BitStream::new(data, headers.scan_data_offset);

    let mcus_x = hdr.mcus_x();
    let mcus_y = hdr.mcus_y();
    let max_v = hdr.max_v_samp as usize;
    let restart_interval = headers.restart_interval as usize;
    let mut mcu_count = 0usize;

    // Coefficient buffer for one 8×8 block
    let mut coeffs = [0i32; 64];

    // Process MCU rows
    for mcu_row in 0..mcus_y {
        // Decode all MCUs in this row
        for mcu_col in 0..mcus_x {
            // Check for restart marker
            if restart_interval > 0 && mcu_count > 0 && mcu_count.is_multiple_of(restart_interval) {
                bs.skip_restart_marker();
                dc_pred.fill(0);
            }

            // Decode all blocks in this MCU
            for (ci, comp) in hdr.components.iter().enumerate() {
                let blocks_h = comp.sampling.h as usize;
                let blocks_v = comp.sampling.v as usize;
                let comp_stride = mcus_x * blocks_h * 8;

                for bv in 0..blocks_v {
                    for bh in 0..blocks_h {
                        huffman::decode_block(
                            &mut bs,
                            dc_tables[ci],
                            ac_tables[ci],
                            &headers.quant_tables[comp.quant_table_id as usize],
                            &mut coeffs,
                            &mut dc_pred[ci],
                        )?;

                        // IDCT into component buffer
                        let x_offset = mcu_col * blocks_h * 8 + bh * 8;
                        let y_offset = bv * 8;
                        let buf_offset = y_offset * comp_stride + x_offset;
                        let buf = &mut scratch.component_bufs[ci];

                        // Check if DC-only (all AC coefficients are zero)
                        let is_dc_only = coeffs[1..].iter().all(|&c| c == 0);
                        if is_dc_only {
                            idct_dc_fn(coeffs[0], &mut buf[buf_offset..], comp_stride);
                        } else {
                            idct_fn(&coeffs, &mut buf[buf_offset..], comp_stride);
                        }
                    }
                }
            }

            mcu_count += 1;
        }

        // After decoding all MCUs in this row, perform upsampling + color
        // conversion and write to the output buffer.
        let mcu_pixel_h = max_v * 8;
        let y_start = mcu_row * mcu_pixel_h;

        if is_greyscale || output_format == PixelFormat::Grey {
            // The Y plane (component_bufs[0]) is stored at native pixel
            // resolution for both 1-component (greyscale) JPEGs and the
            // luma channel of multi-component JPEGs, so the same write
            // path covers both cases — chroma planes are simply skipped.
            let grey_fn = color::select_grey_copy();
            write_greyscale_rows(
                &scratch.component_bufs[0],
                mcus_x * hdr.components[0].sampling.h as usize * 8,
                dst,
                dst_stride,
                y_start,
                mcu_pixel_h.min(img_h - y_start),
                img_w,
                output_format,
                grey_fn,
                &mut scratch.output_row,
            );
        } else if output_format == PixelFormat::Nv12 {
            write_nv12_rows(
                hdr,
                &scratch.component_bufs,
                mcus_x,
                dst,
                dst_stride,
                y_start,
                mcu_pixel_h.min(img_h - y_start),
                img_w,
                img_h,
            );
        } else {
            let color_fn = color::select_color_convert(output_format)
                .ok_or(CodecError::UnsupportedFormat(output_format))?;
            let upsample_h_fn = upsample::select_upsample_h();

            write_color_rows(
                hdr,
                &scratch.component_bufs,
                mcus_x,
                dst,
                dst_stride,
                y_start,
                mcu_pixel_h.min(img_h - y_start),
                img_w,
                output_format,
                color_fn,
                upsample_h_fn,
                &mut scratch.cb_row,
                &mut scratch.cr_row,
                &mut scratch.output_row,
            );
        }
    }

    Ok(())
}

/// Write greyscale rows from the Y component buffer to the output.
#[allow(clippy::too_many_arguments)]
fn write_greyscale_rows(
    y_buf: &[u8],
    y_stride: usize,
    dst: &mut [u8],
    dst_stride: usize,
    y_start: usize,
    num_rows: usize,
    img_w: usize,
    format: PixelFormat,
    grey_fn: color::GreyCopyFn,
    output_row: &mut [u8],
) {
    let channels = format.channels();
    for row in 0..num_rows {
        let y_row = &y_buf[row * y_stride..row * y_stride + img_w];
        let dst_offset = (y_start + row) * dst_stride;

        if format == PixelFormat::Grey {
            grey_fn(y_row, &mut dst[dst_offset..], img_w);
        } else {
            // Expand grey to RGB/RGBA
            for i in 0..img_w {
                let v = y_row[i];
                match channels {
                    3 => {
                        output_row[i * 3] = v;
                        output_row[i * 3 + 1] = v;
                        output_row[i * 3 + 2] = v;
                    }
                    4 => {
                        output_row[i * 4] = v;
                        output_row[i * 4 + 1] = v;
                        output_row[i * 4 + 2] = v;
                        output_row[i * 4 + 3] = 255;
                    }
                    _ => {
                        output_row[i] = v;
                    }
                }
            }
            let row_bytes = img_w * channels;
            dst[dst_offset..dst_offset + row_bytes].copy_from_slice(&output_row[..row_bytes]);
        }
    }
}

/// Write color rows: upsample chroma + convert YCbCr → target format.
#[allow(clippy::too_many_arguments)]
fn write_color_rows(
    hdr: &crate::jpeg::types::ImageHeader,
    comp_bufs: &[Vec<u8>],
    mcus_x: usize,
    dst: &mut [u8],
    dst_stride: usize,
    y_start: usize,
    num_rows: usize,
    img_w: usize,
    format: PixelFormat,
    color_fn: ColorConvertFn,
    upsample_h_fn: upsample::UpsampleHFn,
    cb_row_buf: &mut [u8],
    cr_row_buf: &mut [u8],
    output_row: &mut [u8],
) {
    let channels = format.channels();
    let y_comp = &hdr.components[0];
    let cb_comp = &hdr.components[1];
    let cr_comp = &hdr.components[2];

    let y_stride = mcus_x * y_comp.sampling.h as usize * 8;
    let cb_stride = mcus_x * cb_comp.sampling.h as usize * 8;
    let cr_stride = mcus_x * cr_comp.sampling.h as usize * 8;

    let h_ratio = y_comp.sampling.h / cb_comp.sampling.h;
    let v_ratio = y_comp.sampling.v / cb_comp.sampling.v;

    let chroma_w = img_w.div_ceil(h_ratio as usize);

    for row in 0..num_rows {
        // Y row from component buffer
        let y_row = &comp_bufs[0][row * y_stride..];

        // Chroma rows (may be subsampled vertically)
        let chroma_row = row / v_ratio as usize;
        let cb_src = &comp_bufs[1][chroma_row * cb_stride..];
        let cr_src = &comp_bufs[2][chroma_row * cr_stride..];

        // Upsample chroma to full width if needed
        if h_ratio > 1 {
            upsample_h_fn(cb_src, cb_row_buf, chroma_w);
            upsample_h_fn(cr_src, cr_row_buf, chroma_w);
        } else {
            cb_row_buf[..chroma_w].copy_from_slice(&cb_src[..chroma_w]);
            cr_row_buf[..chroma_w].copy_from_slice(&cr_src[..chroma_w]);
        }

        // Color convert
        color_fn(y_row, cb_row_buf, cr_row_buf, output_row, img_w);

        // Write to destination at stride offset
        let dst_offset = (y_start + row) * dst_stride;
        let row_bytes = img_w * channels;
        dst[dst_offset..dst_offset + row_bytes].copy_from_slice(&output_row[..row_bytes]);
    }
}

/// Write NV12 output: Y plane + interleaved UV plane.
///
/// NV12 layout in the destination buffer:
/// - Y plane: `img_h` rows of `img_w` bytes at offset 0
/// - UV plane: `img_h/2` rows of `img_w` bytes (Cb/Cr interleaved) at offset
///   `img_h * dst_stride`
///
/// For 4:2:0 JPEGs, the Cb/Cr components are already at half resolution,
/// so we copy them directly (no upsampling needed).
/// For 4:4:4 JPEGs, we subsample by averaging 2×2 blocks.
#[allow(clippy::too_many_arguments)]
fn write_nv12_rows(
    hdr: &crate::jpeg::types::ImageHeader,
    comp_bufs: &[Vec<u8>],
    mcus_x: usize,
    dst: &mut [u8],
    dst_stride: usize,
    y_start: usize,
    num_rows: usize,
    img_w: usize,
    img_h: usize,
) {
    let y_comp = &hdr.components[0];
    let cb_comp = &hdr.components[1];

    let y_comp_stride = mcus_x * y_comp.sampling.h as usize * 8;
    let cb_comp_stride = mcus_x * cb_comp.sampling.h as usize * 8;

    let v_ratio = y_comp.sampling.v / cb_comp.sampling.v;
    let uv_plane_offset = img_h * dst_stride;

    // Copy Y plane rows directly
    for row in 0..num_rows {
        let src_offset = row * y_comp_stride;
        let dst_offset = (y_start + row) * dst_stride;
        let copy_len = img_w.min(y_comp_stride - (src_offset % y_comp_stride));
        dst[dst_offset..dst_offset + img_w.min(copy_len)]
            .copy_from_slice(&comp_bufs[0][src_offset..src_offset + img_w.min(copy_len)]);
    }

    // Write UV plane (interleaved Cb/Cr at half height, half width).
    // NV12 is defined only for even widths — the UV plane stores one pair
    // per two-luma-column block. With odd img_w the right-most column has
    // no chroma neighbour and the tensor allocation cannot fit ceil(w/2)
    // UV pairs into a w-byte row stride. Odd-width NV12 is rejected up
    // front in `decode_image` (see Unsupported(JpegChromaSubsampling)
    // for the subsampling validator); here we just use the spec-true
    // half-width count.
    let chroma_h = num_rows / v_ratio as usize;
    let chroma_w = img_w / 2;

    for crow in 0..chroma_h {
        let chroma_src_row = crow;
        let cb_src = &comp_bufs[1][chroma_src_row * cb_comp_stride..];
        let cr_src = &comp_bufs[2][chroma_src_row * cb_comp_stride..];

        let uv_row_idx = y_start / 2 + crow;
        let uv_offset = uv_plane_offset + uv_row_idx * dst_stride;

        // Interleave Cb/Cr pairs
        for x in 0..chroma_w {
            dst[uv_offset + x * 2] = cb_src[x];
            dst[uv_offset + x * 2 + 1] = cr_src[x];
        }
    }
}