Skip to main content

edgefirst_image/cpu/
mod.rs

1// SPDX-FileCopyrightText: Copyright 2025 Au-Zone Technologies
2// SPDX-License-Identifier: Apache-2.0
3
4use crate::{Crop, Error, Flip, FunctionTimer, ImageProcessorTrait, Rect, Result, Rotation};
5use edgefirst_decoder::{DetectBox, ProtoData, Segmentation};
6use edgefirst_tensor::{
7    DType, PixelFormat, Tensor, TensorDyn, TensorMapTrait, TensorMemory, TensorTrait,
8};
9
10mod convert;
11mod masks;
12mod resize;
13mod tests;
14
15// bilinear_dot removed — masks.rs now uses slice-native bilinear_dot_slice
16// closure-based kernel, invoked through the local dtype dispatch below.
17
18/// CPUConverter implements the ImageProcessor trait using the fallback CPU
19/// implementation for image processing.
20#[derive(Debug, Clone)]
21pub struct CPUProcessor {
22    resizer: fast_image_resize::Resizer,
23    options: fast_image_resize::ResizeOptions,
24    colors: [[u8; 4]; 20],
25}
26
27unsafe impl Send for CPUProcessor {}
28unsafe impl Sync for CPUProcessor {}
29
30impl Default for CPUProcessor {
31    fn default() -> Self {
32        Self::new_bilinear()
33    }
34}
35
36/// Write the base layer of `dst` before mask rendering.
37///
38/// This is the terminal fallback: on CPU we have no 2D hardware, so a
39/// direct buffer write is the appropriate primitive. The invariant is that
40/// every call to the CPU draw_* entry points fully initialises dst — we
41/// never rely on "whatever was in the buffer" from the caller.
42///
43/// - `background == Some(bg)` → byte-for-byte copy bg → dst (after shape /
44///   format validation).
45/// - `background == None` → fill dst with 0x00 (transparent black).
46fn prepare_dst_base_cpu(dst: &mut TensorDyn, background: Option<&TensorDyn>) -> Result<()> {
47    match background {
48        Some(bg) => {
49            if bg.shape() != dst.shape() {
50                return Err(Error::InvalidShape(
51                    "background shape does not match dst".into(),
52                ));
53            }
54            if bg.format() != dst.format() {
55                return Err(Error::InvalidShape(
56                    "background pixel format does not match dst".into(),
57                ));
58            }
59            let bg_u8 = bg.as_u8().ok_or(Error::NotAnImage)?;
60            let dst_u8 = dst.as_u8_mut().ok_or(Error::NotAnImage)?;
61            let bg_map = bg_u8.map()?;
62            let mut dst_map = dst_u8.map()?;
63            let bg_slice = bg_map.as_slice();
64            let dst_slice = dst_map.as_mut_slice();
65            if bg_slice.len() != dst_slice.len() {
66                return Err(Error::InvalidShape(
67                    "background buffer size does not match dst".into(),
68                ));
69            }
70            dst_slice.copy_from_slice(bg_slice);
71        }
72        None => {
73            let dst_u8 = dst.as_u8_mut().ok_or(Error::NotAnImage)?;
74            let mut dst_map = dst_u8.map()?;
75            dst_map.as_mut_slice().fill(0);
76        }
77    }
78    Ok(())
79}
80
81/// Compute row stride for a packed-format Tensor<u8> image given its format.
82fn row_stride_for(width: usize, fmt: PixelFormat) -> usize {
83    use edgefirst_tensor::PixelLayout;
84    match fmt.layout() {
85        PixelLayout::Packed => width * fmt.channels(),
86        PixelLayout::Planar | PixelLayout::SemiPlanar => width,
87        _ => width, // fallback for non-exhaustive
88    }
89}
90
91/// Read the effective row stride from a tensor, falling back to the computed
92/// minimum stride if the tensor has no explicit stride set. This correctly
93/// handles tensors with GPU pitch-alignment padding (e.g., from
94/// `ImageProcessor::create_image()` or codec strided decode).
95fn tensor_row_stride(tensor: &Tensor<u8>) -> usize {
96    tensor.effective_row_stride().unwrap_or_else(|| {
97        let w = tensor.width().unwrap_or(0);
98        let fmt = tensor.format().unwrap_or(PixelFormat::Rgb);
99        row_stride_for(w, fmt)
100    })
101}
102
103/// Apply XOR 0x80 bias to color channels only, preserving alpha.
104///
105/// Matches GL int8 shader behavior: `vec4(int8_bias(c.rgb), c.a)`.
106/// For formats without alpha, XORs every byte (fast path).
107pub(crate) fn apply_int8_xor_bias(data: &mut [u8], fmt: PixelFormat) {
108    use edgefirst_tensor::PixelLayout;
109    if !fmt.has_alpha() {
110        for b in data.iter_mut() {
111            *b ^= 0x80;
112        }
113    } else if fmt.layout() == PixelLayout::Planar {
114        // Planar with alpha (e.g. PlanarRgba): XOR color planes, skip alpha plane.
115        let channels = fmt.channels();
116        let plane_size = data.len() / channels;
117        for b in data[..plane_size * (channels - 1)].iter_mut() {
118            *b ^= 0x80;
119        }
120    } else {
121        // Packed with alpha (Rgba, Bgra): XOR color bytes, skip alpha byte.
122        let channels = fmt.channels();
123        for pixel in data.chunks_exact_mut(channels) {
124            for b in &mut pixel[..channels - 1] {
125                *b ^= 0x80;
126            }
127        }
128    }
129}
130
131impl CPUProcessor {
132    /// Creates a new CPUConverter with bilinear resizing.
133    pub fn new() -> Self {
134        Self::new_bilinear()
135    }
136
137    /// Creates a new CPUConverter with bilinear resizing.
138    fn new_bilinear() -> Self {
139        let resizer = fast_image_resize::Resizer::new();
140        let options = fast_image_resize::ResizeOptions::new()
141            .resize_alg(fast_image_resize::ResizeAlg::Convolution(
142                fast_image_resize::FilterType::Bilinear,
143            ))
144            .use_alpha(false);
145
146        log::debug!("CPUConverter created");
147        Self {
148            resizer,
149            options,
150            colors: crate::DEFAULT_COLORS_U8,
151        }
152    }
153
154    /// Creates a new CPUConverter with nearest neighbor resizing.
155    pub fn new_nearest() -> Self {
156        let resizer = fast_image_resize::Resizer::new();
157        let options = fast_image_resize::ResizeOptions::new()
158            .resize_alg(fast_image_resize::ResizeAlg::Nearest)
159            .use_alpha(false);
160        log::debug!("CPUConverter created");
161        Self {
162            resizer,
163            options,
164            colors: crate::DEFAULT_COLORS_U8,
165        }
166    }
167
168    pub(crate) fn support_conversion_pf(src: PixelFormat, dst: PixelFormat) -> bool {
169        use PixelFormat::*;
170        matches!(
171            (src, dst),
172            (Nv12, Rgb)
173                | (Nv12, Rgba)
174                | (Nv12, Grey)
175                | (Nv16, Rgb)
176                | (Nv16, Rgba)
177                | (Nv16, Bgra)
178                | (Yuyv, Rgb)
179                | (Yuyv, Rgba)
180                | (Yuyv, Grey)
181                | (Yuyv, Yuyv)
182                | (Yuyv, PlanarRgb)
183                | (Yuyv, PlanarRgba)
184                | (Yuyv, Nv16)
185                | (Vyuy, Rgb)
186                | (Vyuy, Rgba)
187                | (Vyuy, Grey)
188                | (Vyuy, Vyuy)
189                | (Vyuy, PlanarRgb)
190                | (Vyuy, PlanarRgba)
191                | (Vyuy, Nv16)
192                | (Rgba, Rgb)
193                | (Rgba, Rgba)
194                | (Rgba, Grey)
195                | (Rgba, Yuyv)
196                | (Rgba, PlanarRgb)
197                | (Rgba, PlanarRgba)
198                | (Rgba, Nv16)
199                | (Rgb, Rgb)
200                | (Rgb, Rgba)
201                | (Rgb, Grey)
202                | (Rgb, Yuyv)
203                | (Rgb, PlanarRgb)
204                | (Rgb, PlanarRgba)
205                | (Rgb, Nv16)
206                | (Grey, Rgb)
207                | (Grey, Rgba)
208                | (Grey, Grey)
209                | (Grey, Yuyv)
210                | (Grey, PlanarRgb)
211                | (Grey, PlanarRgba)
212                | (Grey, Nv16)
213                | (Nv12, Bgra)
214                | (Yuyv, Bgra)
215                | (Vyuy, Bgra)
216                | (Rgba, Bgra)
217                | (Rgb, Bgra)
218                | (Grey, Bgra)
219                | (Bgra, Bgra)
220                | (PlanarRgb, Rgb)
221                | (PlanarRgb, Rgba)
222                | (PlanarRgba, Rgb)
223                | (PlanarRgba, Rgba)
224                | (PlanarRgb, Bgra)
225                | (PlanarRgba, Bgra)
226        )
227    }
228
229    /// Format conversion dispatch for Tensor<u8> with PixelFormat metadata.
230    pub(crate) fn convert_format_pf(
231        src: &Tensor<u8>,
232        dst: &mut Tensor<u8>,
233        src_fmt: PixelFormat,
234        dst_fmt: PixelFormat,
235    ) -> Result<()> {
236        let _timer = FunctionTimer::new(format!(
237            "ImageProcessor::convert_format {} to {}",
238            src_fmt, dst_fmt,
239        ));
240
241        use PixelFormat::*;
242        match (src_fmt, dst_fmt) {
243            (Nv12, Rgb) => Self::convert_nv12_to_rgb(src, dst),
244            (Nv12, Rgba) => Self::convert_nv12_to_rgba(src, dst),
245            (Nv12, Grey) => Self::convert_nv12_to_grey(src, dst),
246            (Yuyv, Rgb) => Self::convert_yuyv_to_rgb(src, dst),
247            (Yuyv, Rgba) => Self::convert_yuyv_to_rgba(src, dst),
248            (Yuyv, Grey) => Self::convert_yuyv_to_grey(src, dst),
249            (Yuyv, Yuyv) => Self::copy_image(src, dst),
250            (Yuyv, PlanarRgb) => Self::convert_yuyv_to_8bps(src, dst),
251            (Yuyv, PlanarRgba) => Self::convert_yuyv_to_prgba(src, dst),
252            (Yuyv, Nv16) => Self::convert_yuyv_to_nv16(src, dst),
253            (Vyuy, Rgb) => Self::convert_vyuy_to_rgb(src, dst),
254            (Vyuy, Rgba) => Self::convert_vyuy_to_rgba(src, dst),
255            (Vyuy, Grey) => Self::convert_vyuy_to_grey(src, dst),
256            (Vyuy, Vyuy) => Self::copy_image(src, dst),
257            (Vyuy, PlanarRgb) => Self::convert_vyuy_to_8bps(src, dst),
258            (Vyuy, PlanarRgba) => Self::convert_vyuy_to_prgba(src, dst),
259            (Vyuy, Nv16) => Self::convert_vyuy_to_nv16(src, dst),
260            (Rgba, Rgb) => Self::convert_rgba_to_rgb(src, dst),
261            (Rgba, Rgba) => Self::copy_image(src, dst),
262            (Rgba, Grey) => Self::convert_rgba_to_grey(src, dst),
263            (Rgba, Yuyv) => Self::convert_rgba_to_yuyv(src, dst),
264            (Rgba, PlanarRgb) => Self::convert_rgba_to_8bps(src, dst),
265            (Rgba, PlanarRgba) => Self::convert_rgba_to_prgba(src, dst),
266            (Rgba, Nv16) => Self::convert_rgba_to_nv16(src, dst),
267            (Rgb, Rgb) => Self::copy_image(src, dst),
268            (Rgb, Rgba) => Self::convert_rgb_to_rgba(src, dst),
269            (Rgb, Grey) => Self::convert_rgb_to_grey(src, dst),
270            (Rgb, Yuyv) => Self::convert_rgb_to_yuyv(src, dst),
271            (Rgb, PlanarRgb) => Self::convert_rgb_to_8bps(src, dst),
272            (Rgb, PlanarRgba) => Self::convert_rgb_to_prgba(src, dst),
273            (Rgb, Nv16) => Self::convert_rgb_to_nv16(src, dst),
274            (Grey, Rgb) => Self::convert_grey_to_rgb(src, dst),
275            (Grey, Rgba) => Self::convert_grey_to_rgba(src, dst),
276            (Grey, Grey) => Self::copy_image(src, dst),
277            (Grey, Yuyv) => Self::convert_grey_to_yuyv(src, dst),
278            (Grey, PlanarRgb) => Self::convert_grey_to_8bps(src, dst),
279            (Grey, PlanarRgba) => Self::convert_grey_to_prgba(src, dst),
280            (Grey, Nv16) => Self::convert_grey_to_nv16(src, dst),
281
282            // the following converts are added for use in testing
283            (Nv16, Rgb) => Self::convert_nv16_to_rgb(src, dst),
284            (Nv16, Rgba) => Self::convert_nv16_to_rgba(src, dst),
285            (PlanarRgb, Rgb) => Self::convert_8bps_to_rgb(src, dst),
286            (PlanarRgb, Rgba) => Self::convert_8bps_to_rgba(src, dst),
287            (PlanarRgba, Rgb) => Self::convert_prgba_to_rgb(src, dst),
288            (PlanarRgba, Rgba) => Self::convert_prgba_to_rgba(src, dst),
289
290            // BGRA destination: convert to RGBA layout, then swap R and B
291            (Bgra, Bgra) => Self::copy_image(src, dst),
292            (Nv12, Bgra) => {
293                Self::convert_nv12_to_rgba(src, dst)?;
294                Self::swizzle_rb_4chan(dst)
295            }
296            (Nv16, Bgra) => {
297                Self::convert_nv16_to_rgba(src, dst)?;
298                Self::swizzle_rb_4chan(dst)
299            }
300            (Yuyv, Bgra) => {
301                Self::convert_yuyv_to_rgba(src, dst)?;
302                Self::swizzle_rb_4chan(dst)
303            }
304            (Vyuy, Bgra) => {
305                Self::convert_vyuy_to_rgba(src, dst)?;
306                Self::swizzle_rb_4chan(dst)
307            }
308            (Rgba, Bgra) => {
309                dst.map()?.copy_from_slice(&src.map()?);
310                Self::swizzle_rb_4chan(dst)
311            }
312            (Rgb, Bgra) => {
313                Self::convert_rgb_to_rgba(src, dst)?;
314                Self::swizzle_rb_4chan(dst)
315            }
316            (Grey, Bgra) => {
317                Self::convert_grey_to_rgba(src, dst)?;
318                Self::swizzle_rb_4chan(dst)
319            }
320            (PlanarRgb, Bgra) => {
321                Self::convert_8bps_to_rgba(src, dst)?;
322                Self::swizzle_rb_4chan(dst)
323            }
324            (PlanarRgba, Bgra) => {
325                Self::convert_prgba_to_rgba(src, dst)?;
326                Self::swizzle_rb_4chan(dst)
327            }
328
329            (s, d) => Err(Error::NotSupported(format!("Conversion from {s} to {d}",))),
330        }
331    }
332
333    /// Tensor<u8>-based fill_image_outside_crop.
334    pub(crate) fn fill_image_outside_crop_u8(
335        dst: &mut Tensor<u8>,
336        rgba: [u8; 4],
337        crop: Rect,
338    ) -> Result<()> {
339        let dst_fmt = dst.format().unwrap();
340        let dst_w = dst.width().unwrap();
341        let dst_h = dst.height().unwrap();
342        let mut dst_map = dst.map()?;
343        let dst_tup = (dst_map.as_mut_slice(), dst_w, dst_h);
344        Self::fill_outside_crop_dispatch(dst_tup, dst_fmt, rgba, crop)
345    }
346
347    /// Common fill dispatch by format.
348    fn fill_outside_crop_dispatch(
349        dst: (&mut [u8], usize, usize),
350        fmt: PixelFormat,
351        rgba: [u8; 4],
352        crop: Rect,
353    ) -> Result<()> {
354        use PixelFormat::*;
355        match fmt {
356            Rgba | Bgra => Self::fill_image_outside_crop_(dst, rgba, crop),
357            Rgb => Self::fill_image_outside_crop_(dst, Self::rgba_to_rgb(rgba), crop),
358            Grey => Self::fill_image_outside_crop_(dst, Self::rgba_to_grey(rgba), crop),
359            Yuyv => Self::fill_image_outside_crop_(
360                (dst.0, dst.1 / 2, dst.2),
361                Self::rgba_to_yuyv(rgba),
362                Rect::new(crop.left / 2, crop.top, crop.width.div_ceil(2), crop.height),
363            ),
364            PlanarRgb => Self::fill_image_outside_crop_planar(dst, Self::rgba_to_rgb(rgba), crop),
365            PlanarRgba => Self::fill_image_outside_crop_planar(dst, rgba, crop),
366            Nv16 => {
367                let yuyv = Self::rgba_to_yuyv(rgba);
368                Self::fill_image_outside_crop_yuv_semiplanar(dst, yuyv[0], [yuyv[1], yuyv[3]], crop)
369            }
370            _ => Err(Error::Internal(format!(
371                "Found unexpected destination {fmt}",
372            ))),
373        }
374    }
375}
376
377impl ImageProcessorTrait for CPUProcessor {
378    fn convert(
379        &mut self,
380        src: &TensorDyn,
381        dst: &mut TensorDyn,
382        rotation: Rotation,
383        flip: Flip,
384        crop: Crop,
385    ) -> Result<()> {
386        self.convert_impl(src, dst, rotation, flip, crop)
387    }
388
389    fn draw_decoded_masks(
390        &mut self,
391        dst: &mut TensorDyn,
392        detect: &[DetectBox],
393        segmentation: &[Segmentation],
394        overlay: crate::MaskOverlay<'_>,
395    ) -> Result<()> {
396        // CPU is the terminal fallback — it must always produce the full
397        // output, never assume the caller cleared dst. Every call writes
398        // the base layer first (bg copy or zero fill) and then the masks.
399        prepare_dst_base_cpu(dst, overlay.background)?;
400        let dst = dst.as_u8_mut().ok_or(Error::NotAnImage)?;
401        self.draw_decoded_masks_impl(
402            dst,
403            detect,
404            segmentation,
405            overlay.opacity,
406            overlay.color_mode,
407        )
408    }
409
410    fn draw_proto_masks(
411        &mut self,
412        dst: &mut TensorDyn,
413        detect: &[DetectBox],
414        proto_data: &ProtoData,
415        overlay: crate::MaskOverlay<'_>,
416    ) -> Result<()> {
417        prepare_dst_base_cpu(dst, overlay.background)?;
418        let dst = dst.as_u8_mut().ok_or(Error::NotAnImage)?;
419        self.draw_proto_masks_impl(
420            dst,
421            detect,
422            proto_data,
423            overlay.opacity,
424            overlay.letterbox,
425            overlay.color_mode,
426        )
427    }
428
429    fn set_class_colors(&mut self, colors: &[[u8; 4]]) -> Result<()> {
430        for (c, new_c) in self.colors.iter_mut().zip(colors.iter()) {
431            *c = *new_c;
432        }
433        Ok(())
434    }
435}
436
437// Internal methods — dtype-aware dispatch layer.
438impl CPUProcessor {
439    /// Top-level conversion dispatcher: handles dtype combinations.
440    pub(crate) fn convert_impl(
441        &mut self,
442        src: &TensorDyn,
443        dst: &mut TensorDyn,
444        rotation: Rotation,
445        flip: Flip,
446        crop: Crop,
447    ) -> Result<()> {
448        let src_fmt = src.format().ok_or(Error::NotAnImage)?;
449        let dst_fmt = dst.format().ok_or(Error::NotAnImage)?;
450
451        match (src.dtype(), dst.dtype()) {
452            (DType::U8, DType::U8) => {
453                let src = src.as_u8().unwrap();
454                let dst = dst.as_u8_mut().unwrap();
455                self.convert_u8(src, dst, src_fmt, dst_fmt, rotation, flip, crop)
456            }
457            (DType::U8, DType::I8) => {
458                // Int8 output: reinterpret the i8 destination as u8 (layout-
459                // identical), convert directly into it, then XOR 0x80 in-place.
460                let src_u8 = src.as_u8().unwrap();
461                let dst_i8 = dst.as_i8_mut().unwrap();
462                // SAFETY: Tensor<i8> and Tensor<u8> are layout-identical
463                // (same element size, no T-dependent drop glue). Same
464                // rationale as gl::processor::tensor_i8_as_u8_mut.
465                let dst_u8 = unsafe { &mut *(dst_i8 as *mut Tensor<i8> as *mut Tensor<u8>) };
466                self.convert_u8(src_u8, dst_u8, src_fmt, dst_fmt, rotation, flip, crop)?;
467                // Apply XOR 0x80 bias in-place (u8 → i8 conversion)
468                let mut map = dst_u8.map()?;
469                apply_int8_xor_bias(map.as_mut_slice(), dst_fmt);
470                Ok(())
471            }
472            (s, d) => Err(Error::NotSupported(format!("dtype {s} -> {d}",))),
473        }
474    }
475
476    /// U8-to-U8 conversion: the full format conversion + resize pipeline.
477    #[allow(clippy::too_many_arguments)]
478    fn convert_u8(
479        &mut self,
480        src: &Tensor<u8>,
481        dst: &mut Tensor<u8>,
482        src_fmt: PixelFormat,
483        dst_fmt: PixelFormat,
484        rotation: Rotation,
485        flip: Flip,
486        crop: Crop,
487    ) -> Result<()> {
488        use PixelFormat::*;
489
490        let src_w = src.width().unwrap();
491        let src_h = src.height().unwrap();
492        let dst_w = dst.width().unwrap();
493        let dst_h = dst.height().unwrap();
494
495        crop.check_crop_dims(src_w, src_h, dst_w, dst_h)?;
496
497        // Determine intermediate format for the resize step
498        let intermediate = match (src_fmt, dst_fmt) {
499            (Nv12, Rgb) => Rgb,
500            (Nv12, Rgba) => Rgba,
501            (Nv12, Grey) => Grey,
502            (Nv12, Yuyv) => Rgba,
503            (Nv12, Nv16) => Rgba,
504            (Nv12, PlanarRgb) => Rgb,
505            (Nv12, PlanarRgba) => Rgba,
506            (Yuyv, Rgb) => Rgb,
507            (Yuyv, Rgba) => Rgba,
508            (Yuyv, Grey) => Grey,
509            (Yuyv, Yuyv) => Rgba,
510            (Yuyv, PlanarRgb) => Rgb,
511            (Yuyv, PlanarRgba) => Rgba,
512            (Yuyv, Nv16) => Rgba,
513            (Vyuy, Rgb) => Rgb,
514            (Vyuy, Rgba) => Rgba,
515            (Vyuy, Grey) => Grey,
516            (Vyuy, Vyuy) => Rgba,
517            (Vyuy, PlanarRgb) => Rgb,
518            (Vyuy, PlanarRgba) => Rgba,
519            (Vyuy, Nv16) => Rgba,
520            (Rgba, Rgb) => Rgba,
521            (Rgba, Rgba) => Rgba,
522            (Rgba, Grey) => Grey,
523            (Rgba, Yuyv) => Rgba,
524            (Rgba, PlanarRgb) => Rgba,
525            (Rgba, PlanarRgba) => Rgba,
526            (Rgba, Nv16) => Rgba,
527            (Rgb, Rgb) => Rgb,
528            (Rgb, Rgba) => Rgb,
529            (Rgb, Grey) => Grey,
530            (Rgb, Yuyv) => Rgb,
531            (Rgb, PlanarRgb) => Rgb,
532            (Rgb, PlanarRgba) => Rgb,
533            (Rgb, Nv16) => Rgb,
534            (Grey, Rgb) => Rgb,
535            (Grey, Rgba) => Rgba,
536            (Grey, Grey) => Grey,
537            (Grey, Yuyv) => Grey,
538            (Grey, PlanarRgb) => Grey,
539            (Grey, PlanarRgba) => Grey,
540            (Grey, Nv16) => Grey,
541            (Nv12, Bgra) => Rgba,
542            (Yuyv, Bgra) => Rgba,
543            (Vyuy, Bgra) => Rgba,
544            (Rgba, Bgra) => Rgba,
545            (Rgb, Bgra) => Rgb,
546            (Grey, Bgra) => Grey,
547            (Bgra, Bgra) => Bgra,
548            (Nv16, Rgb) => Rgb,
549            (Nv16, Rgba) => Rgba,
550            (Nv16, Bgra) => Rgba,
551            (PlanarRgb, Rgb) => Rgb,
552            (PlanarRgb, Rgba) => Rgb,
553            (PlanarRgb, Bgra) => Rgb,
554            (PlanarRgba, Rgb) => Rgba,
555            (PlanarRgba, Rgba) => Rgba,
556            (PlanarRgba, Bgra) => Rgba,
557            (s, d) => {
558                return Err(Error::NotSupported(format!("Conversion from {s} to {d}",)));
559            }
560        };
561
562        let need_resize_flip_rotation = rotation != Rotation::None
563            || flip != Flip::None
564            || src_w != dst_w
565            || src_h != dst_h
566            || crop.src_rect.is_some_and(|c| {
567                c != Rect {
568                    left: 0,
569                    top: 0,
570                    width: src_w,
571                    height: src_h,
572                }
573            })
574            || crop.dst_rect.is_some_and(|c| {
575                c != Rect {
576                    left: 0,
577                    top: 0,
578                    width: dst_w,
579                    height: dst_h,
580                }
581            });
582
583        // check if a direct conversion can be done
584        if !need_resize_flip_rotation && Self::support_conversion_pf(src_fmt, dst_fmt) {
585            return Self::convert_format_pf(src, dst, src_fmt, dst_fmt);
586        }
587
588        // any extra checks
589        if dst_fmt == Yuyv && !dst_w.is_multiple_of(2) {
590            return Err(Error::NotSupported(format!(
591                "{} destination must have width divisible by 2",
592                dst_fmt,
593            )));
594        }
595
596        // create tmp buffer
597        let mut tmp_buffer;
598        let tmp;
599        let tmp_fmt;
600        if intermediate != src_fmt {
601            let _s = tracing::trace_span!(
602                "image.convert.cpu.format_convert",
603                from = ?src_fmt,
604                to = ?intermediate,
605                pass = "pre_resize",
606            )
607            .entered();
608            tmp_buffer = Tensor::<u8>::image(src_w, src_h, intermediate, Some(TensorMemory::Mem))?;
609
610            Self::convert_format_pf(src, &mut tmp_buffer, src_fmt, intermediate)?;
611            tmp = &tmp_buffer;
612            tmp_fmt = intermediate;
613        } else {
614            tmp = src;
615            tmp_fmt = src_fmt;
616        }
617
618        // format must be RGB/RGBA/GREY
619        debug_assert!(matches!(tmp_fmt, Rgb | Rgba | Grey));
620        if tmp_fmt == dst_fmt {
621            let _s = tracing::trace_span!("image.convert.cpu.resize_flip_rotate").entered();
622            self.resize_flip_rotate_pf(tmp, dst, dst_fmt, rotation, flip, crop)?;
623        } else if !need_resize_flip_rotation {
624            let _s = tracing::trace_span!(
625                "image.convert.cpu.format_convert",
626                from = ?tmp_fmt,
627                to = ?dst_fmt,
628                pass = "direct",
629            )
630            .entered();
631            Self::convert_format_pf(tmp, dst, tmp_fmt, dst_fmt)?;
632        } else {
633            let mut tmp2 = Tensor::<u8>::image(dst_w, dst_h, tmp_fmt, Some(TensorMemory::Mem))?;
634            if crop.dst_rect.is_some_and(|c| {
635                c != Rect {
636                    left: 0,
637                    top: 0,
638                    width: dst_w,
639                    height: dst_h,
640                }
641            }) && crop.dst_color.is_none()
642            {
643                Self::convert_format_pf(dst, &mut tmp2, dst_fmt, tmp_fmt)?;
644            }
645            {
646                let _s = tracing::trace_span!("image.convert.cpu.resize_flip_rotate").entered();
647                self.resize_flip_rotate_pf(tmp, &mut tmp2, tmp_fmt, rotation, flip, crop)?;
648            }
649            {
650                let _s = tracing::trace_span!(
651                    "image.convert.cpu.format_convert",
652                    from = ?tmp_fmt,
653                    to = ?dst_fmt,
654                    pass = "post_resize",
655                )
656                .entered();
657                Self::convert_format_pf(&tmp2, dst, tmp_fmt, dst_fmt)?;
658            }
659        }
660        if let (Some(dst_rect), Some(dst_color)) = (crop.dst_rect, crop.dst_color) {
661            let full_rect = Rect {
662                left: 0,
663                top: 0,
664                width: dst_w,
665                height: dst_h,
666            };
667            if dst_rect != full_rect {
668                Self::fill_image_outside_crop_u8(dst, dst_color, dst_rect)?;
669            }
670        }
671
672        Ok(())
673    }
674
675    fn draw_decoded_masks_impl(
676        &mut self,
677        dst: &mut Tensor<u8>,
678        detect: &[DetectBox],
679        segmentation: &[Segmentation],
680        opacity: f32,
681        color_mode: crate::ColorMode,
682    ) -> Result<()> {
683        let dst_fmt = dst.format().ok_or(Error::NotAnImage)?;
684        if !matches!(dst_fmt, PixelFormat::Rgba | PixelFormat::Rgb) {
685            return Err(crate::Error::NotSupported(
686                "CPU image rendering only supports RGBA or RGB images".to_string(),
687            ));
688        }
689
690        let _timer = FunctionTimer::new("CPUProcessor::draw_decoded_masks");
691
692        let dst_w = dst.width().unwrap();
693        let dst_h = dst.height().unwrap();
694        let dst_rs = tensor_row_stride(dst);
695        let dst_c = dst_fmt.channels();
696
697        let mut map = dst.map()?;
698        let dst_slice = map.as_mut_slice();
699
700        self.render_box(dst_w, dst_h, dst_rs, dst_c, dst_slice, detect, color_mode)?;
701
702        if segmentation.is_empty() {
703            return Ok(());
704        }
705
706        // Semantic segmentation (e.g. ModelPack) has C > 1 (multi-class),
707        // instance segmentation (e.g. YOLO) has C = 1 (binary per-instance).
708        let is_semantic = segmentation[0].segmentation.shape()[2] > 1;
709
710        if is_semantic {
711            self.render_modelpack_segmentation(
712                dst_w,
713                dst_h,
714                dst_rs,
715                dst_c,
716                dst_slice,
717                &segmentation[0],
718                opacity,
719            )?;
720        } else {
721            for (idx, (seg, det)) in segmentation.iter().zip(detect).enumerate() {
722                let color_index = color_mode.index(idx, det.label);
723                self.render_yolo_segmentation(
724                    dst_w,
725                    dst_h,
726                    dst_rs,
727                    dst_c,
728                    dst_slice,
729                    seg,
730                    color_index,
731                    opacity,
732                )?;
733            }
734        }
735
736        Ok(())
737    }
738
739    fn draw_proto_masks_impl(
740        &mut self,
741        dst: &mut Tensor<u8>,
742        detect: &[DetectBox],
743        proto_data: &ProtoData,
744        opacity: f32,
745        letterbox: Option<[f32; 4]>,
746        color_mode: crate::ColorMode,
747    ) -> Result<()> {
748        let dst_fmt = dst.format().ok_or(Error::NotAnImage)?;
749        if !matches!(dst_fmt, PixelFormat::Rgba | PixelFormat::Rgb) {
750            return Err(crate::Error::NotSupported(
751                "CPU image rendering only supports RGBA or RGB images".to_string(),
752            ));
753        }
754
755        let _timer = FunctionTimer::new("CPUProcessor::draw_proto_masks");
756
757        let dst_w = dst.width().unwrap();
758        let dst_h = dst.height().unwrap();
759        let dst_rs = tensor_row_stride(dst);
760        let channels = dst_fmt.channels();
761
762        let mut map = dst.map()?;
763        let dst_slice = map.as_mut_slice();
764
765        self.render_box(
766            dst_w, dst_h, dst_rs, channels, dst_slice, detect, color_mode,
767        )?;
768
769        if detect.is_empty() {
770            return Ok(());
771        }
772        let proto_shape = proto_data.protos.shape();
773        if proto_shape.len() != 3 {
774            return Err(Error::InvalidShape(format!(
775                "protos tensor must be rank-3, got {proto_shape:?}"
776            )));
777        }
778        let proto_h = proto_shape[0];
779        let proto_w = proto_shape[1];
780        let num_protos = proto_shape[2];
781        let coeff_shape = proto_data.mask_coefficients.shape();
782        if coeff_shape.len() != 2 {
783            return Err(Error::InvalidShape(format!(
784                "mask_coefficients tensor must be rank-2, got {coeff_shape:?}"
785            )));
786        }
787        // Genuine "no detections this frame" → nothing to render.
788        if coeff_shape[0] == 0 {
789            return Ok(());
790        }
791        if coeff_shape[1] != num_protos {
792            return Err(Error::InvalidShape(format!(
793                "mask_coefficients second dimension must match num_protos \
794                 ({num_protos}), got {coeff_shape:?}"
795            )));
796        }
797
798        // Widen coefficients to f32 once; shape [N, num_protos].
799        let coeff_f32: Vec<f32> = match proto_data.mask_coefficients.dtype() {
800            DType::F32 => {
801                let t = proto_data.mask_coefficients.as_f32().expect("F32");
802                let m = t.map()?;
803                m.as_slice().to_vec()
804            }
805            DType::F16 => {
806                let t = proto_data.mask_coefficients.as_f16().expect("F16");
807                let m = t.map()?;
808                m.as_slice().iter().map(|v| v.to_f32()).collect()
809            }
810            DType::I8 => {
811                let t = proto_data.mask_coefficients.as_i8().expect("I8");
812                let m = t.map()?;
813                if let Some(q) = t.quantization() {
814                    use edgefirst_tensor::QuantMode;
815                    let (scale, zp) = match q.mode() {
816                        QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
817                        QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
818                        other => {
819                            return Err(Error::NotSupported(format!(
820                                "I8 mask_coefficients quantization mode {other:?} not supported"
821                            )));
822                        }
823                    };
824                    m.as_slice()
825                        .iter()
826                        .map(|&v| (v as f32 - zp) * scale)
827                        .collect()
828                } else {
829                    m.as_slice().iter().map(|&v| v as f32).collect()
830                }
831            }
832            DType::I16 => {
833                let t = proto_data.mask_coefficients.as_i16().expect("I16");
834                let m = t.map()?;
835                if let Some(q) = t.quantization() {
836                    use edgefirst_tensor::QuantMode;
837                    let (scale, zp) = match q.mode() {
838                        QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
839                        QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
840                        other => {
841                            return Err(Error::NotSupported(format!(
842                                "I16 mask_coefficients quantization mode {other:?} not supported"
843                            )));
844                        }
845                    };
846                    m.as_slice()
847                        .iter()
848                        .map(|&v| (v as f32 - zp) * scale)
849                        .collect()
850                } else {
851                    m.as_slice().iter().map(|&v| v as f32).collect()
852                }
853            }
854            other => {
855                return Err(Error::InvalidShape(format!(
856                    "mask_coefficients dtype {other:?} not supported"
857                )));
858            }
859        };
860
861        // Precompute letterbox scale/offset for output-pixel → proto-pixel mapping.
862        let (lx0, lx_range, ly0, ly_range) = match letterbox {
863            Some([lx0, ly0, lx1, ly1]) => (lx0, lx1 - lx0, ly0, ly1 - ly0),
864            None => (0.0_f32, 1.0_f32, 0.0_f32, 1.0_f32),
865        };
866
867        // Per-dtype dispatch. Map protos once, call the inner draw loop
868        // with a dtype-specialized loader closure.
869        match proto_data.protos.dtype() {
870            DType::F32 => {
871                let t = proto_data.protos.as_f32().expect("F32");
872                let m = t.map()?;
873                self.draw_proto_masks_inner(
874                    dst_slice,
875                    dst_w,
876                    dst_h,
877                    dst_rs,
878                    channels,
879                    detect,
880                    m.as_slice(),
881                    &coeff_f32,
882                    proto_h,
883                    proto_w,
884                    num_protos,
885                    opacity,
886                    (lx0, lx_range, ly0, ly_range),
887                    color_mode,
888                    0.0_f32,
889                    |p: &f32, _| *p,
890                );
891            }
892            DType::F16 => {
893                let t = proto_data.protos.as_f16().expect("F16");
894                let m = t.map()?;
895                self.draw_proto_masks_inner(
896                    dst_slice,
897                    dst_w,
898                    dst_h,
899                    dst_rs,
900                    channels,
901                    detect,
902                    m.as_slice(),
903                    &coeff_f32,
904                    proto_h,
905                    proto_w,
906                    num_protos,
907                    opacity,
908                    (lx0, lx_range, ly0, ly_range),
909                    color_mode,
910                    0.0_f32,
911                    |p: &half::f16, _| p.to_f32(),
912                );
913            }
914            DType::I8 => {
915                use edgefirst_tensor::QuantMode;
916                let t = proto_data.protos.as_i8().expect("I8");
917                let m = t.map()?;
918                let quant = t.quantization().ok_or_else(|| {
919                    Error::InvalidShape("I8 protos require quantization metadata".into())
920                })?;
921                let (scale, zp) = match quant.mode() {
922                    QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
923                    QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
924                    QuantMode::PerChannel { axis, .. }
925                    | QuantMode::PerChannelSymmetric { axis, .. } => {
926                        return Err(Error::NotSupported(format!(
927                            "per-channel quantization (axis={axis}) in draw_proto_masks \
928                             CPU path not yet supported"
929                        )));
930                    }
931                };
932                self.draw_proto_masks_inner(
933                    dst_slice,
934                    dst_w,
935                    dst_h,
936                    dst_rs,
937                    channels,
938                    detect,
939                    m.as_slice(),
940                    &coeff_f32,
941                    proto_h,
942                    proto_w,
943                    num_protos,
944                    opacity,
945                    (lx0, lx_range, ly0, ly_range),
946                    color_mode,
947                    scale,
948                    move |p: &i8, _| (*p as f32) - zp,
949                );
950            }
951            other => {
952                return Err(Error::InvalidShape(format!(
953                    "proto tensor dtype {other:?} not supported"
954                )));
955            }
956        }
957
958        Ok(())
959    }
960
961    #[allow(clippy::too_many_arguments)]
962    fn draw_proto_masks_inner<P: Copy>(
963        &self,
964        dst_slice: &mut [u8],
965        dst_w: usize,
966        dst_h: usize,
967        dst_rs: usize,
968        channels: usize,
969        detect: &[DetectBox],
970        protos: &[P],
971        coeff_all_f32: &[f32],
972        proto_h: usize,
973        proto_w: usize,
974        num_protos: usize,
975        opacity: f32,
976        letterbox_xy: (f32, f32, f32, f32),
977        color_mode: crate::ColorMode,
978        acc_scale: f32,
979        load_f32: impl Fn(&P, f32) -> f32 + Copy,
980    ) {
981        let (lx0, lx_range, ly0, ly_range) = letterbox_xy;
982        let stride_y = proto_w * num_protos;
983        for (idx, det) in detect.iter().enumerate() {
984            let coeff = &coeff_all_f32[idx * num_protos..(idx + 1) * num_protos];
985            let color_index = color_mode.index(idx, det.label);
986            let color = self.colors[color_index % self.colors.len()];
987            let alpha = if opacity == 1.0 {
988                color[3] as u16
989            } else {
990                (color[3] as f32 * opacity).round() as u16
991            };
992
993            let start_x = (dst_w as f32 * det.bbox.xmin).round() as usize;
994            let start_y = (dst_h as f32 * det.bbox.ymin).round() as usize;
995            let end_x = ((dst_w as f32 * det.bbox.xmax).round() as usize).min(dst_w);
996            let end_y = ((dst_h as f32 * det.bbox.ymax).round() as usize).min(dst_h);
997
998            for y in start_y..end_y {
999                for x in start_x..end_x {
1000                    let px = (lx0 + (x as f32 / dst_w as f32) * lx_range) * proto_w as f32 - 0.5;
1001                    let py = (ly0 + (y as f32 / dst_h as f32) * ly_range) * proto_h as f32 - 0.5;
1002
1003                    // Bilinear interpolation with per-load widening. Inline
1004                    // bilinear-sample since bilinear_dot_slice takes a
1005                    // different closure shape (no `zp` arg).
1006                    let x0 = (px.floor() as isize).clamp(0, proto_w as isize - 1) as usize;
1007                    let y0 = (py.floor() as isize).clamp(0, proto_h as isize - 1) as usize;
1008                    let x1 = (x0 + 1).min(proto_w - 1);
1009                    let y1 = (y0 + 1).min(proto_h - 1);
1010                    let fx = px - px.floor();
1011                    let fy = py - py.floor();
1012                    let w00 = (1.0 - fx) * (1.0 - fy);
1013                    let w10 = fx * (1.0 - fy);
1014                    let w01 = (1.0 - fx) * fy;
1015                    let w11 = fx * fy;
1016                    let b00 = y0 * stride_y + x0 * num_protos;
1017                    let b10 = y0 * stride_y + x1 * num_protos;
1018                    let b01 = y1 * stride_y + x0 * num_protos;
1019                    let b11 = y1 * stride_y + x1 * num_protos;
1020                    let mut acc = 0.0_f32;
1021                    for p in 0..num_protos {
1022                        let v00 = load_f32(&protos[b00 + p], 0.0);
1023                        let v10 = load_f32(&protos[b10 + p], 0.0);
1024                        let v01 = load_f32(&protos[b01 + p], 0.0);
1025                        let v11 = load_f32(&protos[b11 + p], 0.0);
1026                        let val = w00 * v00 + w10 * v10 + w01 * v01 + w11 * v11;
1027                        acc += coeff[p] * val;
1028                    }
1029                    let final_acc = if acc_scale == 0.0 {
1030                        acc
1031                    } else {
1032                        acc_scale * acc
1033                    };
1034                    // Pass-through: acc_scale=0.0 means "no scaling" (f32/f16
1035                    // native); non-zero means "apply scale once" (i8 with
1036                    // per-tensor quant).
1037                    let mask = 1.0 / (1.0 + (-final_acc).exp());
1038                    if mask < 0.5 {
1039                        continue;
1040                    }
1041                    let dst_index = y * dst_rs + x * channels;
1042                    for c in 0..3 {
1043                        dst_slice[dst_index + c] = ((color[c] as u16 * alpha
1044                            + dst_slice[dst_index + c] as u16 * (255 - alpha))
1045                            / 255) as u8;
1046                    }
1047                }
1048            }
1049        }
1050    }
1051}