Skip to main content

edgefirst_image/cpu/
mod.rs

1// SPDX-FileCopyrightText: Copyright 2025 Au-Zone Technologies
2// SPDX-License-Identifier: Apache-2.0
3
4use crate::{Crop, Error, Flip, FunctionTimer, ImageProcessorTrait, Rect, Result, Rotation};
5use edgefirst_decoder::{DetectBox, ProtoData, Segmentation};
6use edgefirst_tensor::{
7    DType, PixelFormat, Tensor, TensorDyn, TensorMapTrait, TensorMemory, TensorTrait,
8};
9
10mod convert;
11mod masks;
12mod resize;
13mod tests;
14
15// bilinear_dot removed — masks.rs now uses slice-native bilinear_dot_slice
16// closure-based kernel, invoked through the local dtype dispatch below.
17
18/// CPUConverter implements the ImageProcessor trait using the fallback CPU
19/// implementation for image processing.
20#[derive(Debug, Clone)]
21pub struct CPUProcessor {
22    resizer: fast_image_resize::Resizer,
23    options: fast_image_resize::ResizeOptions,
24    colors: [[u8; 4]; 20],
25}
26
27unsafe impl Send for CPUProcessor {}
28unsafe impl Sync for CPUProcessor {}
29
30impl Default for CPUProcessor {
31    fn default() -> Self {
32        Self::new_bilinear()
33    }
34}
35
36/// Write the base layer of `dst` before mask rendering.
37///
38/// This is the terminal fallback: on CPU we have no 2D hardware, so a
39/// direct buffer write is the appropriate primitive. The invariant is that
40/// every call to the CPU draw_* entry points fully initialises dst — we
41/// never rely on "whatever was in the buffer" from the caller.
42///
43/// - `background == Some(bg)` → byte-for-byte copy bg → dst (after shape /
44///   format validation).
45/// - `background == None` → fill dst with 0x00 (transparent black).
46fn prepare_dst_base_cpu(dst: &mut TensorDyn, background: Option<&TensorDyn>) -> Result<()> {
47    match background {
48        Some(bg) => {
49            if bg.shape() != dst.shape() {
50                return Err(Error::InvalidShape(
51                    "background shape does not match dst".into(),
52                ));
53            }
54            if bg.format() != dst.format() {
55                return Err(Error::InvalidShape(
56                    "background pixel format does not match dst".into(),
57                ));
58            }
59            let bg_u8 = bg.as_u8().ok_or(Error::NotAnImage)?;
60            let dst_u8 = dst.as_u8_mut().ok_or(Error::NotAnImage)?;
61            let bg_map = bg_u8.map()?;
62            let mut dst_map = dst_u8.map()?;
63            let bg_slice = bg_map.as_slice();
64            let dst_slice = dst_map.as_mut_slice();
65            if bg_slice.len() != dst_slice.len() {
66                return Err(Error::InvalidShape(
67                    "background buffer size does not match dst".into(),
68                ));
69            }
70            dst_slice.copy_from_slice(bg_slice);
71        }
72        None => {
73            let dst_u8 = dst.as_u8_mut().ok_or(Error::NotAnImage)?;
74            let mut dst_map = dst_u8.map()?;
75            dst_map.as_mut_slice().fill(0);
76        }
77    }
78    Ok(())
79}
80
81/// Compute row stride for a packed-format Tensor<u8> image given its format.
82fn row_stride_for(width: usize, fmt: PixelFormat) -> usize {
83    use edgefirst_tensor::PixelLayout;
84    match fmt.layout() {
85        PixelLayout::Packed => width * fmt.channels(),
86        PixelLayout::Planar | PixelLayout::SemiPlanar => width,
87        _ => width, // fallback for non-exhaustive
88    }
89}
90
91/// Apply XOR 0x80 bias to color channels only, preserving alpha.
92///
93/// Matches GL int8 shader behavior: `vec4(int8_bias(c.rgb), c.a)`.
94/// For formats without alpha, XORs every byte (fast path).
95pub(crate) fn apply_int8_xor_bias(data: &mut [u8], fmt: PixelFormat) {
96    use edgefirst_tensor::PixelLayout;
97    if !fmt.has_alpha() {
98        for b in data.iter_mut() {
99            *b ^= 0x80;
100        }
101    } else if fmt.layout() == PixelLayout::Planar {
102        // Planar with alpha (e.g. PlanarRgba): XOR color planes, skip alpha plane.
103        let channels = fmt.channels();
104        let plane_size = data.len() / channels;
105        for b in data[..plane_size * (channels - 1)].iter_mut() {
106            *b ^= 0x80;
107        }
108    } else {
109        // Packed with alpha (Rgba, Bgra): XOR color bytes, skip alpha byte.
110        let channels = fmt.channels();
111        for pixel in data.chunks_exact_mut(channels) {
112            for b in &mut pixel[..channels - 1] {
113                *b ^= 0x80;
114            }
115        }
116    }
117}
118
119impl CPUProcessor {
120    /// Creates a new CPUConverter with bilinear resizing.
121    pub fn new() -> Self {
122        Self::new_bilinear()
123    }
124
125    /// Creates a new CPUConverter with bilinear resizing.
126    fn new_bilinear() -> Self {
127        let resizer = fast_image_resize::Resizer::new();
128        let options = fast_image_resize::ResizeOptions::new()
129            .resize_alg(fast_image_resize::ResizeAlg::Convolution(
130                fast_image_resize::FilterType::Bilinear,
131            ))
132            .use_alpha(false);
133
134        log::debug!("CPUConverter created");
135        Self {
136            resizer,
137            options,
138            colors: crate::DEFAULT_COLORS_U8,
139        }
140    }
141
142    /// Creates a new CPUConverter with nearest neighbor resizing.
143    pub fn new_nearest() -> Self {
144        let resizer = fast_image_resize::Resizer::new();
145        let options = fast_image_resize::ResizeOptions::new()
146            .resize_alg(fast_image_resize::ResizeAlg::Nearest)
147            .use_alpha(false);
148        log::debug!("CPUConverter created");
149        Self {
150            resizer,
151            options,
152            colors: crate::DEFAULT_COLORS_U8,
153        }
154    }
155
156    pub(crate) fn support_conversion_pf(src: PixelFormat, dst: PixelFormat) -> bool {
157        use PixelFormat::*;
158        matches!(
159            (src, dst),
160            (Nv12, Rgb)
161                | (Nv12, Rgba)
162                | (Nv12, Grey)
163                | (Nv16, Rgb)
164                | (Nv16, Rgba)
165                | (Nv16, Bgra)
166                | (Yuyv, Rgb)
167                | (Yuyv, Rgba)
168                | (Yuyv, Grey)
169                | (Yuyv, Yuyv)
170                | (Yuyv, PlanarRgb)
171                | (Yuyv, PlanarRgba)
172                | (Yuyv, Nv16)
173                | (Vyuy, Rgb)
174                | (Vyuy, Rgba)
175                | (Vyuy, Grey)
176                | (Vyuy, Vyuy)
177                | (Vyuy, PlanarRgb)
178                | (Vyuy, PlanarRgba)
179                | (Vyuy, Nv16)
180                | (Rgba, Rgb)
181                | (Rgba, Rgba)
182                | (Rgba, Grey)
183                | (Rgba, Yuyv)
184                | (Rgba, PlanarRgb)
185                | (Rgba, PlanarRgba)
186                | (Rgba, Nv16)
187                | (Rgb, Rgb)
188                | (Rgb, Rgba)
189                | (Rgb, Grey)
190                | (Rgb, Yuyv)
191                | (Rgb, PlanarRgb)
192                | (Rgb, PlanarRgba)
193                | (Rgb, Nv16)
194                | (Grey, Rgb)
195                | (Grey, Rgba)
196                | (Grey, Grey)
197                | (Grey, Yuyv)
198                | (Grey, PlanarRgb)
199                | (Grey, PlanarRgba)
200                | (Grey, Nv16)
201                | (Nv12, Bgra)
202                | (Yuyv, Bgra)
203                | (Vyuy, Bgra)
204                | (Rgba, Bgra)
205                | (Rgb, Bgra)
206                | (Grey, Bgra)
207                | (Bgra, Bgra)
208                | (PlanarRgb, Rgb)
209                | (PlanarRgb, Rgba)
210                | (PlanarRgba, Rgb)
211                | (PlanarRgba, Rgba)
212                | (PlanarRgb, Bgra)
213                | (PlanarRgba, Bgra)
214        )
215    }
216
217    /// Format conversion dispatch for Tensor<u8> with PixelFormat metadata.
218    pub(crate) fn convert_format_pf(
219        src: &Tensor<u8>,
220        dst: &mut Tensor<u8>,
221        src_fmt: PixelFormat,
222        dst_fmt: PixelFormat,
223    ) -> Result<()> {
224        let _timer = FunctionTimer::new(format!(
225            "ImageProcessor::convert_format {} to {}",
226            src_fmt, dst_fmt,
227        ));
228
229        use PixelFormat::*;
230        match (src_fmt, dst_fmt) {
231            (Nv12, Rgb) => Self::convert_nv12_to_rgb(src, dst),
232            (Nv12, Rgba) => Self::convert_nv12_to_rgba(src, dst),
233            (Nv12, Grey) => Self::convert_nv12_to_grey(src, dst),
234            (Yuyv, Rgb) => Self::convert_yuyv_to_rgb(src, dst),
235            (Yuyv, Rgba) => Self::convert_yuyv_to_rgba(src, dst),
236            (Yuyv, Grey) => Self::convert_yuyv_to_grey(src, dst),
237            (Yuyv, Yuyv) => Self::copy_image(src, dst),
238            (Yuyv, PlanarRgb) => Self::convert_yuyv_to_8bps(src, dst),
239            (Yuyv, PlanarRgba) => Self::convert_yuyv_to_prgba(src, dst),
240            (Yuyv, Nv16) => Self::convert_yuyv_to_nv16(src, dst),
241            (Vyuy, Rgb) => Self::convert_vyuy_to_rgb(src, dst),
242            (Vyuy, Rgba) => Self::convert_vyuy_to_rgba(src, dst),
243            (Vyuy, Grey) => Self::convert_vyuy_to_grey(src, dst),
244            (Vyuy, Vyuy) => Self::copy_image(src, dst),
245            (Vyuy, PlanarRgb) => Self::convert_vyuy_to_8bps(src, dst),
246            (Vyuy, PlanarRgba) => Self::convert_vyuy_to_prgba(src, dst),
247            (Vyuy, Nv16) => Self::convert_vyuy_to_nv16(src, dst),
248            (Rgba, Rgb) => Self::convert_rgba_to_rgb(src, dst),
249            (Rgba, Rgba) => Self::copy_image(src, dst),
250            (Rgba, Grey) => Self::convert_rgba_to_grey(src, dst),
251            (Rgba, Yuyv) => Self::convert_rgba_to_yuyv(src, dst),
252            (Rgba, PlanarRgb) => Self::convert_rgba_to_8bps(src, dst),
253            (Rgba, PlanarRgba) => Self::convert_rgba_to_prgba(src, dst),
254            (Rgba, Nv16) => Self::convert_rgba_to_nv16(src, dst),
255            (Rgb, Rgb) => Self::copy_image(src, dst),
256            (Rgb, Rgba) => Self::convert_rgb_to_rgba(src, dst),
257            (Rgb, Grey) => Self::convert_rgb_to_grey(src, dst),
258            (Rgb, Yuyv) => Self::convert_rgb_to_yuyv(src, dst),
259            (Rgb, PlanarRgb) => Self::convert_rgb_to_8bps(src, dst),
260            (Rgb, PlanarRgba) => Self::convert_rgb_to_prgba(src, dst),
261            (Rgb, Nv16) => Self::convert_rgb_to_nv16(src, dst),
262            (Grey, Rgb) => Self::convert_grey_to_rgb(src, dst),
263            (Grey, Rgba) => Self::convert_grey_to_rgba(src, dst),
264            (Grey, Grey) => Self::copy_image(src, dst),
265            (Grey, Yuyv) => Self::convert_grey_to_yuyv(src, dst),
266            (Grey, PlanarRgb) => Self::convert_grey_to_8bps(src, dst),
267            (Grey, PlanarRgba) => Self::convert_grey_to_prgba(src, dst),
268            (Grey, Nv16) => Self::convert_grey_to_nv16(src, dst),
269
270            // the following converts are added for use in testing
271            (Nv16, Rgb) => Self::convert_nv16_to_rgb(src, dst),
272            (Nv16, Rgba) => Self::convert_nv16_to_rgba(src, dst),
273            (PlanarRgb, Rgb) => Self::convert_8bps_to_rgb(src, dst),
274            (PlanarRgb, Rgba) => Self::convert_8bps_to_rgba(src, dst),
275            (PlanarRgba, Rgb) => Self::convert_prgba_to_rgb(src, dst),
276            (PlanarRgba, Rgba) => Self::convert_prgba_to_rgba(src, dst),
277
278            // BGRA destination: convert to RGBA layout, then swap R and B
279            (Bgra, Bgra) => Self::copy_image(src, dst),
280            (Nv12, Bgra) => {
281                Self::convert_nv12_to_rgba(src, dst)?;
282                Self::swizzle_rb_4chan(dst)
283            }
284            (Nv16, Bgra) => {
285                Self::convert_nv16_to_rgba(src, dst)?;
286                Self::swizzle_rb_4chan(dst)
287            }
288            (Yuyv, Bgra) => {
289                Self::convert_yuyv_to_rgba(src, dst)?;
290                Self::swizzle_rb_4chan(dst)
291            }
292            (Vyuy, Bgra) => {
293                Self::convert_vyuy_to_rgba(src, dst)?;
294                Self::swizzle_rb_4chan(dst)
295            }
296            (Rgba, Bgra) => {
297                dst.map()?.copy_from_slice(&src.map()?);
298                Self::swizzle_rb_4chan(dst)
299            }
300            (Rgb, Bgra) => {
301                Self::convert_rgb_to_rgba(src, dst)?;
302                Self::swizzle_rb_4chan(dst)
303            }
304            (Grey, Bgra) => {
305                Self::convert_grey_to_rgba(src, dst)?;
306                Self::swizzle_rb_4chan(dst)
307            }
308            (PlanarRgb, Bgra) => {
309                Self::convert_8bps_to_rgba(src, dst)?;
310                Self::swizzle_rb_4chan(dst)
311            }
312            (PlanarRgba, Bgra) => {
313                Self::convert_prgba_to_rgba(src, dst)?;
314                Self::swizzle_rb_4chan(dst)
315            }
316
317            (s, d) => Err(Error::NotSupported(format!("Conversion from {s} to {d}",))),
318        }
319    }
320
321    /// Tensor<u8>-based fill_image_outside_crop.
322    pub(crate) fn fill_image_outside_crop_u8(
323        dst: &mut Tensor<u8>,
324        rgba: [u8; 4],
325        crop: Rect,
326    ) -> Result<()> {
327        let dst_fmt = dst.format().unwrap();
328        let dst_w = dst.width().unwrap();
329        let dst_h = dst.height().unwrap();
330        let mut dst_map = dst.map()?;
331        let dst_tup = (dst_map.as_mut_slice(), dst_w, dst_h);
332        Self::fill_outside_crop_dispatch(dst_tup, dst_fmt, rgba, crop)
333    }
334
335    /// Common fill dispatch by format.
336    fn fill_outside_crop_dispatch(
337        dst: (&mut [u8], usize, usize),
338        fmt: PixelFormat,
339        rgba: [u8; 4],
340        crop: Rect,
341    ) -> Result<()> {
342        use PixelFormat::*;
343        match fmt {
344            Rgba | Bgra => Self::fill_image_outside_crop_(dst, rgba, crop),
345            Rgb => Self::fill_image_outside_crop_(dst, Self::rgba_to_rgb(rgba), crop),
346            Grey => Self::fill_image_outside_crop_(dst, Self::rgba_to_grey(rgba), crop),
347            Yuyv => Self::fill_image_outside_crop_(
348                (dst.0, dst.1 / 2, dst.2),
349                Self::rgba_to_yuyv(rgba),
350                Rect::new(crop.left / 2, crop.top, crop.width.div_ceil(2), crop.height),
351            ),
352            PlanarRgb => Self::fill_image_outside_crop_planar(dst, Self::rgba_to_rgb(rgba), crop),
353            PlanarRgba => Self::fill_image_outside_crop_planar(dst, rgba, crop),
354            Nv16 => {
355                let yuyv = Self::rgba_to_yuyv(rgba);
356                Self::fill_image_outside_crop_yuv_semiplanar(dst, yuyv[0], [yuyv[1], yuyv[3]], crop)
357            }
358            _ => Err(Error::Internal(format!(
359                "Found unexpected destination {fmt}",
360            ))),
361        }
362    }
363}
364
365impl ImageProcessorTrait for CPUProcessor {
366    fn convert(
367        &mut self,
368        src: &TensorDyn,
369        dst: &mut TensorDyn,
370        rotation: Rotation,
371        flip: Flip,
372        crop: Crop,
373    ) -> Result<()> {
374        self.convert_impl(src, dst, rotation, flip, crop)
375    }
376
377    fn draw_decoded_masks(
378        &mut self,
379        dst: &mut TensorDyn,
380        detect: &[DetectBox],
381        segmentation: &[Segmentation],
382        overlay: crate::MaskOverlay<'_>,
383    ) -> Result<()> {
384        // CPU is the terminal fallback — it must always produce the full
385        // output, never assume the caller cleared dst. Every call writes
386        // the base layer first (bg copy or zero fill) and then the masks.
387        prepare_dst_base_cpu(dst, overlay.background)?;
388        let dst = dst.as_u8_mut().ok_or(Error::NotAnImage)?;
389        self.draw_decoded_masks_impl(
390            dst,
391            detect,
392            segmentation,
393            overlay.opacity,
394            overlay.color_mode,
395        )
396    }
397
398    fn draw_proto_masks(
399        &mut self,
400        dst: &mut TensorDyn,
401        detect: &[DetectBox],
402        proto_data: &ProtoData,
403        overlay: crate::MaskOverlay<'_>,
404    ) -> Result<()> {
405        prepare_dst_base_cpu(dst, overlay.background)?;
406        let dst = dst.as_u8_mut().ok_or(Error::NotAnImage)?;
407        self.draw_proto_masks_impl(
408            dst,
409            detect,
410            proto_data,
411            overlay.opacity,
412            overlay.letterbox,
413            overlay.color_mode,
414        )
415    }
416
417    fn set_class_colors(&mut self, colors: &[[u8; 4]]) -> Result<()> {
418        for (c, new_c) in self.colors.iter_mut().zip(colors.iter()) {
419            *c = *new_c;
420        }
421        Ok(())
422    }
423}
424
425// Internal methods — dtype-aware dispatch layer.
426impl CPUProcessor {
427    /// Top-level conversion dispatcher: handles dtype combinations.
428    pub(crate) fn convert_impl(
429        &mut self,
430        src: &TensorDyn,
431        dst: &mut TensorDyn,
432        rotation: Rotation,
433        flip: Flip,
434        crop: Crop,
435    ) -> Result<()> {
436        let src_fmt = src.format().ok_or(Error::NotAnImage)?;
437        let dst_fmt = dst.format().ok_or(Error::NotAnImage)?;
438
439        match (src.dtype(), dst.dtype()) {
440            (DType::U8, DType::U8) => {
441                let src = src.as_u8().unwrap();
442                let dst = dst.as_u8_mut().unwrap();
443                self.convert_u8(src, dst, src_fmt, dst_fmt, rotation, flip, crop)
444            }
445            (DType::U8, DType::I8) => {
446                // Int8 output: reinterpret the i8 destination as u8 (layout-
447                // identical), convert directly into it, then XOR 0x80 in-place.
448                let src_u8 = src.as_u8().unwrap();
449                let dst_i8 = dst.as_i8_mut().unwrap();
450                // SAFETY: Tensor<i8> and Tensor<u8> are layout-identical
451                // (same element size, no T-dependent drop glue). Same
452                // rationale as gl::processor::tensor_i8_as_u8_mut.
453                let dst_u8 = unsafe { &mut *(dst_i8 as *mut Tensor<i8> as *mut Tensor<u8>) };
454                self.convert_u8(src_u8, dst_u8, src_fmt, dst_fmt, rotation, flip, crop)?;
455                // Apply XOR 0x80 bias in-place (u8 → i8 conversion)
456                let mut map = dst_u8.map()?;
457                apply_int8_xor_bias(map.as_mut_slice(), dst_fmt);
458                Ok(())
459            }
460            (s, d) => Err(Error::NotSupported(format!("dtype {s} -> {d}",))),
461        }
462    }
463
464    /// U8-to-U8 conversion: the full format conversion + resize pipeline.
465    #[allow(clippy::too_many_arguments)]
466    fn convert_u8(
467        &mut self,
468        src: &Tensor<u8>,
469        dst: &mut Tensor<u8>,
470        src_fmt: PixelFormat,
471        dst_fmt: PixelFormat,
472        rotation: Rotation,
473        flip: Flip,
474        crop: Crop,
475    ) -> Result<()> {
476        use PixelFormat::*;
477
478        let src_w = src.width().unwrap();
479        let src_h = src.height().unwrap();
480        let dst_w = dst.width().unwrap();
481        let dst_h = dst.height().unwrap();
482
483        crop.check_crop_dims(src_w, src_h, dst_w, dst_h)?;
484
485        // Determine intermediate format for the resize step
486        let intermediate = match (src_fmt, dst_fmt) {
487            (Nv12, Rgb) => Rgb,
488            (Nv12, Rgba) => Rgba,
489            (Nv12, Grey) => Grey,
490            (Nv12, Yuyv) => Rgba,
491            (Nv12, Nv16) => Rgba,
492            (Nv12, PlanarRgb) => Rgb,
493            (Nv12, PlanarRgba) => Rgba,
494            (Yuyv, Rgb) => Rgb,
495            (Yuyv, Rgba) => Rgba,
496            (Yuyv, Grey) => Grey,
497            (Yuyv, Yuyv) => Rgba,
498            (Yuyv, PlanarRgb) => Rgb,
499            (Yuyv, PlanarRgba) => Rgba,
500            (Yuyv, Nv16) => Rgba,
501            (Vyuy, Rgb) => Rgb,
502            (Vyuy, Rgba) => Rgba,
503            (Vyuy, Grey) => Grey,
504            (Vyuy, Vyuy) => Rgba,
505            (Vyuy, PlanarRgb) => Rgb,
506            (Vyuy, PlanarRgba) => Rgba,
507            (Vyuy, Nv16) => Rgba,
508            (Rgba, Rgb) => Rgba,
509            (Rgba, Rgba) => Rgba,
510            (Rgba, Grey) => Grey,
511            (Rgba, Yuyv) => Rgba,
512            (Rgba, PlanarRgb) => Rgba,
513            (Rgba, PlanarRgba) => Rgba,
514            (Rgba, Nv16) => Rgba,
515            (Rgb, Rgb) => Rgb,
516            (Rgb, Rgba) => Rgb,
517            (Rgb, Grey) => Grey,
518            (Rgb, Yuyv) => Rgb,
519            (Rgb, PlanarRgb) => Rgb,
520            (Rgb, PlanarRgba) => Rgb,
521            (Rgb, Nv16) => Rgb,
522            (Grey, Rgb) => Rgb,
523            (Grey, Rgba) => Rgba,
524            (Grey, Grey) => Grey,
525            (Grey, Yuyv) => Grey,
526            (Grey, PlanarRgb) => Grey,
527            (Grey, PlanarRgba) => Grey,
528            (Grey, Nv16) => Grey,
529            (Nv12, Bgra) => Rgba,
530            (Yuyv, Bgra) => Rgba,
531            (Vyuy, Bgra) => Rgba,
532            (Rgba, Bgra) => Rgba,
533            (Rgb, Bgra) => Rgb,
534            (Grey, Bgra) => Grey,
535            (Bgra, Bgra) => Bgra,
536            (Nv16, Rgb) => Rgb,
537            (Nv16, Rgba) => Rgba,
538            (Nv16, Bgra) => Rgba,
539            (PlanarRgb, Rgb) => Rgb,
540            (PlanarRgb, Rgba) => Rgb,
541            (PlanarRgb, Bgra) => Rgb,
542            (PlanarRgba, Rgb) => Rgba,
543            (PlanarRgba, Rgba) => Rgba,
544            (PlanarRgba, Bgra) => Rgba,
545            (s, d) => {
546                return Err(Error::NotSupported(format!("Conversion from {s} to {d}",)));
547            }
548        };
549
550        let need_resize_flip_rotation = rotation != Rotation::None
551            || flip != Flip::None
552            || src_w != dst_w
553            || src_h != dst_h
554            || crop.src_rect.is_some_and(|c| {
555                c != Rect {
556                    left: 0,
557                    top: 0,
558                    width: src_w,
559                    height: src_h,
560                }
561            })
562            || crop.dst_rect.is_some_and(|c| {
563                c != Rect {
564                    left: 0,
565                    top: 0,
566                    width: dst_w,
567                    height: dst_h,
568                }
569            });
570
571        // check if a direct conversion can be done
572        if !need_resize_flip_rotation && Self::support_conversion_pf(src_fmt, dst_fmt) {
573            return Self::convert_format_pf(src, dst, src_fmt, dst_fmt);
574        }
575
576        // any extra checks
577        if dst_fmt == Yuyv && !dst_w.is_multiple_of(2) {
578            return Err(Error::NotSupported(format!(
579                "{} destination must have width divisible by 2",
580                dst_fmt,
581            )));
582        }
583
584        // create tmp buffer
585        let mut tmp_buffer;
586        let tmp;
587        let tmp_fmt;
588        if intermediate != src_fmt {
589            let _s = tracing::trace_span!(
590                "cpu_format_convert",
591                from = ?src_fmt,
592                to = ?intermediate,
593                pass = "pre_resize",
594            )
595            .entered();
596            tmp_buffer = Tensor::<u8>::image(src_w, src_h, intermediate, Some(TensorMemory::Mem))?;
597
598            Self::convert_format_pf(src, &mut tmp_buffer, src_fmt, intermediate)?;
599            tmp = &tmp_buffer;
600            tmp_fmt = intermediate;
601        } else {
602            tmp = src;
603            tmp_fmt = src_fmt;
604        }
605
606        // format must be RGB/RGBA/GREY
607        debug_assert!(matches!(tmp_fmt, Rgb | Rgba | Grey));
608        if tmp_fmt == dst_fmt {
609            let _s = tracing::trace_span!("cpu_resize").entered();
610            self.resize_flip_rotate_pf(tmp, dst, dst_fmt, rotation, flip, crop)?;
611        } else if !need_resize_flip_rotation {
612            let _s = tracing::trace_span!(
613                "cpu_format_convert",
614                from = ?tmp_fmt,
615                to = ?dst_fmt,
616                pass = "direct",
617            )
618            .entered();
619            Self::convert_format_pf(tmp, dst, tmp_fmt, dst_fmt)?;
620        } else {
621            let mut tmp2 = Tensor::<u8>::image(dst_w, dst_h, tmp_fmt, Some(TensorMemory::Mem))?;
622            if crop.dst_rect.is_some_and(|c| {
623                c != Rect {
624                    left: 0,
625                    top: 0,
626                    width: dst_w,
627                    height: dst_h,
628                }
629            }) && crop.dst_color.is_none()
630            {
631                Self::convert_format_pf(dst, &mut tmp2, dst_fmt, tmp_fmt)?;
632            }
633            {
634                let _s = tracing::trace_span!("cpu_resize").entered();
635                self.resize_flip_rotate_pf(tmp, &mut tmp2, tmp_fmt, rotation, flip, crop)?;
636            }
637            {
638                let _s = tracing::trace_span!(
639                    "cpu_format_convert",
640                    from = ?tmp_fmt,
641                    to = ?dst_fmt,
642                    pass = "post_resize",
643                )
644                .entered();
645                Self::convert_format_pf(&tmp2, dst, tmp_fmt, dst_fmt)?;
646            }
647        }
648        if let (Some(dst_rect), Some(dst_color)) = (crop.dst_rect, crop.dst_color) {
649            let full_rect = Rect {
650                left: 0,
651                top: 0,
652                width: dst_w,
653                height: dst_h,
654            };
655            if dst_rect != full_rect {
656                Self::fill_image_outside_crop_u8(dst, dst_color, dst_rect)?;
657            }
658        }
659
660        Ok(())
661    }
662
663    fn draw_decoded_masks_impl(
664        &mut self,
665        dst: &mut Tensor<u8>,
666        detect: &[DetectBox],
667        segmentation: &[Segmentation],
668        opacity: f32,
669        color_mode: crate::ColorMode,
670    ) -> Result<()> {
671        let dst_fmt = dst.format().ok_or(Error::NotAnImage)?;
672        if !matches!(dst_fmt, PixelFormat::Rgba | PixelFormat::Rgb) {
673            return Err(crate::Error::NotSupported(
674                "CPU image rendering only supports RGBA or RGB images".to_string(),
675            ));
676        }
677
678        let _timer = FunctionTimer::new("CPUProcessor::draw_decoded_masks");
679
680        let dst_w = dst.width().unwrap();
681        let dst_h = dst.height().unwrap();
682        let dst_rs = row_stride_for(dst_w, dst_fmt);
683        let dst_c = dst_fmt.channels();
684
685        let mut map = dst.map()?;
686        let dst_slice = map.as_mut_slice();
687
688        self.render_box(dst_w, dst_h, dst_rs, dst_c, dst_slice, detect, color_mode)?;
689
690        if segmentation.is_empty() {
691            return Ok(());
692        }
693
694        // Semantic segmentation (e.g. ModelPack) has C > 1 (multi-class),
695        // instance segmentation (e.g. YOLO) has C = 1 (binary per-instance).
696        let is_semantic = segmentation[0].segmentation.shape()[2] > 1;
697
698        if is_semantic {
699            self.render_modelpack_segmentation(
700                dst_w,
701                dst_h,
702                dst_rs,
703                dst_c,
704                dst_slice,
705                &segmentation[0],
706                opacity,
707            )?;
708        } else {
709            for (idx, (seg, det)) in segmentation.iter().zip(detect).enumerate() {
710                let color_index = color_mode.index(idx, det.label);
711                self.render_yolo_segmentation(
712                    dst_w,
713                    dst_h,
714                    dst_rs,
715                    dst_c,
716                    dst_slice,
717                    seg,
718                    color_index,
719                    opacity,
720                )?;
721            }
722        }
723
724        Ok(())
725    }
726
727    fn draw_proto_masks_impl(
728        &mut self,
729        dst: &mut Tensor<u8>,
730        detect: &[DetectBox],
731        proto_data: &ProtoData,
732        opacity: f32,
733        letterbox: Option<[f32; 4]>,
734        color_mode: crate::ColorMode,
735    ) -> Result<()> {
736        let dst_fmt = dst.format().ok_or(Error::NotAnImage)?;
737        if !matches!(dst_fmt, PixelFormat::Rgba | PixelFormat::Rgb) {
738            return Err(crate::Error::NotSupported(
739                "CPU image rendering only supports RGBA or RGB images".to_string(),
740            ));
741        }
742
743        let _timer = FunctionTimer::new("CPUProcessor::draw_proto_masks");
744
745        let dst_w = dst.width().unwrap();
746        let dst_h = dst.height().unwrap();
747        let dst_rs = row_stride_for(dst_w, dst_fmt);
748        let channels = dst_fmt.channels();
749
750        let mut map = dst.map()?;
751        let dst_slice = map.as_mut_slice();
752
753        self.render_box(
754            dst_w, dst_h, dst_rs, channels, dst_slice, detect, color_mode,
755        )?;
756
757        if detect.is_empty() {
758            return Ok(());
759        }
760        let proto_shape = proto_data.protos.shape();
761        if proto_shape.len() != 3 {
762            return Err(Error::InvalidShape(format!(
763                "protos tensor must be rank-3, got {proto_shape:?}"
764            )));
765        }
766        let proto_h = proto_shape[0];
767        let proto_w = proto_shape[1];
768        let num_protos = proto_shape[2];
769        let coeff_shape = proto_data.mask_coefficients.shape();
770        if coeff_shape.len() != 2 {
771            return Err(Error::InvalidShape(format!(
772                "mask_coefficients tensor must be rank-2, got {coeff_shape:?}"
773            )));
774        }
775        // Genuine "no detections this frame" → nothing to render.
776        if coeff_shape[0] == 0 {
777            return Ok(());
778        }
779        if coeff_shape[1] != num_protos {
780            return Err(Error::InvalidShape(format!(
781                "mask_coefficients second dimension must match num_protos \
782                 ({num_protos}), got {coeff_shape:?}"
783            )));
784        }
785
786        // Widen coefficients to f32 once; shape [N, num_protos].
787        let coeff_f32: Vec<f32> = match proto_data.mask_coefficients.dtype() {
788            DType::F32 => {
789                let t = proto_data.mask_coefficients.as_f32().expect("F32");
790                let m = t.map()?;
791                m.as_slice().to_vec()
792            }
793            DType::F16 => {
794                let t = proto_data.mask_coefficients.as_f16().expect("F16");
795                let m = t.map()?;
796                m.as_slice().iter().map(|v| v.to_f32()).collect()
797            }
798            DType::I8 => {
799                let t = proto_data.mask_coefficients.as_i8().expect("I8");
800                let m = t.map()?;
801                if let Some(q) = t.quantization() {
802                    use edgefirst_tensor::QuantMode;
803                    let (scale, zp) = match q.mode() {
804                        QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
805                        QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
806                        other => {
807                            return Err(Error::NotSupported(format!(
808                                "I8 mask_coefficients quantization mode {other:?} not supported"
809                            )));
810                        }
811                    };
812                    m.as_slice()
813                        .iter()
814                        .map(|&v| (v as f32 - zp) * scale)
815                        .collect()
816                } else {
817                    m.as_slice().iter().map(|&v| v as f32).collect()
818                }
819            }
820            other => {
821                return Err(Error::InvalidShape(format!(
822                    "mask_coefficients dtype {other:?} not supported"
823                )));
824            }
825        };
826
827        // Precompute letterbox scale/offset for output-pixel → proto-pixel mapping.
828        let (lx0, lx_range, ly0, ly_range) = match letterbox {
829            Some([lx0, ly0, lx1, ly1]) => (lx0, lx1 - lx0, ly0, ly1 - ly0),
830            None => (0.0_f32, 1.0_f32, 0.0_f32, 1.0_f32),
831        };
832
833        // Per-dtype dispatch. Map protos once, call the inner draw loop
834        // with a dtype-specialized loader closure.
835        match proto_data.protos.dtype() {
836            DType::F32 => {
837                let t = proto_data.protos.as_f32().expect("F32");
838                let m = t.map()?;
839                self.draw_proto_masks_inner(
840                    dst_slice,
841                    dst_w,
842                    dst_h,
843                    dst_rs,
844                    channels,
845                    detect,
846                    m.as_slice(),
847                    &coeff_f32,
848                    proto_h,
849                    proto_w,
850                    num_protos,
851                    opacity,
852                    (lx0, lx_range, ly0, ly_range),
853                    color_mode,
854                    0.0_f32,
855                    |p: &f32, _| *p,
856                );
857            }
858            DType::F16 => {
859                let t = proto_data.protos.as_f16().expect("F16");
860                let m = t.map()?;
861                self.draw_proto_masks_inner(
862                    dst_slice,
863                    dst_w,
864                    dst_h,
865                    dst_rs,
866                    channels,
867                    detect,
868                    m.as_slice(),
869                    &coeff_f32,
870                    proto_h,
871                    proto_w,
872                    num_protos,
873                    opacity,
874                    (lx0, lx_range, ly0, ly_range),
875                    color_mode,
876                    0.0_f32,
877                    |p: &half::f16, _| p.to_f32(),
878                );
879            }
880            DType::I8 => {
881                use edgefirst_tensor::QuantMode;
882                let t = proto_data.protos.as_i8().expect("I8");
883                let m = t.map()?;
884                let quant = t.quantization().ok_or_else(|| {
885                    Error::InvalidShape("I8 protos require quantization metadata".into())
886                })?;
887                let (scale, zp) = match quant.mode() {
888                    QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
889                    QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
890                    QuantMode::PerChannel { axis, .. }
891                    | QuantMode::PerChannelSymmetric { axis, .. } => {
892                        return Err(Error::NotSupported(format!(
893                            "per-channel quantization (axis={axis}) in draw_proto_masks \
894                             CPU path not yet supported"
895                        )));
896                    }
897                };
898                self.draw_proto_masks_inner(
899                    dst_slice,
900                    dst_w,
901                    dst_h,
902                    dst_rs,
903                    channels,
904                    detect,
905                    m.as_slice(),
906                    &coeff_f32,
907                    proto_h,
908                    proto_w,
909                    num_protos,
910                    opacity,
911                    (lx0, lx_range, ly0, ly_range),
912                    color_mode,
913                    scale,
914                    move |p: &i8, _| (*p as f32) - zp,
915                );
916            }
917            other => {
918                return Err(Error::InvalidShape(format!(
919                    "proto tensor dtype {other:?} not supported"
920                )));
921            }
922        }
923
924        Ok(())
925    }
926
927    #[allow(clippy::too_many_arguments)]
928    fn draw_proto_masks_inner<P: Copy>(
929        &self,
930        dst_slice: &mut [u8],
931        dst_w: usize,
932        dst_h: usize,
933        dst_rs: usize,
934        channels: usize,
935        detect: &[DetectBox],
936        protos: &[P],
937        coeff_all_f32: &[f32],
938        proto_h: usize,
939        proto_w: usize,
940        num_protos: usize,
941        opacity: f32,
942        letterbox_xy: (f32, f32, f32, f32),
943        color_mode: crate::ColorMode,
944        acc_scale: f32,
945        load_f32: impl Fn(&P, f32) -> f32 + Copy,
946    ) {
947        let (lx0, lx_range, ly0, ly_range) = letterbox_xy;
948        let stride_y = proto_w * num_protos;
949        for (idx, det) in detect.iter().enumerate() {
950            let coeff = &coeff_all_f32[idx * num_protos..(idx + 1) * num_protos];
951            let color_index = color_mode.index(idx, det.label);
952            let color = self.colors[color_index % self.colors.len()];
953            let alpha = if opacity == 1.0 {
954                color[3] as u16
955            } else {
956                (color[3] as f32 * opacity).round() as u16
957            };
958
959            let start_x = (dst_w as f32 * det.bbox.xmin).round() as usize;
960            let start_y = (dst_h as f32 * det.bbox.ymin).round() as usize;
961            let end_x = ((dst_w as f32 * det.bbox.xmax).round() as usize).min(dst_w);
962            let end_y = ((dst_h as f32 * det.bbox.ymax).round() as usize).min(dst_h);
963
964            for y in start_y..end_y {
965                for x in start_x..end_x {
966                    let px = (lx0 + (x as f32 / dst_w as f32) * lx_range) * proto_w as f32 - 0.5;
967                    let py = (ly0 + (y as f32 / dst_h as f32) * ly_range) * proto_h as f32 - 0.5;
968
969                    // Bilinear interpolation with per-load widening. Inline
970                    // bilinear-sample since bilinear_dot_slice takes a
971                    // different closure shape (no `zp` arg).
972                    let x0 = (px.floor() as isize).clamp(0, proto_w as isize - 1) as usize;
973                    let y0 = (py.floor() as isize).clamp(0, proto_h as isize - 1) as usize;
974                    let x1 = (x0 + 1).min(proto_w - 1);
975                    let y1 = (y0 + 1).min(proto_h - 1);
976                    let fx = px - px.floor();
977                    let fy = py - py.floor();
978                    let w00 = (1.0 - fx) * (1.0 - fy);
979                    let w10 = fx * (1.0 - fy);
980                    let w01 = (1.0 - fx) * fy;
981                    let w11 = fx * fy;
982                    let b00 = y0 * stride_y + x0 * num_protos;
983                    let b10 = y0 * stride_y + x1 * num_protos;
984                    let b01 = y1 * stride_y + x0 * num_protos;
985                    let b11 = y1 * stride_y + x1 * num_protos;
986                    let mut acc = 0.0_f32;
987                    for p in 0..num_protos {
988                        let v00 = load_f32(&protos[b00 + p], 0.0);
989                        let v10 = load_f32(&protos[b10 + p], 0.0);
990                        let v01 = load_f32(&protos[b01 + p], 0.0);
991                        let v11 = load_f32(&protos[b11 + p], 0.0);
992                        let val = w00 * v00 + w10 * v10 + w01 * v01 + w11 * v11;
993                        acc += coeff[p] * val;
994                    }
995                    let final_acc = if acc_scale == 0.0 {
996                        acc
997                    } else {
998                        acc_scale * acc
999                    };
1000                    // Pass-through: acc_scale=0.0 means "no scaling" (f32/f16
1001                    // native); non-zero means "apply scale once" (i8 with
1002                    // per-tensor quant).
1003                    let mask = 1.0 / (1.0 + (-final_acc).exp());
1004                    if mask < 0.5 {
1005                        continue;
1006                    }
1007                    let dst_index = y * dst_rs + x * channels;
1008                    for c in 0..3 {
1009                        dst_slice[dst_index + c] = ((color[c] as u16 * alpha
1010                            + dst_slice[dst_index + c] as u16 * (255 - alpha))
1011                            / 255) as u8;
1012                    }
1013                }
1014            }
1015        }
1016    }
1017}