Skip to main content

edgefirst_image/
opengl_headless.rs

1// SPDX-FileCopyrightText: Copyright 2025 Au-Zone Technologies
2// SPDX-License-Identifier: Apache-2.0
3
4#![cfg(target_os = "linux")]
5#![cfg(feature = "opengl")]
6
7use edgefirst_decoder::{DetectBox, ProtoData, ProtoTensor, Segmentation};
8use edgefirst_tensor::{TensorMemory, TensorTrait};
9use four_char_code::FourCharCode;
10use gbm::{
11    drm::{buffer::DrmFourcc, control::Device as DrmControlDevice, Device as DrmDevice},
12    AsRaw, Device,
13};
14use khronos_egl::{self as egl, Attrib, Display, Dynamic, Instance, EGL1_4};
15use log::{debug, error};
16use std::{
17    collections::BTreeSet,
18    ffi::{c_char, c_void, CStr, CString},
19    mem::ManuallyDrop,
20    os::fd::AsRawFd,
21    ptr::{null, null_mut, NonNull},
22    rc::Rc,
23    str::FromStr,
24    sync::OnceLock,
25    thread::JoinHandle,
26    time::Instant,
27};
28use tokio::sync::mpsc::{Sender, WeakSender};
29
30macro_rules! function {
31    () => {{
32        fn f() {}
33        fn type_name_of<T>(_: T) -> &'static str {
34            std::any::type_name::<T>()
35        }
36        let name = type_name_of(f);
37
38        // Find and cut the rest of the path
39        match &name[..name.len() - 3].rfind(':') {
40            Some(pos) => &name[pos + 1..name.len() - 3],
41            None => &name[..name.len() - 3],
42        }
43    }};
44}
45
46use crate::{
47    fourcc_is_int8, fourcc_is_packed_rgb, CPUProcessor, Crop, Error, Flip, ImageProcessorTrait,
48    MaskRegion, Rect, Rotation, TensorImage, TensorImageRef, BGRA, DEFAULT_COLORS, GREY, NV12,
49    PLANAR_RGB, PLANAR_RGBA, PLANAR_RGB_INT8, RGB, RGBA, RGB_INT8, VYUY, YUYV,
50};
51
52/// Identifies the type of EGL display used for headless OpenGL ES rendering.
53///
54/// The HAL creates a surfaceless GLES 3.0 context
55/// (`EGL_KHR_surfaceless_context` + `EGL_KHR_no_config_context`) and
56/// renders exclusively through FBOs backed by EGLImages imported from
57/// DMA-buf file descriptors. No window or PBuffer surface is created.
58///
59/// Displays are probed in priority order: PlatformDevice first (zero
60/// external dependencies), then GBM, then Default. Use
61/// [`probe_egl_displays`] to discover which are available and
62/// [`ImageProcessorConfig::egl_display`](crate::ImageProcessorConfig::egl_display)
63/// to override the auto-detection.
64///
65/// # Display Types
66///
67/// - **`PlatformDevice`** — Uses `EGL_EXT_device_enumeration` to query
68///   available EGL devices via `eglQueryDevicesEXT`, then selects the first
69///   device with `eglGetPlatformDisplay(EGL_EXT_platform_device, ...)`.
70///   Headless and compositor-free with zero external library dependencies.
71///   Works on NVIDIA GPUs and newer Vivante drivers.
72///
73/// - **`Gbm`** — Opens a DRM render node (e.g. `/dev/dri/renderD128`) and
74///   creates a GBM (Generic Buffer Manager) device, then calls
75///   `eglGetPlatformDisplay(EGL_PLATFORM_GBM_KHR, gbm_device)`. Requires
76///   `libgbm` and a DRM render node. Needed on ARM Mali (i.MX95) and older
77///   Vivante drivers that do not expose `EGL_EXT_platform_device`.
78///
79/// - **`Default`** — Calls `eglGetDisplay(EGL_DEFAULT_DISPLAY)`, letting the
80///   EGL implementation choose the display. On Wayland systems this connects
81///   to the compositor; on X11 it connects to the X server. May block on
82///   headless systems where a compositor is expected but not running.
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
84pub enum EglDisplayKind {
85    Gbm,
86    PlatformDevice,
87    Default,
88}
89
90impl std::fmt::Display for EglDisplayKind {
91    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92        match self {
93            EglDisplayKind::Gbm => write!(f, "GBM"),
94            EglDisplayKind::PlatformDevice => write!(f, "PlatformDevice"),
95            EglDisplayKind::Default => write!(f, "Default"),
96        }
97    }
98}
99
100/// A validated, available EGL display discovered by [`probe_egl_displays`].
101#[derive(Debug, Clone)]
102pub struct EglDisplayInfo {
103    /// The type of EGL display.
104    pub kind: EglDisplayKind,
105    /// Human-readable description for logging/diagnostics
106    /// (e.g. "GBM via /dev/dri/renderD128").
107    pub description: String,
108}
109
110/// EGL library handle. Intentionally leaked (never dlclose'd) to avoid SIGBUS
111/// on process exit: GPU drivers may keep internal state that outlives explicit
112/// EGL cleanup, and dlclose can unmap memory still referenced by the driver.
113static EGL_LIB: OnceLock<&'static libloading::Library> = OnceLock::new();
114
115fn get_egl_lib() -> Result<&'static libloading::Library, crate::Error> {
116    if let Some(egl) = EGL_LIB.get() {
117        Ok(egl)
118    } else {
119        let egl = unsafe { libloading::Library::new("libEGL.so.1")? };
120        // Leak the library to prevent dlclose on process exit
121        let egl: &'static libloading::Library = Box::leak(Box::new(egl));
122        Ok(EGL_LIB.get_or_init(|| egl))
123    }
124}
125
126type Egl = Instance<Dynamic<&'static libloading::Library, EGL1_4>>;
127
128/// Check whether an EGL display supports the surfaceless + no-config context
129/// extensions required by the HAL's FBO-based rendering pipeline.
130///
131/// Queries `eglQueryString(display, EGL_EXTENSIONS)` and checks for
132/// `EGL_KHR_surfaceless_context` and `EGL_KHR_no_config_context`.
133fn probe_display_extensions(egl: &Egl, display: egl::Display) -> bool {
134    let Ok(ext_str) = egl.query_string(Some(display), egl::EXTENSIONS) else {
135        return false;
136    };
137    let exts = ext_str.to_string_lossy();
138
139    let required = ["EGL_KHR_surfaceless_context", "EGL_KHR_no_config_context"];
140
141    for r in &required {
142        if !exts.contains(r) {
143            log::debug!("Display missing required extension: {r}");
144            return false;
145        }
146    }
147
148    egl.bind_api(egl::OPENGL_ES_API).is_ok()
149}
150
151/// Probe for available EGL displays supporting headless OpenGL ES 3.0.
152///
153/// Returns validated displays in priority order (PlatformDevice, GBM,
154/// Default). Each display is validated with `eglInitialize` + extension
155/// checks for `EGL_KHR_surfaceless_context` and `EGL_KHR_no_config_context`.
156/// Probed state is cleaned up with `eglTerminate` — no EGL resources are
157/// left alive.
158///
159/// An empty list means OpenGL is not available on this system.
160///
161/// # Errors
162///
163/// Returns an error only if `libEGL.so.1` cannot be loaded. Individual
164/// display probe failures are silently skipped.
165pub fn probe_egl_displays() -> Result<Vec<EglDisplayInfo>, Error> {
166    let egl: Egl = unsafe { Instance::<Dynamic<_, EGL1_4>>::load_required_from(get_egl_lib()?)? };
167
168    let mut results = Vec::new();
169
170    // PlatformDevice first (zero external deps, works on NVIDIA + newer Vivante)
171    if let Ok(display_type) = GlContext::egl_get_platform_display_from_device(&egl) {
172        let display = display_type.as_display();
173        if egl.initialize(display).is_ok() {
174            if probe_display_extensions(&egl, display) {
175                results.push(EglDisplayInfo {
176                    kind: EglDisplayKind::PlatformDevice,
177                    description: "EGL platform device via EGL_EXT_device_enumeration".to_string(),
178                });
179            }
180            let _ = egl.terminate(display);
181        }
182    }
183
184    // GBM second (needed for Mali + old Vivante)
185    if let Ok(display_type) = GlContext::egl_get_gbm_display(&egl) {
186        let display = display_type.as_display();
187        if egl.initialize(display).is_ok() {
188            if probe_display_extensions(&egl, display) {
189                results.push(EglDisplayInfo {
190                    kind: EglDisplayKind::Gbm,
191                    description: "GBM via /dev/dri/renderD128".to_string(),
192                });
193            }
194            let _ = egl.terminate(display);
195        }
196    }
197
198    // Default last (needs compositor)
199    if let Ok(display_type) = GlContext::egl_get_default_display(&egl) {
200        let display = display_type.as_display();
201        if egl.initialize(display).is_ok() {
202            if probe_display_extensions(&egl, display) {
203                results.push(EglDisplayInfo {
204                    kind: EglDisplayKind::Default,
205                    description: "EGL default display".to_string(),
206                });
207            }
208            let _ = egl.terminate(display);
209        }
210    }
211
212    Ok(results)
213}
214
215/// Tracks which data-transfer method is active for moving pixels
216/// between CPU memory and GPU textures/framebuffers.
217#[derive(Debug, Clone, Copy, PartialEq, Eq)]
218pub(crate) enum TransferBackend {
219    /// Zero-copy via EGLImage imported from DMA-buf file descriptors.
220    /// Available on i.MX8 (Vivante), i.MX95 (Mali), Jetson, and any
221    /// platform where `EGL_EXT_image_dma_buf_import` is present AND
222    /// the GPU can actually render through DMA-buf-backed textures.
223    DmaBuf,
224
225    /// GPU buffer via Pixel Buffer Object. Used when DMA-buf is unavailable
226    /// but OpenGL is present. Data stays in GPU-accessible memory.
227    Pbo,
228
229    /// Synchronous `glTexSubImage2D` upload + `glReadnPixels` readback.
230    /// Used when DMA-buf is unavailable or when the DMA-buf verification
231    /// probe fails (e.g. NVIDIA discrete GPUs where EGLImage creation
232    /// succeeds but rendered data is all zeros).
233    Sync,
234}
235
236impl TransferBackend {
237    /// Returns `true` if DMA-buf zero-copy is available.
238    pub(crate) fn is_dma(self) -> bool {
239        self == TransferBackend::DmaBuf
240    }
241
242    /// Returns `true` if PBO transfer is active.
243    #[allow(dead_code)]
244    pub(crate) fn is_pbo(self) -> bool {
245        self == TransferBackend::Pbo
246    }
247}
248
249pub(crate) struct GlContext {
250    pub(crate) transfer_backend: TransferBackend,
251    pub(crate) display: EglDisplayType,
252    pub(crate) ctx: egl::Context,
253    /// Wrapped in ManuallyDrop because the khronos-egl Dynamic instance's
254    /// Drop calls eglReleaseThread() which can panic during process shutdown
255    /// if the EGL library has been partially unloaded. We drop it explicitly
256    /// inside catch_unwind in GlContext::drop.
257    pub(crate) egl: ManuallyDrop<Rc<Egl>>,
258}
259
260pub(crate) enum EglDisplayType {
261    Default(egl::Display),
262    Gbm(egl::Display, #[allow(dead_code)] Device<Card>),
263    PlatformDisplay(egl::Display),
264}
265
266impl EglDisplayType {
267    fn as_display(&self) -> egl::Display {
268        match self {
269            EglDisplayType::Default(disp) => *disp,
270            EglDisplayType::Gbm(disp, _) => *disp,
271            EglDisplayType::PlatformDisplay(disp) => *disp,
272        }
273    }
274}
275
276impl GlContext {
277    pub(crate) fn new(kind: Option<EglDisplayKind>) -> Result<GlContext, crate::Error> {
278        // Create an EGL API instance.
279        let egl: Rc<Egl> =
280            Rc::new(unsafe { Instance::<Dynamic<_, EGL1_4>>::load_required_from(get_egl_lib()?)? });
281
282        if let Some(kind) = kind {
283            // Specific display type requested — try only that one.
284            let display_fn = match kind {
285                EglDisplayKind::Gbm => Self::egl_get_gbm_display as fn(&Egl) -> _,
286                EglDisplayKind::PlatformDevice => Self::egl_get_platform_display_from_device,
287                EglDisplayKind::Default => Self::egl_get_default_display,
288            };
289            return Self::try_initialize_egl(egl, display_fn).map_err(|e| {
290                log::debug!("Failed to initialize EGL with {kind} display: {e:?}");
291                e
292            });
293        }
294
295        // Try PlatformDevice first (zero external deps, works on NVIDIA + newer Vivante)
296        if let Ok(headless) =
297            Self::try_initialize_egl(egl.clone(), Self::egl_get_platform_display_from_device)
298        {
299            return Ok(headless);
300        } else {
301            log::debug!("Didn't initialize EGL with platform display from device enumeration");
302        }
303
304        // GBM second (needed for Mali + old Vivante that lack EGL_EXT_platform_device)
305        if let Ok(headless) = Self::try_initialize_egl(egl.clone(), Self::egl_get_gbm_display) {
306            return Ok(headless);
307        } else {
308            log::debug!("Didn't initialize EGL with GBM Display");
309        }
310
311        // Default display last (needs compositor)
312        if let Ok(headless) = Self::try_initialize_egl(egl.clone(), Self::egl_get_default_display) {
313            return Ok(headless);
314        } else {
315            log::debug!("Didn't initialize EGL with Default Display");
316        }
317
318        Err(Error::OpenGl(
319            "Could not initialize EGL with any known method".to_string(),
320        ))
321    }
322
323    fn try_initialize_egl(
324        egl: Rc<Egl>,
325        display_fn: impl Fn(&Egl) -> Result<EglDisplayType, crate::Error>,
326    ) -> Result<GlContext, crate::Error> {
327        let display = display_fn(&egl)?;
328        log::debug!("egl initialize with display: {:x?}", display.as_display());
329        egl.initialize(display.as_display())?;
330
331        // Verify required extensions for surfaceless + no-config context
332        let ext_str = egl.query_string(Some(display.as_display()), egl::EXTENSIONS)?;
333        let exts = ext_str.to_string_lossy();
334
335        if !exts.contains("EGL_KHR_surfaceless_context") {
336            return Err(crate::Error::GLVersion(
337                "EGL display does not support EGL_KHR_surfaceless_context".to_string(),
338            ));
339        }
340
341        if !exts.contains("EGL_KHR_no_config_context") {
342            return Err(crate::Error::GLVersion(
343                "EGL display does not support EGL_KHR_no_config_context".to_string(),
344            ));
345        }
346
347        egl.bind_api(egl::OPENGL_ES_API)?;
348
349        // No-config context: pass EGL_NO_CONFIG_KHR (null) instead of a
350        // real config. The context is not bound to any specific framebuffer
351        // format — it works with any FBO attachment format.
352        let context_attributes = [egl::CONTEXT_MAJOR_VERSION, 3, egl::NONE, egl::NONE];
353        let ctx = egl.create_context(
354            display.as_display(),
355            egl_ext::NO_CONFIG_KHR,
356            None,
357            &context_attributes,
358        )?;
359        debug!("ctx: {ctx:?}");
360
361        // Surfaceless context: no PBuffer surface needed. All rendering
362        // goes through FBOs backed by EGLImages.
363        egl.make_current(display.as_display(), None, None, Some(ctx))?;
364
365        let has_dma_extensions = Self::egl_check_support_dma(&egl).is_ok();
366        let transfer_backend = if has_dma_extensions {
367            TransferBackend::DmaBuf
368        } else {
369            TransferBackend::Sync
370        };
371        Ok(GlContext {
372            display,
373            ctx,
374            egl: ManuallyDrop::new(egl),
375            transfer_backend,
376        })
377    }
378
379    fn egl_get_default_display(egl: &Egl) -> Result<EglDisplayType, crate::Error> {
380        // get the default display
381        if let Some(display) = unsafe { egl.get_display(egl::DEFAULT_DISPLAY) } {
382            debug!("default display: {display:?}");
383            return Ok(EglDisplayType::Default(display));
384        }
385
386        Err(Error::OpenGl(
387            "Could not obtain EGL Default Display".to_string(),
388        ))
389    }
390
391    fn egl_get_gbm_display(egl: &Egl) -> Result<EglDisplayType, crate::Error> {
392        // init a GBM device
393        let gbm = Device::new(Card::open_global()?)?;
394
395        debug!("gbm: {gbm:?}");
396        let display = Self::egl_get_platform_display_with_fallback(
397            egl,
398            egl_ext::PLATFORM_GBM_KHR,
399            gbm.as_raw() as *mut c_void,
400            &[egl::ATTRIB_NONE],
401        )?;
402
403        Ok(EglDisplayType::Gbm(display, gbm))
404    }
405
406    fn egl_get_platform_display_from_device(egl: &Egl) -> Result<EglDisplayType, crate::Error> {
407        let extensions = egl.query_string(None, egl::EXTENSIONS)?;
408        let extensions = extensions.to_string_lossy();
409        log::debug!("EGL Extensions: {}", extensions);
410
411        if !extensions.contains("EGL_EXT_device_enumeration") {
412            return Err(Error::GLVersion(
413                "EGL doesn't supported EGL_EXT_device_enumeration extension".to_string(),
414            ));
415        }
416
417        type EGLDeviceEXT = *mut c_void;
418        let devices = if let Some(ext) = egl.get_proc_address("eglQueryDevicesEXT") {
419            let func: unsafe extern "system" fn(
420                max_devices: egl::Int,
421                devices: *mut EGLDeviceEXT,
422                num_devices: *mut egl::Int,
423            ) -> *const c_char = unsafe { std::mem::transmute(ext) };
424            let mut devices = [std::ptr::null_mut(); 10];
425            let mut num_devices = 0;
426            unsafe { func(devices.len() as i32, devices.as_mut_ptr(), &mut num_devices) };
427            for i in 0..num_devices {
428                log::debug!("EGL device: {:?}", devices[i as usize]);
429            }
430            devices[0..num_devices as usize].to_vec()
431        } else {
432            return Err(Error::GLVersion(
433                "EGL doesn't supported eglQueryDevicesEXT function".to_string(),
434            ));
435        };
436
437        if !extensions.contains("EGL_EXT_platform_device") {
438            return Err(Error::GLVersion(
439                "EGL doesn't supported EGL_EXT_platform_device extension".to_string(),
440            ));
441        }
442
443        if devices.is_empty() {
444            return Err(Error::GLVersion(
445                "EGL_EXT_device_enumeration returned 0 devices".to_string(),
446            ));
447        }
448        let disp = Self::egl_get_platform_display_with_fallback(
449            egl,
450            egl_ext::PLATFORM_DEVICE_EXT,
451            devices[0],
452            &[egl::ATTRIB_NONE],
453        )?;
454        Ok(EglDisplayType::PlatformDisplay(disp))
455    }
456
457    fn egl_check_support_dma(egl: &Egl) -> Result<(), crate::Error> {
458        let extensions = egl.query_string(None, egl::EXTENSIONS)?;
459        let extensions = extensions.to_string_lossy();
460        log::debug!("EGL Extensions: {}", extensions);
461
462        if egl.upcast::<egl::EGL1_5>().is_some() {
463            return Ok(());
464        }
465
466        if !extensions.contains("EGL_EXT_image_dma_buf_import") {
467            return Err(crate::Error::GLVersion(
468                "EGL does not support EGL_EXT_image_dma_buf_import extension".to_string(),
469            ));
470        }
471
472        if egl.get_proc_address("eglCreateImageKHR").is_none() {
473            return Err(crate::Error::GLVersion(
474                "EGL does not support eglCreateImageKHR function".to_string(),
475            ));
476        }
477
478        if egl.get_proc_address("eglDestroyImageKHR").is_none() {
479            return Err(crate::Error::GLVersion(
480                "EGL does not support eglDestroyImageKHR function".to_string(),
481            ));
482        }
483        Ok(())
484    }
485
486    fn egl_get_platform_display_with_fallback(
487        egl: &Egl,
488        platform: egl::Enum,
489        native_display: *mut c_void,
490        attrib_list: &[Attrib],
491    ) -> Result<Display, Error> {
492        if let Some(egl) = egl.upcast::<egl::EGL1_5>() {
493            unsafe { egl.get_platform_display(platform, native_display, attrib_list) }
494                .map_err(|e| e.into())
495        } else if let Some(ext) = egl.get_proc_address("eglGetPlatformDisplayEXT") {
496            let func: unsafe extern "system" fn(
497                platform: egl::Enum,
498                native_display: *mut c_void,
499                attrib_list: *const Attrib,
500            ) -> egl::EGLDisplay = unsafe { std::mem::transmute(ext) };
501            let disp = unsafe { func(platform, native_display, attrib_list.as_ptr()) };
502            if disp != egl::NO_DISPLAY {
503                Ok(unsafe { Display::from_ptr(disp) })
504            } else {
505                Err(egl.get_error().map(|e| e.into()).unwrap_or(Error::Internal(
506                    "EGL failed but no error was reported".to_owned(),
507                )))
508            }
509        } else {
510            Err(Error::EGLLoad(egl::LoadError::InvalidVersion {
511                provided: egl.version(),
512                required: khronos_egl::Version::EGL1_5,
513            }))
514        }
515    }
516
517    fn egl_create_image_with_fallback(
518        egl: &Egl,
519        display: Display,
520        ctx: egl::Context,
521        target: egl::Enum,
522        buffer: egl::ClientBuffer,
523        attrib_list: &[Attrib],
524    ) -> Result<egl::Image, Error> {
525        if let Some(egl) = egl.upcast::<egl::EGL1_5>() {
526            egl.create_image(display, ctx, target, buffer, attrib_list)
527                .map_err(|e| e.into())
528        } else if let Some(ext) = egl.get_proc_address("eglCreateImageKHR") {
529            log::trace!("eglCreateImageKHR addr: {:?}", ext);
530            let func: unsafe extern "system" fn(
531                display: egl::EGLDisplay,
532                ctx: egl::EGLContext,
533                target: egl::Enum,
534                buffer: egl::EGLClientBuffer,
535                attrib_list: *const egl::Int,
536            ) -> egl::EGLImage = unsafe { std::mem::transmute(ext) };
537            let new_attrib_list = attrib_list
538                .iter()
539                .map(|x| *x as egl::Int)
540                .collect::<Vec<_>>();
541
542            let image = unsafe {
543                func(
544                    display.as_ptr(),
545                    ctx.as_ptr(),
546                    target,
547                    buffer.as_ptr(),
548                    new_attrib_list.as_ptr(),
549                )
550            };
551            if image != egl::NO_IMAGE {
552                Ok(unsafe { egl::Image::from_ptr(image) })
553            } else {
554                Err(egl.get_error().map(|e| e.into()).unwrap_or(Error::Internal(
555                    "EGL failed but no error was reported".to_owned(),
556                )))
557            }
558        } else {
559            Err(Error::EGLLoad(egl::LoadError::InvalidVersion {
560                provided: egl.version(),
561                required: khronos_egl::Version::EGL1_5,
562            }))
563        }
564    }
565
566    fn egl_destroy_image_with_fallback(
567        egl: &Egl,
568        display: Display,
569        image: egl::Image,
570    ) -> Result<(), Error> {
571        if let Some(egl) = egl.upcast::<egl::EGL1_5>() {
572            egl.destroy_image(display, image).map_err(|e| e.into())
573        } else if let Some(ext) = egl.get_proc_address("eglDestroyImageKHR") {
574            let func: unsafe extern "system" fn(
575                display: egl::EGLDisplay,
576                image: egl::EGLImage,
577            ) -> egl::Boolean = unsafe { std::mem::transmute(ext) };
578            let res = unsafe { func(display.as_ptr(), image.as_ptr()) };
579            if res == egl::TRUE {
580                Ok(())
581            } else {
582                Err(egl.get_error().map(|e| e.into()).unwrap_or(Error::Internal(
583                    "EGL failed but no error was reported".to_owned(),
584                )))
585            }
586        } else {
587            Err(Error::EGLLoad(egl::LoadError::InvalidVersion {
588                provided: egl.version(),
589                required: khronos_egl::Version::EGL1_5,
590            }))
591        }
592    }
593}
594
595impl Drop for GlContext {
596    fn drop(&mut self) {
597        // During process shutdown (e.g. Python interpreter exit), the EGL/GL
598        // shared libraries may already be partially unloaded, causing panics
599        // or heap corruption when calling cleanup functions. We suppress
600        // panic output and catch panics to prevent propagation.
601        let prev_hook = std::panic::take_hook();
602        std::panic::set_hook(Box::new(|_| {}));
603        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
604            let _ = self
605                .egl
606                .make_current(self.display.as_display(), None, None, None);
607
608            let _ = self
609                .egl
610                .destroy_context(self.display.as_display(), self.ctx);
611
612            // eglTerminate is ref-counted per the EGL spec: each eglInitialize
613            // increments a counter and each eglTerminate decrements it. The
614            // display is only truly torn down when the last reference is
615            // released. catch_unwind absorbs any driver-side misbehaviour.
616            let _ = self.egl.terminate(self.display.as_display());
617        }));
618        std::panic::set_hook(prev_hook);
619
620        // The Rc<Egl> (ManuallyDrop) is intentionally NOT dropped. The
621        // khronos-egl Dynamic instance's Drop calls eglReleaseThread() which
622        // panics if the EGL library has been unloaded (local/x86_64) or
623        // causes heap corruption by calling into invalid memory (ARM).
624    }
625}
626
627#[derive(Debug)]
628/// A simple wrapper for a device node.
629pub(crate) struct Card(std::fs::File);
630
631/// Implementing `AsFd` is a prerequisite to implementing the traits found
632/// in this crate. Here, we are just calling `as_fd()` on the inner File.
633impl std::os::unix::io::AsFd for Card {
634    fn as_fd(&self) -> std::os::unix::io::BorrowedFd<'_> {
635        self.0.as_fd()
636    }
637}
638
639/// With `AsFd` implemented, we can now implement `drm::Device`.
640impl DrmDevice for Card {}
641impl DrmControlDevice for Card {}
642
643/// Simple helper methods for opening a `Card`.
644impl Card {
645    pub fn open(path: &str) -> Result<Self, crate::Error> {
646        let mut options = std::fs::OpenOptions::new();
647        options.read(true);
648        options.write(true);
649        let c = options.open(path);
650        match c {
651            Ok(c) => Ok(Card(c)),
652            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
653                Err(Error::NotFound(format!("File not found: {path}")))
654            }
655            Err(e) => Err(e.into()),
656        }
657    }
658
659    pub fn open_global() -> Result<Self, crate::Error> {
660        let targets = ["/dev/dri/renderD128", "/dev/dri/card0", "/dev/dri/card1"];
661        let e = Self::open(targets[0]);
662        if let Ok(t) = e {
663            return Ok(t);
664        }
665        for t in &targets[1..] {
666            if let Ok(t) = Self::open(t) {
667                return Ok(t);
668            }
669        }
670        e
671    }
672}
673
674#[derive(Debug, Clone, Copy)]
675struct RegionOfInterest {
676    left: f32,
677    top: f32,
678    right: f32,
679    bottom: f32,
680}
681
682#[allow(clippy::type_complexity)]
683enum GLProcessorMessage {
684    ImageConvert(
685        SendablePtr<TensorImage>,
686        SendablePtr<TensorImage>,
687        Rotation,
688        Flip,
689        Crop,
690        tokio::sync::oneshot::Sender<Result<(), Error>>,
691    ),
692    SetColors(
693        Vec<[u8; 4]>,
694        tokio::sync::oneshot::Sender<Result<(), Error>>,
695    ),
696    DrawMasks(
697        SendablePtr<TensorImage>,
698        SendablePtr<DetectBox>,
699        SendablePtr<Segmentation>,
700        tokio::sync::oneshot::Sender<Result<(), Error>>,
701    ),
702    DrawMasksProto(
703        SendablePtr<TensorImage>,
704        SendablePtr<DetectBox>,
705        Box<ProtoData>,
706        tokio::sync::oneshot::Sender<Result<(), Error>>,
707    ),
708    SetInt8Interpolation(
709        Int8InterpolationMode,
710        tokio::sync::oneshot::Sender<Result<(), Error>>,
711    ),
712    DecodeMasksAtlas(
713        SendablePtr<DetectBox>,
714        Box<ProtoData>,
715        usize, // output_width
716        usize, // output_height
717        tokio::sync::oneshot::Sender<Result<(Vec<u8>, Vec<MaskRegion>), Error>>,
718    ),
719    PboCreate(
720        usize, // buffer size in bytes
721        tokio::sync::oneshot::Sender<Result<u32, Error>>,
722    ),
723    PboMap(
724        u32,   // buffer_id
725        usize, // size
726        tokio::sync::oneshot::Sender<Result<edgefirst_tensor::PboMapping, Error>>,
727    ),
728    PboUnmap(
729        u32, // buffer_id
730        tokio::sync::oneshot::Sender<Result<(), Error>>,
731    ),
732    PboDelete(u32), // fire-and-forget, no reply
733}
734
735/// Implements PboOps by sending commands to the GL thread.
736///
737/// Uses a `WeakSender` so that PBO images don't keep the GL thread's channel
738/// alive. When the `GLProcessorThreaded` is dropped, its `Sender` is the last
739/// strong reference — dropping it closes the channel and lets the GL thread
740/// exit. PBO operations after that return `PboDisconnected`.
741struct GlPboOps {
742    sender: WeakSender<GLProcessorMessage>,
743}
744
745// SAFETY: GlPboOps sends all GL operations to the dedicated GL thread via a
746// channel. `map_buffer` returns a CPU-visible pointer from `glMapBufferRange`
747// that remains valid until `unmap_buffer` calls `glUnmapBuffer` on the GL thread.
748// `delete_buffer` sends a fire-and-forget deletion command to the GL thread.
749unsafe impl edgefirst_tensor::PboOps for GlPboOps {
750    fn map_buffer(
751        &self,
752        buffer_id: u32,
753        size: usize,
754    ) -> edgefirst_tensor::Result<edgefirst_tensor::PboMapping> {
755        let sender = self
756            .sender
757            .upgrade()
758            .ok_or(edgefirst_tensor::Error::PboDisconnected)?;
759        let (tx, rx) = tokio::sync::oneshot::channel();
760        sender
761            .blocking_send(GLProcessorMessage::PboMap(buffer_id, size, tx))
762            .map_err(|_| edgefirst_tensor::Error::PboDisconnected)?;
763        rx.blocking_recv()
764            .map_err(|_| edgefirst_tensor::Error::PboDisconnected)?
765            .map_err(|e| {
766                edgefirst_tensor::Error::NotImplemented(format!("GL PBO map failed: {e:?}"))
767            })
768    }
769
770    fn unmap_buffer(&self, buffer_id: u32) -> edgefirst_tensor::Result<()> {
771        let sender = self
772            .sender
773            .upgrade()
774            .ok_or(edgefirst_tensor::Error::PboDisconnected)?;
775        let (tx, rx) = tokio::sync::oneshot::channel();
776        sender
777            .blocking_send(GLProcessorMessage::PboUnmap(buffer_id, tx))
778            .map_err(|_| edgefirst_tensor::Error::PboDisconnected)?;
779        rx.blocking_recv()
780            .map_err(|_| edgefirst_tensor::Error::PboDisconnected)?
781            .map_err(|e| {
782                edgefirst_tensor::Error::NotImplemented(format!("GL PBO unmap failed: {e:?}"))
783            })
784    }
785
786    fn delete_buffer(&self, buffer_id: u32) {
787        if let Some(sender) = self.sender.upgrade() {
788            let _ = sender.blocking_send(GLProcessorMessage::PboDelete(buffer_id));
789        }
790    }
791}
792
793/// OpenGL multi-threaded image converter. The actual conversion is done in a
794/// separate rendering thread, as OpenGL contexts are not thread-safe. This can
795/// be safely sent between threads. The `convert()` call sends the conversion
796/// request to the rendering thread and waits for the result.
797#[derive(Debug)]
798pub struct GLProcessorThreaded {
799    // This is only None when the converter is being dropped.
800    handle: Option<JoinHandle<()>>,
801
802    // This is only None when the converter is being dropped.
803    sender: Option<Sender<GLProcessorMessage>>,
804    transfer_backend: TransferBackend,
805    has_bgra: bool,
806}
807
808unsafe impl Send for GLProcessorThreaded {}
809unsafe impl Sync for GLProcessorThreaded {}
810
811struct SendablePtr<T: Send> {
812    ptr: NonNull<T>,
813    len: usize,
814}
815
816unsafe impl<T> Send for SendablePtr<T> where T: Send {}
817
818impl GLProcessorThreaded {
819    /// Creates a new OpenGL multi-threaded image converter.
820    pub fn new(kind: Option<EglDisplayKind>) -> Result<Self, Error> {
821        let (send, mut recv) = tokio::sync::mpsc::channel::<GLProcessorMessage>(1);
822
823        let (create_ctx_send, create_ctx_recv) = tokio::sync::oneshot::channel();
824
825        let func = move || {
826            let mut gl_converter = match GLProcessorST::new(kind) {
827                Ok(gl) => gl,
828                Err(e) => {
829                    let _ = create_ctx_send.send(Err(e));
830                    return;
831                }
832            };
833            let _ = create_ctx_send.send(Ok((
834                gl_converter.gl_context.transfer_backend,
835                gl_converter.has_bgra,
836            )));
837            while let Some(msg) = recv.blocking_recv() {
838                match msg {
839                    GLProcessorMessage::ImageConvert(src, mut dst, rotation, flip, crop, resp) => {
840                        // SAFETY: This is safe because the convert() function waits for the resp to
841                        // be sent before dropping the borrow for src and dst
842                        let src = unsafe { src.ptr.as_ref() };
843                        let dst = unsafe { dst.ptr.as_mut() };
844                        let res = gl_converter.convert(src, dst, rotation, flip, crop);
845                        let _ = resp.send(res);
846                    }
847                    GLProcessorMessage::DrawMasks(mut dst, det, seg, resp) => {
848                        // SAFETY: This is safe because the draw_masks() function waits for the
849                        // resp to be sent before dropping the borrow for dst, detect, and
850                        // segmentation
851                        let dst = unsafe { dst.ptr.as_mut() };
852                        let det = unsafe { std::slice::from_raw_parts(det.ptr.as_ptr(), det.len) };
853                        let seg = unsafe { std::slice::from_raw_parts(seg.ptr.as_ptr(), seg.len) };
854                        let res = gl_converter.draw_masks(dst, det, seg);
855                        let _ = resp.send(res);
856                    }
857                    GLProcessorMessage::DrawMasksProto(mut dst, det, proto_data, resp) => {
858                        // SAFETY: Same safety invariant as DrawMasks — caller
859                        // blocks on resp before dropping borrows.
860                        let dst = unsafe { dst.ptr.as_mut() };
861                        let det = unsafe { std::slice::from_raw_parts(det.ptr.as_ptr(), det.len) };
862                        let res = gl_converter.draw_masks_proto(dst, det, &proto_data);
863                        let _ = resp.send(res);
864                    }
865                    GLProcessorMessage::SetColors(colors, resp) => {
866                        let res = gl_converter.set_class_colors(&colors);
867                        let _ = resp.send(res);
868                    }
869                    GLProcessorMessage::SetInt8Interpolation(mode, resp) => {
870                        gl_converter.set_int8_interpolation_mode(mode);
871                        let _ = resp.send(Ok(()));
872                    }
873                    GLProcessorMessage::DecodeMasksAtlas(
874                        det,
875                        proto_data,
876                        output_width,
877                        output_height,
878                        resp,
879                    ) => {
880                        let det = unsafe { std::slice::from_raw_parts(det.ptr.as_ptr(), det.len) };
881                        let res = gl_converter.decode_masks_atlas(
882                            det,
883                            &proto_data,
884                            output_width,
885                            output_height,
886                        );
887                        let _ = resp.send(res);
888                    }
889                    GLProcessorMessage::PboCreate(size, resp) => {
890                        let result = unsafe {
891                            let mut id: u32 = 0;
892                            gls::gl::GenBuffers(1, &mut id);
893                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, id);
894                            gls::gl::BufferData(
895                                gls::gl::PIXEL_PACK_BUFFER,
896                                size as isize,
897                                std::ptr::null(),
898                                gls::gl::STREAM_COPY,
899                            );
900                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
901                            match check_gl_error("PboCreate", 0) {
902                                Ok(()) => Ok(id),
903                                Err(e) => {
904                                    gls::gl::DeleteBuffers(1, &id);
905                                    Err(e)
906                                }
907                            }
908                        };
909                        let _ = resp.send(result);
910                    }
911                    GLProcessorMessage::PboMap(buffer_id, size, resp) => {
912                        let result = unsafe {
913                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, buffer_id);
914                            let ptr = gls::gl::MapBufferRange(
915                                gls::gl::PIXEL_PACK_BUFFER,
916                                0,
917                                size as isize,
918                                gls::gl::MAP_READ_BIT | gls::gl::MAP_WRITE_BIT,
919                            );
920                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
921                            if ptr.is_null() {
922                                Err(crate::Error::OpenGl(
923                                    "glMapBufferRange returned null".to_string(),
924                                ))
925                            } else {
926                                Ok(edgefirst_tensor::PboMapping {
927                                    ptr: ptr as *mut u8,
928                                    size,
929                                })
930                            }
931                        };
932                        let _ = resp.send(result);
933                    }
934                    GLProcessorMessage::PboUnmap(buffer_id, resp) => {
935                        let result = unsafe {
936                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, buffer_id);
937                            let ok = gls::gl::UnmapBuffer(gls::gl::PIXEL_PACK_BUFFER);
938                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
939                            if ok == gls::gl::FALSE {
940                                Err(Error::OpenGl(
941                                    "PBO data was corrupted during mapping".into(),
942                                ))
943                            } else {
944                                check_gl_error("PboUnmap", 0)
945                            }
946                        };
947                        let _ = resp.send(result);
948                    }
949                    GLProcessorMessage::PboDelete(buffer_id) => unsafe {
950                        gls::gl::DeleteBuffers(1, &buffer_id);
951                    },
952                }
953            }
954        };
955
956        // let handle = tokio::task::spawn(func());
957        let handle = std::thread::spawn(func);
958
959        let (transfer_backend, has_bgra) = match create_ctx_recv.blocking_recv() {
960            Ok(Err(e)) => return Err(e),
961            Err(_) => {
962                return Err(Error::Internal(
963                    "GL converter error messaging closed without update".to_string(),
964                ));
965            }
966            Ok(Ok(tb)) => tb,
967        };
968
969        Ok(Self {
970            handle: Some(handle),
971            sender: Some(send),
972            transfer_backend,
973            has_bgra,
974        })
975    }
976}
977
978impl ImageProcessorTrait for GLProcessorThreaded {
979    fn convert(
980        &mut self,
981        src: &TensorImage,
982        dst: &mut TensorImage,
983        rotation: crate::Rotation,
984        flip: Flip,
985        crop: Crop,
986    ) -> crate::Result<()> {
987        crop.check_crop(src, dst)?;
988        if !GLProcessorST::check_src_format_supported(self.transfer_backend, src) {
989            return Err(crate::Error::NotSupported(format!(
990                "Opengl doesn't support {} source texture",
991                src.fourcc().display()
992            )));
993        }
994
995        if !GLProcessorST::check_dst_format_supported(self.transfer_backend, dst, self.has_bgra) {
996            return Err(crate::Error::NotSupported(format!(
997                "Opengl doesn't support {} destination texture",
998                dst.fourcc().display()
999            )));
1000        }
1001
1002        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1003        self.sender
1004            .as_ref()
1005            .unwrap()
1006            .blocking_send(GLProcessorMessage::ImageConvert(
1007                SendablePtr {
1008                    ptr: src.into(),
1009                    len: 1,
1010                },
1011                SendablePtr {
1012                    ptr: dst.into(),
1013                    len: 1,
1014                },
1015                rotation,
1016                flip,
1017                crop,
1018                err_send,
1019            ))
1020            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1021        err_recv.blocking_recv().map_err(|_| {
1022            Error::Internal("GL converter error messaging closed without update".to_string())
1023        })?
1024    }
1025
1026    fn convert_ref(
1027        &mut self,
1028        src: &TensorImage,
1029        dst: &mut TensorImageRef<'_>,
1030        rotation: Rotation,
1031        flip: Flip,
1032        crop: Crop,
1033    ) -> crate::Result<()> {
1034        // OpenGL doesn't support PLANAR_RGB output, delegate to CPU
1035        let mut cpu = CPUProcessor::new();
1036        cpu.convert_ref(src, dst, rotation, flip, crop)
1037    }
1038
1039    fn draw_masks(
1040        &mut self,
1041        dst: &mut TensorImage,
1042        detect: &[crate::DetectBox],
1043        segmentation: &[crate::Segmentation],
1044    ) -> crate::Result<()> {
1045        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1046        self.sender
1047            .as_ref()
1048            .unwrap()
1049            .blocking_send(GLProcessorMessage::DrawMasks(
1050                SendablePtr {
1051                    ptr: dst.into(),
1052                    len: 1,
1053                },
1054                SendablePtr {
1055                    ptr: NonNull::new(detect.as_ptr() as *mut DetectBox).unwrap(),
1056                    len: detect.len(),
1057                },
1058                SendablePtr {
1059                    ptr: NonNull::new(segmentation.as_ptr() as *mut Segmentation).unwrap(),
1060                    len: segmentation.len(),
1061                },
1062                err_send,
1063            ))
1064            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1065        err_recv.blocking_recv().map_err(|_| {
1066            Error::Internal("GL converter error messaging closed without update".to_string())
1067        })?
1068    }
1069
1070    fn draw_masks_proto(
1071        &mut self,
1072        dst: &mut TensorImage,
1073        detect: &[DetectBox],
1074        proto_data: &ProtoData,
1075    ) -> crate::Result<()> {
1076        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1077        self.sender
1078            .as_ref()
1079            .unwrap()
1080            .blocking_send(GLProcessorMessage::DrawMasksProto(
1081                SendablePtr {
1082                    ptr: NonNull::new(dst as *mut TensorImage).unwrap(),
1083                    len: 1,
1084                },
1085                SendablePtr {
1086                    ptr: NonNull::new(detect.as_ptr() as *mut DetectBox).unwrap(),
1087                    len: detect.len(),
1088                },
1089                Box::new(proto_data.clone()),
1090                err_send,
1091            ))
1092            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1093        err_recv.blocking_recv().map_err(|_| {
1094            Error::Internal("GL converter error messaging closed without update".to_string())
1095        })?
1096    }
1097
1098    fn decode_masks_atlas(
1099        &mut self,
1100        detect: &[DetectBox],
1101        proto_data: ProtoData,
1102        output_width: usize,
1103        output_height: usize,
1104    ) -> crate::Result<(Vec<u8>, Vec<MaskRegion>)> {
1105        GLProcessorThreaded::decode_masks_atlas(
1106            self,
1107            detect,
1108            proto_data,
1109            output_width,
1110            output_height,
1111        )
1112    }
1113
1114    fn set_class_colors(&mut self, colors: &[[u8; 4]]) -> Result<(), crate::Error> {
1115        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1116        self.sender
1117            .as_ref()
1118            .unwrap()
1119            .blocking_send(GLProcessorMessage::SetColors(colors.to_vec(), err_send))
1120            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1121        err_recv.blocking_recv().map_err(|_| {
1122            Error::Internal("GL converter error messaging closed without update".to_string())
1123        })?
1124    }
1125}
1126
1127impl GLProcessorThreaded {
1128    /// Sets the interpolation mode for int8 proto textures.
1129    pub fn set_int8_interpolation_mode(
1130        &mut self,
1131        mode: Int8InterpolationMode,
1132    ) -> Result<(), crate::Error> {
1133        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1134        self.sender
1135            .as_ref()
1136            .unwrap()
1137            .blocking_send(GLProcessorMessage::SetInt8Interpolation(mode, err_send))
1138            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1139        err_recv.blocking_recv().map_err(|_| {
1140            Error::Internal("GL converter error messaging closed without update".to_string())
1141        })?
1142    }
1143
1144    /// Decode all detection masks into a compact atlas via the GL thread.
1145    ///
1146    /// Returns `(atlas_pixels, regions)` where `atlas_pixels` is a contiguous
1147    /// `Vec<u8>` of shape `[atlas_h, output_width]` (compact, bbox-sized strips)
1148    /// and `regions` describes each detection's location within the atlas.
1149    pub fn decode_masks_atlas(
1150        &mut self,
1151        detect: &[DetectBox],
1152        proto_data: ProtoData,
1153        output_width: usize,
1154        output_height: usize,
1155    ) -> Result<(Vec<u8>, Vec<MaskRegion>), crate::Error> {
1156        let (resp_send, resp_recv) = tokio::sync::oneshot::channel();
1157        self.sender
1158            .as_ref()
1159            .unwrap()
1160            .blocking_send(GLProcessorMessage::DecodeMasksAtlas(
1161                SendablePtr {
1162                    ptr: NonNull::new(detect.as_ptr() as *mut DetectBox).unwrap(),
1163                    len: detect.len(),
1164                },
1165                Box::new(proto_data),
1166                output_width,
1167                output_height,
1168                resp_send,
1169            ))
1170            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1171        resp_recv.blocking_recv().map_err(|_| {
1172            Error::Internal("GL converter error messaging closed without update".to_string())
1173        })?
1174    }
1175
1176    /// Create a PBO-backed TensorImage on the GL thread.
1177    pub fn create_pbo_image(
1178        &self,
1179        width: usize,
1180        height: usize,
1181        fourcc: four_char_code::FourCharCode,
1182    ) -> Result<crate::TensorImage, Error> {
1183        let sender = self
1184            .sender
1185            .as_ref()
1186            .ok_or(Error::OpenGl("GL processor is shutting down".to_string()))?;
1187
1188        let channels = crate::fourcc_channels(fourcc)?;
1189        let size = width * height * channels;
1190        if size == 0 {
1191            return Err(Error::OpenGl("Invalid image dimensions".to_string()));
1192        }
1193
1194        // Allocate PBO on the GL thread
1195        let (tx, rx) = tokio::sync::oneshot::channel();
1196        sender
1197            .blocking_send(GLProcessorMessage::PboCreate(size, tx))
1198            .map_err(|_| Error::OpenGl("GL thread channel closed".to_string()))?;
1199        let buffer_id = rx
1200            .blocking_recv()
1201            .map_err(|_| Error::OpenGl("GL thread did not respond".to_string()))??;
1202
1203        let ops: std::sync::Arc<dyn edgefirst_tensor::PboOps> = std::sync::Arc::new(GlPboOps {
1204            sender: sender.downgrade(),
1205        });
1206
1207        let shape = if crate::fourcc_planar(fourcc)? {
1208            vec![channels, height, width]
1209        } else {
1210            vec![height, width, channels]
1211        };
1212
1213        let pbo_tensor =
1214            edgefirst_tensor::PboTensor::<u8>::from_pbo(buffer_id, size, &shape, None, ops)
1215                .map_err(|e| Error::OpenGl(format!("PBO tensor creation failed: {e:?}")))?;
1216        let tensor = edgefirst_tensor::Tensor::Pbo(pbo_tensor);
1217        crate::TensorImage::from_tensor(tensor, fourcc)
1218            .map_err(|e| Error::OpenGl(format!("Failed to wrap PBO tensor as image: {e:?}")))
1219    }
1220
1221    /// Returns the active transfer backend.
1222    #[allow(dead_code)]
1223    pub(crate) fn transfer_backend(&self) -> TransferBackend {
1224        self.transfer_backend
1225    }
1226}
1227
1228impl Drop for GLProcessorThreaded {
1229    fn drop(&mut self) {
1230        drop(self.sender.take());
1231        let _ = self.handle.take().and_then(|h| h.join().ok());
1232    }
1233}
1234
1235/// Interpolation mode for int8 proto textures (GL_R8I cannot use GL_LINEAR).
1236#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1237pub enum Int8InterpolationMode {
1238    /// texelFetch at nearest texel — simplest, fastest GPU execution.
1239    Nearest,
1240    /// texelFetch × 4 neighbors with shader-computed bilinear weights (default).
1241    Bilinear,
1242    /// Two-pass: dequant int8→f16 FBO, then existing f16 shader with GL_LINEAR.
1243    TwoPass,
1244}
1245
1246/// Selects which EGLImage cache to use.
1247#[derive(Debug)]
1248enum CacheKind {
1249    Src,
1250    Dst,
1251}
1252
1253/// A cached EGLImage with a weak reference to the source tensor's guard.
1254struct CachedEglImage {
1255    egl_image: EglImage,
1256    /// Weak reference to the source Tensor's BufferIdentity guard.
1257    guard: std::sync::Weak<()>,
1258    /// Optional GL renderbuffer backed by this EGLImage (used by direct RGB path).
1259    renderbuffer: Option<u32>,
1260    /// Monotonic access counter for LRU eviction.
1261    last_used: u64,
1262}
1263
1264/// EGLImage cache owned by GLProcessorST.
1265///
1266/// Uses a HashMap with a monotonic counter for LRU eviction: each access
1267/// updates the entry's `last_used` timestamp, and eviction removes the entry
1268/// with the smallest `last_used` value.
1269struct EglImageCache {
1270    entries: std::collections::HashMap<u64, CachedEglImage>,
1271    capacity: usize,
1272    hits: u64,
1273    misses: u64,
1274    /// Monotonic counter incremented on each access for LRU tracking.
1275    access_counter: u64,
1276}
1277
1278impl EglImageCache {
1279    fn new(capacity: usize) -> Self {
1280        Self {
1281            entries: std::collections::HashMap::with_capacity(capacity),
1282            capacity,
1283            hits: 0,
1284            misses: 0,
1285            access_counter: 0,
1286        }
1287    }
1288
1289    /// Allocate a new LRU timestamp.
1290    fn next_timestamp(&mut self) -> u64 {
1291        self.access_counter += 1;
1292        self.access_counter
1293    }
1294
1295    /// Evict the least recently used entry.
1296    fn evict_lru(&mut self) {
1297        if let Some((&evict_id, _)) = self.entries.iter().min_by_key(|(_, entry)| entry.last_used) {
1298            if let Some(evicted) = self.entries.remove(&evict_id) {
1299                if let Some(rbo) = evicted.renderbuffer {
1300                    unsafe { gls::gl::DeleteRenderbuffers(1, &rbo) };
1301                }
1302            }
1303        }
1304    }
1305
1306    /// Sweep dead entries (tensor dropped, Weak is dead).
1307    fn sweep(&mut self) {
1308        let before = self.entries.len();
1309        self.entries.retain(|_id, entry| {
1310            let alive = entry.guard.upgrade().is_some();
1311            if !alive {
1312                if let Some(rbo) = entry.renderbuffer {
1313                    unsafe { gls::gl::DeleteRenderbuffers(1, &rbo) };
1314                }
1315            }
1316            alive
1317        });
1318        let swept = before - self.entries.len();
1319        if swept > 0 {
1320            log::debug!("EglImageCache: swept {swept} dead entries");
1321        }
1322    }
1323}
1324
1325impl Drop for EglImageCache {
1326    fn drop(&mut self) {
1327        for entry in self.entries.values() {
1328            if let Some(rbo) = entry.renderbuffer {
1329                unsafe { gls::gl::DeleteRenderbuffers(1, &rbo) };
1330            }
1331        }
1332        log::debug!(
1333            "EglImageCache stats: {} hits, {} misses, {} entries remaining",
1334            self.hits,
1335            self.misses,
1336            self.entries.len()
1337        );
1338    }
1339}
1340
1341/// OpenGL single-threaded image converter.
1342pub struct GLProcessorST {
1343    camera_eglimage_texture: Texture,
1344    camera_normal_texture: Texture,
1345    render_texture: Texture,
1346    segmentation_texture: Texture,
1347    segmentation_program: GlProgram,
1348    instanced_segmentation_program: GlProgram,
1349    proto_texture: Texture,
1350    proto_segmentation_program: GlProgram,
1351    proto_segmentation_int8_nearest_program: GlProgram,
1352    proto_segmentation_int8_bilinear_program: GlProgram,
1353    proto_dequant_int8_program: GlProgram,
1354    proto_segmentation_f32_program: GlProgram,
1355    color_program: GlProgram,
1356    /// Whether GL_OES_texture_float_linear is available (allows GL_LINEAR on R32F textures).
1357    has_float_linear: bool,
1358    /// Whether GL_EXT_texture_format_BGRA8888 is available (allows BGRA destinations).
1359    has_bgra: bool,
1360    /// Interpolation mode for int8 proto textures.
1361    int8_interpolation_mode: Int8InterpolationMode,
1362    /// Intermediate FBO texture for two-pass int8 dequant path.
1363    proto_dequant_texture: Texture,
1364    proto_mask_logit_int8_bilinear_program: GlProgram,
1365    proto_mask_logit_int8_nearest_program: GlProgram,
1366    proto_mask_logit_f32_program: GlProgram,
1367    /// Dedicated FBO for mask rendering.
1368    mask_fbo: u32,
1369    /// R8 texture attached to mask_fbo.
1370    mask_fbo_texture: u32,
1371    /// Current allocated width of mask FBO texture.
1372    mask_fbo_width: usize,
1373    /// Current allocated height of mask FBO texture.
1374    mask_fbo_height: usize,
1375    /// PBO buffer ID for atlas readback (0 = not allocated).
1376    mask_atlas_pbo: u32,
1377    vertex_buffer: Buffer,
1378    texture_buffer: Buffer,
1379    /// Persistent FBO for the convert() render path.
1380    /// Created once, reused by re-attaching textures each frame.
1381    convert_fbo: FrameBuffer,
1382    /// EGLImage cache for source DMA buffers.
1383    src_egl_cache: EglImageCache,
1384    /// EGLImage cache for destination DMA buffers.
1385    dst_egl_cache: EglImageCache,
1386    /// Intermediate RGBA texture for two-pass packed RGB conversion.
1387    /// Pass 1 renders YUYV/NV12→RGBA here; Pass 2 packs RGBA→RGB to DMA dest.
1388    packed_rgb_intermediate_tex: Texture,
1389    /// FBO for pass 1 of packed RGB conversion (renders to intermediate texture).
1390    packed_rgb_fbo: FrameBuffer,
1391    /// Current allocated size of the intermediate texture (0,0 = unallocated).
1392    packed_rgb_intermediate_size: (usize, usize),
1393    texture_program: GlProgram,
1394    texture_program_yuv: GlProgram,
1395    texture_program_planar: GlProgram,
1396    /// Shader: existing planar RGB with int8 bias (XOR 0x80) applied to output.
1397    texture_program_planar_int8: GlProgram,
1398    /// Shader: packed RGB -> RGBA8 packing (2D texture source, pass 2).
1399    packed_rgba8_program_2d: GlProgram,
1400    /// Shader: packed RGB int8 -> RGBA8 packing with XOR 0x80 (2D texture source, pass 2).
1401    packed_rgba8_int8_program_2d: GlProgram,
1402    /// Shader: direct RGB render with int8 XOR 0x80 bias (2D texture source).
1403    texture_int8_program: GlProgram,
1404    /// Shader: direct RGB render with int8 XOR 0x80 bias (external OES source).
1405    texture_int8_program_yuv: GlProgram,
1406    /// Whether the GPU supports direct RGB rendering via BGR888 renderbuffer.
1407    support_rgb_direct: bool,
1408    gl_context: GlContext,
1409}
1410
1411impl Drop for GLProcessorST {
1412    fn drop(&mut self) {
1413        unsafe {
1414            {
1415                if self.mask_fbo != 0 {
1416                    gls::gl::DeleteFramebuffers(1, &self.mask_fbo);
1417                }
1418                if self.mask_fbo_texture != 0 {
1419                    gls::gl::DeleteTextures(1, &self.mask_fbo_texture);
1420                }
1421                if self.mask_atlas_pbo != 0 {
1422                    gls::gl::DeleteBuffers(1, &self.mask_atlas_pbo);
1423                }
1424            }
1425        }
1426    }
1427}
1428
1429impl ImageProcessorTrait for GLProcessorST {
1430    fn convert(
1431        &mut self,
1432        src: &TensorImage,
1433        dst: &mut TensorImage,
1434        rotation: crate::Rotation,
1435        flip: Flip,
1436        crop: Crop,
1437    ) -> crate::Result<()> {
1438        crop.check_crop(src, dst)?;
1439        if !Self::check_src_format_supported(self.gl_context.transfer_backend, src) {
1440            return Err(crate::Error::NotSupported(format!(
1441                "Opengl doesn't support {} source texture",
1442                src.fourcc().display()
1443            )));
1444        }
1445
1446        if !Self::check_dst_format_supported(self.gl_context.transfer_backend, dst, self.has_bgra) {
1447            return Err(crate::Error::NotSupported(format!(
1448                "Opengl doesn't support {} destination texture",
1449                dst.fourcc().display()
1450            )));
1451        }
1452        log::debug!(
1453            "dst tensor: {:?} src tensor :{:?}",
1454            dst.tensor().memory(),
1455            src.tensor().memory()
1456        );
1457        check_gl_error(function!(), line!())?;
1458        if self.gl_context.transfer_backend.is_dma() && dst.tensor().memory() == TensorMemory::Dma {
1459            // Packed RGB is now supported via DMA with buffer reinterpretation
1460            let res = self.convert_dest_dma(dst, src, rotation, flip, crop);
1461            return res;
1462        }
1463        // PBO-to-PBO: both tensors are PBO-backed, use GL buffer bindings for
1464        // both upload and readback (zero CPU copy for both directions)
1465        if src.tensor().memory() == TensorMemory::Pbo && dst.tensor().memory() == TensorMemory::Pbo
1466        {
1467            return self.convert_pbo_to_pbo(dst, src, rotation, flip, crop);
1468        }
1469        // PBO dst with non-PBO src: use normal texture upload for src (which
1470        // maps the Mem/DMA tensor), but PBO PACK readback for dst.
1471        // This avoids the deadlock that would occur if convert_dest_non_dma
1472        // tried to map() the PBO dst on the GL thread.
1473        if dst.tensor().memory() == TensorMemory::Pbo {
1474            return self.convert_any_to_pbo(dst, src, rotation, flip, crop);
1475        }
1476        // PBO src with non-PBO dst: the src tensor's map() would deadlock on
1477        // the GL thread, so use PBO UNPACK upload. Readback goes to Mem dst
1478        // via normal ReadnPixels into mapped memory.
1479        if src.tensor().memory() == TensorMemory::Pbo {
1480            return self.convert_pbo_to_mem(dst, src, rotation, flip, crop);
1481        }
1482        let start = Instant::now();
1483        let res = self.convert_dest_non_dma(dst, src, rotation, flip, crop);
1484        log::debug!("convert_dest_non_dma takes {:?}", start.elapsed());
1485        res
1486    }
1487
1488    fn convert_ref(
1489        &mut self,
1490        src: &TensorImage,
1491        dst: &mut TensorImageRef<'_>,
1492        rotation: Rotation,
1493        flip: Flip,
1494        crop: Crop,
1495    ) -> crate::Result<()> {
1496        // OpenGL doesn't support PLANAR_RGB output, delegate to CPU
1497        let mut cpu = CPUProcessor::new();
1498        cpu.convert_ref(src, dst, rotation, flip, crop)
1499    }
1500
1501    fn draw_masks(
1502        &mut self,
1503        dst: &mut TensorImage,
1504        detect: &[DetectBox],
1505        segmentation: &[Segmentation],
1506    ) -> Result<(), crate::Error> {
1507        use crate::FunctionTimer;
1508
1509        let _timer = FunctionTimer::new("GLProcessorST::draw_masks");
1510        if !matches!(dst.fourcc(), RGBA | BGRA | RGB) {
1511            return Err(crate::Error::NotSupported(
1512                "Opengl image rendering only supports RGBA, BGRA, or RGB images".to_string(),
1513            ));
1514        }
1515
1516        // Determine memory backend and set up the framebuffer.
1517        // PBO tensors need special handling: calling tensor.map() on the GL
1518        // thread would deadlock because PboOps sends a message back to this
1519        // thread. Instead, bind the PBO as GL_PIXEL_UNPACK_BUFFER so
1520        // TexImage2D reads directly from GPU memory (zero CPU copy).
1521        let memory = dst.tensor.memory();
1522        let pbo_buffer_id = if memory == edgefirst_tensor::TensorMemory::Pbo {
1523            match &dst.tensor {
1524                edgefirst_tensor::Tensor::Pbo(p) if !p.is_mapped() => Some(p.buffer_id()),
1525                _ => None,
1526            }
1527        } else {
1528            None
1529        };
1530
1531        let is_dma = match memory {
1532            edgefirst_tensor::TensorMemory::Dma if self.setup_renderbuffer_dma(dst).is_ok() => true,
1533            _ if pbo_buffer_id.is_some() => {
1534                self.setup_renderbuffer_from_pbo(dst, pbo_buffer_id.unwrap())?;
1535                false
1536            }
1537            _ => {
1538                // Add dest rect to make sure dst is rendered fully
1539                self.setup_renderbuffer_non_dma(
1540                    dst,
1541                    Crop::new().with_dst_rect(Some(Rect::new(0, 0, 0, 0))),
1542                )?;
1543                false
1544            }
1545        };
1546
1547        gls::enable(gls::gl::BLEND);
1548        gls::blend_func_separate(
1549            gls::gl::SRC_ALPHA,
1550            gls::gl::ONE_MINUS_SRC_ALPHA,
1551            gls::gl::ZERO,
1552            gls::gl::ONE,
1553        );
1554
1555        self.render_box(dst, detect)?;
1556        self.render_segmentation(detect, segmentation)?;
1557
1558        gls::finish();
1559        if !is_dma {
1560            let format = match dst.fourcc() {
1561                RGB => gls::gl::RGB,
1562                RGBA => gls::gl::RGBA,
1563                BGRA => 0x80E1, // GL_BGRA (GL_EXT_texture_format_BGRA8888)
1564                _ => unreachable!(),
1565            };
1566            if let Some(buffer_id) = pbo_buffer_id {
1567                // PBO readback: bind as PACK buffer, ReadnPixels writes
1568                // directly into PBO memory (zero CPU copy).
1569                unsafe {
1570                    gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, buffer_id);
1571                    gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
1572                    gls::gl::ReadnPixels(
1573                        0,
1574                        0,
1575                        dst.width() as i32,
1576                        dst.height() as i32,
1577                        format,
1578                        gls::gl::UNSIGNED_BYTE,
1579                        dst.tensor.len() as i32,
1580                        std::ptr::null_mut(),
1581                    );
1582                    gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
1583                    gls::gl::Finish();
1584                }
1585                check_gl_error(function!(), line!())?;
1586            } else {
1587                let mut dst_map = dst.tensor().map()?;
1588                unsafe {
1589                    gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
1590                    gls::gl::ReadnPixels(
1591                        0,
1592                        0,
1593                        dst.width() as i32,
1594                        dst.height() as i32,
1595                        format,
1596                        gls::gl::UNSIGNED_BYTE,
1597                        dst.tensor.len() as i32,
1598                        dst_map.as_mut_ptr() as *mut c_void,
1599                    );
1600                }
1601            }
1602        }
1603
1604        Ok(())
1605    }
1606
1607    fn draw_masks_proto(
1608        &mut self,
1609        dst: &mut TensorImage,
1610        detect: &[DetectBox],
1611        proto_data: &ProtoData,
1612    ) -> crate::Result<()> {
1613        use crate::FunctionTimer;
1614
1615        let _timer = FunctionTimer::new("GLProcessorST::draw_masks_proto");
1616        if !matches!(dst.fourcc(), RGBA | BGRA | RGB) {
1617            return Err(crate::Error::NotSupported(
1618                "Opengl image rendering only supports RGBA, BGRA, or RGB images".to_string(),
1619            ));
1620        }
1621
1622        // PBO detection — same rationale as draw_masks.
1623        let memory = dst.tensor.memory();
1624        let pbo_buffer_id = if memory == edgefirst_tensor::TensorMemory::Pbo {
1625            match &dst.tensor {
1626                edgefirst_tensor::Tensor::Pbo(p) if !p.is_mapped() => Some(p.buffer_id()),
1627                _ => None,
1628            }
1629        } else {
1630            None
1631        };
1632
1633        let is_dma = match memory {
1634            edgefirst_tensor::TensorMemory::Dma if self.setup_renderbuffer_dma(dst).is_ok() => true,
1635            _ if pbo_buffer_id.is_some() => {
1636                self.setup_renderbuffer_from_pbo(dst, pbo_buffer_id.unwrap())?;
1637                false
1638            }
1639            _ => {
1640                self.setup_renderbuffer_non_dma(
1641                    dst,
1642                    Crop::new().with_dst_rect(Some(Rect::new(0, 0, 0, 0))),
1643                )?;
1644                false
1645            }
1646        };
1647
1648        gls::enable(gls::gl::BLEND);
1649        gls::blend_func_separate(
1650            gls::gl::SRC_ALPHA,
1651            gls::gl::ONE_MINUS_SRC_ALPHA,
1652            gls::gl::ZERO,
1653            gls::gl::ONE,
1654        );
1655
1656        self.render_box(dst, detect)?;
1657        self.render_proto_segmentation(detect, proto_data)?;
1658
1659        gls::finish();
1660        if !is_dma {
1661            let format = match dst.fourcc() {
1662                RGB => gls::gl::RGB,
1663                RGBA => gls::gl::RGBA,
1664                BGRA => 0x80E1, // GL_BGRA (GL_EXT_texture_format_BGRA8888)
1665                _ => unreachable!(),
1666            };
1667            if let Some(buffer_id) = pbo_buffer_id {
1668                unsafe {
1669                    gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, buffer_id);
1670                    gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
1671                    gls::gl::ReadnPixels(
1672                        0,
1673                        0,
1674                        dst.width() as i32,
1675                        dst.height() as i32,
1676                        format,
1677                        gls::gl::UNSIGNED_BYTE,
1678                        dst.tensor.len() as i32,
1679                        std::ptr::null_mut(),
1680                    );
1681                    gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
1682                    gls::gl::Finish();
1683                }
1684                check_gl_error(function!(), line!())?;
1685            } else {
1686                let mut dst_map = dst.tensor().map()?;
1687                unsafe {
1688                    gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
1689                    gls::gl::ReadnPixels(
1690                        0,
1691                        0,
1692                        dst.width() as i32,
1693                        dst.height() as i32,
1694                        format,
1695                        gls::gl::UNSIGNED_BYTE,
1696                        dst.tensor.len() as i32,
1697                        dst_map.as_mut_ptr() as *mut c_void,
1698                    );
1699                }
1700            }
1701        }
1702
1703        Ok(())
1704    }
1705
1706    fn decode_masks_atlas(
1707        &mut self,
1708        detect: &[DetectBox],
1709        proto_data: ProtoData,
1710        output_width: usize,
1711        output_height: usize,
1712    ) -> crate::Result<(Vec<u8>, Vec<MaskRegion>)> {
1713        GLProcessorST::decode_masks_atlas(self, detect, &proto_data, output_width, output_height)
1714    }
1715
1716    fn set_class_colors(&mut self, colors: &[[u8; 4]]) -> crate::Result<()> {
1717        if colors.is_empty() {
1718            return Ok(());
1719        }
1720        let mut colors_f32 = colors
1721            .iter()
1722            .map(|c| {
1723                [
1724                    c[0] as f32 / 255.0,
1725                    c[1] as f32 / 255.0,
1726                    c[2] as f32 / 255.0,
1727                    c[3] as f32 / 255.0,
1728                ]
1729            })
1730            .take(20)
1731            .collect::<Vec<[f32; 4]>>();
1732
1733        self.segmentation_program
1734            .load_uniform_4fv(c"colors", &colors_f32)?;
1735        self.instanced_segmentation_program
1736            .load_uniform_4fv(c"colors", &colors_f32)?;
1737        self.proto_segmentation_program
1738            .load_uniform_4fv(c"colors", &colors_f32)?;
1739        self.proto_segmentation_int8_nearest_program
1740            .load_uniform_4fv(c"colors", &colors_f32)?;
1741        self.proto_segmentation_int8_bilinear_program
1742            .load_uniform_4fv(c"colors", &colors_f32)?;
1743        self.proto_segmentation_f32_program
1744            .load_uniform_4fv(c"colors", &colors_f32)?;
1745
1746        colors_f32.iter_mut().for_each(|c| {
1747            c[3] = 1.0; // set alpha to 1.0 for color rendering
1748        });
1749        self.color_program
1750            .load_uniform_4fv(c"colors", &colors_f32)?;
1751
1752        Ok(())
1753    }
1754}
1755
1756impl GLProcessorST {
1757    pub fn new(kind: Option<EglDisplayKind>) -> Result<GLProcessorST, crate::Error> {
1758        let gl_context = GlContext::new(kind)?;
1759        gls::load_with(|s| {
1760            gl_context
1761                .egl
1762                .get_proc_address(s)
1763                .map_or(std::ptr::null(), |p| p as *const _)
1764        });
1765
1766        let (has_float_linear, has_bgra) = Self::gl_check_support()?;
1767
1768        // Uploads and downloads are all packed with no alignment requirements
1769        unsafe {
1770            gls::gl::PixelStorei(gls::gl::PACK_ALIGNMENT, 1);
1771            gls::gl::PixelStorei(gls::gl::UNPACK_ALIGNMENT, 1);
1772        }
1773
1774        let texture_program_planar =
1775            GlProgram::new(generate_vertex_shader(), generate_planar_rgb_shader())?;
1776
1777        let texture_program =
1778            GlProgram::new(generate_vertex_shader(), generate_texture_fragment_shader())?;
1779
1780        let texture_program_yuv = GlProgram::new(
1781            generate_vertex_shader(),
1782            generate_texture_fragment_shader_yuv(),
1783        )?;
1784
1785        let segmentation_program =
1786            GlProgram::new(generate_vertex_shader(), generate_segmentation_shader())?;
1787        segmentation_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1788        let instanced_segmentation_program = GlProgram::new(
1789            generate_vertex_shader(),
1790            generate_instanced_segmentation_shader(),
1791        )?;
1792        instanced_segmentation_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1793
1794        // Existing f16 proto shader (RGBA16F, 4 protos per layer)
1795        let proto_segmentation_program = GlProgram::new(
1796            generate_vertex_shader(),
1797            generate_proto_segmentation_shader(),
1798        )?;
1799        proto_segmentation_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1800
1801        // Int8 proto shaders (R8I, 1 proto per layer, 32 layers)
1802        let proto_segmentation_int8_nearest_program = GlProgram::new(
1803            generate_vertex_shader(),
1804            generate_proto_segmentation_shader_int8_nearest(),
1805        )?;
1806        proto_segmentation_int8_nearest_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1807
1808        let proto_segmentation_int8_bilinear_program = GlProgram::new(
1809            generate_vertex_shader(),
1810            generate_proto_segmentation_shader_int8_bilinear(),
1811        )?;
1812        proto_segmentation_int8_bilinear_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1813
1814        let proto_dequant_int8_program = GlProgram::new(
1815            generate_vertex_shader(),
1816            generate_proto_dequant_shader_int8(),
1817        )?;
1818
1819        // F32 proto shader (R32F, 1 proto per layer, 32 layers)
1820        let proto_segmentation_f32_program = GlProgram::new(
1821            generate_vertex_shader(),
1822            generate_proto_segmentation_shader_f32(),
1823        )?;
1824        proto_segmentation_f32_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1825
1826        let color_program = GlProgram::new(generate_vertex_shader(), generate_color_shader())?;
1827        color_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1828
1829        // Binary logit-threshold mask shaders (atlas path — skip sigmoid)
1830        let proto_mask_logit_int8_nearest_program = GlProgram::new(
1831            generate_vertex_shader(),
1832            generate_proto_mask_logit_shader_int8_nearest(),
1833        )?;
1834        let proto_mask_logit_int8_bilinear_program = GlProgram::new(
1835            generate_vertex_shader(),
1836            generate_proto_mask_logit_shader_int8_bilinear(),
1837        )?;
1838        let proto_mask_logit_f32_program = GlProgram::new(
1839            generate_vertex_shader(),
1840            generate_proto_mask_logit_shader_f32(),
1841        )?;
1842
1843        // Int8 variant of the existing planar RGB shader (for PLANAR_RGB_INT8 destinations).
1844        let texture_program_planar_int8 =
1845            GlProgram::new(generate_vertex_shader(), generate_planar_rgb_int8_shader())?;
1846
1847        // RGB packing shaders (2D only — used in pass 2 of two-pass pipeline)
1848        let packed_rgba8_program_2d =
1849            GlProgram::new(generate_vertex_shader(), generate_packed_rgba8_shader_2d())?;
1850        let packed_rgba8_int8_program_2d = GlProgram::new(
1851            generate_vertex_shader(),
1852            generate_packed_rgba8_int8_shader_2d(),
1853        )?;
1854
1855        // Int8 direct-render shaders (for RGB_INT8 destinations via direct path)
1856        let texture_int8_program =
1857            GlProgram::new(generate_vertex_shader(), generate_texture_int8_shader())?;
1858        let texture_int8_program_yuv =
1859            GlProgram::new(generate_vertex_shader(), generate_texture_int8_shader_yuv())?;
1860
1861        let camera_eglimage_texture = Texture::new();
1862        let camera_normal_texture = Texture::new();
1863        let render_texture = Texture::new();
1864        let segmentation_texture = Texture::new();
1865        let proto_texture = Texture::new();
1866        let proto_dequant_texture = Texture::new();
1867        let vertex_buffer = Buffer::new(0, 3, 100);
1868        let texture_buffer = Buffer::new(1, 2, 100);
1869
1870        let mut converter = GLProcessorST {
1871            gl_context,
1872            texture_program,
1873            texture_program_yuv,
1874            texture_program_planar,
1875            texture_program_planar_int8,
1876            packed_rgba8_program_2d,
1877            packed_rgba8_int8_program_2d,
1878            texture_int8_program,
1879            texture_int8_program_yuv,
1880            support_rgb_direct: false, // will be probed in Task 3
1881            camera_eglimage_texture,
1882            camera_normal_texture,
1883            segmentation_texture,
1884            proto_texture,
1885            proto_segmentation_int8_nearest_program,
1886            proto_segmentation_int8_bilinear_program,
1887            proto_dequant_int8_program,
1888            proto_segmentation_f32_program,
1889            has_float_linear,
1890            has_bgra,
1891            int8_interpolation_mode: Int8InterpolationMode::Bilinear,
1892            proto_dequant_texture,
1893            proto_mask_logit_int8_bilinear_program,
1894            proto_mask_logit_int8_nearest_program,
1895            proto_mask_logit_f32_program,
1896            mask_fbo: 0,
1897            mask_fbo_texture: 0,
1898            mask_fbo_width: 0,
1899            mask_fbo_height: 0,
1900            mask_atlas_pbo: 0,
1901            vertex_buffer,
1902            texture_buffer,
1903            convert_fbo: FrameBuffer::new(),
1904            src_egl_cache: EglImageCache::new(8),
1905            dst_egl_cache: EglImageCache::new(8),
1906            packed_rgb_intermediate_tex: Texture::new(),
1907            packed_rgb_fbo: FrameBuffer::new(),
1908            packed_rgb_intermediate_size: (0, 0),
1909            render_texture,
1910            segmentation_program,
1911            instanced_segmentation_program,
1912            proto_segmentation_program,
1913            color_program,
1914        };
1915        check_gl_error(function!(), line!())?;
1916
1917        // Probe GPU capability for direct RGB rendering
1918        converter.support_rgb_direct = converter.probe_rgb_direct_support();
1919
1920        // Verify DMA-buf actually works (catches NVIDIA discrete GPUs where
1921        // EGLImage creation succeeds but rendered data is all zeros)
1922        if converter.gl_context.transfer_backend.is_dma() && !converter.verify_dma_buf_roundtrip() {
1923            log::info!("DMA-buf verification failed — falling back to PBO transfers");
1924            converter.gl_context.transfer_backend = TransferBackend::Pbo;
1925            // RGB direct rendering also requires DMA, so disable it
1926            converter.support_rgb_direct = false;
1927        }
1928
1929        // If DMA-buf failed/unavailable but GL is alive, use PBO transfers
1930        if converter.gl_context.transfer_backend == TransferBackend::Sync {
1931            log::info!("Upgrading transfer backend from Sync to Pbo (GL context available)");
1932            converter.gl_context.transfer_backend = TransferBackend::Pbo;
1933        }
1934
1935        // Allow env-var override for benchmarking specific transfer paths.
1936        // Values: "dmabuf", "pbo", "sync" (case-insensitive).
1937        if let Ok(val) = std::env::var("EDGEFIRST_FORCE_TRANSFER") {
1938            let forced = match val.to_ascii_lowercase().as_str() {
1939                "dmabuf" | "dma" => Some(TransferBackend::DmaBuf),
1940                "pbo" => Some(TransferBackend::Pbo),
1941                "sync" => Some(TransferBackend::Sync),
1942                other => {
1943                    log::warn!(
1944                        "EDGEFIRST_FORCE_TRANSFER={other:?} not recognised \
1945                         (expected dmabuf|pbo|sync), ignoring"
1946                    );
1947                    None
1948                }
1949            };
1950            if let Some(backend) = forced {
1951                log::info!(
1952                    "EDGEFIRST_FORCE_TRANSFER override: {:?} → {backend:?}",
1953                    converter.gl_context.transfer_backend
1954                );
1955                converter.gl_context.transfer_backend = backend;
1956                if !backend.is_dma() {
1957                    converter.support_rgb_direct = false;
1958                }
1959            }
1960        }
1961
1962        log::debug!(
1963            "GLConverter created (transfer={:?}, rgb_direct={})",
1964            converter.gl_context.transfer_backend,
1965            converter.support_rgb_direct
1966        );
1967        Ok(converter)
1968    }
1969
1970    /// Probe whether the GPU supports direct RGB rendering via BGR888 DMA-buf
1971    /// backed renderbuffer. Creates a small test FBO and checks completeness.
1972    /// Returns `false` on any failure (DMA unavailable, EGLImage rejected, FBO incomplete).
1973    fn probe_rgb_direct_support(&self) -> bool {
1974        if !self.gl_context.transfer_backend.is_dma() {
1975            log::debug!("probe_rgb_direct: no DMA support");
1976            return false;
1977        }
1978
1979        // Check glEGLImageTargetRenderbufferStorageOES is available
1980        if self
1981            .gl_context
1982            .egl
1983            .get_proc_address("glEGLImageTargetRenderbufferStorageOES")
1984            .is_none()
1985        {
1986            log::debug!("probe_rgb_direct: glEGLImageTargetRenderbufferStorageOES not available");
1987            return false;
1988        }
1989
1990        // Allocate a small test DMA buffer (64x64 RGB = 12288 bytes)
1991        let test_img = match TensorImage::new(64, 64, RGB, Some(TensorMemory::Dma)) {
1992            Ok(img) => img,
1993            Err(e) => {
1994                log::debug!("probe_rgb_direct: failed to allocate test DMA buffer: {e}");
1995                return false;
1996            }
1997        };
1998
1999        // Create EGLImage from the test DMA buffer
2000        let egl_image =
2001            match self.create_egl_image_with_dims(&test_img, 64, 64, DrmFourcc::Bgr888, 3) {
2002                Ok(img) => img,
2003                Err(e) => {
2004                    log::debug!("probe_rgb_direct: EGLImage creation failed: {e}");
2005                    return false;
2006                }
2007            };
2008
2009        // Create renderbuffer, bind EGLImage, create FBO, check completeness
2010        let result = unsafe {
2011            let mut rbo = 0u32;
2012            gls::gl::GenRenderbuffers(1, &mut rbo);
2013            gls::gl::BindRenderbuffer(gls::gl::RENDERBUFFER, rbo);
2014            gls::gl::EGLImageTargetRenderbufferStorageOES(
2015                gls::gl::RENDERBUFFER,
2016                egl_image.egl_image.as_ptr(),
2017            );
2018
2019            let gl_err = gls::gl::GetError();
2020            if gl_err != gls::gl::NO_ERROR {
2021                log::debug!(
2022                    "probe_rgb_direct: EGLImageTargetRenderbufferStorageOES failed: {gl_err:#X}"
2023                );
2024                gls::gl::BindRenderbuffer(gls::gl::RENDERBUFFER, 0);
2025                gls::gl::DeleteRenderbuffers(1, &rbo);
2026                return false;
2027            }
2028
2029            let mut fbo = 0u32;
2030            gls::gl::GenFramebuffers(1, &mut fbo);
2031            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, fbo);
2032            gls::gl::FramebufferRenderbuffer(
2033                gls::gl::FRAMEBUFFER,
2034                gls::gl::COLOR_ATTACHMENT0,
2035                gls::gl::RENDERBUFFER,
2036                rbo,
2037            );
2038
2039            let status = gls::gl::CheckFramebufferStatus(gls::gl::FRAMEBUFFER);
2040            let complete = status == gls::gl::FRAMEBUFFER_COMPLETE;
2041
2042            // Cleanup
2043            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, 0);
2044            gls::gl::DeleteFramebuffers(1, &fbo);
2045            gls::gl::BindRenderbuffer(gls::gl::RENDERBUFFER, 0);
2046            gls::gl::DeleteRenderbuffers(1, &rbo);
2047
2048            complete
2049        };
2050        // egl_image and test_img drop automatically here
2051
2052        log::info!("probe_rgb_direct: BGR888 renderbuffer FBO support = {result}");
2053        result
2054    }
2055
2056    /// Verify that DMA-buf EGLImage round-trip actually works on this GPU.
2057    ///
2058    /// Renders a solid red quad to a 64x64 DMA-buf-backed RGBA texture via
2059    /// EGLImage, then reads it back and checks that the center pixel is red.
2060    /// Returns `true` if the data round-trips correctly.
2061    ///
2062    /// This catches GPUs like NVIDIA discrete where `eglCreateImage` from
2063    /// `dma_heap` fds succeeds but the rendered data is all zeros.
2064    fn verify_dma_buf_roundtrip(&mut self) -> bool {
2065        // Allocate a 64x64 RGBA DMA source tensor and fill it with solid red
2066        let src = match TensorImage::new(64, 64, RGBA, Some(TensorMemory::Dma)) {
2067            Ok(img) => img,
2068            Err(e) => {
2069                log::info!("verify_dma_buf_roundtrip: failed to allocate DMA source: {e}");
2070                return false;
2071            }
2072        };
2073
2074        {
2075            let mut map = match src.tensor().map() {
2076                Ok(m) => m,
2077                Err(e) => {
2078                    log::info!("verify_dma_buf_roundtrip: failed to map DMA source: {e}");
2079                    return false;
2080                }
2081            };
2082            for pixel in map.chunks_exact_mut(4) {
2083                pixel[0] = 255; // R
2084                pixel[1] = 0; // G
2085                pixel[2] = 0; // B
2086                pixel[3] = 255; // A
2087            }
2088        }
2089
2090        // Allocate a 64x64 RGBA DMA destination tensor
2091        let mut dst = match TensorImage::new(64, 64, RGBA, Some(TensorMemory::Dma)) {
2092            Ok(img) => img,
2093            Err(e) => {
2094                log::info!("verify_dma_buf_roundtrip: failed to allocate DMA destination: {e}");
2095                return false;
2096            }
2097        };
2098
2099        // Run the full DMA-buf EGLImage render pipeline
2100        if let Err(e) =
2101            self.convert_dest_dma(&mut dst, &src, Rotation::None, Flip::None, Crop::no_crop())
2102        {
2103            log::info!("verify_dma_buf_roundtrip: convert_dest_dma failed: {e}");
2104            return false;
2105        }
2106
2107        // Read back the center pixel at (32, 32) from the destination
2108        let map = match dst.tensor().map() {
2109            Ok(m) => m,
2110            Err(e) => {
2111                log::info!("verify_dma_buf_roundtrip: failed to map DMA destination: {e}");
2112                return false;
2113            }
2114        };
2115
2116        let offset = (32 * 64 + 32) * 4;
2117        if map.len() < offset + 4 {
2118            log::info!("verify_dma_buf_roundtrip: destination buffer too small");
2119            return false;
2120        }
2121
2122        let r = map[offset];
2123        let g = map[offset + 1];
2124        let b = map[offset + 2];
2125        let a = map[offset + 3];
2126
2127        let pass = r > 250 && g < 5 && b < 5 && a > 250;
2128
2129        if pass {
2130            log::info!("verify_dma_buf_roundtrip: PASSED (center pixel RGBA={r},{g},{b},{a})");
2131        } else {
2132            log::info!(
2133                "verify_dma_buf_roundtrip: FAILED (center pixel RGBA={r},{g},{b},{a}, \
2134                 expected ~255,0,0,255)"
2135            );
2136        }
2137
2138        pass
2139    }
2140
2141    /// Compute padded bbox regions and atlas offsets for a set of detections.
2142    ///
2143    /// Returns the vector of `MaskRegion` with stacked atlas_y_offset values
2144    /// and the total compact atlas height.
2145    fn compute_atlas_regions(
2146        detect: &[DetectBox],
2147        output_width: usize,
2148        output_height: usize,
2149        padding: usize,
2150    ) -> (Vec<MaskRegion>, usize) {
2151        let ow = output_width as i32;
2152        let oh = output_height as i32;
2153        let owf = output_width as f32;
2154        let ohf = output_height as f32;
2155        let pad = padding as i32;
2156
2157        let mut regions = Vec::with_capacity(detect.len());
2158        let mut atlas_y = 0usize;
2159        for det in detect.iter() {
2160            let bbox_x = (det.bbox.xmin * owf).round() as i32;
2161            let bbox_y = (det.bbox.ymin * ohf).round() as i32;
2162            let bbox_w = ((det.bbox.xmax - det.bbox.xmin) * owf).round() as i32;
2163            let bbox_h = ((det.bbox.ymax - det.bbox.ymin) * ohf).round() as i32;
2164            let bbox_x = bbox_x.max(0).min(ow);
2165            let bbox_y = bbox_y.max(0).min(oh);
2166            let bbox_w = bbox_w.max(1).min(ow - bbox_x);
2167            let bbox_h = bbox_h.max(1).min(oh - bbox_y);
2168
2169            let padded_x = (bbox_x - pad).max(0);
2170            let padded_y = (bbox_y - pad).max(0);
2171            let padded_w = ((bbox_x + bbox_w + pad).min(ow) - padded_x).max(1);
2172            let padded_h = ((bbox_y + bbox_h + pad).min(oh) - padded_y).max(1);
2173
2174            regions.push(MaskRegion {
2175                atlas_y_offset: atlas_y,
2176                padded_x: padded_x as usize,
2177                padded_y: padded_y as usize,
2178                padded_w: padded_w as usize,
2179                padded_h: padded_h as usize,
2180                bbox_x: bbox_x as usize,
2181                bbox_y: bbox_y as usize,
2182                bbox_w: bbox_w as usize,
2183                bbox_h: bbox_h as usize,
2184            });
2185            atlas_y += padded_h as usize;
2186        }
2187        (regions, atlas_y)
2188    }
2189
2190    /// Sets the interpolation mode for int8 proto textures.
2191    pub fn set_int8_interpolation_mode(&mut self, mode: Int8InterpolationMode) {
2192        self.int8_interpolation_mode = mode;
2193        log::debug!("Int8 interpolation mode set to {:?}", mode);
2194    }
2195
2196    /// Ensures the mask FBO + R8 texture are allocated at the given dimensions.
2197    /// Creates or resizes the FBO and texture as needed.
2198    fn ensure_mask_fbo(&mut self, width: usize, height: usize) -> crate::Result<()> {
2199        if self.mask_fbo_width == width && self.mask_fbo_height == height && self.mask_fbo != 0 {
2200            return Ok(());
2201        }
2202
2203        // Create FBO if needed
2204        if self.mask_fbo == 0 {
2205            unsafe {
2206                gls::gl::GenFramebuffers(1, &mut self.mask_fbo);
2207            }
2208        }
2209        // Create texture if needed
2210        if self.mask_fbo_texture == 0 {
2211            unsafe {
2212                gls::gl::GenTextures(1, &mut self.mask_fbo_texture);
2213            }
2214        }
2215
2216        // Allocate R8 texture
2217        unsafe {
2218            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.mask_fbo_texture);
2219            gls::gl::TexImage2D(
2220                gls::gl::TEXTURE_2D,
2221                0,
2222                gls::gl::R8 as i32,
2223                width as i32,
2224                height as i32,
2225                0,
2226                gls::gl::RED,
2227                gls::gl::UNSIGNED_BYTE,
2228                std::ptr::null(),
2229            );
2230            gls::gl::TexParameteri(
2231                gls::gl::TEXTURE_2D,
2232                gls::gl::TEXTURE_MIN_FILTER,
2233                gls::gl::NEAREST as i32,
2234            );
2235            gls::gl::TexParameteri(
2236                gls::gl::TEXTURE_2D,
2237                gls::gl::TEXTURE_MAG_FILTER,
2238                gls::gl::NEAREST as i32,
2239            );
2240        }
2241
2242        // Attach to FBO
2243        unsafe {
2244            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, self.mask_fbo);
2245            gls::gl::FramebufferTexture2D(
2246                gls::gl::FRAMEBUFFER,
2247                gls::gl::COLOR_ATTACHMENT0,
2248                gls::gl::TEXTURE_2D,
2249                self.mask_fbo_texture,
2250                0,
2251            );
2252            let status = gls::gl::CheckFramebufferStatus(gls::gl::FRAMEBUFFER);
2253            if status != gls::gl::FRAMEBUFFER_COMPLETE {
2254                return Err(crate::Error::OpenGl(format!(
2255                    "Mask FBO incomplete: status=0x{status:X}"
2256                )));
2257            }
2258            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, 0);
2259        }
2260
2261        self.mask_fbo_width = width;
2262        self.mask_fbo_height = height;
2263        log::debug!("Mask FBO allocated at {width}x{height}");
2264        Ok(())
2265    }
2266
2267    /// Ensures the mask atlas FBO and PBO are allocated for the given total
2268    /// atlas dimensions.  Unlike `ensure_mask_atlas`, the caller provides
2269    /// the exact atlas height (e.g. sum of padded bbox heights).
2270    fn ensure_mask_atlas_size(&mut self, width: usize, atlas_height: usize) -> crate::Result<()> {
2271        if self.mask_fbo_width == width
2272            && self.mask_fbo_height >= atlas_height
2273            && self.mask_fbo != 0
2274            && self.mask_atlas_pbo != 0
2275        {
2276            return Ok(());
2277        }
2278        self.ensure_mask_fbo(width, atlas_height)?;
2279        let pbo_size = width * atlas_height;
2280        unsafe {
2281            if self.mask_atlas_pbo == 0 {
2282                gls::gl::GenBuffers(1, &mut self.mask_atlas_pbo);
2283            }
2284            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, self.mask_atlas_pbo);
2285            gls::gl::BufferData(
2286                gls::gl::PIXEL_PACK_BUFFER,
2287                pbo_size as isize,
2288                std::ptr::null(),
2289                gls::gl::DYNAMIC_READ,
2290            );
2291            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
2292        }
2293        Ok(())
2294    }
2295
2296    /// Decode all detection masks into a single atlas texture and read back
2297    /// as a contiguous buffer, with one PBO readback for all masks.
2298    ///
2299    /// Returns `(atlas_pixels, metadata)` where `atlas_pixels` is a contiguous
2300    /// `Vec<u8>` of size `output_width * compact_atlas_height` (where
2301    /// `compact_atlas_height` is the sum of padded bbox heights) and `metadata`
2302    /// contains per-detection bbox info (with empty pixel vecs).
2303    pub fn decode_masks_atlas(
2304        &mut self,
2305        detect: &[DetectBox],
2306        proto_data: &ProtoData,
2307        output_width: usize,
2308        output_height: usize,
2309    ) -> crate::Result<(Vec<u8>, Vec<MaskRegion>)> {
2310        use crate::FunctionTimer;
2311
2312        let _timer = FunctionTimer::new("GLProcessorST::decode_masks_atlas");
2313
2314        if detect.is_empty() || proto_data.mask_coefficients.is_empty() {
2315            return Ok((Vec::new(), Vec::new()));
2316        }
2317
2318        let padding = 4usize;
2319
2320        let (height, width, num_protos) = proto_data.protos.dim();
2321        let texture_target = gls::gl::TEXTURE_2D_ARRAY;
2322
2323        // Pre-compute atlas regions and total height to size the FBO/PBO
2324        let (regions, compact_atlas_height) =
2325            Self::compute_atlas_regions(detect, output_width, output_height, padding);
2326
2327        // Save current FBO and viewport
2328        let (saved_fbo, saved_viewport) = unsafe {
2329            let mut fbo: i32 = 0;
2330            gls::gl::GetIntegerv(gls::gl::FRAMEBUFFER_BINDING, &mut fbo);
2331            let mut vp = [0i32; 4];
2332            gls::gl::GetIntegerv(gls::gl::VIEWPORT, vp.as_mut_ptr());
2333            (fbo as u32, vp)
2334        };
2335
2336        // Ensure atlas FBO and PBO are allocated for the compact size
2337        self.ensure_mask_atlas_size(output_width, compact_atlas_height)?;
2338
2339        // Upload proto texture array and select the logit-threshold shader
2340        gls::active_texture(gls::gl::TEXTURE0);
2341        gls::bind_texture(texture_target, self.proto_texture.id);
2342        gls::tex_parameteri(
2343            texture_target,
2344            gls::gl::TEXTURE_MIN_FILTER,
2345            gls::gl::NEAREST as i32,
2346        );
2347        gls::tex_parameteri(
2348            texture_target,
2349            gls::gl::TEXTURE_MAG_FILTER,
2350            gls::gl::NEAREST as i32,
2351        );
2352        gls::tex_parameteri(
2353            texture_target,
2354            gls::gl::TEXTURE_WRAP_S,
2355            gls::gl::CLAMP_TO_EDGE as i32,
2356        );
2357        gls::tex_parameteri(
2358            texture_target,
2359            gls::gl::TEXTURE_WRAP_T,
2360            gls::gl::CLAMP_TO_EDGE as i32,
2361        );
2362
2363        let atlas_result = match &proto_data.protos {
2364            ProtoTensor::Quantized {
2365                protos,
2366                quantization,
2367            } => {
2368                let mut tex_data = vec![0i8; height * width * num_protos];
2369                for k in 0..num_protos {
2370                    for y in 0..height {
2371                        for x in 0..width {
2372                            tex_data[k * height * width + y * width + x] = protos[[y, x, k]];
2373                        }
2374                    }
2375                }
2376                gls::tex_image3d(
2377                    texture_target,
2378                    0,
2379                    gls::gl::R8I as i32,
2380                    width as i32,
2381                    height as i32,
2382                    num_protos as i32,
2383                    0,
2384                    gls::gl::RED_INTEGER,
2385                    gls::gl::BYTE,
2386                    Some(&tex_data),
2387                );
2388
2389                let proto_scale = quantization.scale;
2390                let proto_scaled_zp = -(quantization.zero_point as f32) * quantization.scale;
2391
2392                let program = match self.int8_interpolation_mode {
2393                    Int8InterpolationMode::Nearest => &self.proto_mask_logit_int8_nearest_program,
2394                    _ => &self.proto_mask_logit_int8_bilinear_program,
2395                };
2396                gls::use_program(program.id);
2397                program.load_uniform_1i(c"num_protos", num_protos as i32)?;
2398                program.load_uniform_1f(c"proto_scale", proto_scale)?;
2399
2400                self.render_mask_atlas_compact(
2401                    program,
2402                    regions,
2403                    &proto_data.mask_coefficients,
2404                    output_width,
2405                    output_height,
2406                    Some(proto_scaled_zp),
2407                )
2408            }
2409            ProtoTensor::Float(protos_f32) => {
2410                let mut tex_data = vec![0.0f32; height * width * num_protos];
2411                for k in 0..num_protos {
2412                    for y in 0..height {
2413                        for x in 0..width {
2414                            tex_data[k * height * width + y * width + x] = protos_f32[[y, x, k]];
2415                        }
2416                    }
2417                }
2418                gls::tex_image3d(
2419                    texture_target,
2420                    0,
2421                    gls::gl::R32F as i32,
2422                    width as i32,
2423                    height as i32,
2424                    num_protos as i32,
2425                    0,
2426                    gls::gl::RED,
2427                    gls::gl::FLOAT,
2428                    Some(&tex_data),
2429                );
2430                if self.has_float_linear {
2431                    gls::tex_parameteri(
2432                        texture_target,
2433                        gls::gl::TEXTURE_MIN_FILTER,
2434                        gls::gl::LINEAR as i32,
2435                    );
2436                    gls::tex_parameteri(
2437                        texture_target,
2438                        gls::gl::TEXTURE_MAG_FILTER,
2439                        gls::gl::LINEAR as i32,
2440                    );
2441                }
2442
2443                let program = &self.proto_mask_logit_f32_program;
2444                gls::use_program(program.id);
2445                program.load_uniform_1i(c"num_protos", num_protos as i32)?;
2446
2447                self.render_mask_atlas_compact(
2448                    program,
2449                    regions,
2450                    &proto_data.mask_coefficients,
2451                    output_width,
2452                    output_height,
2453                    None,
2454                )
2455            }
2456        };
2457
2458        // Restore previous FBO + viewport
2459        unsafe {
2460            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, saved_fbo);
2461            gls::gl::Viewport(
2462                saved_viewport[0],
2463                saved_viewport[1],
2464                saved_viewport[2],
2465                saved_viewport[3],
2466            );
2467        }
2468
2469        let (atlas_pixels, regions) = atlas_result?;
2470        Ok((atlas_pixels, regions))
2471    }
2472
2473    /// Render all detection masks into a compact atlas where each strip is
2474    /// sized to the padded bounding box, not the full output resolution.
2475    ///
2476    /// The atlas width equals `output_width`; each detection occupies a
2477    /// horizontal strip whose height is the padded bbox height.  Strips are
2478    /// stacked vertically.  A single PBO readback retrieves the entire atlas.
2479    ///
2480    /// Returns `(atlas_pixels, regions)` where `regions` describes each
2481    /// detection's location within the atlas.
2482    #[allow(clippy::too_many_arguments)]
2483    fn render_mask_atlas_compact(
2484        &self,
2485        program: &GlProgram,
2486        regions: Vec<MaskRegion>,
2487        mask_coefficients: &[Vec<f32>],
2488        output_width: usize,
2489        output_height: usize,
2490        proto_scaled_zp: Option<f32>,
2491    ) -> crate::Result<(Vec<u8>, Vec<MaskRegion>)> {
2492        if regions.is_empty() {
2493            return Ok((Vec::new(), Vec::new()));
2494        }
2495
2496        let owf = output_width as f32;
2497        let ohf = output_height as f32;
2498
2499        let atlas_height = regions.last().map_or(0, |r| r.atlas_y_offset + r.padded_h);
2500        let ahf = atlas_height as f32;
2501
2502        unsafe {
2503            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, self.mask_fbo);
2504            gls::gl::Viewport(0, 0, output_width as i32, atlas_height as i32);
2505            gls::gl::Disable(gls::gl::BLEND);
2506            gls::gl::ClearColor(0.0, 0.0, 0.0, 0.0);
2507            gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
2508        }
2509
2510        if let Some(first_coeff) = mask_coefficients.first() {
2511            if first_coeff.len() > 32 {
2512                log::warn!(
2513                    "render_mask_atlas_compact: {} mask coefficients exceeds shader \
2514                     limit of 32 — coefficients will be truncated",
2515                    first_coeff.len()
2516                );
2517            }
2518        }
2519
2520        for (region, coeff) in regions.iter().zip(mask_coefficients.iter()) {
2521            let mut packed_coeff = [[0.0f32; 4]; 8];
2522            for (j, val) in coeff.iter().enumerate().take(32) {
2523                packed_coeff[j / 4][j % 4] = *val;
2524            }
2525            program.load_uniform_4fv(c"mask_coeff", &packed_coeff)?;
2526
2527            // For int8 paths: upload precomputed coeff_sum * scaled_zp
2528            if let Some(szp) = proto_scaled_zp {
2529                let coeff_sum: f32 = coeff.iter().take(32).sum();
2530                program.load_uniform_1f(c"coeff_sum_x_szp", coeff_sum * szp)?;
2531            }
2532
2533            // The bbox quad position in the atlas:
2534            // - X: the padded bbox horizontal position (same as in output coords)
2535            // - Y: the strip's vertical offset in the atlas
2536            let dst_left = region.padded_x as f32 / owf * 2.0 - 1.0;
2537            let dst_right = (region.padded_x + region.padded_w) as f32 / owf * 2.0 - 1.0;
2538            let dst_bottom = region.atlas_y_offset as f32 / ahf * 2.0 - 1.0;
2539            let dst_top = (region.atlas_y_offset + region.padded_h) as f32 / ahf * 2.0 - 1.0;
2540
2541            // Proto texture coords map the padded bbox to proto space
2542            let src_left = region.padded_x as f32 / owf;
2543            let src_right = (region.padded_x + region.padded_w) as f32 / owf;
2544            let src_bottom = region.padded_y as f32 / ohf;
2545            let src_top = (region.padded_y + region.padded_h) as f32 / ohf;
2546
2547            unsafe {
2548                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
2549                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
2550                let verts: [f32; 12] = [
2551                    dst_left, dst_top, 0.0, dst_right, dst_top, 0.0, dst_right, dst_bottom, 0.0,
2552                    dst_left, dst_bottom, 0.0,
2553                ];
2554                gls::gl::BufferSubData(
2555                    gls::gl::ARRAY_BUFFER,
2556                    0,
2557                    (size_of::<f32>() * 12) as isize,
2558                    verts.as_ptr() as *const c_void,
2559                );
2560
2561                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
2562                gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
2563                let tc: [f32; 8] = [
2564                    src_left, src_top, src_right, src_top, src_right, src_bottom, src_left,
2565                    src_bottom,
2566                ];
2567                gls::gl::BufferSubData(
2568                    gls::gl::ARRAY_BUFFER,
2569                    0,
2570                    (size_of::<f32>() * 8) as isize,
2571                    tc.as_ptr() as *const c_void,
2572                );
2573
2574                let idx: [u32; 4] = [0, 1, 2, 3];
2575                gls::gl::DrawElements(
2576                    gls::gl::TRIANGLE_FAN,
2577                    4,
2578                    gls::gl::UNSIGNED_INT,
2579                    idx.as_ptr() as *const c_void,
2580                );
2581            }
2582        }
2583
2584        // Single readback for the compact atlas
2585        let atlas_bytes = output_width * atlas_height;
2586        let mut pixels = vec![0u8; atlas_bytes];
2587
2588        unsafe {
2589            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, self.mask_atlas_pbo);
2590            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
2591            gls::gl::ReadnPixels(
2592                0,
2593                0,
2594                output_width as i32,
2595                atlas_height as i32,
2596                gls::gl::RED,
2597                gls::gl::UNSIGNED_BYTE,
2598                atlas_bytes as i32,
2599                std::ptr::null_mut(),
2600            );
2601            gls::gl::Finish();
2602
2603            let ptr = gls::gl::MapBufferRange(
2604                gls::gl::PIXEL_PACK_BUFFER,
2605                0,
2606                atlas_bytes as isize,
2607                gls::gl::MAP_READ_BIT,
2608            );
2609            if ptr.is_null() {
2610                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
2611                return Err(crate::Error::OpenGl(
2612                    "Failed to map compact atlas PBO for readback".to_string(),
2613                ));
2614            }
2615            std::ptr::copy_nonoverlapping(ptr as *const u8, pixels.as_mut_ptr(), atlas_bytes);
2616            gls::gl::UnmapBuffer(gls::gl::PIXEL_PACK_BUFFER);
2617            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
2618        }
2619
2620        Ok((pixels, regions))
2621    }
2622
2623    fn check_src_format_supported(backend: TransferBackend, img: &TensorImage) -> bool {
2624        if backend.is_dma() && img.tensor().memory() == TensorMemory::Dma {
2625            // EGLImage supports RGBA, GREY, YUYV, and NV12 for DMA buffers.
2626            // VYUY excluded: Vivante GPU accepts the DRM fourcc but produces
2627            // incorrect output (similarity ~0.28 vs reference).
2628            matches!(img.fourcc(), RGBA | GREY | YUYV | NV12)
2629        } else {
2630            matches!(img.fourcc(), RGB | RGBA | GREY)
2631        }
2632    }
2633
2634    fn check_dst_format_supported(
2635        backend: TransferBackend,
2636        img: &TensorImage,
2637        has_bgra: bool,
2638    ) -> bool {
2639        if img.fourcc() == BGRA && !has_bgra {
2640            return false;
2641        }
2642        if backend.is_dma() && img.tensor().memory() == TensorMemory::Dma {
2643            matches!(
2644                img.fourcc(),
2645                RGBA | BGRA | GREY | PLANAR_RGB | RGB | RGB_INT8 | PLANAR_RGB_INT8
2646            )
2647        } else {
2648            matches!(img.fourcc(), RGB | RGBA | BGRA | GREY | RGB_INT8)
2649        }
2650    }
2651
2652    /// Checks required GL extensions and returns optional capability flags:
2653    /// `(has_float_linear, has_bgra)`.
2654    fn gl_check_support() -> Result<(bool, bool), crate::Error> {
2655        if let Ok(version) = gls::get_string(gls::gl::SHADING_LANGUAGE_VERSION) {
2656            log::debug!("GL Shading Language Version: {version:?}");
2657        } else {
2658            log::warn!("Could not get GL Shading Language Version");
2659        }
2660
2661        let extensions = unsafe {
2662            let str = gls::gl::GetString(gls::gl::EXTENSIONS);
2663            if str.is_null() {
2664                return Err(crate::Error::GLVersion(
2665                    "GL returned no supported extensions".to_string(),
2666                ));
2667            }
2668            CStr::from_ptr(str as *const c_char)
2669                .to_string_lossy()
2670                .to_string()
2671        };
2672        log::debug!("GL Extensions: {extensions}");
2673        let required_ext = ["GL_OES_EGL_image_external_essl3"];
2674        let extensions = extensions.split_ascii_whitespace().collect::<BTreeSet<_>>();
2675        for required in required_ext {
2676            if !extensions.contains(required) {
2677                return Err(crate::Error::GLVersion(format!(
2678                    "GL does not support {required} extension",
2679                )));
2680            }
2681        }
2682
2683        let has_float_linear = extensions.contains("GL_OES_texture_float_linear");
2684        log::debug!("GL_OES_texture_float_linear: {has_float_linear}");
2685
2686        let has_bgra = extensions.contains("GL_EXT_texture_format_BGRA8888");
2687        log::debug!("GL_EXT_texture_format_BGRA8888: {has_bgra}");
2688
2689        Ok((has_float_linear, has_bgra))
2690    }
2691
2692    fn setup_renderbuffer_dma(&mut self, dst: &TensorImage) -> crate::Result<()> {
2693        self.convert_fbo.bind();
2694
2695        let (width, height) = if matches!(dst.fourcc(), PLANAR_RGB | PLANAR_RGB_INT8) {
2696            let width = dst.width();
2697            let height = dst.height() * 3;
2698            (width as i32, height as i32)
2699        } else {
2700            (dst.width() as i32, dst.height() as i32)
2701        };
2702        let dest_egl = self.get_or_create_egl_image(CacheKind::Dst, dst)?;
2703        unsafe {
2704            gls::gl::UseProgram(self.texture_program_yuv.id);
2705            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
2706            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.render_texture.id);
2707            gls::gl::TexParameteri(
2708                gls::gl::TEXTURE_2D,
2709                gls::gl::TEXTURE_MIN_FILTER,
2710                gls::gl::LINEAR as i32,
2711            );
2712            gls::gl::TexParameteri(
2713                gls::gl::TEXTURE_2D,
2714                gls::gl::TEXTURE_MAG_FILTER,
2715                gls::gl::LINEAR as i32,
2716            );
2717            gls::gl::EGLImageTargetTexture2DOES(gls::gl::TEXTURE_2D, dest_egl.as_ptr());
2718            gls::gl::FramebufferTexture2D(
2719                gls::gl::FRAMEBUFFER,
2720                gls::gl::COLOR_ATTACHMENT0,
2721                gls::gl::TEXTURE_2D,
2722                self.render_texture.id,
2723                0,
2724            );
2725            check_gl_error(function!(), line!())?;
2726            gls::gl::Viewport(0, 0, width, height);
2727        }
2728        Ok(())
2729    }
2730
2731    fn convert_dest_dma(
2732        &mut self,
2733        dst: &mut TensorImage,
2734        src: &TensorImage,
2735        rotation: crate::Rotation,
2736        flip: Flip,
2737        crop: Crop,
2738    ) -> crate::Result<()> {
2739        assert!(self.gl_context.transfer_backend.is_dma());
2740        if fourcc_is_packed_rgb(dst.fourcc()) {
2741            if self.support_rgb_direct {
2742                self.convert_to_rgb_direct(src, dst, rotation, flip, crop)
2743            } else {
2744                // Two-pass packed RGB is slower than G2D/CPU; decline so
2745                // ImageProcessor falls through to a faster backend.
2746                Err(crate::Error::NotSupported(
2747                    "OpenGL two-pass packed RGB disabled (no direct RGB support)".into(),
2748                ))
2749            }
2750        } else if dst.is_planar() {
2751            self.setup_renderbuffer_dma(dst)?;
2752            self.convert_to_planar(src, dst, rotation, flip, crop)
2753        } else {
2754            self.setup_renderbuffer_dma(dst)?;
2755            self.convert_to(src, dst, rotation, flip, crop)
2756        }
2757    }
2758
2759    fn setup_renderbuffer_non_dma(&mut self, dst: &TensorImage, crop: Crop) -> crate::Result<()> {
2760        debug_assert!(matches!(
2761            dst.fourcc(),
2762            RGB | RGBA | BGRA | GREY | PLANAR_RGB | RGB_INT8
2763        ));
2764        let (width, height) = if dst.is_planar() {
2765            let width = dst.width() / 4;
2766            let height = match dst.fourcc() {
2767                RGBA => dst.height() * 4,
2768                RGB => dst.height() * 3,
2769                GREY => dst.height(),
2770                _ => unreachable!(),
2771            };
2772            (width as i32, height as i32)
2773        } else {
2774            (dst.width() as i32, dst.height() as i32)
2775        };
2776
2777        let format = if dst.is_planar() {
2778            gls::gl::RED
2779        } else {
2780            match dst.fourcc() {
2781                RGB | RGB_INT8 => gls::gl::RGB,
2782                RGBA => gls::gl::RGBA,
2783                BGRA => 0x80E1, // GL_BGRA (GL_EXT_texture_format_BGRA8888)
2784                GREY => gls::gl::RED,
2785                _ => unreachable!(),
2786            }
2787        };
2788
2789        let start = Instant::now();
2790        self.convert_fbo.bind();
2791
2792        let map;
2793
2794        let pixels = if crop.dst_rect.is_none_or(|crop| {
2795            crop.top == 0
2796                && crop.left == 0
2797                && crop.height == dst.height()
2798                && crop.width == dst.width()
2799        }) {
2800            std::ptr::null()
2801        } else {
2802            map = dst.tensor().map()?;
2803            map.as_ptr() as *const c_void
2804        };
2805        unsafe {
2806            gls::gl::UseProgram(self.texture_program.id);
2807            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.render_texture.id);
2808            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
2809            gls::gl::TexParameteri(
2810                gls::gl::TEXTURE_2D,
2811                gls::gl::TEXTURE_MIN_FILTER,
2812                gls::gl::LINEAR as i32,
2813            );
2814            gls::gl::TexParameteri(
2815                gls::gl::TEXTURE_2D,
2816                gls::gl::TEXTURE_MAG_FILTER,
2817                gls::gl::LINEAR as i32,
2818            );
2819
2820            gls::gl::TexImage2D(
2821                gls::gl::TEXTURE_2D,
2822                0,
2823                format as i32,
2824                width,
2825                height,
2826                0,
2827                format,
2828                gls::gl::UNSIGNED_BYTE,
2829                pixels,
2830            );
2831            check_gl_error(function!(), line!())?;
2832            gls::gl::FramebufferTexture2D(
2833                gls::gl::FRAMEBUFFER,
2834                gls::gl::COLOR_ATTACHMENT0,
2835                gls::gl::TEXTURE_2D,
2836                self.render_texture.id,
2837                0,
2838            );
2839            check_gl_error(function!(), line!())?;
2840            gls::gl::Viewport(0, 0, width, height);
2841        }
2842        log::debug!("Set up framebuffer takes {:?}", start.elapsed());
2843        Ok(())
2844    }
2845
2846    /// Set up a framebuffer for overlay rendering on a PBO-backed destination.
2847    ///
2848    /// Binds the PBO as `GL_PIXEL_UNPACK_BUFFER` and uploads its contents to
2849    /// the render texture via `TexImage2D` with a NULL pointer — GL reads
2850    /// directly from PBO memory without any CPU-side `map()` call. This avoids
2851    /// the deadlock that occurs when `setup_renderbuffer_non_dma` tries to
2852    /// `tensor.map()` a PBO on the GL thread.
2853    fn setup_renderbuffer_from_pbo(
2854        &mut self,
2855        dst: &TensorImage,
2856        buffer_id: u32,
2857    ) -> crate::Result<()> {
2858        let (width, height) = (dst.width() as i32, dst.height() as i32);
2859        let format = match dst.fourcc() {
2860            RGB => gls::gl::RGB,
2861            RGBA => gls::gl::RGBA,
2862            BGRA => 0x80E1, // GL_BGRA (GL_EXT_texture_format_BGRA8888)
2863            _ => {
2864                return Err(crate::Error::NotSupported(format!(
2865                    "PBO renderbuffer not supported for {}",
2866                    dst.fourcc().display()
2867                )))
2868            }
2869        };
2870        self.convert_fbo.bind();
2871        unsafe {
2872            gls::gl::UseProgram(self.texture_program.id);
2873            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.render_texture.id);
2874            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
2875            gls::gl::TexParameteri(
2876                gls::gl::TEXTURE_2D,
2877                gls::gl::TEXTURE_MIN_FILTER,
2878                gls::gl::LINEAR as i32,
2879            );
2880            gls::gl::TexParameteri(
2881                gls::gl::TEXTURE_2D,
2882                gls::gl::TEXTURE_MAG_FILTER,
2883                gls::gl::LINEAR as i32,
2884            );
2885
2886            // Upload existing PBO content to the render texture.
2887            // Binding PBO as UNPACK buffer makes TexImage2D read from it.
2888            gls::gl::BindBuffer(gls::gl::PIXEL_UNPACK_BUFFER, buffer_id);
2889            gls::gl::TexImage2D(
2890                gls::gl::TEXTURE_2D,
2891                0,
2892                format as i32,
2893                width,
2894                height,
2895                0,
2896                format,
2897                gls::gl::UNSIGNED_BYTE,
2898                std::ptr::null(),
2899            );
2900            gls::gl::BindBuffer(gls::gl::PIXEL_UNPACK_BUFFER, 0);
2901
2902            check_gl_error(function!(), line!())?;
2903            gls::gl::FramebufferTexture2D(
2904                gls::gl::FRAMEBUFFER,
2905                gls::gl::COLOR_ATTACHMENT0,
2906                gls::gl::TEXTURE_2D,
2907                self.render_texture.id,
2908                0,
2909            );
2910            check_gl_error(function!(), line!())?;
2911            gls::gl::Viewport(0, 0, width, height);
2912        }
2913        Ok(())
2914    }
2915
2916    fn convert_dest_non_dma(
2917        &mut self,
2918        dst: &mut TensorImage,
2919        src: &TensorImage,
2920        rotation: crate::Rotation,
2921        flip: Flip,
2922        crop: Crop,
2923    ) -> crate::Result<()> {
2924        self.setup_renderbuffer_non_dma(dst, crop)?;
2925        let start = Instant::now();
2926        if dst.is_planar() {
2927            self.convert_to_planar(src, dst, rotation, flip, crop)?;
2928        } else {
2929            self.convert_to(src, dst, rotation, flip, crop)?;
2930        }
2931        log::debug!("Draw to framebuffer takes {:?}", start.elapsed());
2932        let start = Instant::now();
2933        let dest_format = match dst.fourcc() {
2934            RGB | RGB_INT8 => gls::gl::RGB,
2935            RGBA => gls::gl::RGBA,
2936            BGRA => 0x80E1, // GL_BGRA (GL_EXT_texture_format_BGRA8888)
2937            GREY => gls::gl::RED,
2938            _ => unreachable!(),
2939        };
2940
2941        unsafe {
2942            let mut dst_map = dst.tensor().map()?;
2943            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
2944            gls::gl::ReadnPixels(
2945                0,
2946                0,
2947                dst.width() as i32,
2948                dst.height() as i32,
2949                dest_format,
2950                gls::gl::UNSIGNED_BYTE,
2951                dst.tensor.len() as i32,
2952                dst_map.as_mut_ptr() as *mut c_void,
2953            );
2954            // Apply XOR 0x80 for int8 formats (convert uint8 → int8 representation)
2955            if fourcc_is_int8(dst.fourcc()) {
2956                for byte in dst_map.iter_mut() {
2957                    *byte ^= 0x80;
2958                }
2959            }
2960        }
2961        log::debug!("Read from framebuffer takes {:?}", start.elapsed());
2962        Ok(())
2963    }
2964
2965    /// Convert between two PBO-backed images.
2966    ///
2967    /// Source PBO is bound as `GL_PIXEL_UNPACK_BUFFER` for zero-copy texture upload
2968    /// (avoids `tensor.map()` to prevent GL-thread deadlocks). Destination uses
2969    /// `GL_PIXEL_PACK_BUFFER` for zero-copy readback into the PBO.
2970    fn convert_pbo_to_pbo(
2971        &mut self,
2972        dst: &mut TensorImage,
2973        src: &TensorImage,
2974        rotation: crate::Rotation,
2975        flip: Flip,
2976        crop: Crop,
2977    ) -> crate::Result<()> {
2978        // Safety check: neither PBO must be mapped; extract buffer IDs before releasing borrows
2979        let (src_buffer_id, dst_buffer_id) = {
2980            let src_pbo = match &src.tensor {
2981                edgefirst_tensor::Tensor::Pbo(p) => p,
2982                _ => {
2983                    return Err(crate::Error::OpenGl(
2984                        "convert_pbo_to_pbo: src is not a PBO tensor".to_string(),
2985                    ))
2986                }
2987            };
2988            let dst_pbo = match &dst.tensor {
2989                edgefirst_tensor::Tensor::Pbo(p) => p,
2990                _ => {
2991                    return Err(crate::Error::OpenGl(
2992                        "convert_pbo_to_pbo: dst is not a PBO tensor".to_string(),
2993                    ))
2994                }
2995            };
2996
2997            if src_pbo.is_mapped() || dst_pbo.is_mapped() {
2998                return Err(crate::Error::OpenGl(
2999                    "Cannot convert PBO tensors while they are mapped".to_string(),
3000                ));
3001            }
3002
3003            (src_pbo.buffer_id(), dst_pbo.buffer_id())
3004        };
3005
3006        // Setup renderbuffer (same as non-DMA path)
3007        self.setup_renderbuffer_non_dma(dst, crop)?;
3008
3009        // Upload source from PBO and render.
3010        // We cannot call convert_to/draw_src_texture directly because they
3011        // call src.tensor().map() which sends a message back to THIS thread,
3012        // causing a deadlock. Instead, bind the source PBO as UNPACK buffer
3013        // and upload to the texture with a NULL pointer — GL reads directly
3014        // from the PBO, zero CPU copy.
3015        let start = Instant::now();
3016        self.draw_src_texture_from_pbo(src, src_buffer_id, dst, rotation, flip, crop)?;
3017        log::debug!("PBO render takes {:?}", start.elapsed());
3018
3019        // Readback into destination PBO instead of CPU memory
3020        let start_read = Instant::now();
3021        let dest_format = match dst.fourcc() {
3022            crate::RGB | crate::RGB_INT8 => gls::gl::RGB,
3023            crate::RGBA => gls::gl::RGBA,
3024            crate::BGRA => 0x80E1, // GL_BGRA (GL_EXT_texture_format_BGRA8888)
3025            crate::GREY => gls::gl::RED,
3026            _ => {
3027                return Err(crate::Error::NotSupported(format!(
3028                    "PBO readback not supported for {}",
3029                    dst.fourcc().display()
3030                )))
3031            }
3032        };
3033
3034        unsafe {
3035            // Bind destination PBO as PACK buffer — glReadnPixels will write into it
3036            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, dst_buffer_id);
3037            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
3038            gls::gl::ReadnPixels(
3039                0,
3040                0,
3041                dst.width() as i32,
3042                dst.height() as i32,
3043                dest_format,
3044                gls::gl::UNSIGNED_BYTE,
3045                dst.tensor.len() as i32,
3046                std::ptr::null_mut(), // NULL pointer = write to bound PACK buffer
3047            );
3048            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
3049            gls::gl::Finish();
3050        }
3051
3052        check_gl_error(function!(), line!())?;
3053
3054        // Handle int8 XOR if needed (must map PBO to do this on the GL thread
3055        // directly, since we're already on the GL thread)
3056        if fourcc_is_int8(dst.fourcc()) {
3057            unsafe {
3058                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, dst_buffer_id);
3059                let ptr = gls::gl::MapBufferRange(
3060                    gls::gl::PIXEL_PACK_BUFFER,
3061                    0,
3062                    dst.tensor.len() as isize,
3063                    gls::gl::MAP_READ_BIT | gls::gl::MAP_WRITE_BIT,
3064                );
3065                if !ptr.is_null() {
3066                    let slice = std::slice::from_raw_parts_mut(ptr as *mut u8, dst.tensor.len());
3067                    for byte in slice.iter_mut() {
3068                        *byte ^= 0x80;
3069                    }
3070                    gls::gl::UnmapBuffer(gls::gl::PIXEL_PACK_BUFFER);
3071                }
3072                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
3073            }
3074            check_gl_error(function!(), line!())?;
3075        }
3076
3077        log::debug!("PBO readback takes {:?}", start_read.elapsed());
3078        Ok(())
3079    }
3080
3081    /// Upload source image from a PBO and render to the current framebuffer.
3082    /// This is the PBO equivalent of draw_src_texture — instead of mapping
3083    /// the tensor to CPU and calling glTexImage2D with a data pointer, we
3084    /// bind the source PBO as GL_PIXEL_UNPACK_BUFFER and pass NULL, causing
3085    /// GL to read directly from the PBO (zero CPU copy).
3086    fn draw_src_texture_from_pbo(
3087        &mut self,
3088        src: &TensorImage,
3089        src_buffer_id: u32,
3090        dst: &TensorImage,
3091        rotation: crate::Rotation,
3092        flip: Flip,
3093        crop: Crop,
3094    ) -> Result<(), Error> {
3095        let texture_target = gls::gl::TEXTURE_2D;
3096        let texture_format = match src.fourcc() {
3097            crate::RGB | crate::RGB_INT8 => gls::gl::RGB,
3098            crate::RGBA => gls::gl::RGBA,
3099            crate::GREY => gls::gl::RED,
3100            _ => {
3101                return Err(Error::NotSupported(format!(
3102                    "PBO upload not supported for {:?}",
3103                    src.fourcc()
3104                )));
3105            }
3106        };
3107
3108        let has_crop = crop.dst_rect.is_some_and(|x| {
3109            x.left != 0 || x.top != 0 || x.width != dst.width() || x.height != dst.height()
3110        });
3111
3112        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3113        let src_roi = if let Some(crop) = crop.src_rect {
3114            RegionOfInterest {
3115                left: crop.left as f32 / src.width() as f32,
3116                top: (crop.top + crop.height) as f32 / src.height() as f32,
3117                right: (crop.left + crop.width) as f32 / src.width() as f32,
3118                bottom: crop.top as f32 / src.height() as f32,
3119            }
3120        } else {
3121            RegionOfInterest {
3122                left: 0.,
3123                top: 1.,
3124                right: 1.,
3125                bottom: 0.,
3126            }
3127        };
3128
3129        let cvt_screen_coord = |normalized| normalized * 2.0 - 1.0;
3130        let mut dst_roi = if let Some(crop) = crop.dst_rect {
3131            RegionOfInterest {
3132                left: cvt_screen_coord(crop.left as f32 / dst.width() as f32),
3133                top: cvt_screen_coord((crop.top + crop.height) as f32 / dst.height() as f32),
3134                right: cvt_screen_coord((crop.left + crop.width) as f32 / dst.width() as f32),
3135                bottom: cvt_screen_coord(crop.top as f32 / dst.height() as f32),
3136            }
3137        } else {
3138            RegionOfInterest {
3139                left: -1.,
3140                top: 1.,
3141                right: 1.,
3142                bottom: -1.,
3143            }
3144        };
3145
3146        let rotation_offset = match rotation {
3147            crate::Rotation::None => 0,
3148            crate::Rotation::Clockwise90 => 1,
3149            crate::Rotation::Rotate180 => 2,
3150            crate::Rotation::CounterClockwise90 => 3,
3151        };
3152
3153        unsafe {
3154            if has_crop {
3155                if let Some(dst_color) = crop.dst_color {
3156                    gls::gl::ClearColor(
3157                        dst_color[0] as f32 / 255.0,
3158                        dst_color[1] as f32 / 255.0,
3159                        dst_color[2] as f32 / 255.0,
3160                        dst_color[3] as f32 / 255.0,
3161                    );
3162                    gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
3163                }
3164            }
3165
3166            gls::gl::UseProgram(self.texture_program.id);
3167            gls::gl::BindTexture(texture_target, self.camera_normal_texture.id);
3168            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
3169            gls::gl::TexParameteri(
3170                texture_target,
3171                gls::gl::TEXTURE_MIN_FILTER,
3172                gls::gl::LINEAR as i32,
3173            );
3174            gls::gl::TexParameteri(
3175                texture_target,
3176                gls::gl::TEXTURE_MAG_FILTER,
3177                gls::gl::LINEAR as i32,
3178            );
3179            if src.fourcc() == crate::GREY {
3180                for swizzle in [
3181                    gls::gl::TEXTURE_SWIZZLE_R,
3182                    gls::gl::TEXTURE_SWIZZLE_G,
3183                    gls::gl::TEXTURE_SWIZZLE_B,
3184                ] {
3185                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, gls::gl::RED as i32);
3186                }
3187            } else {
3188                for (swizzle, src_component) in [
3189                    (gls::gl::TEXTURE_SWIZZLE_R, gls::gl::RED),
3190                    (gls::gl::TEXTURE_SWIZZLE_G, gls::gl::GREEN),
3191                    (gls::gl::TEXTURE_SWIZZLE_B, gls::gl::BLUE),
3192                ] {
3193                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, src_component as i32);
3194                }
3195            }
3196
3197            // Bind source PBO as UNPACK buffer — glTexImage2D reads from it
3198            gls::gl::BindBuffer(gls::gl::PIXEL_UNPACK_BUFFER, src_buffer_id);
3199            gls::gl::TexImage2D(
3200                texture_target,
3201                0,
3202                texture_format as i32,
3203                src.width() as i32,
3204                src.height() as i32,
3205                0,
3206                texture_format,
3207                gls::gl::UNSIGNED_BYTE,
3208                std::ptr::null(), // NULL = read from bound UNPACK buffer
3209            );
3210            gls::gl::BindBuffer(gls::gl::PIXEL_UNPACK_BUFFER, 0);
3211
3212            // Force texture cache state to be rebuilt next call
3213            self.camera_normal_texture.width = 0;
3214
3215            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
3216            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
3217
3218            match flip {
3219                crate::Flip::None => {}
3220                crate::Flip::Vertical => {
3221                    std::mem::swap(&mut dst_roi.top, &mut dst_roi.bottom);
3222                }
3223                crate::Flip::Horizontal => {
3224                    std::mem::swap(&mut dst_roi.left, &mut dst_roi.right);
3225                }
3226            }
3227
3228            let camera_vertices: [f32; 12] = [
3229                dst_roi.left,
3230                dst_roi.top,
3231                0., // left top
3232                dst_roi.right,
3233                dst_roi.top,
3234                0., // right top
3235                dst_roi.right,
3236                dst_roi.bottom,
3237                0., // right bottom
3238                dst_roi.left,
3239                dst_roi.bottom,
3240                0., // left bottom
3241            ];
3242            gls::gl::BufferData(
3243                gls::gl::ARRAY_BUFFER,
3244                (camera_vertices.len() * std::mem::size_of::<f32>()) as isize,
3245                camera_vertices.as_ptr() as *const c_void,
3246                gls::gl::STATIC_DRAW,
3247            );
3248            gls::gl::VertexAttribPointer(
3249                self.vertex_buffer.buffer_index,
3250                3,
3251                gls::gl::FLOAT,
3252                gls::gl::FALSE,
3253                0,
3254                std::ptr::null(),
3255            );
3256
3257            let texture_coords: [[f32; 8]; 4] = [
3258                [
3259                    src_roi.left,
3260                    src_roi.top,
3261                    src_roi.right,
3262                    src_roi.top,
3263                    src_roi.right,
3264                    src_roi.bottom,
3265                    src_roi.left,
3266                    src_roi.bottom,
3267                ],
3268                [
3269                    src_roi.left,
3270                    src_roi.bottom,
3271                    src_roi.left,
3272                    src_roi.top,
3273                    src_roi.right,
3274                    src_roi.top,
3275                    src_roi.right,
3276                    src_roi.bottom,
3277                ],
3278                [
3279                    src_roi.right,
3280                    src_roi.bottom,
3281                    src_roi.left,
3282                    src_roi.bottom,
3283                    src_roi.left,
3284                    src_roi.top,
3285                    src_roi.right,
3286                    src_roi.top,
3287                ],
3288                [
3289                    src_roi.right,
3290                    src_roi.top,
3291                    src_roi.right,
3292                    src_roi.bottom,
3293                    src_roi.left,
3294                    src_roi.bottom,
3295                    src_roi.left,
3296                    src_roi.top,
3297                ],
3298            ];
3299            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
3300            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
3301            gls::gl::BufferData(
3302                gls::gl::ARRAY_BUFFER,
3303                (texture_coords[0].len() * std::mem::size_of::<f32>()) as isize,
3304                texture_coords[rotation_offset].as_ptr() as *const c_void,
3305                gls::gl::STATIC_DRAW,
3306            );
3307            gls::gl::VertexAttribPointer(
3308                self.texture_buffer.buffer_index,
3309                2,
3310                gls::gl::FLOAT,
3311                gls::gl::FALSE,
3312                0,
3313                std::ptr::null(),
3314            );
3315            gls::gl::DrawArrays(gls::gl::TRIANGLE_FAN, 0, 4);
3316            gls::gl::DisableVertexAttribArray(self.vertex_buffer.buffer_index);
3317            gls::gl::DisableVertexAttribArray(self.texture_buffer.buffer_index);
3318
3319            gls::gl::Finish();
3320        }
3321
3322        check_gl_error(function!(), line!())?;
3323        Ok(())
3324    }
3325
3326    /// Convert any source (Mem/DMA) to a PBO destination.
3327    /// Source is uploaded via normal texture path (maps tensor for CPU upload).
3328    /// Destination readback uses PBO PACK binding (no map on GL thread).
3329    fn convert_any_to_pbo(
3330        &mut self,
3331        dst: &mut TensorImage,
3332        src: &TensorImage,
3333        rotation: crate::Rotation,
3334        flip: Flip,
3335        crop: Crop,
3336    ) -> crate::Result<()> {
3337        let dst_buffer_id = match &dst.tensor {
3338            edgefirst_tensor::Tensor::Pbo(p) => {
3339                if p.is_mapped() {
3340                    return Err(crate::Error::OpenGl(
3341                        "Cannot convert to a mapped PBO tensor".to_string(),
3342                    ));
3343                }
3344                p.buffer_id()
3345            }
3346            _ => {
3347                return Err(crate::Error::OpenGl(
3348                    "convert_any_to_pbo: dst is not a PBO tensor".to_string(),
3349                ))
3350            }
3351        };
3352
3353        self.setup_renderbuffer_non_dma(dst, crop)?;
3354        let start = Instant::now();
3355        if dst.is_planar() {
3356            self.convert_to_planar(src, dst, rotation, flip, crop)?;
3357        } else {
3358            self.convert_to(src, dst, rotation, flip, crop)?;
3359        }
3360        log::debug!("any-to-PBO render takes {:?}", start.elapsed());
3361
3362        // PBO readback
3363        let start_read = Instant::now();
3364        let dest_format = match dst.fourcc() {
3365            crate::RGB | crate::RGB_INT8 => gls::gl::RGB,
3366            crate::RGBA => gls::gl::RGBA,
3367            crate::BGRA => 0x80E1, // GL_BGRA (GL_EXT_texture_format_BGRA8888)
3368            crate::GREY => gls::gl::RED,
3369            _ => {
3370                return Err(crate::Error::NotSupported(format!(
3371                    "PBO readback not supported for {}",
3372                    dst.fourcc().display()
3373                )))
3374            }
3375        };
3376        unsafe {
3377            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, dst_buffer_id);
3378            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
3379            gls::gl::ReadnPixels(
3380                0,
3381                0,
3382                dst.width() as i32,
3383                dst.height() as i32,
3384                dest_format,
3385                gls::gl::UNSIGNED_BYTE,
3386                dst.tensor.len() as i32,
3387                std::ptr::null_mut(),
3388            );
3389            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
3390            gls::gl::Finish();
3391        }
3392        check_gl_error(function!(), line!())?;
3393
3394        if fourcc_is_int8(dst.fourcc()) {
3395            unsafe {
3396                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, dst_buffer_id);
3397                let ptr = gls::gl::MapBufferRange(
3398                    gls::gl::PIXEL_PACK_BUFFER,
3399                    0,
3400                    dst.tensor.len() as isize,
3401                    gls::gl::MAP_READ_BIT | gls::gl::MAP_WRITE_BIT,
3402                );
3403                if !ptr.is_null() {
3404                    let slice = std::slice::from_raw_parts_mut(ptr as *mut u8, dst.tensor.len());
3405                    for byte in slice.iter_mut() {
3406                        *byte ^= 0x80;
3407                    }
3408                    gls::gl::UnmapBuffer(gls::gl::PIXEL_PACK_BUFFER);
3409                }
3410                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
3411            }
3412            check_gl_error(function!(), line!())?;
3413        }
3414
3415        log::debug!("any-to-PBO readback takes {:?}", start_read.elapsed());
3416        Ok(())
3417    }
3418
3419    /// Convert a PBO source to a non-PBO (Mem) destination.
3420    /// Source is uploaded via PBO UNPACK binding (no map on GL thread).
3421    /// Destination readback uses normal ReadnPixels into mapped Mem tensor.
3422    fn convert_pbo_to_mem(
3423        &mut self,
3424        dst: &mut TensorImage,
3425        src: &TensorImage,
3426        rotation: crate::Rotation,
3427        flip: Flip,
3428        crop: Crop,
3429    ) -> crate::Result<()> {
3430        let src_buffer_id = match &src.tensor {
3431            edgefirst_tensor::Tensor::Pbo(p) => {
3432                if p.is_mapped() {
3433                    return Err(crate::Error::OpenGl(
3434                        "Cannot convert from a mapped PBO tensor".to_string(),
3435                    ));
3436                }
3437                p.buffer_id()
3438            }
3439            _ => {
3440                return Err(crate::Error::OpenGl(
3441                    "convert_pbo_to_mem: src is not a PBO tensor".to_string(),
3442                ))
3443            }
3444        };
3445
3446        self.setup_renderbuffer_non_dma(dst, crop)?;
3447        let start = Instant::now();
3448        self.draw_src_texture_from_pbo(src, src_buffer_id, dst, rotation, flip, crop)?;
3449        log::debug!("PBO-to-mem render takes {:?}", start.elapsed());
3450
3451        // Normal readback into Mem dst
3452        let start = Instant::now();
3453        let dest_format = match dst.fourcc() {
3454            crate::RGB | crate::RGB_INT8 => gls::gl::RGB,
3455            crate::RGBA => gls::gl::RGBA,
3456            crate::BGRA => 0x80E1, // GL_BGRA (GL_EXT_texture_format_BGRA8888)
3457            crate::GREY => gls::gl::RED,
3458            _ => {
3459                return Err(crate::Error::NotSupported(format!(
3460                    "PBO readback not supported for {}",
3461                    dst.fourcc().display()
3462                )))
3463            }
3464        };
3465        unsafe {
3466            let mut dst_map = dst.tensor().map()?;
3467            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
3468            gls::gl::ReadnPixels(
3469                0,
3470                0,
3471                dst.width() as i32,
3472                dst.height() as i32,
3473                dest_format,
3474                gls::gl::UNSIGNED_BYTE,
3475                dst.tensor.len() as i32,
3476                dst_map.as_mut_ptr() as *mut c_void,
3477            );
3478            if fourcc_is_int8(dst.fourcc()) {
3479                for byte in dst_map.iter_mut() {
3480                    *byte ^= 0x80;
3481                }
3482            }
3483        }
3484        log::debug!("PBO-to-mem readback takes {:?}", start.elapsed());
3485        Ok(())
3486    }
3487
3488    fn convert_to(
3489        &mut self,
3490        src: &TensorImage,
3491        dst: &TensorImage,
3492        rotation: crate::Rotation,
3493        flip: Flip,
3494        crop: Crop,
3495    ) -> Result<(), crate::Error> {
3496        check_gl_error(function!(), line!())?;
3497
3498        let has_crop = crop.dst_rect.is_some_and(|x| {
3499            x.left != 0 || x.top != 0 || x.width != dst.width() || x.height != dst.height()
3500        });
3501        if has_crop {
3502            if let Some(dst_color) = crop.dst_color {
3503                unsafe {
3504                    gls::gl::ClearColor(
3505                        dst_color[0] as f32 / 255.0,
3506                        dst_color[1] as f32 / 255.0,
3507                        dst_color[2] as f32 / 255.0,
3508                        dst_color[3] as f32 / 255.0,
3509                    );
3510                    gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
3511                };
3512            }
3513        }
3514
3515        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3516        let src_roi = if let Some(crop) = crop.src_rect {
3517            RegionOfInterest {
3518                left: crop.left as f32 / src.width() as f32,
3519                top: (crop.top + crop.height) as f32 / src.height() as f32,
3520                right: (crop.left + crop.width) as f32 / src.width() as f32,
3521                bottom: crop.top as f32 / src.height() as f32,
3522            }
3523        } else {
3524            RegionOfInterest {
3525                left: 0.,
3526                top: 1.,
3527                right: 1.,
3528                bottom: 0.,
3529            }
3530        };
3531
3532        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3533        let cvt_screen_coord = |normalized| normalized * 2.0 - 1.0;
3534        let dst_roi = if let Some(crop) = crop.dst_rect {
3535            RegionOfInterest {
3536                left: cvt_screen_coord(crop.left as f32 / dst.width() as f32),
3537                top: cvt_screen_coord((crop.top + crop.height) as f32 / dst.height() as f32),
3538                right: cvt_screen_coord((crop.left + crop.width) as f32 / dst.width() as f32),
3539                bottom: cvt_screen_coord(crop.top as f32 / dst.height() as f32),
3540            }
3541        } else {
3542            RegionOfInterest {
3543                left: -1.,
3544                top: 1.,
3545                right: 1.,
3546                bottom: -1.,
3547            }
3548        };
3549        let rotation_offset = match rotation {
3550            crate::Rotation::None => 0,
3551            crate::Rotation::Clockwise90 => 1,
3552            crate::Rotation::Rotate180 => 2,
3553            crate::Rotation::CounterClockwise90 => 3,
3554        };
3555        if self.gl_context.transfer_backend.is_dma() && src.tensor().memory() == TensorMemory::Dma {
3556            match self.get_or_create_egl_image(CacheKind::Src, src) {
3557                Ok(src_egl) => self.draw_camera_texture_eglimage(
3558                    src,
3559                    src_egl,
3560                    src_roi,
3561                    dst_roi,
3562                    rotation_offset,
3563                    flip,
3564                )?,
3565                Err(e) => {
3566                    log::warn!("EGL image creation failed for {:?}: {:?}", src.fourcc(), e);
3567                    let start = Instant::now();
3568                    self.draw_src_texture(src, src_roi, dst_roi, rotation_offset, flip)?;
3569                    log::debug!("draw_src_texture takes {:?}", start.elapsed());
3570                }
3571            }
3572        } else {
3573            let start = Instant::now();
3574            self.draw_src_texture(src, src_roi, dst_roi, rotation_offset, flip)?;
3575            log::debug!("draw_src_texture takes {:?}", start.elapsed());
3576        }
3577
3578        let start = Instant::now();
3579        unsafe { gls::gl::Finish() };
3580        log::debug!("gl_Finish takes {:?}", start.elapsed());
3581        check_gl_error(function!(), line!())?;
3582        Ok(())
3583    }
3584
3585    fn convert_to_planar(
3586        &mut self,
3587        src: &TensorImage,
3588        dst: &TensorImage,
3589        rotation: crate::Rotation,
3590        flip: Flip,
3591        crop: Crop,
3592    ) -> Result<(), crate::Error> {
3593        // if let Some(crop) = crop.src_rect
3594        //     && (crop.left > 0
3595        //         || crop.top > 0
3596        //         || crop.height < src.height()
3597        //         || crop.width < src.width())
3598        // {
3599        //     return Err(crate::Error::NotSupported(
3600        //         "Cropping in planar RGB mode is not supported".to_string(),
3601        //     ));
3602        // }
3603
3604        // if let Some(crop) = crop.dst_rect
3605        //     && (crop.left > 0
3606        //         || crop.top > 0
3607        //         || crop.height < src.height()
3608        //         || crop.width < src.width())
3609        // {
3610        //     return Err(crate::Error::NotSupported(
3611        //         "Cropping in planar RGB mode is not supported".to_string(),
3612        //     ));
3613        // }
3614
3615        let alpha = match dst.fourcc() {
3616            PLANAR_RGB | PLANAR_RGB_INT8 => false,
3617            PLANAR_RGBA => true,
3618            _ => {
3619                return Err(crate::Error::NotSupported(
3620                    "Destination format must be PLANAR_RGB, PLANAR_RGB_INT8, or PLANAR_RGBA"
3621                        .to_string(),
3622                ));
3623            }
3624        };
3625        let is_int8 = fourcc_is_int8(dst.fourcc());
3626
3627        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3628        let src_roi = if let Some(crop) = crop.src_rect {
3629            RegionOfInterest {
3630                left: crop.left as f32 / src.width() as f32,
3631                top: (crop.top + crop.height) as f32 / src.height() as f32,
3632                right: (crop.left + crop.width) as f32 / src.width() as f32,
3633                bottom: crop.top as f32 / src.height() as f32,
3634            }
3635        } else {
3636            RegionOfInterest {
3637                left: 0.,
3638                top: 1.,
3639                right: 1.,
3640                bottom: 0.,
3641            }
3642        };
3643
3644        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3645        let cvt_screen_coord = |normalized| normalized * 2.0 - 1.0;
3646        let dst_roi = if let Some(crop) = crop.dst_rect {
3647            RegionOfInterest {
3648                left: cvt_screen_coord(crop.left as f32 / dst.width() as f32),
3649                top: cvt_screen_coord((crop.top + crop.height) as f32 / dst.height() as f32),
3650                right: cvt_screen_coord((crop.left + crop.width) as f32 / dst.width() as f32),
3651                bottom: cvt_screen_coord(crop.top as f32 / dst.height() as f32),
3652            }
3653        } else {
3654            RegionOfInterest {
3655                left: -1.,
3656                top: 1.,
3657                right: 1.,
3658                bottom: -1.,
3659            }
3660        };
3661        let rotation_offset = match rotation {
3662            crate::Rotation::None => 0,
3663            crate::Rotation::Clockwise90 => 1,
3664            crate::Rotation::Rotate180 => 2,
3665            crate::Rotation::CounterClockwise90 => 3,
3666        };
3667
3668        let has_crop = crop.dst_rect.is_some_and(|x| {
3669            x.left != 0 || x.top != 0 || x.width != dst.width() || x.height != dst.height()
3670        });
3671        if has_crop {
3672            if let Some(dst_color) = crop.dst_color {
3673                self.clear_rect_planar(
3674                    dst.width(),
3675                    dst.height(),
3676                    dst_roi,
3677                    [
3678                        dst_color[0] as f32 / 255.0,
3679                        dst_color[1] as f32 / 255.0,
3680                        dst_color[2] as f32 / 255.0,
3681                        dst_color[3] as f32 / 255.0,
3682                    ],
3683                    alpha,
3684                )?;
3685            }
3686        }
3687
3688        let src_egl = self.get_or_create_egl_image(CacheKind::Src, src)?;
3689
3690        self.draw_camera_texture_to_rgb_planar(
3691            src_egl,
3692            src_roi,
3693            dst_roi,
3694            rotation_offset,
3695            flip,
3696            alpha,
3697            is_int8,
3698        )?;
3699        unsafe { gls::gl::Finish() };
3700
3701        Ok(())
3702    }
3703
3704    /// Render packed RGB (or RGB_INT8) to a DMA destination buffer using a
3705    /// two-pass architecture:
3706    ///
3707    /// **Pass 1:** Render source → intermediate RGBA texture via `convert_to()`
3708    /// (reuses the battle-tested RGBA path with full crop/letterbox/rotation/flip).
3709    ///
3710    /// **Pass 2:** Pack intermediate RGBA → RGB DMA destination using a simple
3711    /// packing shader with 2D sampler. The destination DMA buffer is reinterpreted
3712    /// as RGBA8 at (W*3/4) x H dimensions.
3713    fn convert_to_packed_rgb(
3714        &mut self,
3715        src: &TensorImage,
3716        dst: &mut TensorImage,
3717        rotation: crate::Rotation,
3718        flip: Flip,
3719        crop: Crop,
3720    ) -> crate::Result<()> {
3721        let dst_w = dst.width();
3722        let dst_h = dst.height();
3723        let is_int8 = fourcc_is_int8(dst.fourcc());
3724
3725        // Width must satisfy PackedRgba8 constraint: W*3 divisible by 4
3726        if !(dst_w * 3).is_multiple_of(4) {
3727            return Err(crate::Error::NotSupported(format!(
3728                "Packed RGB requires width*3 divisible by 4, got width={dst_w}"
3729            )));
3730        }
3731
3732        let render_w = dst_w * 3 / 4;
3733        let render_h = dst_h;
3734
3735        log::debug!(
3736            "convert_to_packed_rgb: {dst_w}x{dst_h} -> {render_w}x{render_h} two-pass int8={is_int8}",
3737        );
3738
3739        // --- Pass 1: Render source → intermediate RGBA texture ---
3740        self.ensure_packed_rgb_intermediate(dst_w, dst_h)?;
3741        self.packed_rgb_fbo.bind();
3742        unsafe {
3743            gls::gl::FramebufferTexture2D(
3744                gls::gl::FRAMEBUFFER,
3745                gls::gl::COLOR_ATTACHMENT0,
3746                gls::gl::TEXTURE_2D,
3747                self.packed_rgb_intermediate_tex.id,
3748                0,
3749            );
3750            check_gl_error(function!(), line!())?;
3751            gls::gl::Viewport(0, 0, dst_w as i32, dst_h as i32);
3752        }
3753        // convert_to() renders to the currently-bound FBO (packed_rgb_fbo → intermediate).
3754        // It uses dst only for width/height in ROI coordinate math.
3755        // Handles: source binding (DMA EGLImage or upload), crop, letterbox, rotation, flip.
3756        self.convert_to(src, dst, rotation, flip, crop)?;
3757
3758        // --- Pass 2: Pack intermediate RGBA → RGB DMA destination ---
3759        self.convert_fbo.bind();
3760        let dest_egl =
3761            self.get_or_create_egl_image_rgb(dst, render_w, render_h, DrmFourcc::Abgr8888, 4)?;
3762        unsafe {
3763            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
3764            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.render_texture.id);
3765            gls::gl::TexParameteri(
3766                gls::gl::TEXTURE_2D,
3767                gls::gl::TEXTURE_MIN_FILTER,
3768                gls::gl::NEAREST as i32,
3769            );
3770            gls::gl::TexParameteri(
3771                gls::gl::TEXTURE_2D,
3772                gls::gl::TEXTURE_MAG_FILTER,
3773                gls::gl::NEAREST as i32,
3774            );
3775            gls::gl::EGLImageTargetTexture2DOES(gls::gl::TEXTURE_2D, dest_egl.as_ptr());
3776            gls::gl::FramebufferTexture2D(
3777                gls::gl::FRAMEBUFFER,
3778                gls::gl::COLOR_ATTACHMENT0,
3779                gls::gl::TEXTURE_2D,
3780                self.render_texture.id,
3781                0,
3782            );
3783            check_gl_error(function!(), line!())?;
3784            gls::gl::Viewport(0, 0, render_w as i32, render_h as i32);
3785        }
3786
3787        // Bind intermediate RGBA texture as source for the packing shader
3788        let program = if is_int8 {
3789            &self.packed_rgba8_int8_program_2d
3790        } else {
3791            &self.packed_rgba8_program_2d
3792        };
3793        unsafe {
3794            gls::gl::UseProgram(program.id);
3795            gls::gl::ActiveTexture(gls::gl::TEXTURE1);
3796            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.packed_rgb_intermediate_tex.id);
3797            gls::gl::TexParameteri(
3798                gls::gl::TEXTURE_2D,
3799                gls::gl::TEXTURE_MIN_FILTER,
3800                gls::gl::NEAREST as i32,
3801            );
3802            gls::gl::TexParameteri(
3803                gls::gl::TEXTURE_2D,
3804                gls::gl::TEXTURE_MAG_FILTER,
3805                gls::gl::NEAREST as i32,
3806            );
3807        }
3808
3809        // Set uniform: tex = TEXTURE1 (intermediate RGBA texture)
3810        unsafe {
3811            let loc_tex = gls::gl::GetUniformLocation(program.id, c"tex".as_ptr());
3812            gls::gl::Uniform1i(loc_tex, 1);
3813        }
3814
3815        // Draw full-viewport quad to pack RGBA→RGB
3816        self.draw_fullscreen_quad()?;
3817
3818        unsafe { gls::gl::Finish() };
3819        Ok(())
3820    }
3821
3822    /// Render directly to an RGB8 renderbuffer backed by BGR888 DMA-buf.
3823    /// Single-pass: no intermediate texture, no packing shader.
3824    fn convert_to_rgb_direct(
3825        &mut self,
3826        src: &TensorImage,
3827        dst: &mut TensorImage,
3828        rotation: crate::Rotation,
3829        flip: Flip,
3830        crop: Crop,
3831    ) -> crate::Result<()> {
3832        let is_int8 = fourcc_is_int8(dst.fourcc());
3833
3834        log::debug!(
3835            "convert_to_rgb_direct: {}x{} single-pass int8={is_int8}",
3836            dst.width(),
3837            dst.height(),
3838        );
3839
3840        // Get or create cached renderbuffer
3841        let (rbo, width, height) = self.get_or_create_rgb_direct_rbo(dst)?;
3842
3843        // Bind FBO with renderbuffer attachment
3844        self.convert_fbo.bind();
3845        unsafe {
3846            gls::gl::FramebufferRenderbuffer(
3847                gls::gl::FRAMEBUFFER,
3848                gls::gl::COLOR_ATTACHMENT0,
3849                gls::gl::RENDERBUFFER,
3850                rbo,
3851            );
3852            check_gl_error(function!(), line!())?;
3853
3854            let status = gls::gl::CheckFramebufferStatus(gls::gl::FRAMEBUFFER);
3855            if status != gls::gl::FRAMEBUFFER_COMPLETE {
3856                log::warn!("convert_to_rgb_direct: FBO incomplete (0x{status:x}), falling back");
3857                return self.convert_to_packed_rgb(src, dst, rotation, flip, crop);
3858            }
3859
3860            gls::gl::Viewport(0, 0, width, height);
3861        }
3862
3863        // For int8, temporarily swap to int8 shader programs and bias the clear color
3864        let crop = if is_int8 {
3865            std::mem::swap(&mut self.texture_program, &mut self.texture_int8_program);
3866            std::mem::swap(
3867                &mut self.texture_program_yuv,
3868                &mut self.texture_int8_program_yuv,
3869            );
3870            // Bias the letterbox clear color with XOR 0x80 since glClear bypasses
3871            // the fragment shader — the int8 bias must be applied to the color directly.
3872            let mut crop = crop;
3873            if let Some(ref mut color) = crop.dst_color {
3874                color[0] ^= 0x80;
3875                color[1] ^= 0x80;
3876                color[2] ^= 0x80;
3877            }
3878            crop
3879        } else {
3880            crop
3881        };
3882
3883        let result = self.convert_to(src, dst, rotation, flip, crop);
3884
3885        // Swap back
3886        if is_int8 {
3887            std::mem::swap(&mut self.texture_program, &mut self.texture_int8_program);
3888            std::mem::swap(
3889                &mut self.texture_program_yuv,
3890                &mut self.texture_int8_program_yuv,
3891            );
3892        }
3893
3894        result
3895    }
3896
3897    /// Allocates or resizes the intermediate RGBA texture for two-pass packed RGB.
3898    fn ensure_packed_rgb_intermediate(&mut self, width: usize, height: usize) -> crate::Result<()> {
3899        if self.packed_rgb_intermediate_size == (width, height) {
3900            return Ok(());
3901        }
3902        unsafe {
3903            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.packed_rgb_intermediate_tex.id);
3904            gls::gl::TexParameteri(
3905                gls::gl::TEXTURE_2D,
3906                gls::gl::TEXTURE_MIN_FILTER,
3907                gls::gl::NEAREST as i32,
3908            );
3909            gls::gl::TexParameteri(
3910                gls::gl::TEXTURE_2D,
3911                gls::gl::TEXTURE_MAG_FILTER,
3912                gls::gl::NEAREST as i32,
3913            );
3914            gls::gl::TexImage2D(
3915                gls::gl::TEXTURE_2D,
3916                0,
3917                gls::gl::RGBA as i32,
3918                width as i32,
3919                height as i32,
3920                0,
3921                gls::gl::RGBA,
3922                gls::gl::UNSIGNED_BYTE,
3923                std::ptr::null(),
3924            );
3925            check_gl_error(function!(), line!())?;
3926        }
3927        self.packed_rgb_intermediate_size = (width, height);
3928        Ok(())
3929    }
3930
3931    /// Draw a fullscreen quad for the currently-bound shader program.
3932    /// Used by the pass-2 packing shader in the two-pass packed RGB pipeline.
3933    fn draw_fullscreen_quad(&self) -> Result<(), crate::Error> {
3934        unsafe {
3935            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
3936            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
3937
3938            let vertices: [f32; 12] = [
3939                -1.0, 1.0, 0.0, // top-left
3940                1.0, 1.0, 0.0, // top-right
3941                1.0, -1.0, 0.0, // bottom-right
3942                -1.0, -1.0, 0.0, // bottom-left
3943            ];
3944            gls::gl::BufferSubData(
3945                gls::gl::ARRAY_BUFFER,
3946                0,
3947                (size_of::<f32>() * vertices.len()) as isize,
3948                vertices.as_ptr() as *const c_void,
3949            );
3950
3951            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
3952            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
3953
3954            // Texture coordinates (the packed shader uses gl_FragCoord, not tc,
3955            // but we still need valid buffers for the vertex attribute layout)
3956            let tex_coords: [f32; 8] = [0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0];
3957            gls::gl::BufferSubData(
3958                gls::gl::ARRAY_BUFFER,
3959                0,
3960                (size_of::<f32>() * tex_coords.len()) as isize,
3961                tex_coords.as_ptr() as *const c_void,
3962            );
3963
3964            let indices: [u32; 4] = [0, 1, 2, 3];
3965            gls::gl::DrawElements(
3966                gls::gl::TRIANGLE_FAN,
3967                indices.len() as i32,
3968                gls::gl::UNSIGNED_INT,
3969                indices.as_ptr() as *const c_void,
3970            );
3971        }
3972        check_gl_error(function!(), line!())?;
3973        Ok(())
3974    }
3975
3976    fn clear_rect_planar(
3977        &self,
3978        width: usize,
3979        height: usize,
3980        dst_roi: RegionOfInterest,
3981        color: [f32; 4],
3982        alpha: bool,
3983    ) -> Result<(), Error> {
3984        if !alpha && color[0] == color[1] && color[1] == color[2] {
3985            unsafe {
3986                gls::gl::ClearColor(color[0], color[0], color[0], 1.0);
3987                gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
3988            };
3989        }
3990
3991        let split = if alpha { 4 } else { 3 };
3992
3993        unsafe {
3994            gls::gl::Enable(gls::gl::SCISSOR_TEST);
3995            let x = (((dst_roi.left + 1.0) / 2.0) * width as f32).round() as i32;
3996            let y = (((dst_roi.bottom + 1.0) / 2.0) * height as f32).round() as i32;
3997            let width = (((dst_roi.right - dst_roi.left) / 2.0) * width as f32).round() as i32;
3998            let height = (((dst_roi.top - dst_roi.bottom) / 2.0) * height as f32 / split as f32)
3999                .round() as i32;
4000            for (i, c) in color.iter().enumerate().take(split) {
4001                gls::gl::Scissor(x, y + i as i32 * height, width, height);
4002                gls::gl::ClearColor(*c, *c, *c, 1.0);
4003                gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
4004            }
4005            gls::gl::Disable(gls::gl::SCISSOR_TEST);
4006        }
4007        Ok(())
4008    }
4009
4010    #[allow(clippy::too_many_arguments)]
4011    fn draw_camera_texture_to_rgb_planar(
4012        &self,
4013        egl_img: egl::Image,
4014        src_roi: RegionOfInterest,
4015        mut dst_roi: RegionOfInterest,
4016        rotation_offset: usize,
4017        flip: Flip,
4018        alpha: bool,
4019        int8: bool,
4020    ) -> Result<(), Error> {
4021        let texture_target = gls::gl::TEXTURE_EXTERNAL_OES;
4022        match flip {
4023            Flip::None => {}
4024            Flip::Vertical => {
4025                std::mem::swap(&mut dst_roi.top, &mut dst_roi.bottom);
4026            }
4027            Flip::Horizontal => {
4028                std::mem::swap(&mut dst_roi.left, &mut dst_roi.right);
4029            }
4030        }
4031        unsafe {
4032            let program = if int8 {
4033                &self.texture_program_planar_int8
4034            } else {
4035                &self.texture_program_planar
4036            };
4037            gls::gl::UseProgram(program.id);
4038            gls::gl::BindTexture(texture_target, self.camera_eglimage_texture.id);
4039            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
4040            gls::gl::TexParameteri(
4041                texture_target,
4042                gls::gl::TEXTURE_MIN_FILTER,
4043                gls::gl::LINEAR as i32,
4044            );
4045            gls::gl::TexParameteri(
4046                texture_target,
4047                gls::gl::TEXTURE_MAG_FILTER,
4048                gls::gl::LINEAR as i32,
4049            );
4050            gls::gl::TexParameteri(
4051                texture_target,
4052                gls::gl::TEXTURE_WRAP_S,
4053                gls::gl::CLAMP_TO_EDGE as i32,
4054            );
4055
4056            gls::gl::TexParameteri(
4057                texture_target,
4058                gls::gl::TEXTURE_WRAP_T,
4059                gls::gl::CLAMP_TO_EDGE as i32,
4060            );
4061
4062            gls::egl_image_target_texture_2d_oes(texture_target, egl_img.as_ptr());
4063            check_gl_error(function!(), line!())?;
4064            let y_centers = if alpha {
4065                vec![-3.0 / 4.0, -1.0 / 4.0, 1.0 / 4.0, 3.0 / 4.0]
4066            } else {
4067                vec![-2.0 / 3.0, 0.0, 2.0 / 3.0]
4068            };
4069            let swizzles = [gls::gl::RED, gls::gl::GREEN, gls::gl::BLUE, gls::gl::ALPHA];
4070            // starts from bottom
4071            for (i, y_center) in y_centers.iter().enumerate() {
4072                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4073                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4074                let camera_vertices: [f32; 12] = [
4075                    dst_roi.left,
4076                    dst_roi.top / 3.0 + y_center,
4077                    0., // left top
4078                    dst_roi.right,
4079                    dst_roi.top / 3.0 + y_center,
4080                    0., // right top
4081                    dst_roi.right,
4082                    dst_roi.bottom / 3.0 + y_center,
4083                    0., // right bottom
4084                    dst_roi.left,
4085                    dst_roi.bottom / 3.0 + y_center,
4086                    0., // left bottom
4087                ];
4088                gls::gl::BufferData(
4089                    gls::gl::ARRAY_BUFFER,
4090                    (size_of::<f32>() * camera_vertices.len()) as isize,
4091                    camera_vertices.as_ptr() as *const c_void,
4092                    gls::gl::DYNAMIC_DRAW,
4093                );
4094
4095                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4096                gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4097                let texture_vertices: [f32; 16] = [
4098                    src_roi.left,
4099                    src_roi.top,
4100                    src_roi.right,
4101                    src_roi.top,
4102                    src_roi.right,
4103                    src_roi.bottom,
4104                    src_roi.left,
4105                    src_roi.bottom,
4106                    src_roi.left,
4107                    src_roi.top,
4108                    src_roi.right,
4109                    src_roi.top,
4110                    src_roi.right,
4111                    src_roi.bottom,
4112                    src_roi.left,
4113                    src_roi.bottom,
4114                ];
4115
4116                gls::gl::BufferData(
4117                    gls::gl::ARRAY_BUFFER,
4118                    (size_of::<f32>() * 8) as isize,
4119                    (texture_vertices[(rotation_offset * 2)..]).as_ptr() as *const c_void,
4120                    gls::gl::DYNAMIC_DRAW,
4121                );
4122                let vertices_index: [u32; 4] = [0, 1, 2, 3];
4123                // self.texture_program_planar
4124                //     .load_uniform_1i(c"color_index", 2 - i as i32);
4125
4126                gls::gl::TexParameteri(
4127                    texture_target,
4128                    gls::gl::TEXTURE_SWIZZLE_R,
4129                    swizzles[i] as i32,
4130                );
4131
4132                gls::gl::DrawElements(
4133                    gls::gl::TRIANGLE_FAN,
4134                    vertices_index.len() as i32,
4135                    gls::gl::UNSIGNED_INT,
4136                    vertices_index.as_ptr() as *const c_void,
4137                );
4138            }
4139            check_gl_error(function!(), line!())?;
4140        }
4141        Ok(())
4142    }
4143
4144    fn draw_src_texture(
4145        &mut self,
4146        src: &TensorImage,
4147        src_roi: RegionOfInterest,
4148        mut dst_roi: RegionOfInterest,
4149        rotation_offset: usize,
4150        flip: Flip,
4151    ) -> Result<(), Error> {
4152        let texture_target = gls::gl::TEXTURE_2D;
4153        let texture_format = match src.fourcc() {
4154            RGB => gls::gl::RGB,
4155            RGBA => gls::gl::RGBA,
4156            GREY => gls::gl::RED,
4157            _ => {
4158                return Err(Error::NotSupported(format!(
4159                    "draw_src_texture does not support {:?} (use DMA-BUF path for YUV)",
4160                    src.fourcc()
4161                )));
4162            }
4163        };
4164        unsafe {
4165            gls::gl::UseProgram(self.texture_program.id);
4166            gls::gl::BindTexture(texture_target, self.camera_normal_texture.id);
4167            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
4168            gls::gl::TexParameteri(
4169                texture_target,
4170                gls::gl::TEXTURE_MIN_FILTER,
4171                gls::gl::LINEAR as i32,
4172            );
4173            gls::gl::TexParameteri(
4174                texture_target,
4175                gls::gl::TEXTURE_MAG_FILTER,
4176                gls::gl::LINEAR as i32,
4177            );
4178            if src.fourcc() == GREY {
4179                for swizzle in [
4180                    gls::gl::TEXTURE_SWIZZLE_R,
4181                    gls::gl::TEXTURE_SWIZZLE_G,
4182                    gls::gl::TEXTURE_SWIZZLE_B,
4183                ] {
4184                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, gls::gl::RED as i32);
4185                }
4186            } else {
4187                for (swizzle, src) in [
4188                    (gls::gl::TEXTURE_SWIZZLE_R, gls::gl::RED),
4189                    (gls::gl::TEXTURE_SWIZZLE_G, gls::gl::GREEN),
4190                    (gls::gl::TEXTURE_SWIZZLE_B, gls::gl::BLUE),
4191                ] {
4192                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, src as i32);
4193                }
4194            }
4195            self.camera_normal_texture.update_texture(
4196                texture_target,
4197                src.width(),
4198                src.height(),
4199                texture_format,
4200                &src.tensor().map()?,
4201            );
4202
4203            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4204            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4205
4206            match flip {
4207                Flip::None => {}
4208                Flip::Vertical => {
4209                    std::mem::swap(&mut dst_roi.top, &mut dst_roi.bottom);
4210                }
4211                Flip::Horizontal => {
4212                    std::mem::swap(&mut dst_roi.left, &mut dst_roi.right);
4213                }
4214            }
4215
4216            let camera_vertices: [f32; 12] = [
4217                dst_roi.left,
4218                dst_roi.top,
4219                0., // left top
4220                dst_roi.right,
4221                dst_roi.top,
4222                0., // right top
4223                dst_roi.right,
4224                dst_roi.bottom,
4225                0., // right bottom
4226                dst_roi.left,
4227                dst_roi.bottom,
4228                0., // left bottom
4229            ];
4230            gls::gl::BufferData(
4231                gls::gl::ARRAY_BUFFER,
4232                (size_of::<f32>() * camera_vertices.len()) as isize,
4233                camera_vertices.as_ptr() as *const c_void,
4234                gls::gl::DYNAMIC_DRAW,
4235            );
4236            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4237            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4238            let texture_vertices: [f32; 16] = [
4239                src_roi.left,
4240                src_roi.top,
4241                src_roi.right,
4242                src_roi.top,
4243                src_roi.right,
4244                src_roi.bottom,
4245                src_roi.left,
4246                src_roi.bottom,
4247                src_roi.left,
4248                src_roi.top,
4249                src_roi.right,
4250                src_roi.top,
4251                src_roi.right,
4252                src_roi.bottom,
4253                src_roi.left,
4254                src_roi.bottom,
4255            ];
4256
4257            gls::gl::BufferData(
4258                gls::gl::ARRAY_BUFFER,
4259                (size_of::<f32>() * 8) as isize,
4260                (texture_vertices[(rotation_offset * 2)..]).as_ptr() as *const c_void,
4261                gls::gl::DYNAMIC_DRAW,
4262            );
4263            let vertices_index: [u32; 4] = [0, 1, 2, 3];
4264            gls::gl::DrawElements(
4265                gls::gl::TRIANGLE_FAN,
4266                vertices_index.len() as i32,
4267                gls::gl::UNSIGNED_INT,
4268                vertices_index.as_ptr() as *const c_void,
4269            );
4270            check_gl_error(function!(), line!())?;
4271
4272            Ok(())
4273        }
4274    }
4275
4276    fn draw_camera_texture_eglimage(
4277        &self,
4278        src: &TensorImage,
4279        egl_img: egl::Image,
4280        src_roi: RegionOfInterest,
4281        mut dst_roi: RegionOfInterest,
4282        rotation_offset: usize,
4283        flip: Flip,
4284    ) -> Result<(), Error> {
4285        // let texture_target = gls::gl::TEXTURE_2D;
4286        let texture_target = gls::gl::TEXTURE_EXTERNAL_OES;
4287        unsafe {
4288            gls::gl::UseProgram(self.texture_program_yuv.id);
4289            gls::gl::BindTexture(texture_target, self.camera_eglimage_texture.id);
4290            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
4291            gls::gl::TexParameteri(
4292                texture_target,
4293                gls::gl::TEXTURE_MIN_FILTER,
4294                gls::gl::LINEAR as i32,
4295            );
4296            gls::gl::TexParameteri(
4297                texture_target,
4298                gls::gl::TEXTURE_MAG_FILTER,
4299                gls::gl::LINEAR as i32,
4300            );
4301
4302            if src.fourcc() == GREY {
4303                for swizzle in [
4304                    gls::gl::TEXTURE_SWIZZLE_R,
4305                    gls::gl::TEXTURE_SWIZZLE_G,
4306                    gls::gl::TEXTURE_SWIZZLE_B,
4307                ] {
4308                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, gls::gl::RED as i32);
4309                }
4310            } else {
4311                for (swizzle, src) in [
4312                    (gls::gl::TEXTURE_SWIZZLE_R, gls::gl::RED),
4313                    (gls::gl::TEXTURE_SWIZZLE_G, gls::gl::GREEN),
4314                    (gls::gl::TEXTURE_SWIZZLE_B, gls::gl::BLUE),
4315                ] {
4316                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, src as i32);
4317                }
4318            }
4319
4320            gls::egl_image_target_texture_2d_oes(texture_target, egl_img.as_ptr());
4321            check_gl_error(function!(), line!())?;
4322            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4323            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4324
4325            match flip {
4326                Flip::None => {}
4327                Flip::Vertical => {
4328                    std::mem::swap(&mut dst_roi.top, &mut dst_roi.bottom);
4329                }
4330                Flip::Horizontal => {
4331                    std::mem::swap(&mut dst_roi.left, &mut dst_roi.right);
4332                }
4333            }
4334
4335            let camera_vertices: [f32; 12] = [
4336                dst_roi.left,
4337                dst_roi.top,
4338                0., // left top
4339                dst_roi.right,
4340                dst_roi.top,
4341                0., // right top
4342                dst_roi.right,
4343                dst_roi.bottom,
4344                0., // right bottom
4345                dst_roi.left,
4346                dst_roi.bottom,
4347                0., // left bottom
4348            ];
4349            gls::gl::BufferSubData(
4350                gls::gl::ARRAY_BUFFER,
4351                0,
4352                (size_of::<f32>() * camera_vertices.len()) as isize,
4353                camera_vertices.as_ptr() as *const c_void,
4354            );
4355
4356            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4357            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4358
4359            let texture_vertices: [f32; 16] = [
4360                src_roi.left,
4361                src_roi.top,
4362                src_roi.right,
4363                src_roi.top,
4364                src_roi.right,
4365                src_roi.bottom,
4366                src_roi.left,
4367                src_roi.bottom,
4368                src_roi.left,
4369                src_roi.top,
4370                src_roi.right,
4371                src_roi.top,
4372                src_roi.right,
4373                src_roi.bottom,
4374                src_roi.left,
4375                src_roi.bottom,
4376            ];
4377            gls::gl::BufferSubData(
4378                gls::gl::ARRAY_BUFFER,
4379                0,
4380                (size_of::<f32>() * 8) as isize,
4381                (texture_vertices[(rotation_offset * 2)..]).as_ptr() as *const c_void,
4382            );
4383
4384            let vertices_index: [u32; 4] = [0, 1, 2, 3];
4385            gls::gl::DrawElements(
4386                gls::gl::TRIANGLE_FAN,
4387                vertices_index.len() as i32,
4388                gls::gl::UNSIGNED_INT,
4389                vertices_index.as_ptr() as *const c_void,
4390            );
4391        }
4392        check_gl_error(function!(), line!())?;
4393        Ok(())
4394    }
4395
4396    fn create_image_from_dma2(&self, src: &TensorImage) -> Result<EglImage, crate::Error> {
4397        let width;
4398        let height;
4399        let format;
4400        let channels;
4401
4402        // NV12 is semi-planar but handled specially via EGL multi-plane import
4403        if src.fourcc() == NV12 {
4404            if !src.width().is_multiple_of(4) {
4405                return Err(Error::NotSupported(
4406                    "OpenGL EGLImage doesn't support image widths which are not multiples of 4"
4407                        .to_string(),
4408                ));
4409            }
4410            width = src.width();
4411            height = src.height();
4412            format = fourcc_to_drm(NV12)?;
4413            channels = 1; // Y plane pitch is 1 byte per pixel
4414        } else if src.is_planar() {
4415            if !src.width().is_multiple_of(16) {
4416                return Err(Error::NotSupported(
4417                    "OpenGL Planar RGB EGLImage doesn't support image widths which are not multiples of 16"
4418                        .to_string(),
4419                ));
4420            }
4421            match src.fourcc() {
4422                PLANAR_RGB | PLANAR_RGB_INT8 => {
4423                    format = DrmFourcc::R8;
4424                    width = src.width();
4425                    height = src.height() * 3;
4426                    channels = 1;
4427                }
4428                fourcc => {
4429                    return Err(crate::Error::NotSupported(format!(
4430                        "Unsupported Planar FourCC {fourcc:?}"
4431                    )));
4432                }
4433            };
4434        } else {
4435            if !src.width().is_multiple_of(4) {
4436                return Err(Error::NotSupported(
4437                    "OpenGL EGLImage doesn't support image widths which are not multiples of 4"
4438                        .to_string(),
4439                ));
4440            }
4441            width = src.width();
4442            height = src.height();
4443            format = fourcc_to_drm(src.fourcc())?;
4444            channels = src.channels();
4445        }
4446
4447        let fd = match &src.tensor {
4448            edgefirst_tensor::Tensor::Dma(dma_tensor) => dma_tensor.fd.as_raw_fd(),
4449            edgefirst_tensor::Tensor::Shm(_) => {
4450                return Err(Error::NotImplemented(
4451                    "OpenGL EGLImage doesn't support SHM".to_string(),
4452                ));
4453            }
4454            edgefirst_tensor::Tensor::Mem(_) => {
4455                return Err(Error::NotImplemented(
4456                    "OpenGL EGLImage doesn't support MEM".to_string(),
4457                ));
4458            }
4459            edgefirst_tensor::Tensor::Pbo(_) => {
4460                return Err(Error::NotImplemented(
4461                    "OpenGL EGLImage doesn't support PBO".to_string(),
4462                ));
4463            }
4464        };
4465
4466        // For NV12, plane0 pitch is width (Y is 1 byte/pixel)
4467        // For other formats, pitch is width * channels
4468        let plane0_pitch = if src.fourcc() == NV12 {
4469            width
4470        } else {
4471            width * channels
4472        };
4473
4474        let mut egl_img_attr = vec![
4475            egl_ext::LINUX_DRM_FOURCC as Attrib,
4476            format as Attrib,
4477            khronos_egl::WIDTH as Attrib,
4478            width as Attrib,
4479            khronos_egl::HEIGHT as Attrib,
4480            height as Attrib,
4481            egl_ext::DMA_BUF_PLANE0_PITCH as Attrib,
4482            plane0_pitch as Attrib,
4483            egl_ext::DMA_BUF_PLANE0_OFFSET as Attrib,
4484            0 as Attrib,
4485            egl_ext::DMA_BUF_PLANE0_FD as Attrib,
4486            fd as Attrib,
4487            egl::IMAGE_PRESERVED as Attrib,
4488            egl::TRUE as Attrib,
4489        ];
4490
4491        // NV12 requires a second plane for UV data
4492        if src.fourcc() == NV12 {
4493            let uv_offset = width * height; // Y plane size
4494            egl_img_attr.append(&mut vec![
4495                egl_ext::DMA_BUF_PLANE1_FD as Attrib,
4496                fd as Attrib,
4497                egl_ext::DMA_BUF_PLANE1_OFFSET as Attrib,
4498                uv_offset as Attrib,
4499                egl_ext::DMA_BUF_PLANE1_PITCH as Attrib,
4500                width as Attrib, // UV plane has same width as Y plane
4501            ]);
4502        }
4503
4504        if matches!(src.fourcc(), YUYV | VYUY | NV12) {
4505            egl_img_attr.append(&mut vec![
4506                egl_ext::YUV_COLOR_SPACE_HINT as Attrib,
4507                egl_ext::ITU_REC709 as Attrib,
4508                egl_ext::SAMPLE_RANGE_HINT as Attrib,
4509                egl_ext::YUV_NARROW_RANGE as Attrib,
4510            ]);
4511        }
4512
4513        egl_img_attr.push(khronos_egl::NONE as Attrib);
4514
4515        match self.new_egl_image_owned(egl_ext::LINUX_DMA_BUF, &egl_img_attr) {
4516            Ok(v) => Ok(v),
4517            Err(e) => Err(e),
4518        }
4519    }
4520
4521    fn new_egl_image_owned(
4522        &'_ self,
4523        target: egl::Enum,
4524        attrib_list: &[Attrib],
4525    ) -> Result<EglImage, Error> {
4526        let image = GlContext::egl_create_image_with_fallback(
4527            &self.gl_context.egl,
4528            self.gl_context.display.as_display(),
4529            unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) },
4530            target,
4531            unsafe { egl::ClientBuffer::from_ptr(null_mut()) },
4532            attrib_list,
4533        )?;
4534        Ok(EglImage {
4535            egl_image: image,
4536            display: self.gl_context.display.as_display(),
4537            egl: Rc::clone(&self.gl_context.egl),
4538        })
4539    }
4540
4541    /// Look up or create an EGLImage for a DMA tensor, returning the EGL image handle.
4542    ///
4543    /// Returns `egl::Image` (a `Copy` type wrapping `*const c_void`) to avoid borrow
4544    /// conflicts with the caller. The cache retains ownership of the `EglImage` value;
4545    /// the handle remains valid as long as the entry lives in the cache.
4546    fn get_or_create_egl_image(
4547        &mut self,
4548        cache: CacheKind,
4549        img: &TensorImage,
4550    ) -> Result<egl::Image, crate::Error> {
4551        let id = img.buffer_identity().id();
4552
4553        // Sweep dead entries opportunistically before looking up.
4554        match cache {
4555            CacheKind::Src => self.src_egl_cache.sweep(),
4556            CacheKind::Dst => self.dst_egl_cache.sweep(),
4557        }
4558
4559        {
4560            let egl_cache = match cache {
4561                CacheKind::Src => &mut self.src_egl_cache,
4562                CacheKind::Dst => &mut self.dst_egl_cache,
4563            };
4564            let ts = egl_cache.next_timestamp();
4565            if let Some(cached) = egl_cache.entries.get_mut(&id) {
4566                egl_cache.hits += 1;
4567                cached.last_used = ts;
4568                log::trace!("EglImageCache {:?} hit: id={id:#x}", cache);
4569                return Ok(cached.egl_image.egl_image);
4570            }
4571            egl_cache.misses += 1;
4572            log::trace!("EglImageCache {:?} miss: id={id:#x}", cache);
4573            // Evict least-recently-used entry if at capacity.
4574            if egl_cache.entries.len() >= egl_cache.capacity {
4575                egl_cache.evict_lru();
4576            }
4577        }
4578
4579        let egl_image_obj = self.create_image_from_dma2(img)?;
4580        let handle = egl_image_obj.egl_image;
4581        let guard = img.buffer_identity().weak();
4582        let egl_cache = match cache {
4583            CacheKind::Src => &mut self.src_egl_cache,
4584            CacheKind::Dst => &mut self.dst_egl_cache,
4585        };
4586        let ts = egl_cache.next_timestamp();
4587        egl_cache.entries.insert(
4588            id,
4589            CachedEglImage {
4590                egl_image: egl_image_obj,
4591                guard,
4592                renderbuffer: None,
4593                last_used: ts,
4594            },
4595        );
4596        Ok(handle)
4597    }
4598
4599    /// Create an EGLImage from a DMA buffer with explicitly specified internal
4600    /// dimensions and format. Used when the GL render surface differs from the
4601    /// logical TensorImage dimensions (e.g., packed RGB reinterpretation).
4602    fn create_egl_image_with_dims(
4603        &self,
4604        img: &TensorImage,
4605        width: usize,
4606        height: usize,
4607        drm_format: DrmFourcc,
4608        bpp: usize,
4609    ) -> Result<EglImage, crate::Error> {
4610        let fd = match &img.tensor {
4611            edgefirst_tensor::Tensor::Dma(dma_tensor) => dma_tensor.fd.as_raw_fd(),
4612            _ => {
4613                return Err(Error::NotImplemented(
4614                    "create_egl_image_with_dims requires DMA tensor".to_string(),
4615                ));
4616            }
4617        };
4618
4619        let pitch = width * bpp;
4620        let egl_img_attr = vec![
4621            egl_ext::LINUX_DRM_FOURCC as Attrib,
4622            drm_format as u32 as Attrib,
4623            khronos_egl::WIDTH as Attrib,
4624            width as Attrib,
4625            khronos_egl::HEIGHT as Attrib,
4626            height as Attrib,
4627            egl_ext::DMA_BUF_PLANE0_PITCH as Attrib,
4628            pitch as Attrib,
4629            egl_ext::DMA_BUF_PLANE0_OFFSET as Attrib,
4630            0 as Attrib,
4631            egl_ext::DMA_BUF_PLANE0_FD as Attrib,
4632            fd as Attrib,
4633            egl::IMAGE_PRESERVED as Attrib,
4634            egl::TRUE as Attrib,
4635            khronos_egl::NONE as Attrib,
4636        ];
4637
4638        self.new_egl_image_owned(egl_ext::LINUX_DMA_BUF, &egl_img_attr)
4639    }
4640
4641    /// Get or create an EGLImage for a packed RGB DMA destination with
4642    /// reinterpreted dimensions. Uses the dst cache keyed by buffer identity.
4643    fn get_or_create_egl_image_rgb(
4644        &mut self,
4645        img: &TensorImage,
4646        width: usize,
4647        height: usize,
4648        drm_format: DrmFourcc,
4649        bpp: usize,
4650    ) -> Result<egl::Image, crate::Error> {
4651        let id = img.buffer_identity().id();
4652        self.dst_egl_cache.sweep();
4653
4654        let ts = self.dst_egl_cache.next_timestamp();
4655        if let Some(cached) = self.dst_egl_cache.entries.get_mut(&id) {
4656            self.dst_egl_cache.hits += 1;
4657            cached.last_used = ts;
4658            log::trace!("EglImageCache dst (RGB) hit: id={id:#x}");
4659            return Ok(cached.egl_image.egl_image);
4660        }
4661        self.dst_egl_cache.misses += 1;
4662        log::trace!("EglImageCache dst (RGB) miss: id={id:#x}");
4663
4664        if self.dst_egl_cache.entries.len() >= self.dst_egl_cache.capacity {
4665            self.dst_egl_cache.evict_lru();
4666        }
4667
4668        let egl_image_obj = self.create_egl_image_with_dims(img, width, height, drm_format, bpp)?;
4669        let handle = egl_image_obj.egl_image;
4670        let guard = img.buffer_identity().weak();
4671        let ts = self.dst_egl_cache.next_timestamp();
4672        self.dst_egl_cache.entries.insert(
4673            id,
4674            CachedEglImage {
4675                egl_image: egl_image_obj,
4676                guard,
4677                renderbuffer: None,
4678                last_used: ts,
4679            },
4680        );
4681        Ok(handle)
4682    }
4683
4684    /// Get or create an EGLImage + renderbuffer for direct RGB rendering.
4685    /// Both are cached in dst_egl_cache keyed by buffer identity.
4686    /// Returns (renderbuffer_id, width, height).
4687    fn get_or_create_rgb_direct_rbo(
4688        &mut self,
4689        dst: &TensorImage,
4690    ) -> crate::Result<(u32, i32, i32)> {
4691        let id = dst.buffer_identity().id();
4692        let width = dst.width() as i32;
4693        let height = dst.height() as i32;
4694
4695        self.dst_egl_cache.sweep();
4696
4697        // Check cache for existing entry with renderbuffer
4698        let ts = self.dst_egl_cache.next_timestamp();
4699        if let Some(cached) = self.dst_egl_cache.entries.get_mut(&id) {
4700            if let Some(rbo) = cached.renderbuffer {
4701                self.dst_egl_cache.hits += 1;
4702                cached.last_used = ts;
4703                log::trace!("EglImageCache dst (rgb_direct) hit: id={id:#x}");
4704                return Ok((rbo, width, height));
4705            }
4706        }
4707        self.dst_egl_cache.misses += 1;
4708        log::trace!("EglImageCache dst (rgb_direct) miss: id={id:#x}");
4709
4710        // Evict least-recently-used entry if at capacity
4711        if self.dst_egl_cache.entries.len() >= self.dst_egl_cache.capacity {
4712            self.dst_egl_cache.evict_lru();
4713        }
4714
4715        // Create EGLImage from BGR888 DMA-buf
4716        let egl_image_obj =
4717            self.create_egl_image_with_dims(dst, dst.width(), dst.height(), DrmFourcc::Bgr888, 3)?;
4718
4719        // Create renderbuffer and bind EGLImage to it
4720        let rbo = unsafe {
4721            let mut rbo = 0u32;
4722            gls::gl::GenRenderbuffers(1, &mut rbo);
4723            gls::gl::BindRenderbuffer(gls::gl::RENDERBUFFER, rbo);
4724            gls::gl::EGLImageTargetRenderbufferStorageOES(
4725                gls::gl::RENDERBUFFER,
4726                egl_image_obj.egl_image.as_ptr(),
4727            );
4728            if let Err(e) = check_gl_error(function!(), line!()) {
4729                gls::gl::DeleteRenderbuffers(1, &rbo);
4730                return Err(e);
4731            }
4732            rbo
4733        };
4734
4735        // Cache both
4736        let guard = dst.buffer_identity().weak();
4737        let ts = self.dst_egl_cache.next_timestamp();
4738        self.dst_egl_cache.entries.insert(
4739            id,
4740            CachedEglImage {
4741                egl_image: egl_image_obj,
4742                guard,
4743                renderbuffer: Some(rbo),
4744                last_used: ts,
4745            },
4746        );
4747
4748        Ok((rbo, width, height))
4749    }
4750
4751    // Reshapes the segmentation to be compatible with RGBA texture array rendering.
4752    fn reshape_segmentation_to_rgba(&self, segmentation: &[u8], shape: [usize; 3]) -> Vec<u8> {
4753        let [height, width, classes] = shape;
4754
4755        let n_layer_stride = height * width * 4;
4756        let n_row_stride = width * 4;
4757        let n_col_stride = 4;
4758        let row_stride = width * classes;
4759        let col_stride = classes;
4760
4761        let mut new_segmentation = vec![0u8; n_layer_stride * classes.div_ceil(4)];
4762
4763        for i in 0..height {
4764            for j in 0..width {
4765                for k in 0..classes.div_ceil(4) * 4 {
4766                    if k >= classes {
4767                        new_segmentation[n_layer_stride * (k / 4)
4768                            + i * n_row_stride
4769                            + j * n_col_stride
4770                            + k % 4] = 0;
4771                    } else {
4772                        new_segmentation[n_layer_stride * (k / 4)
4773                            + i * n_row_stride
4774                            + j * n_col_stride
4775                            + k % 4] = segmentation[i * row_stride + j * col_stride + k];
4776                    }
4777                }
4778            }
4779        }
4780
4781        new_segmentation
4782    }
4783
4784    fn render_modelpack_segmentation(
4785        &mut self,
4786        dst_roi: RegionOfInterest,
4787        segmentation: &[u8],
4788        shape: [usize; 3],
4789    ) -> Result<(), crate::Error> {
4790        log::debug!("start render_segmentation_to_image");
4791
4792        // TODO: Implement specialization for 2 classes and 4 classes which shouldn't
4793        // need rearranging the data
4794        let new_segmentation = self.reshape_segmentation_to_rgba(segmentation, shape);
4795
4796        let [height, width, classes] = shape;
4797
4798        let format = gls::gl::RGBA;
4799        let texture_target = gls::gl::TEXTURE_2D_ARRAY;
4800        self.segmentation_program
4801            .load_uniform_1i(c"background_index", shape[2] as i32 - 1)?;
4802
4803        gls::use_program(self.segmentation_program.id);
4804
4805        gls::bind_texture(texture_target, self.segmentation_texture.id);
4806        gls::active_texture(gls::gl::TEXTURE0);
4807        gls::tex_parameteri(
4808            texture_target,
4809            gls::gl::TEXTURE_MIN_FILTER,
4810            gls::gl::LINEAR as i32,
4811        );
4812        gls::tex_parameteri(
4813            texture_target,
4814            gls::gl::TEXTURE_MAG_FILTER,
4815            gls::gl::LINEAR as i32,
4816        );
4817        gls::tex_parameteri(
4818            texture_target,
4819            gls::gl::TEXTURE_WRAP_S,
4820            gls::gl::CLAMP_TO_EDGE as i32,
4821        );
4822
4823        gls::tex_parameteri(
4824            texture_target,
4825            gls::gl::TEXTURE_WRAP_T,
4826            gls::gl::CLAMP_TO_EDGE as i32,
4827        );
4828
4829        gls::tex_image3d(
4830            texture_target,
4831            0,
4832            format as i32,
4833            width as i32,
4834            height as i32,
4835            classes.div_ceil(4) as i32,
4836            0,
4837            format,
4838            gls::gl::UNSIGNED_BYTE,
4839            Some(&new_segmentation),
4840        );
4841
4842        let src_roi = RegionOfInterest {
4843            left: 0.,
4844            top: 1.,
4845            right: 1.,
4846            bottom: 0.,
4847        };
4848
4849        unsafe {
4850            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4851            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4852
4853            let camera_vertices: [f32; 12] = [
4854                dst_roi.left,
4855                dst_roi.top,
4856                0., // left top
4857                dst_roi.right,
4858                dst_roi.top,
4859                0., // right top
4860                dst_roi.right,
4861                dst_roi.bottom,
4862                0., // right bottom
4863                dst_roi.left,
4864                dst_roi.bottom,
4865                0., // left bottom
4866            ];
4867            gls::gl::BufferSubData(
4868                gls::gl::ARRAY_BUFFER,
4869                0,
4870                (size_of::<f32>() * camera_vertices.len()) as isize,
4871                camera_vertices.as_ptr() as *const c_void,
4872            );
4873
4874            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4875            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4876
4877            let texture_vertices: [f32; 8] = [
4878                src_roi.left,
4879                src_roi.top,
4880                src_roi.right,
4881                src_roi.top,
4882                src_roi.right,
4883                src_roi.bottom,
4884                src_roi.left,
4885                src_roi.bottom,
4886            ];
4887            gls::gl::BufferSubData(
4888                gls::gl::ARRAY_BUFFER,
4889                0,
4890                (size_of::<f32>() * 8) as isize,
4891                (texture_vertices[0..]).as_ptr() as *const c_void,
4892            );
4893
4894            let vertices_index: [u32; 4] = [0, 1, 2, 3];
4895            gls::gl::DrawElements(
4896                gls::gl::TRIANGLE_FAN,
4897                vertices_index.len() as i32,
4898                gls::gl::UNSIGNED_INT,
4899                vertices_index.as_ptr() as *const c_void,
4900            );
4901        }
4902
4903        Ok(())
4904    }
4905
4906    fn render_yolo_segmentation(
4907        &mut self,
4908        dst_roi: RegionOfInterest,
4909        segmentation: &[u8],
4910        shape: [usize; 2],
4911        class: usize,
4912    ) -> Result<(), crate::Error> {
4913        log::debug!("start render_yolo_segmentation");
4914
4915        let [height, width] = shape;
4916
4917        let format = gls::gl::RED;
4918        let texture_target = gls::gl::TEXTURE_2D;
4919        gls::use_program(self.instanced_segmentation_program.id);
4920        self.instanced_segmentation_program
4921            .load_uniform_1i(c"class_index", class as i32)?;
4922        gls::bind_texture(texture_target, self.segmentation_texture.id);
4923        gls::active_texture(gls::gl::TEXTURE0);
4924        gls::tex_parameteri(
4925            texture_target,
4926            gls::gl::TEXTURE_MIN_FILTER,
4927            gls::gl::LINEAR as i32,
4928        );
4929        gls::tex_parameteri(
4930            texture_target,
4931            gls::gl::TEXTURE_MAG_FILTER,
4932            gls::gl::LINEAR as i32,
4933        );
4934        gls::tex_parameteri(
4935            texture_target,
4936            gls::gl::TEXTURE_WRAP_S,
4937            gls::gl::CLAMP_TO_EDGE as i32,
4938        );
4939
4940        gls::tex_parameteri(
4941            texture_target,
4942            gls::gl::TEXTURE_WRAP_T,
4943            gls::gl::CLAMP_TO_EDGE as i32,
4944        );
4945
4946        gls::tex_image2d(
4947            texture_target,
4948            0,
4949            format as i32,
4950            width as i32,
4951            height as i32,
4952            0,
4953            format,
4954            gls::gl::UNSIGNED_BYTE,
4955            Some(segmentation),
4956        );
4957
4958        let src_roi = RegionOfInterest {
4959            left: 0.,
4960            top: 1.,
4961            right: 1.,
4962            bottom: 0.,
4963        };
4964
4965        unsafe {
4966            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4967            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4968
4969            let camera_vertices: [f32; 12] = [
4970                dst_roi.left,
4971                dst_roi.top,
4972                0., // left top
4973                dst_roi.right,
4974                dst_roi.top,
4975                0., // right top
4976                dst_roi.right,
4977                dst_roi.bottom,
4978                0., // right bottom
4979                dst_roi.left,
4980                dst_roi.bottom,
4981                0., // left bottom
4982            ];
4983            gls::gl::BufferSubData(
4984                gls::gl::ARRAY_BUFFER,
4985                0,
4986                (size_of::<f32>() * camera_vertices.len()) as isize,
4987                camera_vertices.as_ptr() as *const c_void,
4988            );
4989
4990            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4991            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4992
4993            let texture_vertices: [f32; 8] = [
4994                src_roi.left,
4995                src_roi.top,
4996                src_roi.right,
4997                src_roi.top,
4998                src_roi.right,
4999                src_roi.bottom,
5000                src_roi.left,
5001                src_roi.bottom,
5002            ];
5003            gls::gl::BufferSubData(
5004                gls::gl::ARRAY_BUFFER,
5005                0,
5006                (size_of::<f32>() * 8) as isize,
5007                (texture_vertices).as_ptr() as *const c_void,
5008            );
5009
5010            let vertices_index: [u32; 4] = [0, 1, 2, 3];
5011            gls::gl::DrawElements(
5012                gls::gl::TRIANGLE_FAN,
5013                vertices_index.len() as i32,
5014                gls::gl::UNSIGNED_INT,
5015                vertices_index.as_ptr() as *const c_void,
5016            );
5017            gls::gl::Finish();
5018        }
5019
5020        Ok(())
5021    }
5022
5023    /// Repack proto tensor `(H, W, num_protos)` as f32 into RGBA f16 layers
5024    /// suitable for upload to a GL_TEXTURE_2D_ARRAY with GL_RGBA16F.
5025    ///
5026    /// Returns `(repacked_bytes, num_layers)` where each layer is H*W*4 half-floats.
5027    fn repack_protos_to_rgba_f16(protos: &ndarray::Array3<f32>) -> (Vec<u8>, usize) {
5028        let (height, width, num_protos) = protos.dim();
5029        let num_layers = num_protos.div_ceil(4);
5030        // Each layer is H*W*4 half-floats, each half-float is 2 bytes
5031        let layer_stride = height * width * 4;
5032        let mut buf = vec![0u16; layer_stride * num_layers];
5033
5034        for y in 0..height {
5035            for x in 0..width {
5036                for k in 0..num_layers * 4 {
5037                    let val = if k < num_protos {
5038                        half::f16::from_f32(protos[[y, x, k]])
5039                    } else {
5040                        half::f16::ZERO
5041                    };
5042                    let layer = k / 4;
5043                    let channel = k % 4;
5044                    buf[layer * layer_stride + y * width * 4 + x * 4 + channel] = val.to_bits();
5045                }
5046            }
5047        }
5048
5049        // Reinterpret u16 buffer as bytes
5050        let byte_buf = unsafe {
5051            std::slice::from_raw_parts(buf.as_ptr() as *const u8, buf.len() * 2).to_vec()
5052        };
5053        (byte_buf, num_layers)
5054    }
5055
5056    /// Render YOLO proto segmentation masks using the fused GPU pipeline.
5057    ///
5058    /// Dispatches to the appropriate shader based on `ProtoTensor` variant:
5059    /// - `Quantized`: uploads raw int8 as `GL_R8I`, dequantizes in shader
5060    /// - `Float`: uploads as `GL_R32F` with hardware bilinear (if available),
5061    ///   or falls back to f16 repack path
5062    fn render_proto_segmentation(
5063        &mut self,
5064        detect: &[DetectBox],
5065        proto_data: &ProtoData,
5066    ) -> crate::Result<()> {
5067        if detect.is_empty() || proto_data.mask_coefficients.is_empty() {
5068            return Ok(());
5069        }
5070
5071        let (height, width, num_protos) = proto_data.protos.dim();
5072        let texture_target = gls::gl::TEXTURE_2D_ARRAY;
5073
5074        match &proto_data.protos {
5075            ProtoTensor::Quantized {
5076                protos,
5077                quantization,
5078            } => {
5079                self.render_proto_segmentation_int8(
5080                    detect,
5081                    &proto_data.mask_coefficients,
5082                    protos,
5083                    quantization,
5084                    height,
5085                    width,
5086                    num_protos,
5087                    texture_target,
5088                )?;
5089            }
5090            ProtoTensor::Float(protos_f32) => {
5091                if self.has_float_linear {
5092                    self.render_proto_segmentation_f32(
5093                        detect,
5094                        &proto_data.mask_coefficients,
5095                        protos_f32,
5096                        height,
5097                        width,
5098                        num_protos,
5099                        texture_target,
5100                    )?;
5101                } else {
5102                    // Fallback: repack to RGBA16F and use existing f16 shader
5103                    self.render_proto_segmentation_f16(
5104                        detect,
5105                        &proto_data.mask_coefficients,
5106                        protos_f32,
5107                        height,
5108                        width,
5109                        num_protos,
5110                        texture_target,
5111                    )?;
5112                }
5113            }
5114        }
5115
5116        unsafe { gls::gl::Finish() };
5117        Ok(())
5118    }
5119
5120    /// Render detection quads using the active program. Shared by all proto
5121    /// shader paths.
5122    fn render_proto_detection_quads(
5123        &self,
5124        program: &GlProgram,
5125        detect: &[DetectBox],
5126        mask_coefficients: &[Vec<f32>],
5127    ) -> crate::Result<()> {
5128        let cvt_screen_coord = |normalized: f32| normalized * 2.0 - 1.0;
5129
5130        for (det, coeff) in detect.iter().zip(mask_coefficients.iter()) {
5131            let mut packed_coeff = [[0.0f32; 4]; 8];
5132            for (i, val) in coeff.iter().enumerate().take(32) {
5133                packed_coeff[i / 4][i % 4] = *val;
5134            }
5135
5136            program.load_uniform_4fv(c"mask_coeff", &packed_coeff)?;
5137            program.load_uniform_1i(c"class_index", det.label as i32)?;
5138
5139            let dst_roi = RegionOfInterest {
5140                left: cvt_screen_coord(det.bbox.xmin),
5141                top: cvt_screen_coord(det.bbox.ymax),
5142                right: cvt_screen_coord(det.bbox.xmax),
5143                bottom: cvt_screen_coord(det.bbox.ymin),
5144            };
5145
5146            // Proto texture coords: tex row 0 = image top (data uploaded in
5147            // row-major order where y=0 is top of image, and GL treats the
5148            // first row of pixel data as the bottom of the texture — but
5149            // texelFetch(y=0) returns that bottom row, which is our image top).
5150            // So tc.y=0 → image top, tc.y=1 → image bottom.
5151            // At NDC top (higher Y = image bottom = ymax), we want tc.y = ymax.
5152            // At NDC bottom (lower Y = image top = ymin), we want tc.y = ymin.
5153            let src_roi = RegionOfInterest {
5154                left: det.bbox.xmin,
5155                top: det.bbox.ymax,
5156                right: det.bbox.xmax,
5157                bottom: det.bbox.ymin,
5158            };
5159
5160            unsafe {
5161                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
5162                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
5163
5164                let camera_vertices: [f32; 12] = [
5165                    dst_roi.left,
5166                    dst_roi.top,
5167                    0.,
5168                    dst_roi.right,
5169                    dst_roi.top,
5170                    0.,
5171                    dst_roi.right,
5172                    dst_roi.bottom,
5173                    0.,
5174                    dst_roi.left,
5175                    dst_roi.bottom,
5176                    0.,
5177                ];
5178                gls::gl::BufferSubData(
5179                    gls::gl::ARRAY_BUFFER,
5180                    0,
5181                    (size_of::<f32>() * camera_vertices.len()) as isize,
5182                    camera_vertices.as_ptr() as *const c_void,
5183                );
5184
5185                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
5186                gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
5187
5188                let texture_vertices: [f32; 8] = [
5189                    src_roi.left,
5190                    src_roi.top,
5191                    src_roi.right,
5192                    src_roi.top,
5193                    src_roi.right,
5194                    src_roi.bottom,
5195                    src_roi.left,
5196                    src_roi.bottom,
5197                ];
5198                gls::gl::BufferSubData(
5199                    gls::gl::ARRAY_BUFFER,
5200                    0,
5201                    (size_of::<f32>() * 8) as isize,
5202                    texture_vertices.as_ptr() as *const c_void,
5203                );
5204
5205                let vertices_index: [u32; 4] = [0, 1, 2, 3];
5206                gls::gl::DrawElements(
5207                    gls::gl::TRIANGLE_FAN,
5208                    vertices_index.len() as i32,
5209                    gls::gl::UNSIGNED_INT,
5210                    vertices_index.as_ptr() as *const c_void,
5211                );
5212            }
5213        }
5214        Ok(())
5215    }
5216
5217    /// Int8 proto path: upload raw i8 protos as `GL_R8I`, dispatch by
5218    /// interpolation mode.
5219    #[allow(clippy::too_many_arguments)]
5220    fn render_proto_segmentation_int8(
5221        &mut self,
5222        detect: &[DetectBox],
5223        mask_coefficients: &[Vec<f32>],
5224        protos: &ndarray::Array3<i8>,
5225        quantization: &edgefirst_decoder::Quantization,
5226        height: usize,
5227        width: usize,
5228        num_protos: usize,
5229        texture_target: u32,
5230    ) -> crate::Result<()> {
5231        // Upload raw int8 protos as R8I texture array (1 proto per layer)
5232        gls::bind_texture(texture_target, self.proto_texture.id);
5233        gls::active_texture(gls::gl::TEXTURE0);
5234        gls::tex_parameteri(
5235            texture_target,
5236            gls::gl::TEXTURE_MIN_FILTER,
5237            gls::gl::NEAREST as i32,
5238        );
5239        gls::tex_parameteri(
5240            texture_target,
5241            gls::gl::TEXTURE_MAG_FILTER,
5242            gls::gl::NEAREST as i32,
5243        );
5244        gls::tex_parameteri(
5245            texture_target,
5246            gls::gl::TEXTURE_WRAP_S,
5247            gls::gl::CLAMP_TO_EDGE as i32,
5248        );
5249        gls::tex_parameteri(
5250            texture_target,
5251            gls::gl::TEXTURE_WRAP_T,
5252            gls::gl::CLAMP_TO_EDGE as i32,
5253        );
5254
5255        // Protos are (H, W, num_protos) in row-major. We need to repack to
5256        // layer-first layout: layer k = all (H, W) texels for proto k.
5257        let mut tex_data = vec![0i8; height * width * num_protos];
5258        for k in 0..num_protos {
5259            for y in 0..height {
5260                for x in 0..width {
5261                    tex_data[k * height * width + y * width + x] = protos[[y, x, k]];
5262                }
5263            }
5264        }
5265
5266        gls::tex_image3d(
5267            texture_target,
5268            0,
5269            gls::gl::R8I as i32,
5270            width as i32,
5271            height as i32,
5272            num_protos as i32,
5273            0,
5274            gls::gl::RED_INTEGER,
5275            gls::gl::BYTE,
5276            Some(&tex_data),
5277        );
5278
5279        let proto_scale = quantization.scale;
5280        let proto_scaled_zp = -(quantization.zero_point as f32) * quantization.scale;
5281
5282        match self.int8_interpolation_mode {
5283            Int8InterpolationMode::Nearest => {
5284                let program = &self.proto_segmentation_int8_nearest_program;
5285                gls::use_program(program.id);
5286                program.load_uniform_1i(c"num_protos", num_protos as i32)?;
5287                program.load_uniform_1f(c"proto_scale", proto_scale)?;
5288                program.load_uniform_1f(c"proto_scaled_zp", proto_scaled_zp)?;
5289                self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5290            }
5291            Int8InterpolationMode::Bilinear => {
5292                let program = &self.proto_segmentation_int8_bilinear_program;
5293                gls::use_program(program.id);
5294                program.load_uniform_1i(c"num_protos", num_protos as i32)?;
5295                program.load_uniform_1f(c"proto_scale", proto_scale)?;
5296                program.load_uniform_1f(c"proto_scaled_zp", proto_scaled_zp)?;
5297                self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5298            }
5299            Int8InterpolationMode::TwoPass => {
5300                self.render_proto_int8_two_pass(
5301                    detect,
5302                    mask_coefficients,
5303                    quantization,
5304                    height,
5305                    width,
5306                    num_protos,
5307                    texture_target,
5308                )?;
5309            }
5310        }
5311
5312        Ok(())
5313    }
5314
5315    /// Two-pass int8 path: dequant int8→RGBA16F FBO, then render with
5316    /// existing f16 shader using GL_LINEAR.
5317    #[allow(clippy::too_many_arguments)]
5318    fn render_proto_int8_two_pass(
5319        &self,
5320        detect: &[DetectBox],
5321        mask_coefficients: &[Vec<f32>],
5322        quantization: &edgefirst_decoder::Quantization,
5323        height: usize,
5324        width: usize,
5325        num_protos: usize,
5326        texture_target: u32,
5327    ) -> crate::Result<()> {
5328        let num_layers = num_protos.div_ceil(4);
5329
5330        // Save the caller's FBO and viewport so we can restore after dequant.
5331        let (saved_fbo, saved_viewport) = unsafe {
5332            let mut fbo: i32 = 0;
5333            gls::gl::GetIntegerv(gls::gl::FRAMEBUFFER_BINDING, &mut fbo);
5334            let mut vp = [0i32; 4];
5335            gls::gl::GetIntegerv(gls::gl::VIEWPORT, vp.as_mut_ptr());
5336            (fbo as u32, vp)
5337        };
5338
5339        // Pass 1: Dequantize int8 → RGBA16F texture via framebuffer
5340        let dequant_fbo = FrameBuffer::new();
5341        gls::bind_texture(texture_target, self.proto_dequant_texture.id);
5342        gls::tex_image3d::<u8>(
5343            texture_target,
5344            0,
5345            gls::gl::RGBA16F as i32,
5346            width as i32,
5347            height as i32,
5348            num_layers as i32,
5349            0,
5350            gls::gl::RGBA,
5351            gls::gl::HALF_FLOAT,
5352            None,
5353        );
5354        gls::tex_parameteri(
5355            texture_target,
5356            gls::gl::TEXTURE_MIN_FILTER,
5357            gls::gl::LINEAR as i32,
5358        );
5359        gls::tex_parameteri(
5360            texture_target,
5361            gls::gl::TEXTURE_MAG_FILTER,
5362            gls::gl::LINEAR as i32,
5363        );
5364        gls::tex_parameteri(
5365            texture_target,
5366            gls::gl::TEXTURE_WRAP_S,
5367            gls::gl::CLAMP_TO_EDGE as i32,
5368        );
5369        gls::tex_parameteri(
5370            texture_target,
5371            gls::gl::TEXTURE_WRAP_T,
5372            gls::gl::CLAMP_TO_EDGE as i32,
5373        );
5374
5375        let proto_scale = quantization.scale;
5376        let proto_scaled_zp = -(quantization.zero_point as f32) * quantization.scale;
5377
5378        let dequant_program = &self.proto_dequant_int8_program;
5379        gls::use_program(dequant_program.id);
5380        dequant_program.load_uniform_1f(c"proto_scale", proto_scale)?;
5381        dequant_program.load_uniform_1f(c"proto_scaled_zp", proto_scaled_zp)?;
5382
5383        // Bind the int8 proto texture to TEXTURE0 for the dequant shader
5384        gls::active_texture(gls::gl::TEXTURE0);
5385        gls::bind_texture(texture_target, self.proto_texture.id);
5386
5387        // Render each RGBA16F layer (4 protos per layer)
5388        for layer in 0..num_layers {
5389            dequant_fbo.bind();
5390            unsafe {
5391                gls::gl::FramebufferTextureLayer(
5392                    gls::gl::FRAMEBUFFER,
5393                    gls::gl::COLOR_ATTACHMENT0,
5394                    self.proto_dequant_texture.id,
5395                    0,
5396                    layer as i32,
5397                );
5398                gls::gl::Viewport(0, 0, width as i32, height as i32);
5399            }
5400            dequant_program.load_uniform_1i(c"base_layer", (layer * 4) as i32)?;
5401
5402            // Full-screen quad
5403            unsafe {
5404                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
5405                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
5406                let verts: [f32; 12] = [
5407                    -1.0, -1.0, 0.0, 1.0, -1.0, 0.0, 1.0, 1.0, 0.0, -1.0, 1.0, 0.0,
5408                ];
5409                gls::gl::BufferSubData(
5410                    gls::gl::ARRAY_BUFFER,
5411                    0,
5412                    (size_of::<f32>() * 12) as isize,
5413                    verts.as_ptr() as *const c_void,
5414                );
5415                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
5416                gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
5417                let tc: [f32; 8] = [0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0];
5418                gls::gl::BufferSubData(
5419                    gls::gl::ARRAY_BUFFER,
5420                    0,
5421                    (size_of::<f32>() * 8) as isize,
5422                    tc.as_ptr() as *const c_void,
5423                );
5424                let idx: [u32; 4] = [0, 1, 2, 3];
5425                gls::gl::DrawElements(
5426                    gls::gl::TRIANGLE_FAN,
5427                    4,
5428                    gls::gl::UNSIGNED_INT,
5429                    idx.as_ptr() as *const c_void,
5430                );
5431            }
5432        }
5433
5434        // Drop the dequant FBO (its Drop unbinds to 0) and restore the caller's.
5435        drop(dequant_fbo);
5436        unsafe {
5437            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, saved_fbo);
5438            gls::gl::Viewport(
5439                saved_viewport[0],
5440                saved_viewport[1],
5441                saved_viewport[2],
5442                saved_viewport[3],
5443            );
5444        }
5445
5446        // Pass 2: render with existing f16 shader reading from dequant texture
5447        let program = &self.proto_segmentation_program;
5448        gls::use_program(program.id);
5449        gls::active_texture(gls::gl::TEXTURE0);
5450        gls::bind_texture(texture_target, self.proto_dequant_texture.id);
5451        program.load_uniform_1i(c"num_layers", num_layers as i32)?;
5452        self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5453
5454        Ok(())
5455    }
5456
5457    /// F32 proto path: upload as `GL_R32F` with `GL_LINEAR` filtering.
5458    #[allow(clippy::too_many_arguments)]
5459    fn render_proto_segmentation_f32(
5460        &self,
5461        detect: &[DetectBox],
5462        mask_coefficients: &[Vec<f32>],
5463        protos_f32: &ndarray::Array3<f32>,
5464        height: usize,
5465        width: usize,
5466        num_protos: usize,
5467        texture_target: u32,
5468    ) -> crate::Result<()> {
5469        let program = &self.proto_segmentation_f32_program;
5470        gls::use_program(program.id);
5471        gls::bind_texture(texture_target, self.proto_texture.id);
5472        gls::active_texture(gls::gl::TEXTURE0);
5473        gls::tex_parameteri(
5474            texture_target,
5475            gls::gl::TEXTURE_MIN_FILTER,
5476            gls::gl::LINEAR as i32,
5477        );
5478        gls::tex_parameteri(
5479            texture_target,
5480            gls::gl::TEXTURE_MAG_FILTER,
5481            gls::gl::LINEAR as i32,
5482        );
5483        gls::tex_parameteri(
5484            texture_target,
5485            gls::gl::TEXTURE_WRAP_S,
5486            gls::gl::CLAMP_TO_EDGE as i32,
5487        );
5488        gls::tex_parameteri(
5489            texture_target,
5490            gls::gl::TEXTURE_WRAP_T,
5491            gls::gl::CLAMP_TO_EDGE as i32,
5492        );
5493
5494        // Repack protos to layer-first layout: (num_protos, H, W)
5495        let mut tex_data = vec![0.0f32; height * width * num_protos];
5496        for k in 0..num_protos {
5497            for y in 0..height {
5498                for x in 0..width {
5499                    tex_data[k * height * width + y * width + x] = protos_f32[[y, x, k]];
5500                }
5501            }
5502        }
5503
5504        gls::tex_image3d(
5505            texture_target,
5506            0,
5507            gls::gl::R32F as i32,
5508            width as i32,
5509            height as i32,
5510            num_protos as i32,
5511            0,
5512            gls::gl::RED,
5513            gls::gl::FLOAT,
5514            Some(&tex_data),
5515        );
5516
5517        program.load_uniform_1i(c"num_protos", num_protos as i32)?;
5518        self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5519
5520        Ok(())
5521    }
5522
5523    /// F16 fallback path: repack f32 protos to RGBA16F and use existing
5524    /// f16 shader with GL_LINEAR. Used when GL_OES_texture_float_linear
5525    /// is not available.
5526    #[allow(clippy::too_many_arguments)]
5527    fn render_proto_segmentation_f16(
5528        &self,
5529        detect: &[DetectBox],
5530        mask_coefficients: &[Vec<f32>],
5531        protos_f32: &ndarray::Array3<f32>,
5532        height: usize,
5533        width: usize,
5534        num_protos: usize,
5535        texture_target: u32,
5536    ) -> crate::Result<()> {
5537        let num_layers = num_protos.div_ceil(4);
5538        let (tex_data, _) = Self::repack_protos_to_rgba_f16(protos_f32);
5539
5540        let program = &self.proto_segmentation_program;
5541        gls::use_program(program.id);
5542        gls::bind_texture(texture_target, self.proto_texture.id);
5543        gls::active_texture(gls::gl::TEXTURE0);
5544        gls::tex_parameteri(
5545            texture_target,
5546            gls::gl::TEXTURE_MIN_FILTER,
5547            gls::gl::LINEAR as i32,
5548        );
5549        gls::tex_parameteri(
5550            texture_target,
5551            gls::gl::TEXTURE_MAG_FILTER,
5552            gls::gl::LINEAR as i32,
5553        );
5554        gls::tex_parameteri(
5555            texture_target,
5556            gls::gl::TEXTURE_WRAP_S,
5557            gls::gl::CLAMP_TO_EDGE as i32,
5558        );
5559        gls::tex_parameteri(
5560            texture_target,
5561            gls::gl::TEXTURE_WRAP_T,
5562            gls::gl::CLAMP_TO_EDGE as i32,
5563        );
5564
5565        gls::tex_image3d(
5566            texture_target,
5567            0,
5568            gls::gl::RGBA16F as i32,
5569            width as i32,
5570            height as i32,
5571            num_layers as i32,
5572            0,
5573            gls::gl::RGBA,
5574            gls::gl::HALF_FLOAT,
5575            Some(&tex_data),
5576        );
5577
5578        program.load_uniform_1i(c"num_layers", num_layers as i32)?;
5579        self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5580
5581        Ok(())
5582    }
5583
5584    fn render_segmentation(
5585        &mut self,
5586        detect: &[DetectBox],
5587        segmentation: &[Segmentation],
5588    ) -> crate::Result<()> {
5589        if segmentation.is_empty() {
5590            return Ok(());
5591        }
5592
5593        let is_modelpack = segmentation[0].segmentation.shape()[2] > 1;
5594        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
5595        let cvt_screen_coord = |normalized| normalized * 2.0 - 1.0;
5596        if is_modelpack {
5597            let seg = &segmentation[0];
5598            let dst_roi = RegionOfInterest {
5599                left: cvt_screen_coord(seg.xmin),
5600                top: cvt_screen_coord(seg.ymax),
5601                right: cvt_screen_coord(seg.xmax),
5602                bottom: cvt_screen_coord(seg.ymin),
5603            };
5604            let segment = seg.segmentation.as_standard_layout();
5605            let slice = segment.as_slice().ok_or(Error::Internal(
5606                "Cannot get slice of segmentation".to_owned(),
5607            ))?;
5608
5609            self.render_modelpack_segmentation(
5610                dst_roi,
5611                slice,
5612                [
5613                    seg.segmentation.shape()[0],
5614                    seg.segmentation.shape()[1],
5615                    seg.segmentation.shape()[2],
5616                ],
5617            )?;
5618        } else {
5619            for (seg, det) in segmentation.iter().zip(detect) {
5620                let dst_roi = RegionOfInterest {
5621                    left: cvt_screen_coord(seg.xmin),
5622                    top: cvt_screen_coord(seg.ymax),
5623                    right: cvt_screen_coord(seg.xmax),
5624                    bottom: cvt_screen_coord(seg.ymin),
5625                };
5626
5627                let segment = seg.segmentation.as_standard_layout();
5628                let slice = segment.as_slice().ok_or(Error::Internal(
5629                    "Cannot get slice of segmentation".to_owned(),
5630                ))?;
5631
5632                self.render_yolo_segmentation(
5633                    dst_roi,
5634                    slice,
5635                    [seg.segmentation.shape()[0], seg.segmentation.shape()[1]],
5636                    det.label,
5637                )?;
5638            }
5639        }
5640
5641        gls::disable(gls::gl::BLEND);
5642        Ok(())
5643    }
5644
5645    fn render_box(&mut self, dst: &TensorImage, detect: &[DetectBox]) -> Result<(), Error> {
5646        unsafe {
5647            gls::gl::UseProgram(self.color_program.id);
5648            let rescale = |x: f32| x * 2.0 - 1.0;
5649            let thickness = 3.0;
5650            for d in detect {
5651                self.color_program
5652                    .load_uniform_1i(c"class_index", d.label as i32)?;
5653                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
5654                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
5655                let bbox: [f32; 4] = d.bbox.into();
5656                let outer_box = [
5657                    bbox[0] - thickness / dst.width() as f32,
5658                    bbox[1] - thickness / dst.height() as f32,
5659                    bbox[2] + thickness / dst.width() as f32,
5660                    bbox[3] + thickness / dst.height() as f32,
5661                ];
5662                let camera_vertices: [f32; 24] = [
5663                    rescale(bbox[0]),
5664                    rescale(bbox[3]),
5665                    0., // bottom left
5666                    rescale(bbox[2]),
5667                    rescale(bbox[3]),
5668                    0., // bottom right
5669                    rescale(bbox[2]),
5670                    rescale(bbox[1]),
5671                    0., // top right
5672                    rescale(bbox[0]),
5673                    rescale(bbox[1]),
5674                    0., // top left
5675                    rescale(outer_box[0]),
5676                    rescale(outer_box[3]),
5677                    0., // bottom left
5678                    rescale(outer_box[2]),
5679                    rescale(outer_box[3]),
5680                    0., // bottom right
5681                    rescale(outer_box[2]),
5682                    rescale(outer_box[1]),
5683                    0., // top right
5684                    rescale(outer_box[0]),
5685                    rescale(outer_box[1]),
5686                    0., // top left
5687                ];
5688                gls::gl::BufferData(
5689                    gls::gl::ARRAY_BUFFER,
5690                    (size_of::<f32>() * camera_vertices.len()) as isize,
5691                    camera_vertices.as_ptr() as *const c_void,
5692                    gls::gl::DYNAMIC_DRAW,
5693                );
5694
5695                let vertices_index: [u32; 10] = [0, 1, 5, 2, 6, 3, 7, 0, 4, 5];
5696                gls::gl::DrawElements(
5697                    gls::gl::TRIANGLE_STRIP,
5698                    vertices_index.len() as i32,
5699                    gls::gl::UNSIGNED_INT,
5700                    vertices_index.as_ptr() as *const c_void,
5701                );
5702            }
5703        }
5704        check_gl_error(function!(), line!())?;
5705        Ok(())
5706    }
5707}
5708struct EglImage {
5709    egl_image: egl::Image,
5710    egl: Rc<Egl>,
5711    display: egl::Display,
5712}
5713
5714impl Drop for EglImage {
5715    fn drop(&mut self) {
5716        if self.egl_image.as_ptr() == egl::NO_IMAGE {
5717            return;
5718        }
5719
5720        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
5721            let e =
5722                GlContext::egl_destroy_image_with_fallback(&self.egl, self.display, self.egl_image);
5723            if let Err(e) = e {
5724                error!("Could not destroy EGL image: {e:?}");
5725            }
5726        }));
5727    }
5728}
5729
5730struct Texture {
5731    id: u32,
5732    target: gls::gl::types::GLenum,
5733    width: usize,
5734    height: usize,
5735    format: gls::gl::types::GLenum,
5736}
5737
5738impl Default for Texture {
5739    fn default() -> Self {
5740        Self::new()
5741    }
5742}
5743
5744impl Texture {
5745    fn new() -> Self {
5746        let mut id = 0;
5747        unsafe { gls::gl::GenTextures(1, &raw mut id) };
5748        Self {
5749            id,
5750            target: 0,
5751            width: 0,
5752            height: 0,
5753            format: 0,
5754        }
5755    }
5756
5757    fn update_texture(
5758        &mut self,
5759        target: gls::gl::types::GLenum,
5760        width: usize,
5761        height: usize,
5762        format: gls::gl::types::GLenum,
5763        data: &[u8],
5764    ) {
5765        if target != self.target
5766            || width != self.width
5767            || height != self.height
5768            || format != self.format
5769        {
5770            unsafe {
5771                gls::gl::TexImage2D(
5772                    target,
5773                    0,
5774                    format as i32,
5775                    width as i32,
5776                    height as i32,
5777                    0,
5778                    format,
5779                    gls::gl::UNSIGNED_BYTE,
5780                    data.as_ptr() as *const c_void,
5781                );
5782            }
5783            self.target = target;
5784            self.format = format;
5785            self.width = width;
5786            self.height = height;
5787        } else {
5788            unsafe {
5789                gls::gl::TexSubImage2D(
5790                    target,
5791                    0,
5792                    0,
5793                    0,
5794                    width as i32,
5795                    height as i32,
5796                    format,
5797                    gls::gl::UNSIGNED_BYTE,
5798                    data.as_ptr() as *const c_void,
5799                );
5800            }
5801        }
5802    }
5803}
5804
5805impl Drop for Texture {
5806    fn drop(&mut self) {
5807        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| unsafe {
5808            gls::gl::DeleteTextures(1, &raw mut self.id)
5809        }));
5810    }
5811}
5812
5813struct Buffer {
5814    id: u32,
5815    buffer_index: u32,
5816}
5817
5818impl Buffer {
5819    fn new(buffer_index: u32, size_per_point: usize, max_points: usize) -> Buffer {
5820        let mut id = 0;
5821        unsafe {
5822            gls::gl::EnableVertexAttribArray(buffer_index);
5823            gls::gl::GenBuffers(1, &raw mut id);
5824            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, id);
5825            gls::gl::VertexAttribPointer(
5826                buffer_index,
5827                size_per_point as i32,
5828                gls::gl::FLOAT,
5829                gls::gl::FALSE,
5830                0,
5831                null(),
5832            );
5833            gls::gl::BufferData(
5834                gls::gl::ARRAY_BUFFER,
5835                (size_of::<f32>() * size_per_point * max_points) as isize,
5836                null(),
5837                gls::gl::DYNAMIC_DRAW,
5838            );
5839        }
5840
5841        Buffer { id, buffer_index }
5842    }
5843}
5844
5845impl Drop for Buffer {
5846    fn drop(&mut self) {
5847        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| unsafe {
5848            gls::gl::DeleteBuffers(1, &raw mut self.id)
5849        }));
5850    }
5851}
5852
5853struct FrameBuffer {
5854    id: u32,
5855}
5856
5857impl FrameBuffer {
5858    fn new() -> FrameBuffer {
5859        let mut id = 0;
5860        unsafe {
5861            gls::gl::GenFramebuffers(1, &raw mut id);
5862        }
5863
5864        FrameBuffer { id }
5865    }
5866
5867    fn bind(&self) {
5868        unsafe { gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, self.id) };
5869    }
5870
5871    fn unbind(&self) {
5872        unsafe { gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, 0) };
5873    }
5874}
5875
5876impl Drop for FrameBuffer {
5877    fn drop(&mut self) {
5878        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
5879            self.unbind();
5880            unsafe {
5881                gls::gl::DeleteFramebuffers(1, &raw mut self.id);
5882            }
5883        }));
5884    }
5885}
5886
5887pub struct GlProgram {
5888    id: u32,
5889    vertex_id: u32,
5890    fragment_id: u32,
5891}
5892
5893impl GlProgram {
5894    fn new(vertex_shader: &str, fragment_shader: &str) -> Result<Self, crate::Error> {
5895        let id = unsafe { gls::gl::CreateProgram() };
5896        let vertex_id = unsafe { gls::gl::CreateShader(gls::gl::VERTEX_SHADER) };
5897        if compile_shader_from_str(vertex_id, vertex_shader, "shader_vert").is_err() {
5898            log::debug!("Vertex shader source:\n{}", vertex_shader);
5899            return Err(crate::Error::OpenGl(format!(
5900                "Shader compile error: {vertex_shader}"
5901            )));
5902        }
5903        unsafe {
5904            gls::gl::AttachShader(id, vertex_id);
5905        }
5906
5907        let fragment_id = unsafe { gls::gl::CreateShader(gls::gl::FRAGMENT_SHADER) };
5908        if compile_shader_from_str(fragment_id, fragment_shader, "shader_frag").is_err() {
5909            log::debug!("Fragment shader source:\n{}", fragment_shader);
5910            return Err(crate::Error::OpenGl(format!(
5911                "Shader compile error: {fragment_shader}"
5912            )));
5913        }
5914
5915        unsafe {
5916            gls::gl::AttachShader(id, fragment_id);
5917            gls::gl::LinkProgram(id);
5918            gls::gl::UseProgram(id);
5919        }
5920
5921        Ok(Self {
5922            id,
5923            vertex_id,
5924            fragment_id,
5925        })
5926    }
5927
5928    #[allow(dead_code)]
5929    fn load_uniform_1f(&self, name: &CStr, value: f32) -> Result<(), crate::Error> {
5930        unsafe {
5931            gls::gl::UseProgram(self.id);
5932            let location = gls::gl::GetUniformLocation(self.id, name.as_ptr());
5933            gls::gl::Uniform1f(location, value);
5934        }
5935        Ok(())
5936    }
5937
5938    #[allow(dead_code)]
5939    fn load_uniform_1i(&self, name: &CStr, value: i32) -> Result<(), crate::Error> {
5940        unsafe {
5941            gls::gl::UseProgram(self.id);
5942            let location = gls::gl::GetUniformLocation(self.id, name.as_ptr());
5943            gls::gl::Uniform1i(location, value);
5944        }
5945        Ok(())
5946    }
5947
5948    fn load_uniform_4fv(&self, name: &CStr, value: &[[f32; 4]]) -> Result<(), crate::Error> {
5949        unsafe {
5950            gls::gl::UseProgram(self.id);
5951            let location = gls::gl::GetUniformLocation(self.id, name.as_ptr());
5952            if location == -1 {
5953                return Err(crate::Error::OpenGl(format!(
5954                    "Could not find uniform location for '{}'",
5955                    name.to_string_lossy().into_owned()
5956                )));
5957            }
5958            gls::gl::Uniform4fv(location, value.len() as i32, value.as_flattened().as_ptr());
5959        }
5960        check_gl_error(function!(), line!())?;
5961        Ok(())
5962    }
5963}
5964
5965impl Drop for GlProgram {
5966    fn drop(&mut self) {
5967        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| unsafe {
5968            gls::gl::DeleteProgram(self.id);
5969            gls::gl::DeleteShader(self.fragment_id);
5970            gls::gl::DeleteShader(self.vertex_id);
5971        }));
5972    }
5973}
5974
5975fn compile_shader_from_str(shader: u32, shader_source: &str, shader_name: &str) -> Result<(), ()> {
5976    let src = match CString::from_str(shader_source) {
5977        Ok(v) => v,
5978        Err(_) => return Err(()),
5979    };
5980    let src_ptr = src.as_ptr();
5981    unsafe {
5982        gls::gl::ShaderSource(shader, 1, &raw const src_ptr, null());
5983        gls::gl::CompileShader(shader);
5984        let mut is_compiled = 0;
5985        gls::gl::GetShaderiv(shader, gls::gl::COMPILE_STATUS, &raw mut is_compiled);
5986        if is_compiled == 0 {
5987            let mut max_length = 0;
5988            gls::gl::GetShaderiv(shader, gls::gl::INFO_LOG_LENGTH, &raw mut max_length);
5989            let mut error_log: Vec<u8> = vec![0; max_length as usize];
5990            gls::gl::GetShaderInfoLog(
5991                shader,
5992                max_length,
5993                &raw mut max_length,
5994                error_log.as_mut_ptr() as *mut c_char,
5995            );
5996            error!(
5997                "Shader '{}' failed: {:?}\n",
5998                shader_name,
5999                CString::from_vec_with_nul(error_log)
6000                    .unwrap()
6001                    .into_string()
6002                    .unwrap()
6003            );
6004            gls::gl::DeleteShader(shader);
6005            return Err(());
6006        }
6007        Ok(())
6008    }
6009}
6010
6011fn check_gl_error(name: &str, line: u32) -> Result<(), Error> {
6012    unsafe {
6013        let err = gls::gl::GetError();
6014        if err != gls::gl::NO_ERROR {
6015            error!("GL Error: {name}:{line}: {err:#X}");
6016            // panic!("GL Error: {err}");
6017            return Err(Error::OpenGl(format!("{err:#X}")));
6018        }
6019    }
6020    Ok(())
6021}
6022
6023fn fourcc_to_drm(fourcc: FourCharCode) -> Result<DrmFourcc, Error> {
6024    match fourcc {
6025        RGBA => Ok(DrmFourcc::Abgr8888),
6026        BGRA => Ok(DrmFourcc::Argb8888),
6027        YUYV => Ok(DrmFourcc::Yuyv),
6028        VYUY => Ok(DrmFourcc::Vyuy),
6029        RGB | RGB_INT8 => Ok(DrmFourcc::Bgr888),
6030        GREY => Ok(DrmFourcc::R8),
6031        NV12 => Ok(DrmFourcc::Nv12),
6032        PLANAR_RGB | PLANAR_RGB_INT8 => Ok(DrmFourcc::R8),
6033        _ => Err(Error::NotSupported(format!(
6034            "FourCC {fourcc:?} has no DRM format mapping"
6035        ))),
6036    }
6037}
6038
6039mod egl_ext {
6040    #![allow(dead_code)]
6041    pub(crate) const LINUX_DMA_BUF: u32 = 0x3270;
6042    pub(crate) const LINUX_DRM_FOURCC: u32 = 0x3271;
6043    pub(crate) const DMA_BUF_PLANE0_FD: u32 = 0x3272;
6044    pub(crate) const DMA_BUF_PLANE0_OFFSET: u32 = 0x3273;
6045    pub(crate) const DMA_BUF_PLANE0_PITCH: u32 = 0x3274;
6046    pub(crate) const DMA_BUF_PLANE1_FD: u32 = 0x3275;
6047    pub(crate) const DMA_BUF_PLANE1_OFFSET: u32 = 0x3276;
6048    pub(crate) const DMA_BUF_PLANE1_PITCH: u32 = 0x3277;
6049    pub(crate) const DMA_BUF_PLANE2_FD: u32 = 0x3278;
6050    pub(crate) const DMA_BUF_PLANE2_OFFSET: u32 = 0x3279;
6051    pub(crate) const DMA_BUF_PLANE2_PITCH: u32 = 0x327A;
6052    pub(crate) const YUV_COLOR_SPACE_HINT: u32 = 0x327B;
6053    pub(crate) const SAMPLE_RANGE_HINT: u32 = 0x327C;
6054    pub(crate) const YUV_CHROMA_HORIZONTAL_SITING_HINT: u32 = 0x327D;
6055    pub(crate) const YUV_CHROMA_VERTICAL_SITING_HINT: u32 = 0x327E;
6056
6057    pub(crate) const ITU_REC601: u32 = 0x327F;
6058    pub(crate) const ITU_REC709: u32 = 0x3280;
6059    pub(crate) const ITU_REC2020: u32 = 0x3281;
6060
6061    pub(crate) const YUV_FULL_RANGE: u32 = 0x3282;
6062    pub(crate) const YUV_NARROW_RANGE: u32 = 0x3283;
6063
6064    pub(crate) const YUV_CHROMA_SITING_0: u32 = 0x3284;
6065    pub(crate) const YUV_CHROMA_SITING_0_5: u32 = 0x3285;
6066
6067    pub(crate) const PLATFORM_GBM_KHR: u32 = 0x31D7;
6068
6069    pub(crate) const PLATFORM_DEVICE_EXT: u32 = 0x313F;
6070
6071    /// EGL_KHR_no_config_context: null config for eglCreateContext.
6072    /// Defined as ((EGLConfig)0) in the EGL spec.
6073    ///
6074    /// # Safety
6075    /// The EGL spec defines EGL_NO_CONFIG_KHR as a null pointer. This is
6076    /// a safe transmute since `Config` is a newtype wrapper around `*mut c_void`.
6077    pub(crate) const NO_CONFIG_KHR: khronos_egl::Config =
6078        unsafe { std::mem::transmute(std::ptr::null_mut::<std::ffi::c_void>()) };
6079}
6080
6081fn generate_vertex_shader() -> &'static str {
6082    "\
6083#version 300 es
6084precision mediump float;
6085layout(location = 0) in vec3 pos;
6086layout(location = 1) in vec2 texCoord;
6087
6088out vec3 fragPos;
6089out vec2 tc;
6090
6091void main() {
6092    fragPos = pos;
6093    tc = texCoord;
6094
6095    gl_Position = vec4(pos, 1.0);
6096}
6097"
6098}
6099
6100fn generate_texture_fragment_shader() -> &'static str {
6101    "\
6102#version 300 es
6103
6104precision mediump float;
6105uniform sampler2D tex;
6106in vec3 fragPos;
6107in vec2 tc;
6108
6109out vec4 color;
6110
6111void main(){
6112    color = texture(tex, tc);
6113}
6114"
6115}
6116
6117fn generate_texture_fragment_shader_yuv() -> &'static str {
6118    "\
6119#version 300 es
6120#extension GL_OES_EGL_image_external_essl3 : require
6121precision mediump float;
6122uniform samplerExternalOES tex;
6123in vec3 fragPos;
6124in vec2 tc;
6125
6126out vec4 color;
6127
6128void main(){
6129    color = texture(tex, tc);
6130}
6131"
6132}
6133
6134fn generate_planar_rgb_shader() -> &'static str {
6135    "\
6136#version 300 es
6137#extension GL_OES_EGL_image_external_essl3 : require
6138precision mediump float;
6139uniform samplerExternalOES tex;
6140in vec3 fragPos;
6141in vec2 tc;
6142
6143out vec4 color;
6144
6145void main(){
6146    color = texture(tex, tc);
6147}
6148"
6149}
6150
6151/// Int8 variant of [`generate_planar_rgb_shader`]. Applies XOR 0x80 bias
6152/// to each RGB channel (uint8 → int8 conversion) using the bit-exact
6153/// quantize+mod approach: `floor(v * 255 + 0.5) + 128 mod 256 / 255`.
6154fn generate_planar_rgb_int8_shader() -> &'static str {
6155    "\
6156#version 300 es
6157#extension GL_OES_EGL_image_external_essl3 : require
6158precision highp float;
6159uniform samplerExternalOES tex;
6160in vec3 fragPos;
6161in vec2 tc;
6162
6163out vec4 color;
6164
6165vec3 int8_bias(vec3 v) {
6166    vec3 q = floor(v * 255.0 + 0.5);
6167    return mod(q + 128.0, 256.0) / 255.0;
6168}
6169
6170void main(){
6171    vec4 c = texture(tex, tc);
6172    color = vec4(int8_bias(c.rgb), c.a);
6173}
6174"
6175}
6176
6177/// Int8 variant of [`generate_texture_fragment_shader`]. Applies `fract(v + 0.5)`
6178/// to each RGB channel for XOR 0x80 bias (uint8 → int8 conversion).
6179/// Used by the direct RGB render path for RGB_INT8 output.
6180fn generate_texture_int8_shader() -> &'static str {
6181    "\
6182#version 300 es
6183precision highp float;
6184uniform sampler2D tex;
6185in vec3 fragPos;
6186in vec2 tc;
6187
6188out vec4 color;
6189
6190// XOR 0x80 bias: quantize to uint8, add 128 mod 256, normalize back.
6191// This matches the CPU `byte ^ 0x80` operation exactly.
6192vec3 int8_bias(vec3 v) {
6193    vec3 q = floor(v * 255.0 + 0.5);
6194    return mod(q + 128.0, 256.0) / 255.0;
6195}
6196
6197void main(){
6198    vec4 c = texture(tex, tc);
6199    color = vec4(int8_bias(c.rgb), c.a);
6200}
6201"
6202}
6203
6204/// Int8 variant of [`generate_texture_fragment_shader_yuv`]. Applies XOR 0x80 bias
6205/// to each RGB channel (uint8 → int8 conversion).
6206/// Used by the direct RGB render path for RGB_INT8 output with external OES sources.
6207fn generate_texture_int8_shader_yuv() -> &'static str {
6208    "\
6209#version 300 es
6210#extension GL_OES_EGL_image_external_essl3 : require
6211precision highp float;
6212uniform samplerExternalOES tex;
6213in vec3 fragPos;
6214in vec2 tc;
6215
6216out vec4 color;
6217
6218vec3 int8_bias(vec3 v) {
6219    vec3 q = floor(v * 255.0 + 0.5);
6220    return mod(q + 128.0, 256.0) / 255.0;
6221}
6222
6223void main(){
6224    vec4 c = texture(tex, tc);
6225    color = vec4(int8_bias(c.rgb), c.a);
6226}
6227"
6228}
6229
6230/// this shader requires a reshape of the segmentation output tensor to (H, W,
6231/// C/4, 4)
6232fn generate_segmentation_shader() -> &'static str {
6233    "\
6234#version 300 es
6235precision mediump float;
6236precision mediump sampler2DArray;
6237
6238uniform sampler2DArray tex;
6239uniform vec4 colors[20];
6240uniform int background_index;
6241
6242in vec3 fragPos;
6243in vec2 tc;
6244in vec4 fragColor;
6245
6246out vec4 color;
6247
6248float max_arg(const in vec4 args, out int argmax) {
6249    if (args[0] >= args[1] && args[0] >= args[2] && args[0] >= args[3]) {
6250        argmax = 0;
6251        return args[0];
6252    }
6253    if (args[1] >= args[0] && args[1] >= args[2] && args[1] >= args[3]) {
6254        argmax = 1;
6255        return args[1];
6256    }
6257    if (args[2] >= args[0] && args[2] >= args[1] && args[2] >= args[3]) {
6258        argmax = 2;
6259        return args[2];
6260    }
6261    argmax = 3;
6262    return args[3];
6263}
6264
6265void main() {
6266    mediump int layers = textureSize(tex, 0).z;
6267    float max_all = -4.0;
6268    int max_ind = 0;
6269    for (int i = 0; i < layers; i++) {
6270        vec4 d = texture(tex, vec3(tc, i));
6271        int max_ind_ = 0;
6272        float max_ = max_arg(d, max_ind_);
6273        if (max_ <= max_all) { continue; }
6274        max_all = max_;
6275        max_ind = i*4 + max_ind_;
6276    }
6277    if (max_ind == background_index) {
6278        discard;
6279    }
6280    max_ind = max_ind % 20;
6281    color = colors[max_ind];
6282}
6283"
6284}
6285
6286fn generate_instanced_segmentation_shader() -> &'static str {
6287    "\
6288#version 300 es
6289precision mediump float;
6290uniform sampler2D mask0;
6291uniform vec4 colors[20];
6292uniform int class_index;
6293in vec3 fragPos;
6294in vec2 tc;
6295in vec4 fragColor;
6296
6297out vec4 color;
6298void main() {
6299    float r0 = texture(mask0, tc).r;
6300    int arg = int(r0>=0.5);
6301    if (arg == 0) {
6302        discard;
6303    }
6304    color = colors[class_index % 20];
6305}
6306"
6307}
6308
6309fn generate_proto_segmentation_shader() -> &'static str {
6310    "\
6311#version 300 es
6312precision highp float;
6313precision highp sampler2DArray;
6314
6315uniform sampler2DArray proto_tex;  // ceil(num_protos/4) layers, RGBA = 4 channels per layer
6316uniform vec4 mask_coeff[8];        // 32 coefficients packed as 8 vec4s
6317uniform vec4 colors[20];
6318uniform int class_index;
6319uniform int num_layers;
6320
6321in vec2 tc;
6322out vec4 color;
6323
6324void main() {
6325    float acc = 0.0;
6326    for (int i = 0; i < num_layers; i++) {
6327        // texture() returns bilinearly interpolated proto values (GL_LINEAR)
6328        acc += dot(mask_coeff[i], texture(proto_tex, vec3(tc, float(i))));
6329    }
6330    float mask = 1.0 / (1.0 + exp(-acc));  // sigmoid
6331    if (mask < 0.5) discard;
6332    color = colors[class_index % 20];
6333}
6334"
6335}
6336
6337/// Int8 proto shader — nearest-neighbor only.
6338///
6339/// Uses `texelFetch()` at the nearest texel. No interpolation. Simplest and
6340/// fastest GPU execution but may show staircase artifacts at mask edges.
6341///
6342/// Layout: `GL_R8I` texture with 1 proto per layer (32 layers).
6343/// Mask coefficients packed as `vec4[8]`, indexed `mask_coeff[k/4][k%4]`.
6344fn generate_proto_segmentation_shader_int8_nearest() -> &'static str {
6345    "\
6346#version 300 es
6347precision highp float;
6348precision highp int;
6349precision highp isampler2DArray;
6350
6351uniform isampler2DArray proto_tex;  // 32 layers, R channel = 1 proto per layer
6352uniform vec4 mask_coeff[8];         // 32 coefficients packed as 8 vec4s
6353uniform vec4 colors[20];
6354uniform int class_index;
6355uniform int num_protos;
6356uniform float proto_scale;
6357uniform float proto_scaled_zp;      // -zero_point * scale
6358
6359in vec2 tc;
6360out vec4 color;
6361
6362void main() {
6363    ivec3 tex_size = textureSize(proto_tex, 0);
6364    int ix = clamp(int(tc.x * float(tex_size.x)), 0, tex_size.x - 1);
6365    int iy = clamp(int(tc.y * float(tex_size.y)), 0, tex_size.y - 1);
6366
6367    float acc = 0.0;
6368    for (int k = 0; k < num_protos; k++) {
6369        float raw = float(texelFetch(proto_tex, ivec3(ix, iy, k), 0).r);
6370        float val = raw * proto_scale + proto_scaled_zp;
6371        acc += mask_coeff[k / 4][k % 4] * val;
6372    }
6373    float mask = 1.0 / (1.0 + exp(-acc));
6374    if (mask < 0.5) discard;
6375    color = colors[class_index % 20];
6376}
6377"
6378}
6379
6380/// Int8 proto shader — shader-based bilinear interpolation (recommended).
6381///
6382/// Uses `texelFetch()` to fetch 4 neighboring texels per fragment, dequantizes
6383/// each, and computes bilinear weights from `fract(tc * textureSize)`.
6384///
6385/// Layout: `GL_R8I` texture with 1 proto per layer (32 layers).
6386fn generate_proto_segmentation_shader_int8_bilinear() -> &'static str {
6387    "\
6388#version 300 es
6389precision highp float;
6390precision highp int;
6391precision highp isampler2DArray;
6392
6393uniform isampler2DArray proto_tex;  // 32 layers, R channel = 1 proto per layer
6394uniform vec4 mask_coeff[8];         // 32 coefficients packed as 8 vec4s
6395uniform vec4 colors[20];
6396uniform int class_index;
6397uniform int num_protos;
6398uniform float proto_scale;
6399uniform float proto_scaled_zp;      // -zero_point * scale
6400
6401in vec2 tc;
6402out vec4 color;
6403
6404void main() {
6405    ivec3 tex_size = textureSize(proto_tex, 0);
6406    // Compute continuous position (matching GL_LINEAR convention: center at +0.5)
6407    vec2 pos = tc * vec2(tex_size.xy) - 0.5;
6408    vec2 f = fract(pos);
6409    ivec2 p0 = ivec2(floor(pos));
6410    ivec2 p1 = p0 + 1;
6411    // Clamp to texture bounds
6412    p0 = clamp(p0, ivec2(0), tex_size.xy - 1);
6413    p1 = clamp(p1, ivec2(0), tex_size.xy - 1);
6414
6415    float w00 = (1.0 - f.x) * (1.0 - f.y);
6416    float w10 = f.x * (1.0 - f.y);
6417    float w01 = (1.0 - f.x) * f.y;
6418    float w11 = f.x * f.y;
6419
6420    float acc = 0.0;
6421    for (int k = 0; k < num_protos; k++) {
6422        float r00 = float(texelFetch(proto_tex, ivec3(p0.x, p0.y, k), 0).r);
6423        float r10 = float(texelFetch(proto_tex, ivec3(p1.x, p0.y, k), 0).r);
6424        float r01 = float(texelFetch(proto_tex, ivec3(p0.x, p1.y, k), 0).r);
6425        float r11 = float(texelFetch(proto_tex, ivec3(p1.x, p1.y, k), 0).r);
6426        float interp = r00 * w00 + r10 * w10 + r01 * w01 + r11 * w11;
6427        float val = interp * proto_scale + proto_scaled_zp;
6428        acc += mask_coeff[k / 4][k % 4] * val;
6429    }
6430    float mask = 1.0 / (1.0 + exp(-acc));
6431    if (mask < 0.5) discard;
6432    color = colors[class_index % 20];
6433}
6434"
6435}
6436
6437/// Int8 dequantization pass shader (two-pass Option C, pass 1).
6438///
6439/// Reads `GL_R8I` texel, dequantizes, and writes float to `GL_RGBA16F` render
6440/// target. This shader processes 4 protos at a time (packing into RGBA).
6441/// After this pass, the existing f16 shader reads the dequantized texture with
6442/// `GL_LINEAR`.
6443fn generate_proto_dequant_shader_int8() -> &'static str {
6444    "\
6445#version 300 es
6446precision highp float;
6447precision highp int;
6448precision highp isampler2DArray;
6449
6450uniform isampler2DArray proto_tex;  // 32 layers of R8I (1 proto per layer)
6451uniform float proto_scale;
6452uniform float proto_scaled_zp;      // -zero_point * scale
6453uniform int base_layer;             // first proto index for this output layer (0, 4, 8, ...)
6454
6455in vec2 tc;
6456out vec4 color;
6457
6458void main() {
6459    ivec3 tex_size = textureSize(proto_tex, 0);
6460    int ix = clamp(int(tc.x * float(tex_size.x)), 0, tex_size.x - 1);
6461    int iy = clamp(int(tc.y * float(tex_size.y)), 0, tex_size.y - 1);
6462
6463    vec4 result;
6464    for (int c = 0; c < 4; c++) {
6465        int layer = base_layer + c;
6466        float raw = float(texelFetch(proto_tex, ivec3(ix, iy, layer), 0).r);
6467        result[c] = raw * proto_scale + proto_scaled_zp;
6468    }
6469    color = result;
6470}
6471"
6472}
6473
6474/// F32 proto shader — direct R32F texture with hardware bilinear filtering.
6475///
6476/// Same structure as int8 bilinear shader but uses `texture()` for hardware
6477/// interpolation (requires `GL_OES_texture_float_linear`). No dequantization.
6478///
6479/// Layout: `GL_R32F` texture with 1 proto per layer (32 layers).
6480fn generate_proto_segmentation_shader_f32() -> &'static str {
6481    "\
6482#version 300 es
6483precision highp float;
6484precision highp sampler2DArray;
6485
6486uniform sampler2DArray proto_tex;  // 32 layers, R channel = 1 proto per layer
6487uniform vec4 mask_coeff[8];        // 32 coefficients packed as 8 vec4s
6488uniform vec4 colors[20];
6489uniform int class_index;
6490uniform int num_protos;
6491
6492in vec2 tc;
6493out vec4 color;
6494
6495void main() {
6496    float acc = 0.0;
6497    for (int k = 0; k < num_protos; k++) {
6498        // texture() returns bilinearly interpolated proto value (GL_LINEAR on R32F)
6499        float val = texture(proto_tex, vec3(tc, float(k))).r;
6500        acc += mask_coeff[k / 4][k % 4] * val;
6501    }
6502    float mask = 1.0 / (1.0 + exp(-acc));
6503    if (mask < 0.5) discard;
6504    color = colors[class_index % 20];
6505}
6506"
6507}
6508
6509/// Binary mask shader — int8, nearest-neighbor, logit threshold.
6510///
6511/// Outputs binary `acc > 0 ? 1.0 : 0.0` instead of `sigmoid(acc)`.  Avoids
6512/// the `exp()` per fragment; used by `decode_masks_atlas` where only mask
6513/// presence matters.
6514fn generate_proto_mask_logit_shader_int8_nearest() -> &'static str {
6515    "\
6516#version 300 es
6517precision highp float;
6518precision highp int;
6519precision highp isampler2DArray;
6520
6521uniform isampler2DArray proto_tex;
6522uniform vec4 mask_coeff[8];
6523uniform int num_protos;
6524uniform float proto_scale;
6525uniform float coeff_sum_x_szp;
6526
6527in vec2 tc;
6528out vec4 color;
6529
6530void main() {
6531    ivec3 tex_size = textureSize(proto_tex, 0);
6532    int ix = clamp(int(tc.x * float(tex_size.x)), 0, tex_size.x - 1);
6533    int iy = clamp(int(tc.y * float(tex_size.y)), 0, tex_size.y - 1);
6534
6535    int groups = (num_protos + 3) / 4;
6536    float acc = 0.0;
6537    for (int i = 0; i < groups; i++) {
6538        int base = i * 4;
6539        vec4 raw = vec4(
6540            float(texelFetch(proto_tex, ivec3(ix, iy, min(base, num_protos - 1)), 0).r),
6541            float(texelFetch(proto_tex, ivec3(ix, iy, min(base + 1, num_protos - 1)), 0).r),
6542            float(texelFetch(proto_tex, ivec3(ix, iy, min(base + 2, num_protos - 1)), 0).r),
6543            float(texelFetch(proto_tex, ivec3(ix, iy, min(base + 3, num_protos - 1)), 0).r)
6544        );
6545        acc += dot(mask_coeff[i], raw);
6546    }
6547    float logit = acc * proto_scale + coeff_sum_x_szp;
6548    float mask = logit > 0.0 ? 1.0 : 0.0;
6549    color = vec4(mask, 0.0, 0.0, 1.0);
6550}
6551"
6552}
6553
6554/// Binary mask shader — int8, shader-based bilinear interpolation, logit threshold.
6555///
6556/// Outputs binary `acc > 0 ? 1.0 : 0.0` instead of `sigmoid(acc)`.  Used by
6557/// `decode_masks_atlas` for int8 models with bilinear interpolation.
6558fn generate_proto_mask_logit_shader_int8_bilinear() -> &'static str {
6559    "\
6560#version 300 es
6561precision highp float;
6562precision highp int;
6563precision highp isampler2DArray;
6564
6565uniform isampler2DArray proto_tex;
6566uniform vec4 mask_coeff[8];
6567uniform int num_protos;
6568uniform float proto_scale;
6569uniform float coeff_sum_x_szp;
6570
6571in vec2 tc;
6572out vec4 color;
6573
6574void main() {
6575    ivec3 tex_size = textureSize(proto_tex, 0);
6576    vec2 pos = tc * vec2(tex_size.xy) - 0.5;
6577    vec2 f = fract(pos);
6578    ivec2 p0 = ivec2(floor(pos));
6579    ivec2 p1 = p0 + 1;
6580    p0 = clamp(p0, ivec2(0), tex_size.xy - 1);
6581    p1 = clamp(p1, ivec2(0), tex_size.xy - 1);
6582
6583    float w00 = (1.0 - f.x) * (1.0 - f.y);
6584    float w10 = f.x * (1.0 - f.y);
6585    float w01 = (1.0 - f.x) * f.y;
6586    float w11 = f.x * f.y;
6587
6588    int groups = (num_protos + 3) / 4;
6589    float acc = 0.0;
6590    for (int i = 0; i < groups; i++) {
6591        int base = i * 4;
6592        int l0 = min(base, num_protos - 1);
6593        int l1 = min(base + 1, num_protos - 1);
6594        int l2 = min(base + 2, num_protos - 1);
6595        int l3 = min(base + 3, num_protos - 1);
6596        vec4 r00 = vec4(
6597            float(texelFetch(proto_tex, ivec3(p0.x, p0.y, l0), 0).r),
6598            float(texelFetch(proto_tex, ivec3(p0.x, p0.y, l1), 0).r),
6599            float(texelFetch(proto_tex, ivec3(p0.x, p0.y, l2), 0).r),
6600            float(texelFetch(proto_tex, ivec3(p0.x, p0.y, l3), 0).r)
6601        );
6602        vec4 r10 = vec4(
6603            float(texelFetch(proto_tex, ivec3(p1.x, p0.y, l0), 0).r),
6604            float(texelFetch(proto_tex, ivec3(p1.x, p0.y, l1), 0).r),
6605            float(texelFetch(proto_tex, ivec3(p1.x, p0.y, l2), 0).r),
6606            float(texelFetch(proto_tex, ivec3(p1.x, p0.y, l3), 0).r)
6607        );
6608        vec4 r01 = vec4(
6609            float(texelFetch(proto_tex, ivec3(p0.x, p1.y, l0), 0).r),
6610            float(texelFetch(proto_tex, ivec3(p0.x, p1.y, l1), 0).r),
6611            float(texelFetch(proto_tex, ivec3(p0.x, p1.y, l2), 0).r),
6612            float(texelFetch(proto_tex, ivec3(p0.x, p1.y, l3), 0).r)
6613        );
6614        vec4 r11 = vec4(
6615            float(texelFetch(proto_tex, ivec3(p1.x, p1.y, l0), 0).r),
6616            float(texelFetch(proto_tex, ivec3(p1.x, p1.y, l1), 0).r),
6617            float(texelFetch(proto_tex, ivec3(p1.x, p1.y, l2), 0).r),
6618            float(texelFetch(proto_tex, ivec3(p1.x, p1.y, l3), 0).r)
6619        );
6620        vec4 interp = r00 * w00 + r10 * w10 + r01 * w01 + r11 * w11;
6621        acc += dot(mask_coeff[i], interp);
6622    }
6623    float logit = acc * proto_scale + coeff_sum_x_szp;
6624    float mask = logit > 0.0 ? 1.0 : 0.0;
6625    color = vec4(mask, 0.0, 0.0, 1.0);
6626}
6627"
6628}
6629
6630/// Binary mask shader — f32 protos with hardware bilinear filtering, logit threshold.
6631///
6632/// Outputs binary `acc > 0 ? 1.0 : 0.0` instead of `sigmoid(acc)`.  Used by
6633/// `decode_masks_atlas` for f32 models.
6634fn generate_proto_mask_logit_shader_f32() -> &'static str {
6635    "\
6636#version 300 es
6637precision highp float;
6638precision highp sampler2DArray;
6639
6640uniform sampler2DArray proto_tex;
6641uniform vec4 mask_coeff[8];
6642uniform int num_protos;
6643
6644in vec2 tc;
6645out vec4 color;
6646
6647void main() {
6648    int groups = (num_protos + 3) / 4;
6649    float acc = 0.0;
6650    for (int i = 0; i < groups; i++) {
6651        int base = i * 4;
6652        vec4 val = vec4(
6653            texture(proto_tex, vec3(tc, float(min(base, num_protos - 1)))).r,
6654            texture(proto_tex, vec3(tc, float(min(base + 1, num_protos - 1)))).r,
6655            texture(proto_tex, vec3(tc, float(min(base + 2, num_protos - 1)))).r,
6656            texture(proto_tex, vec3(tc, float(min(base + 3, num_protos - 1)))).r
6657        );
6658        acc += dot(mask_coeff[i], val);
6659    }
6660    float mask = acc > 0.0 ? 1.0 : 0.0;
6661    color = vec4(mask, 0.0, 0.0, 1.0);
6662}
6663"
6664}
6665
6666fn generate_color_shader() -> &'static str {
6667    "\
6668#version 300 es
6669precision mediump float;
6670uniform vec4 colors[20];
6671uniform int class_index;
6672
6673out vec4 color;
6674void main() {
6675    int index = class_index % 20;
6676    color = colors[index];
6677}
6678"
6679}
6680
6681/// Packed RGB -> RGBA8 packing shader (2D texture source, pass 2).
6682///
6683/// Reads from an intermediate RGBA texture and packs 3 RGB channels into
6684/// RGBA8 output pixels. Each output pixel stores 4 consecutive bytes of the
6685/// destination RGB buffer. Uses only 2 texture fetches per fragment (down
6686/// from 4) by exploiting the fact that 4 consecutive bytes span at most 2
6687/// source pixels.
6688fn generate_packed_rgba8_shader_2d() -> &'static str {
6689    "\
6690#version 300 es
6691precision highp float;
6692precision highp int;
6693uniform sampler2D tex;
6694out vec4 color;
6695void main() {
6696    // gl_FragCoord is at pixel center (n+0.5). Use floor() for robust
6697    // integer pixel index on all GPUs (Vivante, Mali, Adreno).
6698    int out_x = int(floor(gl_FragCoord.x));
6699    int out_y = int(floor(gl_FragCoord.y));
6700    int base = out_x * 4;
6701    // 4 consecutive byte indices map to at most 2 source pixels
6702    int px0 = base / 3;
6703    int px1 = (base + 3) / 3;
6704    vec4 s0 = texelFetch(tex, ivec2(px0, out_y), 0);
6705    vec4 s1 = (px1 != px0) ? texelFetch(tex, ivec2(px1, out_y), 0) : s0;
6706    // Extract channels based on phase (base % 3)
6707    int phase = base - px0 * 3;
6708    if (phase == 0) {
6709        color = vec4(s0.r, s0.g, s0.b, s1.r);
6710    } else if (phase == 1) {
6711        color = vec4(s0.g, s0.b, s1.r, s1.g);
6712    } else {
6713        color = vec4(s0.b, s1.r, s1.g, s1.b);
6714    }
6715}
6716"
6717}
6718
6719/// Packed RGB -> RGBA8 packing shader with int8 XOR 0x80 bias (2D source, pass 2).
6720///
6721/// Same packing logic as [`generate_packed_rgba8_shader_2d`] but applies
6722/// bit-exact XOR 0x80 bias via quantize+mod: `floor(v * 255 + 0.5) + 128
6723/// mod 256 / 255`. This matches the CPU `byte ^ 0x80` operation exactly.
6724fn generate_packed_rgba8_int8_shader_2d() -> &'static str {
6725    "\
6726#version 300 es
6727precision highp float;
6728precision highp int;
6729uniform sampler2D tex;
6730out vec4 color;
6731
6732vec4 int8_bias(vec4 v) {
6733    vec4 q = floor(v * 255.0 + 0.5);
6734    return mod(q + 128.0, 256.0) / 255.0;
6735}
6736
6737void main() {
6738    // gl_FragCoord is at pixel center (n+0.5). Use floor() for robust
6739    // integer pixel index on all GPUs (Vivante, Mali, Adreno).
6740    int out_x = int(floor(gl_FragCoord.x));
6741    int out_y = int(floor(gl_FragCoord.y));
6742    int base = out_x * 4;
6743    // 4 consecutive byte indices map to at most 2 source pixels
6744    int px0 = base / 3;
6745    int px1 = (base + 3) / 3;
6746    vec4 s0 = texelFetch(tex, ivec2(px0, out_y), 0);
6747    vec4 s1 = (px1 != px0) ? texelFetch(tex, ivec2(px1, out_y), 0) : s0;
6748    // Extract channels based on phase (base % 3), then apply int8 bias
6749    int phase = base - px0 * 3;
6750    if (phase == 0) {
6751        color = int8_bias(vec4(s0.r, s0.g, s0.b, s1.r));
6752    } else if (phase == 1) {
6753        color = int8_bias(vec4(s0.g, s0.b, s1.r, s1.g));
6754    } else {
6755        color = int8_bias(vec4(s0.b, s1.r, s1.g, s1.b));
6756    }
6757}
6758"
6759}
6760
6761#[cfg(test)]
6762#[cfg(feature = "opengl")]
6763mod gl_tests {
6764    use super::*;
6765    use crate::{TensorImage, BGRA, RGBA};
6766    #[cfg(feature = "dma_test_formats")]
6767    use crate::{NV12, YUYV};
6768    #[cfg(feature = "dma_test_formats")]
6769    use edgefirst_tensor::{is_dma_available, TensorMemory};
6770    use edgefirst_tensor::{TensorMapTrait, TensorTrait};
6771    use image::buffer::ConvertBuffer;
6772    use ndarray::Array3;
6773
6774    #[test]
6775    fn test_segmentation() {
6776        use edgefirst_decoder::Segmentation;
6777
6778        if !is_opengl_available() {
6779            eprintln!("SKIPPED: {} - OpenGL not available", function!());
6780            return;
6781        }
6782
6783        let mut image = TensorImage::load(
6784            include_bytes!("../../../testdata/giraffe.jpg"),
6785            Some(RGBA),
6786            None,
6787        )
6788        .unwrap();
6789
6790        let mut segmentation = Array3::from_shape_vec(
6791            (2, 160, 160),
6792            include_bytes!("../../../testdata/modelpack_seg_2x160x160.bin").to_vec(),
6793        )
6794        .unwrap();
6795        segmentation.swap_axes(0, 1);
6796        segmentation.swap_axes(1, 2);
6797        let segmentation = segmentation.as_standard_layout().to_owned();
6798
6799        let seg = Segmentation {
6800            segmentation,
6801            xmin: 0.0,
6802            ymin: 0.0,
6803            xmax: 1.0,
6804            ymax: 1.0,
6805        };
6806
6807        let mut renderer = GLProcessorThreaded::new(None).unwrap();
6808        renderer.draw_masks(&mut image, &[], &[seg]).unwrap();
6809    }
6810
6811    #[test]
6812    fn test_segmentation_mem() {
6813        use edgefirst_decoder::Segmentation;
6814
6815        if !is_opengl_available() {
6816            eprintln!("SKIPPED: {} - OpenGL not available", function!());
6817            return;
6818        }
6819
6820        let mut image = TensorImage::load(
6821            include_bytes!("../../../testdata/giraffe.jpg"),
6822            Some(RGBA),
6823            Some(edgefirst_tensor::TensorMemory::Mem),
6824        )
6825        .unwrap();
6826
6827        let mut segmentation = Array3::from_shape_vec(
6828            (2, 160, 160),
6829            include_bytes!("../../../testdata/modelpack_seg_2x160x160.bin").to_vec(),
6830        )
6831        .unwrap();
6832        segmentation.swap_axes(0, 1);
6833        segmentation.swap_axes(1, 2);
6834        let segmentation = segmentation.as_standard_layout().to_owned();
6835
6836        let seg = Segmentation {
6837            segmentation,
6838            xmin: 0.0,
6839            ymin: 0.0,
6840            xmax: 1.0,
6841            ymax: 1.0,
6842        };
6843
6844        let mut renderer = GLProcessorThreaded::new(None).unwrap();
6845        renderer.draw_masks(&mut image, &[], &[seg]).unwrap();
6846    }
6847
6848    #[test]
6849    fn test_segmentation_yolo() {
6850        use edgefirst_decoder::Segmentation;
6851        use ndarray::Array3;
6852
6853        if !is_opengl_available() {
6854            eprintln!("SKIPPED: {} - OpenGL not available", function!());
6855            return;
6856        }
6857
6858        let mut image = TensorImage::load(
6859            include_bytes!("../../../testdata/giraffe.jpg"),
6860            Some(RGBA),
6861            None,
6862        )
6863        .unwrap();
6864
6865        let segmentation = Array3::from_shape_vec(
6866            (76, 55, 1),
6867            include_bytes!("../../../testdata/yolov8_seg_crop_76x55.bin").to_vec(),
6868        )
6869        .unwrap();
6870
6871        let detect = DetectBox {
6872            bbox: [0.59375, 0.25, 0.9375, 0.725].into(),
6873            score: 0.99,
6874            label: 1,
6875        };
6876
6877        let seg = Segmentation {
6878            segmentation,
6879            xmin: 0.59375,
6880            ymin: 0.25,
6881            xmax: 0.9375,
6882            ymax: 0.725,
6883        };
6884
6885        let mut renderer = GLProcessorThreaded::new(None).unwrap();
6886        renderer
6887            .set_class_colors(&[[255, 255, 0, 233], [128, 128, 255, 100]])
6888            .unwrap();
6889        renderer.draw_masks(&mut image, &[detect], &[seg]).unwrap();
6890
6891        let expected = TensorImage::load(
6892            include_bytes!("../../../testdata/output_render_gl.jpg"),
6893            Some(RGBA),
6894            None,
6895        )
6896        .unwrap();
6897
6898        compare_images(&image, &expected, 0.99, function!());
6899    }
6900
6901    #[test]
6902    fn test_boxes() {
6903        use edgefirst_decoder::DetectBox;
6904
6905        if !is_opengl_available() {
6906            eprintln!("SKIPPED: {} - OpenGL not available", function!());
6907            return;
6908        }
6909
6910        let mut image = TensorImage::load(
6911            include_bytes!("../../../testdata/giraffe.jpg"),
6912            Some(RGBA),
6913            None,
6914        )
6915        .unwrap();
6916
6917        let detect = DetectBox {
6918            bbox: [0.59375, 0.25, 0.9375, 0.725].into(),
6919            score: 0.99,
6920            label: 0,
6921        };
6922        let mut renderer = GLProcessorThreaded::new(None).unwrap();
6923        renderer
6924            .set_class_colors(&[[255, 255, 0, 233], [128, 128, 255, 100]])
6925            .unwrap();
6926        renderer.draw_masks(&mut image, &[detect], &[]).unwrap();
6927    }
6928
6929    static GL_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
6930    // Helper function to check if OpenGL is available
6931    fn is_opengl_available() -> bool {
6932        #[cfg(all(target_os = "linux", feature = "opengl"))]
6933        {
6934            *GL_AVAILABLE.get_or_init(|| GLProcessorThreaded::new(None).is_ok())
6935        }
6936
6937        #[cfg(not(all(target_os = "linux", feature = "opengl")))]
6938        {
6939            false
6940        }
6941    }
6942
6943    fn compare_images(img1: &TensorImage, img2: &TensorImage, threshold: f64, name: &str) {
6944        assert_eq!(img1.height(), img2.height(), "Heights differ");
6945        assert_eq!(img1.width(), img2.width(), "Widths differ");
6946        assert_eq!(img1.fourcc(), img2.fourcc(), "FourCC differ");
6947        assert!(
6948            matches!(img1.fourcc(), RGB | RGBA | GREY | PLANAR_RGB),
6949            "FourCC must be RGB or RGBA for comparison"
6950        );
6951
6952        let image1 = match img1.fourcc() {
6953            RGB => image::RgbImage::from_vec(
6954                img1.width() as u32,
6955                img1.height() as u32,
6956                img1.tensor().map().unwrap().to_vec(),
6957            )
6958            .unwrap(),
6959            RGBA => image::RgbaImage::from_vec(
6960                img1.width() as u32,
6961                img1.height() as u32,
6962                img1.tensor().map().unwrap().to_vec(),
6963            )
6964            .unwrap()
6965            .convert(),
6966            GREY => image::GrayImage::from_vec(
6967                img1.width() as u32,
6968                img1.height() as u32,
6969                img1.tensor().map().unwrap().to_vec(),
6970            )
6971            .unwrap()
6972            .convert(),
6973            PLANAR_RGB => image::GrayImage::from_vec(
6974                img1.width() as u32,
6975                (img1.height() * 3) as u32,
6976                img1.tensor().map().unwrap().to_vec(),
6977            )
6978            .unwrap()
6979            .convert(),
6980            _ => return,
6981        };
6982
6983        let image2 = match img2.fourcc() {
6984            RGB => image::RgbImage::from_vec(
6985                img2.width() as u32,
6986                img2.height() as u32,
6987                img2.tensor().map().unwrap().to_vec(),
6988            )
6989            .unwrap(),
6990            RGBA => image::RgbaImage::from_vec(
6991                img2.width() as u32,
6992                img2.height() as u32,
6993                img2.tensor().map().unwrap().to_vec(),
6994            )
6995            .unwrap()
6996            .convert(),
6997            GREY => image::GrayImage::from_vec(
6998                img2.width() as u32,
6999                img2.height() as u32,
7000                img2.tensor().map().unwrap().to_vec(),
7001            )
7002            .unwrap()
7003            .convert(),
7004            PLANAR_RGB => image::GrayImage::from_vec(
7005                img2.width() as u32,
7006                (img2.height() * 3) as u32,
7007                img2.tensor().map().unwrap().to_vec(),
7008            )
7009            .unwrap()
7010            .convert(),
7011            _ => return,
7012        };
7013
7014        let similarity = image_compare::rgb_similarity_structure(
7015            &image_compare::Algorithm::RootMeanSquared,
7016            &image1,
7017            &image2,
7018        )
7019        .expect("Image Comparison failed");
7020        if similarity.score < threshold {
7021            // image1.save(format!("{name}_1.png"));
7022            // image2.save(format!("{name}_2.png"));
7023            similarity
7024                .image
7025                .to_color_map()
7026                .save(format!("{name}.png"))
7027                .unwrap();
7028            panic!(
7029                "{name}: converted image and target image have similarity score too low: {} < {}",
7030                similarity.score, threshold
7031            )
7032        }
7033    }
7034
7035    // =========================================================================
7036    // NV12 Reference Validation Tests
7037    // These tests compare OpenGL NV12 conversions against ffmpeg-generated
7038    // references
7039    // =========================================================================
7040
7041    #[cfg(feature = "dma_test_formats")]
7042    fn load_raw_image(
7043        width: usize,
7044        height: usize,
7045        fourcc: FourCharCode,
7046        memory: Option<TensorMemory>,
7047        bytes: &[u8],
7048    ) -> Result<TensorImage, crate::Error> {
7049        let img = TensorImage::new(width, height, fourcc, memory)?;
7050        let mut map = img.tensor().map()?;
7051        map.as_mut_slice()[..bytes.len()].copy_from_slice(bytes);
7052        Ok(img)
7053    }
7054
7055    /// Test OpenGL NV12→RGBA conversion against ffmpeg reference
7056    #[test]
7057    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7058    fn test_opengl_nv12_to_rgba_reference() {
7059        if !is_dma_available() {
7060            return;
7061        }
7062        // Load NV12 source with DMA
7063        let src = load_raw_image(
7064            1280,
7065            720,
7066            NV12,
7067            Some(TensorMemory::Dma),
7068            include_bytes!("../../../testdata/camera720p.nv12"),
7069        )
7070        .unwrap();
7071
7072        // Load RGBA reference (ffmpeg-generated)
7073        let reference = load_raw_image(
7074            1280,
7075            720,
7076            RGBA,
7077            None,
7078            include_bytes!("../../../testdata/camera720p.rgba"),
7079        )
7080        .unwrap();
7081
7082        // Convert using OpenGL
7083        let mut dst = TensorImage::new(1280, 720, RGBA, Some(TensorMemory::Dma)).unwrap();
7084        let mut gl = GLProcessorThreaded::new(None).unwrap();
7085        gl.convert(&src, &mut dst, Rotation::None, Flip::None, Crop::no_crop())
7086            .unwrap();
7087
7088        // Copy to CPU for comparison
7089        let cpu_dst = TensorImage::new(1280, 720, RGBA, None).unwrap();
7090        cpu_dst
7091            .tensor()
7092            .map()
7093            .unwrap()
7094            .as_mut_slice()
7095            .copy_from_slice(dst.tensor().map().unwrap().as_slice());
7096
7097        compare_images(&reference, &cpu_dst, 0.98, "opengl_nv12_to_rgba_reference");
7098    }
7099
7100    /// Test OpenGL YUYV→RGBA conversion against ffmpeg reference
7101    #[test]
7102    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7103    fn test_opengl_yuyv_to_rgba_reference() {
7104        if !is_dma_available() {
7105            return;
7106        }
7107        // Load YUYV source with DMA
7108        let src = load_raw_image(
7109            1280,
7110            720,
7111            YUYV,
7112            Some(TensorMemory::Dma),
7113            include_bytes!("../../../testdata/camera720p.yuyv"),
7114        )
7115        .unwrap();
7116
7117        // Load RGBA reference (ffmpeg-generated)
7118        let reference = load_raw_image(
7119            1280,
7120            720,
7121            RGBA,
7122            None,
7123            include_bytes!("../../../testdata/camera720p.rgba"),
7124        )
7125        .unwrap();
7126
7127        // Convert using OpenGL
7128        let mut dst = TensorImage::new(1280, 720, RGBA, Some(TensorMemory::Dma)).unwrap();
7129        let mut gl = GLProcessorThreaded::new(None).unwrap();
7130        gl.convert(&src, &mut dst, Rotation::None, Flip::None, Crop::no_crop())
7131            .unwrap();
7132
7133        // Copy to CPU for comparison
7134        let cpu_dst = TensorImage::new(1280, 720, RGBA, None).unwrap();
7135        cpu_dst
7136            .tensor()
7137            .map()
7138            .unwrap()
7139            .as_mut_slice()
7140            .copy_from_slice(dst.tensor().map().unwrap().as_slice());
7141
7142        compare_images(&reference, &cpu_dst, 0.98, "opengl_yuyv_to_rgba_reference");
7143    }
7144
7145    // =========================================================================
7146    // EGL Display Probe & Override Tests
7147    // =========================================================================
7148
7149    /// Validate that probe_egl_displays() discovers available display types
7150    /// and returns them in priority order (GBM first).
7151    ///
7152    /// On headless i.MX hardware, GBM and PlatformDevice are typically
7153    /// available. Default requires a running compositor (Wayland/X11) and
7154    /// may not be present on headless targets.
7155    #[test]
7156    fn test_probe_egl_displays() {
7157        let displays = match probe_egl_displays() {
7158            Ok(d) => d,
7159            Err(e) => {
7160                eprintln!("SKIPPED: {} - EGL not available: {e:?}", function!());
7161                return;
7162            }
7163        };
7164
7165        if displays.is_empty() {
7166            eprintln!("SKIPPED: {} - No EGL displays available", function!());
7167            return;
7168        }
7169
7170        let kinds: Vec<_> = displays.iter().map(|d| d.kind).collect();
7171        eprintln!("Probed EGL displays: {kinds:?}");
7172        for d in &displays {
7173            eprintln!("  {:?}: {}", d.kind, d.description);
7174        }
7175
7176        // Verify priority ordering: PlatformDevice > GBM > Default.
7177        // Not all display types are available on every system, but the
7178        // ones that are present must appear in this order.
7179        let priority = |k: &EglDisplayKind| match k {
7180            EglDisplayKind::PlatformDevice => 0,
7181            EglDisplayKind::Gbm => 1,
7182            EglDisplayKind::Default => 2,
7183        };
7184        for w in kinds.windows(2) {
7185            assert!(
7186                priority(&w[0]) < priority(&w[1]),
7187                "Display ordering violated: {:?} should come after {:?}",
7188                w[1],
7189                w[0],
7190            );
7191        }
7192    }
7193
7194    /// Validate that explicitly selecting each available display kind via
7195    /// GLProcessorThreaded::new(Some(kind)) succeeds and produces a working
7196    /// converter.
7197    #[test]
7198    fn test_override_each_display_kind() {
7199        let displays = match probe_egl_displays() {
7200            Ok(d) => d,
7201            Err(e) => {
7202                eprintln!("SKIPPED: {} - EGL not available: {e:?}", function!());
7203                return;
7204            }
7205        };
7206
7207        if displays.is_empty() {
7208            eprintln!("SKIPPED: {} - No EGL displays available", function!());
7209            return;
7210        }
7211
7212        for display in &displays {
7213            eprintln!(
7214                "Testing override: {:?} ({})",
7215                display.kind, display.description
7216            );
7217            let mut gl = GLProcessorThreaded::new(Some(display.kind)).unwrap_or_else(|e| {
7218                panic!(
7219                    "GLProcessorThreaded::new(Some({:?})) failed: {e:?}",
7220                    display.kind
7221                )
7222            });
7223
7224            // Smoke test: do a simple RGBA → RGBA conversion to verify the
7225            // GL context is fully functional.
7226            let src = TensorImage::load(
7227                include_bytes!("../../../testdata/zidane.jpg"),
7228                Some(RGBA),
7229                None,
7230            )
7231            .unwrap();
7232            let mut dst = TensorImage::new(320, 240, RGBA, None).unwrap();
7233            gl.convert(&src, &mut dst, Rotation::None, Flip::None, Crop::no_crop())
7234                .unwrap_or_else(|e| {
7235                    panic!("convert() with {:?} display failed: {e:?}", display.kind)
7236                });
7237            eprintln!("  {:?} display: convert OK", display.kind);
7238        }
7239    }
7240
7241    /// Validate that requesting a display kind that doesn't exist on the
7242    /// system returns an error rather than falling back silently.
7243    #[test]
7244    fn test_override_unavailable_display_errors() {
7245        let displays = match probe_egl_displays() {
7246            Ok(d) => d,
7247            Err(e) => {
7248                eprintln!("SKIPPED: {} - EGL not available: {e:?}", function!());
7249                return;
7250            }
7251        };
7252        let available_kinds: Vec<_> = displays.iter().map(|d| d.kind).collect();
7253
7254        // Find a kind that is NOT available; if all three are available,
7255        // this test has nothing to verify — skip it.
7256        let unavailable = [
7257            EglDisplayKind::PlatformDevice,
7258            EglDisplayKind::Gbm,
7259            EglDisplayKind::Default,
7260        ]
7261        .into_iter()
7262        .find(|k| !available_kinds.contains(k));
7263
7264        if let Some(kind) = unavailable {
7265            eprintln!("Testing override with unavailable kind: {kind:?}");
7266            let result = GLProcessorThreaded::new(Some(kind));
7267            assert!(
7268                result.is_err(),
7269                "Expected error for unavailable display kind {kind:?}, got Ok"
7270            );
7271            eprintln!("  Correctly returned error: {:?}", result.unwrap_err());
7272        } else {
7273            eprintln!(
7274                "SKIPPED: {} - All three display kinds are available",
7275                function!()
7276            );
7277        }
7278    }
7279
7280    /// Validate that auto-detection (None) still works — this is the existing
7281    /// default behaviour and must not regress.
7282    #[test]
7283    fn test_auto_detect_display() {
7284        if !is_opengl_available() {
7285            eprintln!("SKIPPED: {} - OpenGL not available", function!());
7286            return;
7287        }
7288
7289        let mut gl = GLProcessorThreaded::new(None).expect("auto-detect should succeed");
7290        let src = TensorImage::load(
7291            include_bytes!("../../../testdata/zidane.jpg"),
7292            Some(RGBA),
7293            None,
7294        )
7295        .unwrap();
7296        let mut dst = TensorImage::new(320, 240, RGBA, None).unwrap();
7297        gl.convert(&src, &mut dst, Rotation::None, Flip::None, Crop::no_crop())
7298            .expect("auto-detect convert should succeed");
7299    }
7300
7301    #[test]
7302    fn test_packed_rgb_width_constraint() {
7303        // Standard ML model input widths — all satisfy W*3 % 4 == 0
7304        assert_eq!((640usize * 3) % 4, 0);
7305        assert_eq!((320usize * 3) % 4, 0);
7306        assert_eq!((1280usize * 3) % 4, 0);
7307
7308        // Non-divisible widths should be rejected
7309        assert_ne!((322usize * 3) % 4, 0);
7310        assert_ne!((333usize * 3) % 4, 0);
7311    }
7312
7313    // =========================================================================
7314    // Packed RGB Correctness Tests (two-pass pipeline)
7315    // These tests compare GL RGBA output (alpha stripped) against GL packed
7316    // RGB output. Both use the same GPU color conversion, so differences
7317    // isolate packing shader bugs rather than CPU-vs-GPU YUV conversion.
7318    // They require DMA + OpenGL hardware (on-target only).
7319    // =========================================================================
7320
7321    /// Compare two byte slices pixel-by-pixel with tolerance.
7322    /// Panics with details if any byte differs by more than `tolerance`.
7323    #[cfg(feature = "dma_test_formats")]
7324    fn assert_pixels_match(expected: &[u8], actual: &[u8], tolerance: u8) {
7325        assert_eq!(expected.len(), actual.len(), "Buffer size mismatch");
7326        let mut max_diff: u8 = 0;
7327        let mut diff_count: usize = 0;
7328        let mut first_diff_idx = None;
7329        for (i, (&e, &a)) in expected.iter().zip(actual.iter()).enumerate() {
7330            let diff = (e as i16 - a as i16).unsigned_abs() as u8;
7331            if diff > tolerance {
7332                diff_count += 1;
7333                if first_diff_idx.is_none() {
7334                    first_diff_idx = Some(i);
7335                }
7336            }
7337            max_diff = max_diff.max(diff);
7338        }
7339        assert!(
7340            diff_count == 0,
7341            "Pixel mismatch: {diff_count} bytes differ (max_diff={max_diff}, first at index {})",
7342            first_diff_idx.unwrap_or(0)
7343        );
7344    }
7345
7346    /// Build a letterbox crop that fits src into dst_w x dst_h, preserving aspect ratio.
7347    #[cfg(feature = "dma_test_formats")]
7348    fn letterbox_crop(src_w: usize, src_h: usize, dst_w: usize, dst_h: usize) -> Crop {
7349        let src_aspect = src_w as f64 / src_h as f64;
7350        let dst_aspect = dst_w as f64 / dst_h as f64;
7351        let (new_w, new_h) = if src_aspect > dst_aspect {
7352            let new_h = (dst_w as f64 / src_aspect).round() as usize;
7353            (dst_w, new_h)
7354        } else {
7355            let new_w = (dst_h as f64 * src_aspect).round() as usize;
7356            (new_w, dst_h)
7357        };
7358        let left = (dst_w - new_w) / 2;
7359        let top = (dst_h - new_h) / 2;
7360        Crop::new()
7361            .with_dst_rect(Some(crate::Rect::new(left, top, new_w, new_h)))
7362            .with_dst_color(Some([114, 114, 114, 255]))
7363    }
7364
7365    /// Strip alpha from RGBA bytes → packed RGB bytes.
7366    #[cfg(feature = "dma_test_formats")]
7367    fn rgba_to_rgb(rgba: &[u8]) -> Vec<u8> {
7368        assert_eq!(
7369            rgba.len() % 4,
7370            0,
7371            "RGBA buffer length must be divisible by 4"
7372        );
7373        let mut rgb = Vec::with_capacity(rgba.len() / 4 * 3);
7374        for pixel in rgba.chunks_exact(4) {
7375            rgb.push(pixel[0]);
7376            rgb.push(pixel[1]);
7377            rgb.push(pixel[2]);
7378        }
7379        rgb
7380    }
7381
7382    /// Convert uint8 RGB bytes to int8 (XOR 0x80 each byte).
7383    #[cfg(feature = "dma_test_formats")]
7384    fn uint8_to_int8(data: &[u8]) -> Vec<u8> {
7385        data.iter().map(|&b| b ^ 0x80).collect()
7386    }
7387
7388    /// YUYV 1080p → RGB 640x640 with letterbox (two-pass packed RGB pipeline).
7389    /// Compares GL RGBA (alpha-stripped) against GL packed RGB to validate packing.
7390    #[test]
7391    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7392    fn test_opengl_rgb_correctness() {
7393        if !is_dma_available() {
7394            return;
7395        }
7396        let src_dma = load_raw_image(
7397            1920,
7398            1080,
7399            YUYV,
7400            Some(TensorMemory::Dma),
7401            include_bytes!("../../../testdata/camera1080p.yuyv"),
7402        )
7403        .unwrap();
7404
7405        let crop = letterbox_crop(1920, 1080, 640, 640);
7406        let mut gl = GLProcessorThreaded::new(None).unwrap();
7407
7408        // GL RGBA reference
7409        let mut dst_rgba = TensorImage::new(640, 640, RGBA, Some(TensorMemory::Dma)).unwrap();
7410        gl.convert(&src_dma, &mut dst_rgba, Rotation::None, Flip::None, crop)
7411            .unwrap();
7412
7413        // GL packed RGB output
7414        let mut dst_rgb = TensorImage::new(640, 640, RGB, Some(TensorMemory::Dma)).unwrap();
7415        gl.convert(&src_dma, &mut dst_rgb, Rotation::None, Flip::None, crop)
7416            .unwrap();
7417
7418        let rgba_data = dst_rgba.tensor().map().unwrap();
7419        let expected_rgb = rgba_to_rgb(rgba_data.as_slice());
7420        let gl_data = dst_rgb.tensor().map().unwrap();
7421        assert_pixels_match(&expected_rgb, gl_data.as_slice(), 1);
7422    }
7423
7424    /// YUYV 1080p → RGB_INT8 640x640 with letterbox.
7425    /// Compares GL RGBA (alpha-stripped, XOR 0x80) against GL packed RGB_INT8.
7426    #[test]
7427    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7428    fn test_opengl_rgb_int8_correctness() {
7429        if !is_dma_available() {
7430            return;
7431        }
7432        let src_dma = load_raw_image(
7433            1920,
7434            1080,
7435            YUYV,
7436            Some(TensorMemory::Dma),
7437            include_bytes!("../../../testdata/camera1080p.yuyv"),
7438        )
7439        .unwrap();
7440
7441        let crop = letterbox_crop(1920, 1080, 640, 640);
7442        // Use GLProcessorST with direct RGB disabled to validate two-pass int8
7443        // pipeline against RGBA reference. The direct path renders to a different
7444        // framebuffer format (RGB8 renderbuffer vs RGBA8 texture) which produces
7445        // different YUV interpolation results; it is validated separately by
7446        // test_opengl_rgb_direct_matches_two_pass.
7447        let mut gl = match GLProcessorST::new(None) {
7448            Ok(gl) => gl,
7449            Err(e) => {
7450                eprintln!("SKIPPED: {} - GL not available: {e}", function!());
7451                return;
7452            }
7453        };
7454        gl.support_rgb_direct = false;
7455
7456        // GL RGBA reference
7457        let mut dst_rgba = TensorImage::new(640, 640, RGBA, Some(TensorMemory::Dma)).unwrap();
7458        gl.convert(&src_dma, &mut dst_rgba, Rotation::None, Flip::None, crop)
7459            .unwrap();
7460
7461        // GL packed RGB_INT8 output (two-pass path)
7462        let mut dst_rgb = TensorImage::new(640, 640, RGB_INT8, Some(TensorMemory::Dma)).unwrap();
7463        gl.convert(&src_dma, &mut dst_rgb, Rotation::None, Flip::None, crop)
7464            .unwrap();
7465
7466        let rgba_data = dst_rgba.tensor().map().unwrap();
7467        let expected_rgb = uint8_to_int8(&rgba_to_rgb(rgba_data.as_slice()));
7468        let gl_data = dst_rgb.tensor().map().unwrap();
7469        assert_pixels_match(&expected_rgb, gl_data.as_slice(), 1);
7470    }
7471
7472    /// YUYV 1080p → RGB 1920x1080 (no letterbox, same size).
7473    /// Compares GL RGBA (alpha-stripped) against GL packed RGB without scaling.
7474    #[test]
7475    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7476    fn test_opengl_rgb_no_letterbox_correctness() {
7477        if !is_dma_available() {
7478            return;
7479        }
7480        let src_dma = load_raw_image(
7481            1920,
7482            1080,
7483            YUYV,
7484            Some(TensorMemory::Dma),
7485            include_bytes!("../../../testdata/camera1080p.yuyv"),
7486        )
7487        .unwrap();
7488
7489        let mut gl = GLProcessorThreaded::new(None).unwrap();
7490
7491        // GL RGBA reference (no letterbox — 1920 satisfies W*3 % 4 == 0)
7492        let mut dst_rgba = TensorImage::new(1920, 1080, RGBA, Some(TensorMemory::Dma)).unwrap();
7493        gl.convert(
7494            &src_dma,
7495            &mut dst_rgba,
7496            Rotation::None,
7497            Flip::None,
7498            Crop::no_crop(),
7499        )
7500        .unwrap();
7501
7502        // GL packed RGB output
7503        let mut dst_rgb = TensorImage::new(1920, 1080, RGB, Some(TensorMemory::Dma)).unwrap();
7504        gl.convert(
7505            &src_dma,
7506            &mut dst_rgb,
7507            Rotation::None,
7508            Flip::None,
7509            Crop::no_crop(),
7510        )
7511        .unwrap();
7512
7513        let rgba_data = dst_rgba.tensor().map().unwrap();
7514        let expected_rgb = rgba_to_rgb(rgba_data.as_slice());
7515        let gl_data = dst_rgb.tensor().map().unwrap();
7516        assert_pixels_match(&expected_rgb, gl_data.as_slice(), 1);
7517    }
7518
7519    // =========================================================================
7520    // Direct RGB Render Path Tests
7521    // These tests exercise the single-pass BGR888 renderbuffer path added by
7522    // the GL cache work (EDGEAI-776). They require DMA + OpenGL hardware.
7523    // =========================================================================
7524
7525    /// Verify that the direct RGB probe runs without crashing.
7526    #[test]
7527    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7528    fn test_probe_rgb_direct_support() {
7529        if !is_dma_available() {
7530            eprintln!("SKIPPED: {} - DMA not available", function!());
7531            return;
7532        }
7533        let gl = match GLProcessorST::new(None) {
7534            Ok(gl) => gl,
7535            Err(e) => {
7536                eprintln!("SKIPPED: {} - GL not available: {e}", function!());
7537                return;
7538            }
7539        };
7540        // The probe runs during new(). Just check the field is set.
7541        eprintln!(
7542            "support_rgb_direct = {} (probe completed without crash)",
7543            gl.support_rgb_direct
7544        );
7545    }
7546
7547    /// Compare direct RGB path against two-pass path pixel-for-pixel.
7548    /// If GPU doesn't support direct RGB, this test is a no-op.
7549    #[test]
7550    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7551    fn test_opengl_rgb_direct_matches_two_pass() {
7552        if !is_dma_available() {
7553            eprintln!("SKIPPED: {} - DMA not available", function!());
7554            return;
7555        }
7556        let mut gl = match GLProcessorST::new(None) {
7557            Ok(gl) => gl,
7558            Err(e) => {
7559                eprintln!("SKIPPED: {} - GL not available: {e}", function!());
7560                return;
7561            }
7562        };
7563        if !gl.support_rgb_direct {
7564            eprintln!("SKIPPED: {} - GPU does not support direct RGB", function!());
7565            return;
7566        }
7567
7568        // Create RGBA source with deterministic pattern
7569        // Use 640x480 source → 320x320 output so pitch (320*3=960) is 64-byte aligned
7570        // for Mali GPU DMA-buf import requirements.
7571        let src = TensorImage::new(640, 480, RGBA, Some(TensorMemory::Dma)).unwrap();
7572        {
7573            let mut map = src.tensor().map().unwrap();
7574            for (i, byte) in map.as_mut_slice().iter_mut().enumerate() {
7575                *byte = (i % 251) as u8; // deterministic pattern
7576            }
7577        }
7578
7579        let crop = crate::Crop {
7580            src_rect: None,
7581            dst_rect: None,
7582            dst_color: None,
7583        };
7584
7585        // Direct path (support_rgb_direct = true)
7586        let mut dst_direct = TensorImage::new(320, 320, RGB, Some(TensorMemory::Dma)).unwrap();
7587        gl.convert(&src, &mut dst_direct, Rotation::None, Flip::None, crop)
7588            .unwrap();
7589
7590        // Force two-pass path
7591        gl.support_rgb_direct = false;
7592        let mut dst_twop = TensorImage::new(320, 320, RGB, Some(TensorMemory::Dma)).unwrap();
7593        gl.convert(&src, &mut dst_twop, Rotation::None, Flip::None, crop)
7594            .unwrap();
7595        gl.support_rgb_direct = true;
7596
7597        // Compare
7598        let map_direct = dst_direct.tensor().map().unwrap();
7599        let map_twop = dst_twop.tensor().map().unwrap();
7600        // Allow ±1 tolerance for potential rounding differences
7601        let mut max_diff = 0i32;
7602        for (a, b) in map_direct.as_slice().iter().zip(map_twop.as_slice().iter()) {
7603            let diff = (*a as i32 - *b as i32).abs();
7604            max_diff = max_diff.max(diff);
7605        }
7606        eprintln!("RGB direct vs two-pass max pixel diff: {max_diff}");
7607        assert!(max_diff <= 1, "Pixel mismatch > 1: max_diff={max_diff}");
7608    }
7609
7610    // ---- BGRA destination tests ----
7611
7612    /// Test OpenGL NV12→BGRA conversion with DMA buffers.
7613    /// Compares against NV12→RGBA by verifying R↔B swap.
7614    #[test]
7615    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7616    fn test_opengl_nv12_to_bgra() {
7617        if !is_dma_available() {
7618            eprintln!("SKIPPED: test_opengl_nv12_to_bgra - DMA not available");
7619            return;
7620        }
7621
7622        let src = load_raw_image(
7623            1280,
7624            720,
7625            NV12,
7626            Some(TensorMemory::Dma),
7627            include_bytes!("../../../testdata/camera720p.nv12"),
7628        )
7629        .unwrap();
7630
7631        let mut gl = GLProcessorThreaded::new(None).unwrap();
7632
7633        // Convert to RGBA as reference
7634        let mut rgba_dst = TensorImage::new(1280, 720, RGBA, Some(TensorMemory::Dma)).unwrap();
7635        gl.convert(
7636            &src,
7637            &mut rgba_dst,
7638            Rotation::None,
7639            Flip::None,
7640            Crop::no_crop(),
7641        )
7642        .unwrap();
7643
7644        // Convert to BGRA
7645        let mut bgra_dst = TensorImage::new(1280, 720, BGRA, Some(TensorMemory::Dma)).unwrap();
7646        gl.convert(
7647            &src,
7648            &mut bgra_dst,
7649            Rotation::None,
7650            Flip::None,
7651            Crop::no_crop(),
7652        )
7653        .unwrap();
7654
7655        // Compare: BGRA[B,G,R,A] should match RGBA[R,G,B,A] with R↔B swapped
7656        let bgra_map = bgra_dst.tensor().map().unwrap();
7657        let rgba_map = rgba_dst.tensor().map().unwrap();
7658        let bgra_buf = bgra_map.as_slice();
7659        let rgba_buf = rgba_map.as_slice();
7660
7661        assert_eq!(bgra_buf.len(), rgba_buf.len());
7662        let mut max_diff = 0i32;
7663        for (bc, rc) in bgra_buf.chunks_exact(4).zip(rgba_buf.chunks_exact(4)) {
7664            max_diff = max_diff.max((bc[0] as i32 - rc[2] as i32).abs()); // B
7665            max_diff = max_diff.max((bc[1] as i32 - rc[1] as i32).abs()); // G
7666            max_diff = max_diff.max((bc[2] as i32 - rc[0] as i32).abs()); // R
7667            max_diff = max_diff.max((bc[3] as i32 - rc[3] as i32).abs()); // A
7668        }
7669        eprintln!("NV12→BGRA vs NV12→RGBA max channel diff: {max_diff}");
7670        assert!(
7671            max_diff <= 1,
7672            "BGRA/RGBA channel mismatch > 1: max_diff={max_diff}"
7673        );
7674    }
7675
7676    /// Test OpenGL YUYV→BGRA conversion with DMA buffers.
7677    #[test]
7678    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7679    fn test_opengl_yuyv_to_bgra() {
7680        if !is_dma_available() {
7681            eprintln!("SKIPPED: test_opengl_yuyv_to_bgra - DMA not available");
7682            return;
7683        }
7684
7685        let src = load_raw_image(
7686            1280,
7687            720,
7688            YUYV,
7689            Some(TensorMemory::Dma),
7690            include_bytes!("../../../testdata/camera720p.yuyv"),
7691        )
7692        .unwrap();
7693
7694        let mut gl = GLProcessorThreaded::new(None).unwrap();
7695
7696        let mut rgba_dst = TensorImage::new(1280, 720, RGBA, Some(TensorMemory::Dma)).unwrap();
7697        gl.convert(
7698            &src,
7699            &mut rgba_dst,
7700            Rotation::None,
7701            Flip::None,
7702            Crop::no_crop(),
7703        )
7704        .unwrap();
7705
7706        let mut bgra_dst = TensorImage::new(1280, 720, BGRA, Some(TensorMemory::Dma)).unwrap();
7707        gl.convert(
7708            &src,
7709            &mut bgra_dst,
7710            Rotation::None,
7711            Flip::None,
7712            Crop::no_crop(),
7713        )
7714        .unwrap();
7715
7716        let bgra_map = bgra_dst.tensor().map().unwrap();
7717        let rgba_map = rgba_dst.tensor().map().unwrap();
7718        let bgra_buf = bgra_map.as_slice();
7719        let rgba_buf = rgba_map.as_slice();
7720
7721        let mut max_diff = 0i32;
7722        for (bc, rc) in bgra_buf.chunks_exact(4).zip(rgba_buf.chunks_exact(4)) {
7723            max_diff = max_diff.max((bc[0] as i32 - rc[2] as i32).abs());
7724            max_diff = max_diff.max((bc[1] as i32 - rc[1] as i32).abs());
7725            max_diff = max_diff.max((bc[2] as i32 - rc[0] as i32).abs());
7726            max_diff = max_diff.max((bc[3] as i32 - rc[3] as i32).abs());
7727        }
7728        eprintln!("YUYV→BGRA vs YUYV→RGBA max channel diff: {max_diff}");
7729        assert!(
7730            max_diff <= 1,
7731            "BGRA/RGBA channel mismatch > 1: max_diff={max_diff}"
7732        );
7733    }
7734
7735    /// Test draw_masks() with BGRA destination (segmentation).
7736    /// Draws the same masks to both RGBA and BGRA, then verifies R↔B swap.
7737    #[test]
7738    fn test_draw_masks_bgra() {
7739        use edgefirst_decoder::Segmentation;
7740
7741        if !is_opengl_available() {
7742            eprintln!("SKIPPED: test_draw_masks_bgra - OpenGL not available");
7743            return;
7744        }
7745
7746        let seg_bytes = include_bytes!("../../../testdata/modelpack_seg_2x160x160.bin").to_vec();
7747
7748        // Build segmentation data (shared between both renders)
7749        let make_seg = || {
7750            let mut s = Array3::from_shape_vec((2, 160, 160), seg_bytes.clone()).unwrap();
7751            s.swap_axes(0, 1);
7752            s.swap_axes(1, 2);
7753            let s = s.as_standard_layout().to_owned();
7754            Segmentation {
7755                segmentation: s,
7756                xmin: 0.0,
7757                ymin: 0.0,
7758                xmax: 1.0,
7759                ymax: 1.0,
7760            }
7761        };
7762
7763        let mut gl = GLProcessorThreaded::new(None).unwrap();
7764
7765        // Render to RGBA
7766        let mut rgba_img = TensorImage::load(
7767            include_bytes!("../../../testdata/giraffe.jpg"),
7768            Some(RGBA),
7769            None,
7770        )
7771        .unwrap();
7772        gl.draw_masks(&mut rgba_img, &[], &[make_seg()]).unwrap();
7773
7774        // Render to BGRA (convert source to BGRA first)
7775        let rgba_src = TensorImage::load(
7776            include_bytes!("../../../testdata/giraffe.jpg"),
7777            Some(RGBA),
7778            None,
7779        )
7780        .unwrap();
7781        let mut bgra_img =
7782            TensorImage::new(rgba_src.width(), rgba_src.height(), BGRA, None).unwrap();
7783        gl.convert(
7784            &rgba_src,
7785            &mut bgra_img,
7786            Rotation::None,
7787            Flip::None,
7788            Crop::no_crop(),
7789        )
7790        .unwrap();
7791        gl.draw_masks(&mut bgra_img, &[], &[make_seg()]).unwrap();
7792
7793        // Verify BGRA output matches RGBA output with R↔B swapped
7794        let rgba_map = rgba_img.tensor().map().unwrap();
7795        let bgra_map = bgra_img.tensor().map().unwrap();
7796        let rgba_buf = rgba_map.as_slice();
7797        let bgra_buf = bgra_map.as_slice();
7798        assert_eq!(rgba_buf.len(), bgra_buf.len());
7799
7800        let mut max_diff = 0i32;
7801        for (rc, bc) in rgba_buf.chunks_exact(4).zip(bgra_buf.chunks_exact(4)) {
7802            max_diff = max_diff.max((rc[0] as i32 - bc[2] as i32).abs()); // R
7803            max_diff = max_diff.max((rc[1] as i32 - bc[1] as i32).abs()); // G
7804            max_diff = max_diff.max((rc[2] as i32 - bc[0] as i32).abs()); // B
7805            max_diff = max_diff.max((rc[3] as i32 - bc[3] as i32).abs()); // A
7806        }
7807        eprintln!("draw_masks BGRA vs RGBA max channel diff: {max_diff}");
7808        assert!(
7809            max_diff <= 1,
7810            "draw_masks BGRA/RGBA channel mismatch > 1: max_diff={max_diff}"
7811        );
7812    }
7813
7814    /// Test draw_masks() with BGRA destination using Mem memory (boxes).
7815    /// Draws same boxes to RGBA and BGRA, then verifies R↔B swap.
7816    #[test]
7817    fn test_draw_masks_bgra_mem() {
7818        use edgefirst_decoder::DetectBox;
7819
7820        if !is_opengl_available() {
7821            eprintln!("SKIPPED: test_draw_masks_bgra_mem - OpenGL not available");
7822            return;
7823        }
7824
7825        let detect = DetectBox {
7826            bbox: [0.59375, 0.25, 0.9375, 0.725].into(),
7827            score: 0.99,
7828            label: 0,
7829        };
7830        let colors = [[255, 255, 0, 233], [128, 128, 255, 100]];
7831
7832        let mut gl = GLProcessorThreaded::new(None).unwrap();
7833        gl.set_class_colors(&colors).unwrap();
7834
7835        // Render boxes to RGBA
7836        let mut rgba_img = TensorImage::load(
7837            include_bytes!("../../../testdata/giraffe.jpg"),
7838            Some(RGBA),
7839            Some(edgefirst_tensor::TensorMemory::Mem),
7840        )
7841        .unwrap();
7842        gl.draw_masks(&mut rgba_img, &[detect], &[]).unwrap();
7843
7844        // Render boxes to BGRA
7845        let rgba_src = TensorImage::load(
7846            include_bytes!("../../../testdata/giraffe.jpg"),
7847            Some(RGBA),
7848            Some(edgefirst_tensor::TensorMemory::Mem),
7849        )
7850        .unwrap();
7851        let mut bgra_img = TensorImage::new(
7852            rgba_src.width(),
7853            rgba_src.height(),
7854            BGRA,
7855            Some(edgefirst_tensor::TensorMemory::Mem),
7856        )
7857        .unwrap();
7858        gl.convert(
7859            &rgba_src,
7860            &mut bgra_img,
7861            Rotation::None,
7862            Flip::None,
7863            Crop::no_crop(),
7864        )
7865        .unwrap();
7866        gl.draw_masks(&mut bgra_img, &[detect], &[]).unwrap();
7867
7868        // Verify BGRA output matches RGBA output with R↔B swapped
7869        let rgba_map = rgba_img.tensor().map().unwrap();
7870        let bgra_map = bgra_img.tensor().map().unwrap();
7871        let rgba_buf = rgba_map.as_slice();
7872        let bgra_buf = bgra_map.as_slice();
7873
7874        let mut max_diff = 0i32;
7875        for (rc, bc) in rgba_buf.chunks_exact(4).zip(bgra_buf.chunks_exact(4)) {
7876            max_diff = max_diff.max((rc[0] as i32 - bc[2] as i32).abs());
7877            max_diff = max_diff.max((rc[1] as i32 - bc[1] as i32).abs());
7878            max_diff = max_diff.max((rc[2] as i32 - bc[0] as i32).abs());
7879            max_diff = max_diff.max((rc[3] as i32 - bc[3] as i32).abs());
7880        }
7881        eprintln!("draw_masks_mem BGRA vs RGBA max channel diff: {max_diff}");
7882        assert!(
7883            max_diff <= 1,
7884            "draw_masks_mem BGRA/RGBA channel mismatch > 1: max_diff={max_diff}"
7885        );
7886    }
7887
7888    // ========================================================================
7889    // GL smoke tests for mask rendering and PBO destinations
7890    // ========================================================================
7891
7892    #[test]
7893    fn test_gl_mask_render_smoke() {
7894        if !is_opengl_available() {
7895            eprintln!("SKIPPED: {} - OpenGL not available", function!());
7896            return;
7897        }
7898
7899        let mut gl = GLProcessorThreaded::new(None).unwrap();
7900        let mut image = TensorImage::new(64, 64, RGBA, None).unwrap();
7901
7902        // Render with empty detections and segmentations — should succeed trivially
7903        let result = gl.draw_masks(&mut image, &[], &[]);
7904        assert!(
7905            result.is_ok(),
7906            "GL mask render with empty data should succeed: {result:?}"
7907        );
7908
7909        // Verify output dimensions are unchanged
7910        assert_eq!(image.width(), 64);
7911        assert_eq!(image.height(), 64);
7912    }
7913
7914    #[test]
7915    fn test_gl_pbo_destination_smoke() {
7916        if !is_opengl_available() {
7917            eprintln!("SKIPPED: {} - OpenGL not available", function!());
7918            return;
7919        }
7920
7921        let gl = GLProcessorThreaded::new(None).unwrap();
7922        let result = gl.create_pbo_image(64, 64, RGBA);
7923        match result {
7924            Ok(pbo_img) => {
7925                assert_eq!(pbo_img.width(), 64);
7926                assert_eq!(pbo_img.height(), 64);
7927                assert_eq!(pbo_img.fourcc(), RGBA);
7928            }
7929            Err(e) => {
7930                // PBO may not be supported on all GL implementations
7931                eprintln!("SKIPPED: {} - PBO not supported: {e:?}", function!());
7932            }
7933        }
7934    }
7935}