Skip to main content

edgefirst_image/
opengl_headless.rs

1// SPDX-FileCopyrightText: Copyright 2025 Au-Zone Technologies
2// SPDX-License-Identifier: Apache-2.0
3
4#![cfg(target_os = "linux")]
5#![cfg(feature = "opengl")]
6
7use edgefirst_decoder::{DetectBox, ProtoData, ProtoTensor, Segmentation};
8use edgefirst_tensor::{TensorMemory, TensorTrait};
9use four_char_code::FourCharCode;
10use gbm::{
11    drm::{buffer::DrmFourcc, control::Device as DrmControlDevice, Device as DrmDevice},
12    AsRaw, Device,
13};
14use khronos_egl::{self as egl, Attrib, Display, Dynamic, Instance, EGL1_4};
15use log::{debug, error};
16use std::{
17    collections::BTreeSet,
18    ffi::{c_char, c_void, CStr, CString},
19    mem::ManuallyDrop,
20    os::fd::AsRawFd,
21    ptr::{null, null_mut, NonNull},
22    rc::Rc,
23    str::FromStr,
24    sync::OnceLock,
25    thread::JoinHandle,
26    time::Instant,
27};
28use tokio::sync::mpsc::{Sender, WeakSender};
29
30macro_rules! function {
31    () => {{
32        fn f() {}
33        fn type_name_of<T>(_: T) -> &'static str {
34            std::any::type_name::<T>()
35        }
36        let name = type_name_of(f);
37
38        // Find and cut the rest of the path
39        match &name[..name.len() - 3].rfind(':') {
40            Some(pos) => &name[pos + 1..name.len() - 3],
41            None => &name[..name.len() - 3],
42        }
43    }};
44}
45
46use crate::{
47    fourcc_is_int8, fourcc_is_packed_rgb, CPUProcessor, Crop, Error, Flip, ImageProcessorTrait,
48    MaskRegion, Rect, Rotation, TensorImage, TensorImageRef, DEFAULT_COLORS, GREY, NV12,
49    PLANAR_RGB, PLANAR_RGBA, PLANAR_RGB_INT8, RGB, RGBA, RGB_INT8, VYUY, YUYV,
50};
51
52/// Identifies the type of EGL display used for headless OpenGL ES rendering.
53///
54/// The HAL creates a surfaceless GLES 3.0 context
55/// (`EGL_KHR_surfaceless_context` + `EGL_KHR_no_config_context`) and
56/// renders exclusively through FBOs backed by EGLImages imported from
57/// DMA-buf file descriptors. No window or PBuffer surface is created.
58///
59/// Displays are probed in priority order: PlatformDevice first (zero
60/// external dependencies), then GBM, then Default. Use
61/// [`probe_egl_displays`] to discover which are available and
62/// [`ImageProcessorConfig::egl_display`](crate::ImageProcessorConfig::egl_display)
63/// to override the auto-detection.
64///
65/// # Display Types
66///
67/// - **`PlatformDevice`** — Uses `EGL_EXT_device_enumeration` to query
68///   available EGL devices via `eglQueryDevicesEXT`, then selects the first
69///   device with `eglGetPlatformDisplay(EGL_EXT_platform_device, ...)`.
70///   Headless and compositor-free with zero external library dependencies.
71///   Works on NVIDIA GPUs and newer Vivante drivers.
72///
73/// - **`Gbm`** — Opens a DRM render node (e.g. `/dev/dri/renderD128`) and
74///   creates a GBM (Generic Buffer Manager) device, then calls
75///   `eglGetPlatformDisplay(EGL_PLATFORM_GBM_KHR, gbm_device)`. Requires
76///   `libgbm` and a DRM render node. Needed on ARM Mali (i.MX95) and older
77///   Vivante drivers that do not expose `EGL_EXT_platform_device`.
78///
79/// - **`Default`** — Calls `eglGetDisplay(EGL_DEFAULT_DISPLAY)`, letting the
80///   EGL implementation choose the display. On Wayland systems this connects
81///   to the compositor; on X11 it connects to the X server. May block on
82///   headless systems where a compositor is expected but not running.
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
84pub enum EglDisplayKind {
85    Gbm,
86    PlatformDevice,
87    Default,
88}
89
90impl std::fmt::Display for EglDisplayKind {
91    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92        match self {
93            EglDisplayKind::Gbm => write!(f, "GBM"),
94            EglDisplayKind::PlatformDevice => write!(f, "PlatformDevice"),
95            EglDisplayKind::Default => write!(f, "Default"),
96        }
97    }
98}
99
100/// A validated, available EGL display discovered by [`probe_egl_displays`].
101#[derive(Debug, Clone)]
102pub struct EglDisplayInfo {
103    /// The type of EGL display.
104    pub kind: EglDisplayKind,
105    /// Human-readable description for logging/diagnostics
106    /// (e.g. "GBM via /dev/dri/renderD128").
107    pub description: String,
108}
109
110/// EGL library handle. Intentionally leaked (never dlclose'd) to avoid SIGBUS
111/// on process exit: GPU drivers may keep internal state that outlives explicit
112/// EGL cleanup, and dlclose can unmap memory still referenced by the driver.
113static EGL_LIB: OnceLock<&'static libloading::Library> = OnceLock::new();
114
115fn get_egl_lib() -> Result<&'static libloading::Library, crate::Error> {
116    if let Some(egl) = EGL_LIB.get() {
117        Ok(egl)
118    } else {
119        let egl = unsafe { libloading::Library::new("libEGL.so.1")? };
120        // Leak the library to prevent dlclose on process exit
121        let egl: &'static libloading::Library = Box::leak(Box::new(egl));
122        Ok(EGL_LIB.get_or_init(|| egl))
123    }
124}
125
126type Egl = Instance<Dynamic<&'static libloading::Library, EGL1_4>>;
127
128/// Check whether an EGL display supports the surfaceless + no-config context
129/// extensions required by the HAL's FBO-based rendering pipeline.
130///
131/// Queries `eglQueryString(display, EGL_EXTENSIONS)` and checks for
132/// `EGL_KHR_surfaceless_context` and `EGL_KHR_no_config_context`.
133fn probe_display_extensions(egl: &Egl, display: egl::Display) -> bool {
134    let Ok(ext_str) = egl.query_string(Some(display), egl::EXTENSIONS) else {
135        return false;
136    };
137    let exts = ext_str.to_string_lossy();
138
139    let required = ["EGL_KHR_surfaceless_context", "EGL_KHR_no_config_context"];
140
141    for r in &required {
142        if !exts.contains(r) {
143            log::debug!("Display missing required extension: {r}");
144            return false;
145        }
146    }
147
148    egl.bind_api(egl::OPENGL_ES_API).is_ok()
149}
150
151/// Probe for available EGL displays supporting headless OpenGL ES 3.0.
152///
153/// Returns validated displays in priority order (PlatformDevice, GBM,
154/// Default). Each display is validated with `eglInitialize` + extension
155/// checks for `EGL_KHR_surfaceless_context` and `EGL_KHR_no_config_context`.
156/// Probed state is cleaned up with `eglTerminate` — no EGL resources are
157/// left alive.
158///
159/// An empty list means OpenGL is not available on this system.
160///
161/// # Errors
162///
163/// Returns an error only if `libEGL.so.1` cannot be loaded. Individual
164/// display probe failures are silently skipped.
165pub fn probe_egl_displays() -> Result<Vec<EglDisplayInfo>, Error> {
166    let egl: Egl = unsafe { Instance::<Dynamic<_, EGL1_4>>::load_required_from(get_egl_lib()?)? };
167
168    let mut results = Vec::new();
169
170    // PlatformDevice first (zero external deps, works on NVIDIA + newer Vivante)
171    if let Ok(display_type) = GlContext::egl_get_platform_display_from_device(&egl) {
172        let display = display_type.as_display();
173        if egl.initialize(display).is_ok() {
174            if probe_display_extensions(&egl, display) {
175                results.push(EglDisplayInfo {
176                    kind: EglDisplayKind::PlatformDevice,
177                    description: "EGL platform device via EGL_EXT_device_enumeration".to_string(),
178                });
179            }
180            let _ = egl.terminate(display);
181        }
182    }
183
184    // GBM second (needed for Mali + old Vivante)
185    if let Ok(display_type) = GlContext::egl_get_gbm_display(&egl) {
186        let display = display_type.as_display();
187        if egl.initialize(display).is_ok() {
188            if probe_display_extensions(&egl, display) {
189                results.push(EglDisplayInfo {
190                    kind: EglDisplayKind::Gbm,
191                    description: "GBM via /dev/dri/renderD128".to_string(),
192                });
193            }
194            let _ = egl.terminate(display);
195        }
196    }
197
198    // Default last (needs compositor)
199    if let Ok(display_type) = GlContext::egl_get_default_display(&egl) {
200        let display = display_type.as_display();
201        if egl.initialize(display).is_ok() {
202            if probe_display_extensions(&egl, display) {
203                results.push(EglDisplayInfo {
204                    kind: EglDisplayKind::Default,
205                    description: "EGL default display".to_string(),
206                });
207            }
208            let _ = egl.terminate(display);
209        }
210    }
211
212    Ok(results)
213}
214
215/// Tracks which data-transfer method is active for moving pixels
216/// between CPU memory and GPU textures/framebuffers.
217#[derive(Debug, Clone, Copy, PartialEq, Eq)]
218pub(crate) enum TransferBackend {
219    /// Zero-copy via EGLImage imported from DMA-buf file descriptors.
220    /// Available on i.MX8 (Vivante), i.MX95 (Mali), Jetson, and any
221    /// platform where `EGL_EXT_image_dma_buf_import` is present AND
222    /// the GPU can actually render through DMA-buf-backed textures.
223    DmaBuf,
224
225    /// GPU buffer via Pixel Buffer Object. Used when DMA-buf is unavailable
226    /// but OpenGL is present. Data stays in GPU-accessible memory.
227    Pbo,
228
229    /// Synchronous `glTexSubImage2D` upload + `glReadnPixels` readback.
230    /// Used when DMA-buf is unavailable or when the DMA-buf verification
231    /// probe fails (e.g. NVIDIA discrete GPUs where EGLImage creation
232    /// succeeds but rendered data is all zeros).
233    Sync,
234}
235
236impl TransferBackend {
237    /// Returns `true` if DMA-buf zero-copy is available.
238    pub(crate) fn is_dma(self) -> bool {
239        self == TransferBackend::DmaBuf
240    }
241
242    /// Returns `true` if PBO transfer is active.
243    #[allow(dead_code)]
244    pub(crate) fn is_pbo(self) -> bool {
245        self == TransferBackend::Pbo
246    }
247}
248
249pub(crate) struct GlContext {
250    pub(crate) transfer_backend: TransferBackend,
251    pub(crate) display: EglDisplayType,
252    pub(crate) ctx: egl::Context,
253    /// Wrapped in ManuallyDrop because the khronos-egl Dynamic instance's
254    /// Drop calls eglReleaseThread() which can panic during process shutdown
255    /// if the EGL library has been partially unloaded. We drop it explicitly
256    /// inside catch_unwind in GlContext::drop.
257    pub(crate) egl: ManuallyDrop<Rc<Egl>>,
258}
259
260pub(crate) enum EglDisplayType {
261    Default(egl::Display),
262    Gbm(egl::Display, #[allow(dead_code)] Device<Card>),
263    PlatformDisplay(egl::Display),
264}
265
266impl EglDisplayType {
267    fn as_display(&self) -> egl::Display {
268        match self {
269            EglDisplayType::Default(disp) => *disp,
270            EglDisplayType::Gbm(disp, _) => *disp,
271            EglDisplayType::PlatformDisplay(disp) => *disp,
272        }
273    }
274}
275
276impl GlContext {
277    pub(crate) fn new(kind: Option<EglDisplayKind>) -> Result<GlContext, crate::Error> {
278        // Create an EGL API instance.
279        let egl: Rc<Egl> =
280            Rc::new(unsafe { Instance::<Dynamic<_, EGL1_4>>::load_required_from(get_egl_lib()?)? });
281
282        if let Some(kind) = kind {
283            // Specific display type requested — try only that one.
284            let display_fn = match kind {
285                EglDisplayKind::Gbm => Self::egl_get_gbm_display as fn(&Egl) -> _,
286                EglDisplayKind::PlatformDevice => Self::egl_get_platform_display_from_device,
287                EglDisplayKind::Default => Self::egl_get_default_display,
288            };
289            return Self::try_initialize_egl(egl, display_fn).map_err(|e| {
290                log::debug!("Failed to initialize EGL with {kind} display: {e:?}");
291                e
292            });
293        }
294
295        // Try PlatformDevice first (zero external deps, works on NVIDIA + newer Vivante)
296        if let Ok(headless) =
297            Self::try_initialize_egl(egl.clone(), Self::egl_get_platform_display_from_device)
298        {
299            return Ok(headless);
300        } else {
301            log::debug!("Didn't initialize EGL with platform display from device enumeration");
302        }
303
304        // GBM second (needed for Mali + old Vivante that lack EGL_EXT_platform_device)
305        if let Ok(headless) = Self::try_initialize_egl(egl.clone(), Self::egl_get_gbm_display) {
306            return Ok(headless);
307        } else {
308            log::debug!("Didn't initialize EGL with GBM Display");
309        }
310
311        // Default display last (needs compositor)
312        if let Ok(headless) = Self::try_initialize_egl(egl.clone(), Self::egl_get_default_display) {
313            return Ok(headless);
314        } else {
315            log::debug!("Didn't initialize EGL with Default Display");
316        }
317
318        Err(Error::OpenGl(
319            "Could not initialize EGL with any known method".to_string(),
320        ))
321    }
322
323    fn try_initialize_egl(
324        egl: Rc<Egl>,
325        display_fn: impl Fn(&Egl) -> Result<EglDisplayType, crate::Error>,
326    ) -> Result<GlContext, crate::Error> {
327        let display = display_fn(&egl)?;
328        log::debug!("egl initialize with display: {:x?}", display.as_display());
329        egl.initialize(display.as_display())?;
330
331        // Verify required extensions for surfaceless + no-config context
332        let ext_str = egl.query_string(Some(display.as_display()), egl::EXTENSIONS)?;
333        let exts = ext_str.to_string_lossy();
334
335        if !exts.contains("EGL_KHR_surfaceless_context") {
336            return Err(crate::Error::GLVersion(
337                "EGL display does not support EGL_KHR_surfaceless_context".to_string(),
338            ));
339        }
340
341        if !exts.contains("EGL_KHR_no_config_context") {
342            return Err(crate::Error::GLVersion(
343                "EGL display does not support EGL_KHR_no_config_context".to_string(),
344            ));
345        }
346
347        egl.bind_api(egl::OPENGL_ES_API)?;
348
349        // No-config context: pass EGL_NO_CONFIG_KHR (null) instead of a
350        // real config. The context is not bound to any specific framebuffer
351        // format — it works with any FBO attachment format.
352        let context_attributes = [egl::CONTEXT_MAJOR_VERSION, 3, egl::NONE, egl::NONE];
353        let ctx = egl.create_context(
354            display.as_display(),
355            egl_ext::NO_CONFIG_KHR,
356            None,
357            &context_attributes,
358        )?;
359        debug!("ctx: {ctx:?}");
360
361        // Surfaceless context: no PBuffer surface needed. All rendering
362        // goes through FBOs backed by EGLImages.
363        egl.make_current(display.as_display(), None, None, Some(ctx))?;
364
365        let has_dma_extensions = Self::egl_check_support_dma(&egl).is_ok();
366        let transfer_backend = if has_dma_extensions {
367            TransferBackend::DmaBuf
368        } else {
369            TransferBackend::Sync
370        };
371        Ok(GlContext {
372            display,
373            ctx,
374            egl: ManuallyDrop::new(egl),
375            transfer_backend,
376        })
377    }
378
379    fn egl_get_default_display(egl: &Egl) -> Result<EglDisplayType, crate::Error> {
380        // get the default display
381        if let Some(display) = unsafe { egl.get_display(egl::DEFAULT_DISPLAY) } {
382            debug!("default display: {display:?}");
383            return Ok(EglDisplayType::Default(display));
384        }
385
386        Err(Error::OpenGl(
387            "Could not obtain EGL Default Display".to_string(),
388        ))
389    }
390
391    fn egl_get_gbm_display(egl: &Egl) -> Result<EglDisplayType, crate::Error> {
392        // init a GBM device
393        let gbm = Device::new(Card::open_global()?)?;
394
395        debug!("gbm: {gbm:?}");
396        let display = Self::egl_get_platform_display_with_fallback(
397            egl,
398            egl_ext::PLATFORM_GBM_KHR,
399            gbm.as_raw() as *mut c_void,
400            &[egl::ATTRIB_NONE],
401        )?;
402
403        Ok(EglDisplayType::Gbm(display, gbm))
404    }
405
406    fn egl_get_platform_display_from_device(egl: &Egl) -> Result<EglDisplayType, crate::Error> {
407        let extensions = egl.query_string(None, egl::EXTENSIONS)?;
408        let extensions = extensions.to_string_lossy();
409        log::debug!("EGL Extensions: {}", extensions);
410
411        if !extensions.contains("EGL_EXT_device_enumeration") {
412            return Err(Error::GLVersion(
413                "EGL doesn't supported EGL_EXT_device_enumeration extension".to_string(),
414            ));
415        }
416
417        type EGLDeviceEXT = *mut c_void;
418        let devices = if let Some(ext) = egl.get_proc_address("eglQueryDevicesEXT") {
419            let func: unsafe extern "system" fn(
420                max_devices: egl::Int,
421                devices: *mut EGLDeviceEXT,
422                num_devices: *mut egl::Int,
423            ) -> *const c_char = unsafe { std::mem::transmute(ext) };
424            let mut devices = [std::ptr::null_mut(); 10];
425            let mut num_devices = 0;
426            unsafe { func(devices.len() as i32, devices.as_mut_ptr(), &mut num_devices) };
427            for i in 0..num_devices {
428                log::debug!("EGL device: {:?}", devices[i as usize]);
429            }
430            devices[0..num_devices as usize].to_vec()
431        } else {
432            return Err(Error::GLVersion(
433                "EGL doesn't supported eglQueryDevicesEXT function".to_string(),
434            ));
435        };
436
437        if !extensions.contains("EGL_EXT_platform_device") {
438            return Err(Error::GLVersion(
439                "EGL doesn't supported EGL_EXT_platform_device extension".to_string(),
440            ));
441        }
442
443        if devices.is_empty() {
444            return Err(Error::GLVersion(
445                "EGL_EXT_device_enumeration returned 0 devices".to_string(),
446            ));
447        }
448        let disp = Self::egl_get_platform_display_with_fallback(
449            egl,
450            egl_ext::PLATFORM_DEVICE_EXT,
451            devices[0],
452            &[egl::ATTRIB_NONE],
453        )?;
454        Ok(EglDisplayType::PlatformDisplay(disp))
455    }
456
457    fn egl_check_support_dma(egl: &Egl) -> Result<(), crate::Error> {
458        let extensions = egl.query_string(None, egl::EXTENSIONS)?;
459        let extensions = extensions.to_string_lossy();
460        log::debug!("EGL Extensions: {}", extensions);
461
462        if egl.upcast::<egl::EGL1_5>().is_some() {
463            return Ok(());
464        }
465
466        if !extensions.contains("EGL_EXT_image_dma_buf_import") {
467            return Err(crate::Error::GLVersion(
468                "EGL does not support EGL_EXT_image_dma_buf_import extension".to_string(),
469            ));
470        }
471
472        if egl.get_proc_address("eglCreateImageKHR").is_none() {
473            return Err(crate::Error::GLVersion(
474                "EGL does not support eglCreateImageKHR function".to_string(),
475            ));
476        }
477
478        if egl.get_proc_address("eglDestroyImageKHR").is_none() {
479            return Err(crate::Error::GLVersion(
480                "EGL does not support eglDestroyImageKHR function".to_string(),
481            ));
482        }
483        Ok(())
484    }
485
486    fn egl_get_platform_display_with_fallback(
487        egl: &Egl,
488        platform: egl::Enum,
489        native_display: *mut c_void,
490        attrib_list: &[Attrib],
491    ) -> Result<Display, Error> {
492        if let Some(egl) = egl.upcast::<egl::EGL1_5>() {
493            unsafe { egl.get_platform_display(platform, native_display, attrib_list) }
494                .map_err(|e| e.into())
495        } else if let Some(ext) = egl.get_proc_address("eglGetPlatformDisplayEXT") {
496            let func: unsafe extern "system" fn(
497                platform: egl::Enum,
498                native_display: *mut c_void,
499                attrib_list: *const Attrib,
500            ) -> egl::EGLDisplay = unsafe { std::mem::transmute(ext) };
501            let disp = unsafe { func(platform, native_display, attrib_list.as_ptr()) };
502            if disp != egl::NO_DISPLAY {
503                Ok(unsafe { Display::from_ptr(disp) })
504            } else {
505                Err(egl.get_error().map(|e| e.into()).unwrap_or(Error::Internal(
506                    "EGL failed but no error was reported".to_owned(),
507                )))
508            }
509        } else {
510            Err(Error::EGLLoad(egl::LoadError::InvalidVersion {
511                provided: egl.version(),
512                required: khronos_egl::Version::EGL1_5,
513            }))
514        }
515    }
516
517    fn egl_create_image_with_fallback(
518        egl: &Egl,
519        display: Display,
520        ctx: egl::Context,
521        target: egl::Enum,
522        buffer: egl::ClientBuffer,
523        attrib_list: &[Attrib],
524    ) -> Result<egl::Image, Error> {
525        if let Some(egl) = egl.upcast::<egl::EGL1_5>() {
526            egl.create_image(display, ctx, target, buffer, attrib_list)
527                .map_err(|e| e.into())
528        } else if let Some(ext) = egl.get_proc_address("eglCreateImageKHR") {
529            log::trace!("eglCreateImageKHR addr: {:?}", ext);
530            let func: unsafe extern "system" fn(
531                display: egl::EGLDisplay,
532                ctx: egl::EGLContext,
533                target: egl::Enum,
534                buffer: egl::EGLClientBuffer,
535                attrib_list: *const egl::Int,
536            ) -> egl::EGLImage = unsafe { std::mem::transmute(ext) };
537            let new_attrib_list = attrib_list
538                .iter()
539                .map(|x| *x as egl::Int)
540                .collect::<Vec<_>>();
541
542            let image = unsafe {
543                func(
544                    display.as_ptr(),
545                    ctx.as_ptr(),
546                    target,
547                    buffer.as_ptr(),
548                    new_attrib_list.as_ptr(),
549                )
550            };
551            if image != egl::NO_IMAGE {
552                Ok(unsafe { egl::Image::from_ptr(image) })
553            } else {
554                Err(egl.get_error().map(|e| e.into()).unwrap_or(Error::Internal(
555                    "EGL failed but no error was reported".to_owned(),
556                )))
557            }
558        } else {
559            Err(Error::EGLLoad(egl::LoadError::InvalidVersion {
560                provided: egl.version(),
561                required: khronos_egl::Version::EGL1_5,
562            }))
563        }
564    }
565
566    fn egl_destroy_image_with_fallback(
567        egl: &Egl,
568        display: Display,
569        image: egl::Image,
570    ) -> Result<(), Error> {
571        if let Some(egl) = egl.upcast::<egl::EGL1_5>() {
572            egl.destroy_image(display, image).map_err(|e| e.into())
573        } else if let Some(ext) = egl.get_proc_address("eglDestroyImageKHR") {
574            let func: unsafe extern "system" fn(
575                display: egl::EGLDisplay,
576                image: egl::EGLImage,
577            ) -> egl::Boolean = unsafe { std::mem::transmute(ext) };
578            let res = unsafe { func(display.as_ptr(), image.as_ptr()) };
579            if res == egl::TRUE {
580                Ok(())
581            } else {
582                Err(egl.get_error().map(|e| e.into()).unwrap_or(Error::Internal(
583                    "EGL failed but no error was reported".to_owned(),
584                )))
585            }
586        } else {
587            Err(Error::EGLLoad(egl::LoadError::InvalidVersion {
588                provided: egl.version(),
589                required: khronos_egl::Version::EGL1_5,
590            }))
591        }
592    }
593}
594
595impl Drop for GlContext {
596    fn drop(&mut self) {
597        // During process shutdown (e.g. Python interpreter exit), the EGL/GL
598        // shared libraries may already be partially unloaded, causing panics
599        // or heap corruption when calling cleanup functions. We suppress
600        // panic output and catch panics to prevent propagation.
601        let prev_hook = std::panic::take_hook();
602        std::panic::set_hook(Box::new(|_| {}));
603        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
604            let _ = self
605                .egl
606                .make_current(self.display.as_display(), None, None, None);
607
608            let _ = self
609                .egl
610                .destroy_context(self.display.as_display(), self.ctx);
611
612            // eglTerminate is ref-counted per the EGL spec: each eglInitialize
613            // increments a counter and each eglTerminate decrements it. The
614            // display is only truly torn down when the last reference is
615            // released. catch_unwind absorbs any driver-side misbehaviour.
616            let _ = self.egl.terminate(self.display.as_display());
617        }));
618        std::panic::set_hook(prev_hook);
619
620        // The Rc<Egl> (ManuallyDrop) is intentionally NOT dropped. The
621        // khronos-egl Dynamic instance's Drop calls eglReleaseThread() which
622        // panics if the EGL library has been unloaded (local/x86_64) or
623        // causes heap corruption by calling into invalid memory (ARM).
624    }
625}
626
627#[derive(Debug)]
628/// A simple wrapper for a device node.
629pub(crate) struct Card(std::fs::File);
630
631/// Implementing `AsFd` is a prerequisite to implementing the traits found
632/// in this crate. Here, we are just calling `as_fd()` on the inner File.
633impl std::os::unix::io::AsFd for Card {
634    fn as_fd(&self) -> std::os::unix::io::BorrowedFd<'_> {
635        self.0.as_fd()
636    }
637}
638
639/// With `AsFd` implemented, we can now implement `drm::Device`.
640impl DrmDevice for Card {}
641impl DrmControlDevice for Card {}
642
643/// Simple helper methods for opening a `Card`.
644impl Card {
645    pub fn open(path: &str) -> Result<Self, crate::Error> {
646        let mut options = std::fs::OpenOptions::new();
647        options.read(true);
648        options.write(true);
649        let c = options.open(path);
650        match c {
651            Ok(c) => Ok(Card(c)),
652            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
653                Err(Error::NotFound(format!("File not found: {path}")))
654            }
655            Err(e) => Err(e.into()),
656        }
657    }
658
659    pub fn open_global() -> Result<Self, crate::Error> {
660        let targets = ["/dev/dri/renderD128", "/dev/dri/card0", "/dev/dri/card1"];
661        let e = Self::open(targets[0]);
662        if let Ok(t) = e {
663            return Ok(t);
664        }
665        for t in &targets[1..] {
666            if let Ok(t) = Self::open(t) {
667                return Ok(t);
668            }
669        }
670        e
671    }
672}
673
674#[derive(Debug, Clone, Copy)]
675struct RegionOfInterest {
676    left: f32,
677    top: f32,
678    right: f32,
679    bottom: f32,
680}
681
682#[allow(clippy::type_complexity)]
683enum GLProcessorMessage {
684    ImageConvert(
685        SendablePtr<TensorImage>,
686        SendablePtr<TensorImage>,
687        Rotation,
688        Flip,
689        Crop,
690        tokio::sync::oneshot::Sender<Result<(), Error>>,
691    ),
692    SetColors(
693        Vec<[u8; 4]>,
694        tokio::sync::oneshot::Sender<Result<(), Error>>,
695    ),
696    DrawMasks(
697        SendablePtr<TensorImage>,
698        SendablePtr<DetectBox>,
699        SendablePtr<Segmentation>,
700        tokio::sync::oneshot::Sender<Result<(), Error>>,
701    ),
702    DrawMasksProto(
703        SendablePtr<TensorImage>,
704        SendablePtr<DetectBox>,
705        Box<ProtoData>,
706        tokio::sync::oneshot::Sender<Result<(), Error>>,
707    ),
708    SetInt8Interpolation(
709        Int8InterpolationMode,
710        tokio::sync::oneshot::Sender<Result<(), Error>>,
711    ),
712    DecodeMasksAtlas(
713        SendablePtr<DetectBox>,
714        Box<ProtoData>,
715        usize, // output_width
716        usize, // output_height
717        tokio::sync::oneshot::Sender<Result<(Vec<u8>, Vec<MaskRegion>), Error>>,
718    ),
719    PboCreate(
720        usize, // buffer size in bytes
721        tokio::sync::oneshot::Sender<Result<u32, Error>>,
722    ),
723    PboMap(
724        u32,   // buffer_id
725        usize, // size
726        tokio::sync::oneshot::Sender<Result<edgefirst_tensor::PboMapping, Error>>,
727    ),
728    PboUnmap(
729        u32, // buffer_id
730        tokio::sync::oneshot::Sender<Result<(), Error>>,
731    ),
732    PboDelete(u32), // fire-and-forget, no reply
733}
734
735/// Implements PboOps by sending commands to the GL thread.
736///
737/// Uses a `WeakSender` so that PBO images don't keep the GL thread's channel
738/// alive. When the `GLProcessorThreaded` is dropped, its `Sender` is the last
739/// strong reference — dropping it closes the channel and lets the GL thread
740/// exit. PBO operations after that return `PboDisconnected`.
741struct GlPboOps {
742    sender: WeakSender<GLProcessorMessage>,
743}
744
745// SAFETY: GlPboOps sends all GL operations to the dedicated GL thread via a
746// channel. `map_buffer` returns a CPU-visible pointer from `glMapBufferRange`
747// that remains valid until `unmap_buffer` calls `glUnmapBuffer` on the GL thread.
748// `delete_buffer` sends a fire-and-forget deletion command to the GL thread.
749unsafe impl edgefirst_tensor::PboOps for GlPboOps {
750    fn map_buffer(
751        &self,
752        buffer_id: u32,
753        size: usize,
754    ) -> edgefirst_tensor::Result<edgefirst_tensor::PboMapping> {
755        let sender = self
756            .sender
757            .upgrade()
758            .ok_or(edgefirst_tensor::Error::PboDisconnected)?;
759        let (tx, rx) = tokio::sync::oneshot::channel();
760        sender
761            .blocking_send(GLProcessorMessage::PboMap(buffer_id, size, tx))
762            .map_err(|_| edgefirst_tensor::Error::PboDisconnected)?;
763        rx.blocking_recv()
764            .map_err(|_| edgefirst_tensor::Error::PboDisconnected)?
765            .map_err(|e| {
766                edgefirst_tensor::Error::NotImplemented(format!("GL PBO map failed: {e:?}"))
767            })
768    }
769
770    fn unmap_buffer(&self, buffer_id: u32) -> edgefirst_tensor::Result<()> {
771        let sender = self
772            .sender
773            .upgrade()
774            .ok_or(edgefirst_tensor::Error::PboDisconnected)?;
775        let (tx, rx) = tokio::sync::oneshot::channel();
776        sender
777            .blocking_send(GLProcessorMessage::PboUnmap(buffer_id, tx))
778            .map_err(|_| edgefirst_tensor::Error::PboDisconnected)?;
779        rx.blocking_recv()
780            .map_err(|_| edgefirst_tensor::Error::PboDisconnected)?
781            .map_err(|e| {
782                edgefirst_tensor::Error::NotImplemented(format!("GL PBO unmap failed: {e:?}"))
783            })
784    }
785
786    fn delete_buffer(&self, buffer_id: u32) {
787        if let Some(sender) = self.sender.upgrade() {
788            let _ = sender.blocking_send(GLProcessorMessage::PboDelete(buffer_id));
789        }
790    }
791}
792
793/// OpenGL multi-threaded image converter. The actual conversion is done in a
794/// separate rendering thread, as OpenGL contexts are not thread-safe. This can
795/// be safely sent between threads. The `convert()` call sends the conversion
796/// request to the rendering thread and waits for the result.
797#[derive(Debug)]
798pub struct GLProcessorThreaded {
799    // This is only None when the converter is being dropped.
800    handle: Option<JoinHandle<()>>,
801
802    // This is only None when the converter is being dropped.
803    sender: Option<Sender<GLProcessorMessage>>,
804    transfer_backend: TransferBackend,
805}
806
807unsafe impl Send for GLProcessorThreaded {}
808unsafe impl Sync for GLProcessorThreaded {}
809
810struct SendablePtr<T: Send> {
811    ptr: NonNull<T>,
812    len: usize,
813}
814
815unsafe impl<T> Send for SendablePtr<T> where T: Send {}
816
817impl GLProcessorThreaded {
818    /// Creates a new OpenGL multi-threaded image converter.
819    pub fn new(kind: Option<EglDisplayKind>) -> Result<Self, Error> {
820        let (send, mut recv) = tokio::sync::mpsc::channel::<GLProcessorMessage>(1);
821
822        let (create_ctx_send, create_ctx_recv) = tokio::sync::oneshot::channel();
823
824        let func = move || {
825            let mut gl_converter = match GLProcessorST::new(kind) {
826                Ok(gl) => gl,
827                Err(e) => {
828                    let _ = create_ctx_send.send(Err(e));
829                    return;
830                }
831            };
832            let _ = create_ctx_send.send(Ok(gl_converter.gl_context.transfer_backend));
833            while let Some(msg) = recv.blocking_recv() {
834                match msg {
835                    GLProcessorMessage::ImageConvert(src, mut dst, rotation, flip, crop, resp) => {
836                        // SAFETY: This is safe because the convert() function waits for the resp to
837                        // be sent before dropping the borrow for src and dst
838                        let src = unsafe { src.ptr.as_ref() };
839                        let dst = unsafe { dst.ptr.as_mut() };
840                        let res = gl_converter.convert(src, dst, rotation, flip, crop);
841                        let _ = resp.send(res);
842                    }
843                    GLProcessorMessage::DrawMasks(mut dst, det, seg, resp) => {
844                        // SAFETY: This is safe because the draw_masks() function waits for the
845                        // resp to be sent before dropping the borrow for dst, detect, and
846                        // segmentation
847                        let dst = unsafe { dst.ptr.as_mut() };
848                        let det = unsafe { std::slice::from_raw_parts(det.ptr.as_ptr(), det.len) };
849                        let seg = unsafe { std::slice::from_raw_parts(seg.ptr.as_ptr(), seg.len) };
850                        let res = gl_converter.draw_masks(dst, det, seg);
851                        let _ = resp.send(res);
852                    }
853                    GLProcessorMessage::DrawMasksProto(mut dst, det, proto_data, resp) => {
854                        // SAFETY: Same safety invariant as DrawMasks — caller
855                        // blocks on resp before dropping borrows.
856                        let dst = unsafe { dst.ptr.as_mut() };
857                        let det = unsafe { std::slice::from_raw_parts(det.ptr.as_ptr(), det.len) };
858                        let res = gl_converter.draw_masks_proto(dst, det, &proto_data);
859                        let _ = resp.send(res);
860                    }
861                    GLProcessorMessage::SetColors(colors, resp) => {
862                        let res = gl_converter.set_class_colors(&colors);
863                        let _ = resp.send(res);
864                    }
865                    GLProcessorMessage::SetInt8Interpolation(mode, resp) => {
866                        gl_converter.set_int8_interpolation_mode(mode);
867                        let _ = resp.send(Ok(()));
868                    }
869                    GLProcessorMessage::DecodeMasksAtlas(
870                        det,
871                        proto_data,
872                        output_width,
873                        output_height,
874                        resp,
875                    ) => {
876                        let det = unsafe { std::slice::from_raw_parts(det.ptr.as_ptr(), det.len) };
877                        let res = gl_converter.decode_masks_atlas(
878                            det,
879                            &proto_data,
880                            output_width,
881                            output_height,
882                        );
883                        let _ = resp.send(res);
884                    }
885                    GLProcessorMessage::PboCreate(size, resp) => {
886                        let result = unsafe {
887                            let mut id: u32 = 0;
888                            gls::gl::GenBuffers(1, &mut id);
889                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, id);
890                            gls::gl::BufferData(
891                                gls::gl::PIXEL_PACK_BUFFER,
892                                size as isize,
893                                std::ptr::null(),
894                                gls::gl::STREAM_COPY,
895                            );
896                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
897                            match check_gl_error("PboCreate", 0) {
898                                Ok(()) => Ok(id),
899                                Err(e) => {
900                                    gls::gl::DeleteBuffers(1, &id);
901                                    Err(e)
902                                }
903                            }
904                        };
905                        let _ = resp.send(result);
906                    }
907                    GLProcessorMessage::PboMap(buffer_id, size, resp) => {
908                        let result = unsafe {
909                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, buffer_id);
910                            let ptr = gls::gl::MapBufferRange(
911                                gls::gl::PIXEL_PACK_BUFFER,
912                                0,
913                                size as isize,
914                                gls::gl::MAP_READ_BIT | gls::gl::MAP_WRITE_BIT,
915                            );
916                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
917                            if ptr.is_null() {
918                                Err(crate::Error::OpenGl(
919                                    "glMapBufferRange returned null".to_string(),
920                                ))
921                            } else {
922                                Ok(edgefirst_tensor::PboMapping {
923                                    ptr: ptr as *mut u8,
924                                    size,
925                                })
926                            }
927                        };
928                        let _ = resp.send(result);
929                    }
930                    GLProcessorMessage::PboUnmap(buffer_id, resp) => {
931                        let result = unsafe {
932                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, buffer_id);
933                            let ok = gls::gl::UnmapBuffer(gls::gl::PIXEL_PACK_BUFFER);
934                            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
935                            if ok == gls::gl::FALSE {
936                                Err(Error::OpenGl(
937                                    "PBO data was corrupted during mapping".into(),
938                                ))
939                            } else {
940                                check_gl_error("PboUnmap", 0)
941                            }
942                        };
943                        let _ = resp.send(result);
944                    }
945                    GLProcessorMessage::PboDelete(buffer_id) => unsafe {
946                        gls::gl::DeleteBuffers(1, &buffer_id);
947                    },
948                }
949            }
950        };
951
952        // let handle = tokio::task::spawn(func());
953        let handle = std::thread::spawn(func);
954
955        let transfer_backend = match create_ctx_recv.blocking_recv() {
956            Ok(Err(e)) => return Err(e),
957            Err(_) => {
958                return Err(Error::Internal(
959                    "GL converter error messaging closed without update".to_string(),
960                ));
961            }
962            Ok(Ok(tb)) => tb,
963        };
964
965        Ok(Self {
966            handle: Some(handle),
967            sender: Some(send),
968            transfer_backend,
969        })
970    }
971}
972
973impl ImageProcessorTrait for GLProcessorThreaded {
974    fn convert(
975        &mut self,
976        src: &TensorImage,
977        dst: &mut TensorImage,
978        rotation: crate::Rotation,
979        flip: Flip,
980        crop: Crop,
981    ) -> crate::Result<()> {
982        crop.check_crop(src, dst)?;
983        if !GLProcessorST::check_src_format_supported(self.transfer_backend, src) {
984            return Err(crate::Error::NotSupported(format!(
985                "Opengl doesn't support {} source texture",
986                src.fourcc().display()
987            )));
988        }
989
990        if !GLProcessorST::check_dst_format_supported(self.transfer_backend, dst) {
991            return Err(crate::Error::NotSupported(format!(
992                "Opengl doesn't support {} destination texture",
993                dst.fourcc().display()
994            )));
995        }
996
997        let (err_send, err_recv) = tokio::sync::oneshot::channel();
998        self.sender
999            .as_ref()
1000            .unwrap()
1001            .blocking_send(GLProcessorMessage::ImageConvert(
1002                SendablePtr {
1003                    ptr: src.into(),
1004                    len: 1,
1005                },
1006                SendablePtr {
1007                    ptr: dst.into(),
1008                    len: 1,
1009                },
1010                rotation,
1011                flip,
1012                crop,
1013                err_send,
1014            ))
1015            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1016        err_recv.blocking_recv().map_err(|_| {
1017            Error::Internal("GL converter error messaging closed without update".to_string())
1018        })?
1019    }
1020
1021    fn convert_ref(
1022        &mut self,
1023        src: &TensorImage,
1024        dst: &mut TensorImageRef<'_>,
1025        rotation: Rotation,
1026        flip: Flip,
1027        crop: Crop,
1028    ) -> crate::Result<()> {
1029        // OpenGL doesn't support PLANAR_RGB output, delegate to CPU
1030        let mut cpu = CPUProcessor::new();
1031        cpu.convert_ref(src, dst, rotation, flip, crop)
1032    }
1033
1034    fn draw_masks(
1035        &mut self,
1036        dst: &mut TensorImage,
1037        detect: &[crate::DetectBox],
1038        segmentation: &[crate::Segmentation],
1039    ) -> crate::Result<()> {
1040        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1041        self.sender
1042            .as_ref()
1043            .unwrap()
1044            .blocking_send(GLProcessorMessage::DrawMasks(
1045                SendablePtr {
1046                    ptr: dst.into(),
1047                    len: 1,
1048                },
1049                SendablePtr {
1050                    ptr: NonNull::new(detect.as_ptr() as *mut DetectBox).unwrap(),
1051                    len: detect.len(),
1052                },
1053                SendablePtr {
1054                    ptr: NonNull::new(segmentation.as_ptr() as *mut Segmentation).unwrap(),
1055                    len: segmentation.len(),
1056                },
1057                err_send,
1058            ))
1059            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1060        err_recv.blocking_recv().map_err(|_| {
1061            Error::Internal("GL converter error messaging closed without update".to_string())
1062        })?
1063    }
1064
1065    fn draw_masks_proto(
1066        &mut self,
1067        dst: &mut TensorImage,
1068        detect: &[DetectBox],
1069        proto_data: &ProtoData,
1070    ) -> crate::Result<()> {
1071        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1072        self.sender
1073            .as_ref()
1074            .unwrap()
1075            .blocking_send(GLProcessorMessage::DrawMasksProto(
1076                SendablePtr {
1077                    ptr: NonNull::new(dst as *mut TensorImage).unwrap(),
1078                    len: 1,
1079                },
1080                SendablePtr {
1081                    ptr: NonNull::new(detect.as_ptr() as *mut DetectBox).unwrap(),
1082                    len: detect.len(),
1083                },
1084                Box::new(proto_data.clone()),
1085                err_send,
1086            ))
1087            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1088        err_recv.blocking_recv().map_err(|_| {
1089            Error::Internal("GL converter error messaging closed without update".to_string())
1090        })?
1091    }
1092
1093    fn decode_masks_atlas(
1094        &mut self,
1095        detect: &[DetectBox],
1096        proto_data: ProtoData,
1097        output_width: usize,
1098        output_height: usize,
1099    ) -> crate::Result<(Vec<u8>, Vec<MaskRegion>)> {
1100        GLProcessorThreaded::decode_masks_atlas(
1101            self,
1102            detect,
1103            proto_data,
1104            output_width,
1105            output_height,
1106        )
1107    }
1108
1109    fn set_class_colors(&mut self, colors: &[[u8; 4]]) -> Result<(), crate::Error> {
1110        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1111        self.sender
1112            .as_ref()
1113            .unwrap()
1114            .blocking_send(GLProcessorMessage::SetColors(colors.to_vec(), err_send))
1115            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1116        err_recv.blocking_recv().map_err(|_| {
1117            Error::Internal("GL converter error messaging closed without update".to_string())
1118        })?
1119    }
1120}
1121
1122impl GLProcessorThreaded {
1123    /// Sets the interpolation mode for int8 proto textures.
1124    pub fn set_int8_interpolation_mode(
1125        &mut self,
1126        mode: Int8InterpolationMode,
1127    ) -> Result<(), crate::Error> {
1128        let (err_send, err_recv) = tokio::sync::oneshot::channel();
1129        self.sender
1130            .as_ref()
1131            .unwrap()
1132            .blocking_send(GLProcessorMessage::SetInt8Interpolation(mode, err_send))
1133            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1134        err_recv.blocking_recv().map_err(|_| {
1135            Error::Internal("GL converter error messaging closed without update".to_string())
1136        })?
1137    }
1138
1139    /// Decode all detection masks into a compact atlas via the GL thread.
1140    ///
1141    /// Returns `(atlas_pixels, regions)` where `atlas_pixels` is a contiguous
1142    /// `Vec<u8>` of shape `[atlas_h, output_width]` (compact, bbox-sized strips)
1143    /// and `regions` describes each detection's location within the atlas.
1144    pub fn decode_masks_atlas(
1145        &mut self,
1146        detect: &[DetectBox],
1147        proto_data: ProtoData,
1148        output_width: usize,
1149        output_height: usize,
1150    ) -> Result<(Vec<u8>, Vec<MaskRegion>), crate::Error> {
1151        let (resp_send, resp_recv) = tokio::sync::oneshot::channel();
1152        self.sender
1153            .as_ref()
1154            .unwrap()
1155            .blocking_send(GLProcessorMessage::DecodeMasksAtlas(
1156                SendablePtr {
1157                    ptr: NonNull::new(detect.as_ptr() as *mut DetectBox).unwrap(),
1158                    len: detect.len(),
1159                },
1160                Box::new(proto_data),
1161                output_width,
1162                output_height,
1163                resp_send,
1164            ))
1165            .map_err(|_| Error::Internal("GL converter thread exited".to_string()))?;
1166        resp_recv.blocking_recv().map_err(|_| {
1167            Error::Internal("GL converter error messaging closed without update".to_string())
1168        })?
1169    }
1170
1171    /// Create a PBO-backed TensorImage on the GL thread.
1172    pub fn create_pbo_image(
1173        &self,
1174        width: usize,
1175        height: usize,
1176        fourcc: four_char_code::FourCharCode,
1177    ) -> Result<crate::TensorImage, Error> {
1178        let sender = self
1179            .sender
1180            .as_ref()
1181            .ok_or(Error::OpenGl("GL processor is shutting down".to_string()))?;
1182
1183        let channels = crate::fourcc_channels(fourcc)?;
1184        let size = width * height * channels;
1185        if size == 0 {
1186            return Err(Error::OpenGl("Invalid image dimensions".to_string()));
1187        }
1188
1189        // Allocate PBO on the GL thread
1190        let (tx, rx) = tokio::sync::oneshot::channel();
1191        sender
1192            .blocking_send(GLProcessorMessage::PboCreate(size, tx))
1193            .map_err(|_| Error::OpenGl("GL thread channel closed".to_string()))?;
1194        let buffer_id = rx
1195            .blocking_recv()
1196            .map_err(|_| Error::OpenGl("GL thread did not respond".to_string()))??;
1197
1198        let ops: std::sync::Arc<dyn edgefirst_tensor::PboOps> = std::sync::Arc::new(GlPboOps {
1199            sender: sender.downgrade(),
1200        });
1201
1202        let shape = if crate::fourcc_planar(fourcc)? {
1203            vec![channels, height, width]
1204        } else {
1205            vec![height, width, channels]
1206        };
1207
1208        let pbo_tensor =
1209            edgefirst_tensor::PboTensor::<u8>::from_pbo(buffer_id, size, &shape, None, ops)
1210                .map_err(|e| Error::OpenGl(format!("PBO tensor creation failed: {e:?}")))?;
1211        let tensor = edgefirst_tensor::Tensor::Pbo(pbo_tensor);
1212        crate::TensorImage::from_tensor(tensor, fourcc)
1213            .map_err(|e| Error::OpenGl(format!("Failed to wrap PBO tensor as image: {e:?}")))
1214    }
1215
1216    /// Returns the active transfer backend.
1217    #[allow(dead_code)]
1218    pub(crate) fn transfer_backend(&self) -> TransferBackend {
1219        self.transfer_backend
1220    }
1221}
1222
1223impl Drop for GLProcessorThreaded {
1224    fn drop(&mut self) {
1225        drop(self.sender.take());
1226        let _ = self.handle.take().and_then(|h| h.join().ok());
1227    }
1228}
1229
1230/// Interpolation mode for int8 proto textures (GL_R8I cannot use GL_LINEAR).
1231#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1232pub enum Int8InterpolationMode {
1233    /// texelFetch at nearest texel — simplest, fastest GPU execution.
1234    Nearest,
1235    /// texelFetch × 4 neighbors with shader-computed bilinear weights (default).
1236    Bilinear,
1237    /// Two-pass: dequant int8→f16 FBO, then existing f16 shader with GL_LINEAR.
1238    TwoPass,
1239}
1240
1241/// Selects which EGLImage cache to use.
1242#[derive(Debug)]
1243enum CacheKind {
1244    Src,
1245    Dst,
1246}
1247
1248/// A cached EGLImage with a weak reference to the source tensor's guard.
1249struct CachedEglImage {
1250    egl_image: EglImage,
1251    /// Weak reference to the source Tensor's BufferIdentity guard.
1252    guard: std::sync::Weak<()>,
1253    /// Optional GL renderbuffer backed by this EGLImage (used by direct RGB path).
1254    renderbuffer: Option<u32>,
1255    /// Monotonic access counter for LRU eviction.
1256    last_used: u64,
1257}
1258
1259/// EGLImage cache owned by GLProcessorST.
1260///
1261/// Uses a HashMap with a monotonic counter for LRU eviction: each access
1262/// updates the entry's `last_used` timestamp, and eviction removes the entry
1263/// with the smallest `last_used` value.
1264struct EglImageCache {
1265    entries: std::collections::HashMap<u64, CachedEglImage>,
1266    capacity: usize,
1267    hits: u64,
1268    misses: u64,
1269    /// Monotonic counter incremented on each access for LRU tracking.
1270    access_counter: u64,
1271}
1272
1273impl EglImageCache {
1274    fn new(capacity: usize) -> Self {
1275        Self {
1276            entries: std::collections::HashMap::with_capacity(capacity),
1277            capacity,
1278            hits: 0,
1279            misses: 0,
1280            access_counter: 0,
1281        }
1282    }
1283
1284    /// Allocate a new LRU timestamp.
1285    fn next_timestamp(&mut self) -> u64 {
1286        self.access_counter += 1;
1287        self.access_counter
1288    }
1289
1290    /// Evict the least recently used entry.
1291    fn evict_lru(&mut self) {
1292        if let Some((&evict_id, _)) = self.entries.iter().min_by_key(|(_, entry)| entry.last_used) {
1293            if let Some(evicted) = self.entries.remove(&evict_id) {
1294                if let Some(rbo) = evicted.renderbuffer {
1295                    unsafe { gls::gl::DeleteRenderbuffers(1, &rbo) };
1296                }
1297            }
1298        }
1299    }
1300
1301    /// Sweep dead entries (tensor dropped, Weak is dead).
1302    fn sweep(&mut self) {
1303        let before = self.entries.len();
1304        self.entries.retain(|_id, entry| {
1305            let alive = entry.guard.upgrade().is_some();
1306            if !alive {
1307                if let Some(rbo) = entry.renderbuffer {
1308                    unsafe { gls::gl::DeleteRenderbuffers(1, &rbo) };
1309                }
1310            }
1311            alive
1312        });
1313        let swept = before - self.entries.len();
1314        if swept > 0 {
1315            log::debug!("EglImageCache: swept {swept} dead entries");
1316        }
1317    }
1318}
1319
1320impl Drop for EglImageCache {
1321    fn drop(&mut self) {
1322        for entry in self.entries.values() {
1323            if let Some(rbo) = entry.renderbuffer {
1324                unsafe { gls::gl::DeleteRenderbuffers(1, &rbo) };
1325            }
1326        }
1327        log::debug!(
1328            "EglImageCache stats: {} hits, {} misses, {} entries remaining",
1329            self.hits,
1330            self.misses,
1331            self.entries.len()
1332        );
1333    }
1334}
1335
1336/// OpenGL single-threaded image converter.
1337pub struct GLProcessorST {
1338    camera_eglimage_texture: Texture,
1339    camera_normal_texture: Texture,
1340    render_texture: Texture,
1341    segmentation_texture: Texture,
1342    segmentation_program: GlProgram,
1343    instanced_segmentation_program: GlProgram,
1344    proto_texture: Texture,
1345    proto_segmentation_program: GlProgram,
1346    proto_segmentation_int8_nearest_program: GlProgram,
1347    proto_segmentation_int8_bilinear_program: GlProgram,
1348    proto_dequant_int8_program: GlProgram,
1349    proto_segmentation_f32_program: GlProgram,
1350    color_program: GlProgram,
1351    /// Whether GL_OES_texture_float_linear is available (allows GL_LINEAR on R32F textures).
1352    has_float_linear: bool,
1353    /// Interpolation mode for int8 proto textures.
1354    int8_interpolation_mode: Int8InterpolationMode,
1355    /// Intermediate FBO texture for two-pass int8 dequant path.
1356    proto_dequant_texture: Texture,
1357    proto_mask_logit_int8_bilinear_program: GlProgram,
1358    proto_mask_logit_int8_nearest_program: GlProgram,
1359    proto_mask_logit_f32_program: GlProgram,
1360    /// Dedicated FBO for mask rendering.
1361    mask_fbo: u32,
1362    /// R8 texture attached to mask_fbo.
1363    mask_fbo_texture: u32,
1364    /// Current allocated width of mask FBO texture.
1365    mask_fbo_width: usize,
1366    /// Current allocated height of mask FBO texture.
1367    mask_fbo_height: usize,
1368    /// PBO buffer ID for atlas readback (0 = not allocated).
1369    mask_atlas_pbo: u32,
1370    vertex_buffer: Buffer,
1371    texture_buffer: Buffer,
1372    /// Persistent FBO for the convert() render path.
1373    /// Created once, reused by re-attaching textures each frame.
1374    convert_fbo: FrameBuffer,
1375    /// EGLImage cache for source DMA buffers.
1376    src_egl_cache: EglImageCache,
1377    /// EGLImage cache for destination DMA buffers.
1378    dst_egl_cache: EglImageCache,
1379    /// Intermediate RGBA texture for two-pass packed RGB conversion.
1380    /// Pass 1 renders YUYV/NV12→RGBA here; Pass 2 packs RGBA→RGB to DMA dest.
1381    packed_rgb_intermediate_tex: Texture,
1382    /// FBO for pass 1 of packed RGB conversion (renders to intermediate texture).
1383    packed_rgb_fbo: FrameBuffer,
1384    /// Current allocated size of the intermediate texture (0,0 = unallocated).
1385    packed_rgb_intermediate_size: (usize, usize),
1386    texture_program: GlProgram,
1387    texture_program_yuv: GlProgram,
1388    texture_program_planar: GlProgram,
1389    /// Shader: existing planar RGB with int8 bias (XOR 0x80) applied to output.
1390    texture_program_planar_int8: GlProgram,
1391    /// Shader: packed RGB -> RGBA8 packing (2D texture source, pass 2).
1392    packed_rgba8_program_2d: GlProgram,
1393    /// Shader: packed RGB int8 -> RGBA8 packing with XOR 0x80 (2D texture source, pass 2).
1394    packed_rgba8_int8_program_2d: GlProgram,
1395    /// Shader: direct RGB render with int8 XOR 0x80 bias (2D texture source).
1396    texture_int8_program: GlProgram,
1397    /// Shader: direct RGB render with int8 XOR 0x80 bias (external OES source).
1398    texture_int8_program_yuv: GlProgram,
1399    /// Whether the GPU supports direct RGB rendering via BGR888 renderbuffer.
1400    support_rgb_direct: bool,
1401    gl_context: GlContext,
1402}
1403
1404impl Drop for GLProcessorST {
1405    fn drop(&mut self) {
1406        unsafe {
1407            {
1408                if self.mask_fbo != 0 {
1409                    gls::gl::DeleteFramebuffers(1, &self.mask_fbo);
1410                }
1411                if self.mask_fbo_texture != 0 {
1412                    gls::gl::DeleteTextures(1, &self.mask_fbo_texture);
1413                }
1414                if self.mask_atlas_pbo != 0 {
1415                    gls::gl::DeleteBuffers(1, &self.mask_atlas_pbo);
1416                }
1417            }
1418        }
1419    }
1420}
1421
1422impl ImageProcessorTrait for GLProcessorST {
1423    fn convert(
1424        &mut self,
1425        src: &TensorImage,
1426        dst: &mut TensorImage,
1427        rotation: crate::Rotation,
1428        flip: Flip,
1429        crop: Crop,
1430    ) -> crate::Result<()> {
1431        crop.check_crop(src, dst)?;
1432        if !Self::check_src_format_supported(self.gl_context.transfer_backend, src) {
1433            return Err(crate::Error::NotSupported(format!(
1434                "Opengl doesn't support {} source texture",
1435                src.fourcc().display()
1436            )));
1437        }
1438
1439        if !Self::check_dst_format_supported(self.gl_context.transfer_backend, dst) {
1440            return Err(crate::Error::NotSupported(format!(
1441                "Opengl doesn't support {} destination texture",
1442                dst.fourcc().display()
1443            )));
1444        }
1445        log::debug!(
1446            "dst tensor: {:?} src tensor :{:?}",
1447            dst.tensor().memory(),
1448            src.tensor().memory()
1449        );
1450        check_gl_error(function!(), line!())?;
1451        if self.gl_context.transfer_backend.is_dma() && dst.tensor().memory() == TensorMemory::Dma {
1452            // Packed RGB is now supported via DMA with buffer reinterpretation
1453            let res = self.convert_dest_dma(dst, src, rotation, flip, crop);
1454            return res;
1455        }
1456        // PBO-to-PBO: both tensors are PBO-backed, use GL buffer bindings for
1457        // both upload and readback (zero CPU copy for both directions)
1458        if src.tensor().memory() == TensorMemory::Pbo && dst.tensor().memory() == TensorMemory::Pbo
1459        {
1460            return self.convert_pbo_to_pbo(dst, src, rotation, flip, crop);
1461        }
1462        // PBO dst with non-PBO src: use normal texture upload for src (which
1463        // maps the Mem/DMA tensor), but PBO PACK readback for dst.
1464        // This avoids the deadlock that would occur if convert_dest_non_dma
1465        // tried to map() the PBO dst on the GL thread.
1466        if dst.tensor().memory() == TensorMemory::Pbo {
1467            return self.convert_any_to_pbo(dst, src, rotation, flip, crop);
1468        }
1469        // PBO src with non-PBO dst: the src tensor's map() would deadlock on
1470        // the GL thread, so use PBO UNPACK upload. Readback goes to Mem dst
1471        // via normal ReadnPixels into mapped memory.
1472        if src.tensor().memory() == TensorMemory::Pbo {
1473            return self.convert_pbo_to_mem(dst, src, rotation, flip, crop);
1474        }
1475        let start = Instant::now();
1476        let res = self.convert_dest_non_dma(dst, src, rotation, flip, crop);
1477        log::debug!("convert_dest_non_dma takes {:?}", start.elapsed());
1478        res
1479    }
1480
1481    fn convert_ref(
1482        &mut self,
1483        src: &TensorImage,
1484        dst: &mut TensorImageRef<'_>,
1485        rotation: Rotation,
1486        flip: Flip,
1487        crop: Crop,
1488    ) -> crate::Result<()> {
1489        // OpenGL doesn't support PLANAR_RGB output, delegate to CPU
1490        let mut cpu = CPUProcessor::new();
1491        cpu.convert_ref(src, dst, rotation, flip, crop)
1492    }
1493
1494    fn draw_masks(
1495        &mut self,
1496        dst: &mut TensorImage,
1497        detect: &[DetectBox],
1498        segmentation: &[Segmentation],
1499    ) -> Result<(), crate::Error> {
1500        use crate::FunctionTimer;
1501
1502        let _timer = FunctionTimer::new("GLProcessorST::draw_masks");
1503        if !matches!(dst.fourcc(), RGBA | RGB) {
1504            return Err(crate::Error::NotSupported(
1505                "Opengl image rendering only supports RGBA or RGB images".to_string(),
1506            ));
1507        }
1508
1509        let is_dma = match dst.tensor.memory() {
1510            edgefirst_tensor::TensorMemory::Dma if self.setup_renderbuffer_dma(dst).is_ok() => true,
1511            _ => {
1512                // Add dest rect to make sure dst is rendered fully
1513                self.setup_renderbuffer_non_dma(
1514                    dst,
1515                    Crop::new().with_dst_rect(Some(Rect::new(0, 0, 0, 0))),
1516                )?;
1517                false
1518            }
1519        };
1520
1521        gls::enable(gls::gl::BLEND);
1522        gls::blend_func_separate(
1523            gls::gl::SRC_ALPHA,
1524            gls::gl::ONE_MINUS_SRC_ALPHA,
1525            gls::gl::ZERO,
1526            gls::gl::ONE,
1527        );
1528
1529        self.render_box(dst, detect)?;
1530        self.render_segmentation(detect, segmentation)?;
1531
1532        gls::finish();
1533        if !is_dma {
1534            let mut dst_map = dst.tensor().map()?;
1535            let format = match dst.fourcc() {
1536                RGB => gls::gl::RGB,
1537                RGBA => gls::gl::RGBA,
1538                _ => unreachable!(),
1539            };
1540            unsafe {
1541                gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
1542                gls::gl::ReadnPixels(
1543                    0,
1544                    0,
1545                    dst.width() as i32,
1546                    dst.height() as i32,
1547                    format,
1548                    gls::gl::UNSIGNED_BYTE,
1549                    dst.tensor.len() as i32,
1550                    dst_map.as_mut_ptr() as *mut c_void,
1551                );
1552            }
1553        }
1554
1555        Ok(())
1556    }
1557
1558    fn draw_masks_proto(
1559        &mut self,
1560        dst: &mut TensorImage,
1561        detect: &[DetectBox],
1562        proto_data: &ProtoData,
1563    ) -> crate::Result<()> {
1564        use crate::FunctionTimer;
1565
1566        let _timer = FunctionTimer::new("GLProcessorST::draw_masks_proto");
1567        if !matches!(dst.fourcc(), RGBA | RGB) {
1568            return Err(crate::Error::NotSupported(
1569                "Opengl image rendering only supports RGBA or RGB images".to_string(),
1570            ));
1571        }
1572
1573        let is_dma = match dst.tensor.memory() {
1574            edgefirst_tensor::TensorMemory::Dma if self.setup_renderbuffer_dma(dst).is_ok() => true,
1575            _ => {
1576                self.setup_renderbuffer_non_dma(
1577                    dst,
1578                    Crop::new().with_dst_rect(Some(Rect::new(0, 0, 0, 0))),
1579                )?;
1580                false
1581            }
1582        };
1583
1584        gls::enable(gls::gl::BLEND);
1585        gls::blend_func_separate(
1586            gls::gl::SRC_ALPHA,
1587            gls::gl::ONE_MINUS_SRC_ALPHA,
1588            gls::gl::ZERO,
1589            gls::gl::ONE,
1590        );
1591
1592        self.render_box(dst, detect)?;
1593        self.render_proto_segmentation(detect, proto_data)?;
1594
1595        gls::finish();
1596        if !is_dma {
1597            let mut dst_map = dst.tensor().map()?;
1598            let format = match dst.fourcc() {
1599                RGB => gls::gl::RGB,
1600                RGBA => gls::gl::RGBA,
1601                _ => unreachable!(),
1602            };
1603            unsafe {
1604                gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
1605                gls::gl::ReadnPixels(
1606                    0,
1607                    0,
1608                    dst.width() as i32,
1609                    dst.height() as i32,
1610                    format,
1611                    gls::gl::UNSIGNED_BYTE,
1612                    dst.tensor.len() as i32,
1613                    dst_map.as_mut_ptr() as *mut c_void,
1614                );
1615            }
1616        }
1617
1618        Ok(())
1619    }
1620
1621    fn decode_masks_atlas(
1622        &mut self,
1623        detect: &[DetectBox],
1624        proto_data: ProtoData,
1625        output_width: usize,
1626        output_height: usize,
1627    ) -> crate::Result<(Vec<u8>, Vec<MaskRegion>)> {
1628        GLProcessorST::decode_masks_atlas(self, detect, &proto_data, output_width, output_height)
1629    }
1630
1631    fn set_class_colors(&mut self, colors: &[[u8; 4]]) -> crate::Result<()> {
1632        if colors.is_empty() {
1633            return Ok(());
1634        }
1635        let mut colors_f32 = colors
1636            .iter()
1637            .map(|c| {
1638                [
1639                    c[0] as f32 / 255.0,
1640                    c[1] as f32 / 255.0,
1641                    c[2] as f32 / 255.0,
1642                    c[3] as f32 / 255.0,
1643                ]
1644            })
1645            .take(20)
1646            .collect::<Vec<[f32; 4]>>();
1647
1648        self.segmentation_program
1649            .load_uniform_4fv(c"colors", &colors_f32)?;
1650        self.instanced_segmentation_program
1651            .load_uniform_4fv(c"colors", &colors_f32)?;
1652        self.proto_segmentation_program
1653            .load_uniform_4fv(c"colors", &colors_f32)?;
1654        self.proto_segmentation_int8_nearest_program
1655            .load_uniform_4fv(c"colors", &colors_f32)?;
1656        self.proto_segmentation_int8_bilinear_program
1657            .load_uniform_4fv(c"colors", &colors_f32)?;
1658        self.proto_segmentation_f32_program
1659            .load_uniform_4fv(c"colors", &colors_f32)?;
1660
1661        colors_f32.iter_mut().for_each(|c| {
1662            c[3] = 1.0; // set alpha to 1.0 for color rendering
1663        });
1664        self.color_program
1665            .load_uniform_4fv(c"colors", &colors_f32)?;
1666
1667        Ok(())
1668    }
1669}
1670
1671impl GLProcessorST {
1672    pub fn new(kind: Option<EglDisplayKind>) -> Result<GLProcessorST, crate::Error> {
1673        let gl_context = GlContext::new(kind)?;
1674        gls::load_with(|s| {
1675            gl_context
1676                .egl
1677                .get_proc_address(s)
1678                .map_or(std::ptr::null(), |p| p as *const _)
1679        });
1680
1681        let has_float_linear = Self::gl_check_support()?;
1682
1683        // Uploads and downloads are all packed with no alignment requirements
1684        unsafe {
1685            gls::gl::PixelStorei(gls::gl::PACK_ALIGNMENT, 1);
1686            gls::gl::PixelStorei(gls::gl::UNPACK_ALIGNMENT, 1);
1687        }
1688
1689        let texture_program_planar =
1690            GlProgram::new(generate_vertex_shader(), generate_planar_rgb_shader())?;
1691
1692        let texture_program =
1693            GlProgram::new(generate_vertex_shader(), generate_texture_fragment_shader())?;
1694
1695        let texture_program_yuv = GlProgram::new(
1696            generate_vertex_shader(),
1697            generate_texture_fragment_shader_yuv(),
1698        )?;
1699
1700        let segmentation_program =
1701            GlProgram::new(generate_vertex_shader(), generate_segmentation_shader())?;
1702        segmentation_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1703        let instanced_segmentation_program = GlProgram::new(
1704            generate_vertex_shader(),
1705            generate_instanced_segmentation_shader(),
1706        )?;
1707        instanced_segmentation_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1708
1709        // Existing f16 proto shader (RGBA16F, 4 protos per layer)
1710        let proto_segmentation_program = GlProgram::new(
1711            generate_vertex_shader(),
1712            generate_proto_segmentation_shader(),
1713        )?;
1714        proto_segmentation_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1715
1716        // Int8 proto shaders (R8I, 1 proto per layer, 32 layers)
1717        let proto_segmentation_int8_nearest_program = GlProgram::new(
1718            generate_vertex_shader(),
1719            generate_proto_segmentation_shader_int8_nearest(),
1720        )?;
1721        proto_segmentation_int8_nearest_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1722
1723        let proto_segmentation_int8_bilinear_program = GlProgram::new(
1724            generate_vertex_shader(),
1725            generate_proto_segmentation_shader_int8_bilinear(),
1726        )?;
1727        proto_segmentation_int8_bilinear_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1728
1729        let proto_dequant_int8_program = GlProgram::new(
1730            generate_vertex_shader(),
1731            generate_proto_dequant_shader_int8(),
1732        )?;
1733
1734        // F32 proto shader (R32F, 1 proto per layer, 32 layers)
1735        let proto_segmentation_f32_program = GlProgram::new(
1736            generate_vertex_shader(),
1737            generate_proto_segmentation_shader_f32(),
1738        )?;
1739        proto_segmentation_f32_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1740
1741        let color_program = GlProgram::new(generate_vertex_shader(), generate_color_shader())?;
1742        color_program.load_uniform_4fv(c"colors", &DEFAULT_COLORS)?;
1743
1744        // Binary logit-threshold mask shaders (atlas path — skip sigmoid)
1745        let proto_mask_logit_int8_nearest_program = GlProgram::new(
1746            generate_vertex_shader(),
1747            generate_proto_mask_logit_shader_int8_nearest(),
1748        )?;
1749        let proto_mask_logit_int8_bilinear_program = GlProgram::new(
1750            generate_vertex_shader(),
1751            generate_proto_mask_logit_shader_int8_bilinear(),
1752        )?;
1753        let proto_mask_logit_f32_program = GlProgram::new(
1754            generate_vertex_shader(),
1755            generate_proto_mask_logit_shader_f32(),
1756        )?;
1757
1758        // Int8 variant of the existing planar RGB shader (for PLANAR_RGB_INT8 destinations).
1759        let texture_program_planar_int8 =
1760            GlProgram::new(generate_vertex_shader(), generate_planar_rgb_int8_shader())?;
1761
1762        // RGB packing shaders (2D only — used in pass 2 of two-pass pipeline)
1763        let packed_rgba8_program_2d =
1764            GlProgram::new(generate_vertex_shader(), generate_packed_rgba8_shader_2d())?;
1765        let packed_rgba8_int8_program_2d = GlProgram::new(
1766            generate_vertex_shader(),
1767            generate_packed_rgba8_int8_shader_2d(),
1768        )?;
1769
1770        // Int8 direct-render shaders (for RGB_INT8 destinations via direct path)
1771        let texture_int8_program =
1772            GlProgram::new(generate_vertex_shader(), generate_texture_int8_shader())?;
1773        let texture_int8_program_yuv =
1774            GlProgram::new(generate_vertex_shader(), generate_texture_int8_shader_yuv())?;
1775
1776        let camera_eglimage_texture = Texture::new();
1777        let camera_normal_texture = Texture::new();
1778        let render_texture = Texture::new();
1779        let segmentation_texture = Texture::new();
1780        let proto_texture = Texture::new();
1781        let proto_dequant_texture = Texture::new();
1782        let vertex_buffer = Buffer::new(0, 3, 100);
1783        let texture_buffer = Buffer::new(1, 2, 100);
1784
1785        let mut converter = GLProcessorST {
1786            gl_context,
1787            texture_program,
1788            texture_program_yuv,
1789            texture_program_planar,
1790            texture_program_planar_int8,
1791            packed_rgba8_program_2d,
1792            packed_rgba8_int8_program_2d,
1793            texture_int8_program,
1794            texture_int8_program_yuv,
1795            support_rgb_direct: false, // will be probed in Task 3
1796            camera_eglimage_texture,
1797            camera_normal_texture,
1798            segmentation_texture,
1799            proto_texture,
1800            proto_segmentation_int8_nearest_program,
1801            proto_segmentation_int8_bilinear_program,
1802            proto_dequant_int8_program,
1803            proto_segmentation_f32_program,
1804            has_float_linear,
1805            int8_interpolation_mode: Int8InterpolationMode::Bilinear,
1806            proto_dequant_texture,
1807            proto_mask_logit_int8_bilinear_program,
1808            proto_mask_logit_int8_nearest_program,
1809            proto_mask_logit_f32_program,
1810            mask_fbo: 0,
1811            mask_fbo_texture: 0,
1812            mask_fbo_width: 0,
1813            mask_fbo_height: 0,
1814            mask_atlas_pbo: 0,
1815            vertex_buffer,
1816            texture_buffer,
1817            convert_fbo: FrameBuffer::new(),
1818            src_egl_cache: EglImageCache::new(8),
1819            dst_egl_cache: EglImageCache::new(8),
1820            packed_rgb_intermediate_tex: Texture::new(),
1821            packed_rgb_fbo: FrameBuffer::new(),
1822            packed_rgb_intermediate_size: (0, 0),
1823            render_texture,
1824            segmentation_program,
1825            instanced_segmentation_program,
1826            proto_segmentation_program,
1827            color_program,
1828        };
1829        check_gl_error(function!(), line!())?;
1830
1831        // Probe GPU capability for direct RGB rendering
1832        converter.support_rgb_direct = converter.probe_rgb_direct_support();
1833
1834        // Verify DMA-buf actually works (catches NVIDIA discrete GPUs where
1835        // EGLImage creation succeeds but rendered data is all zeros)
1836        if converter.gl_context.transfer_backend.is_dma() && !converter.verify_dma_buf_roundtrip() {
1837            log::info!("DMA-buf verification failed — falling back to PBO transfers");
1838            converter.gl_context.transfer_backend = TransferBackend::Pbo;
1839            // RGB direct rendering also requires DMA, so disable it
1840            converter.support_rgb_direct = false;
1841        }
1842
1843        // If DMA-buf failed/unavailable but GL is alive, use PBO transfers
1844        if converter.gl_context.transfer_backend == TransferBackend::Sync {
1845            log::info!("Upgrading transfer backend from Sync to Pbo (GL context available)");
1846            converter.gl_context.transfer_backend = TransferBackend::Pbo;
1847        }
1848
1849        log::debug!(
1850            "GLConverter created (transfer={:?}, rgb_direct={})",
1851            converter.gl_context.transfer_backend,
1852            converter.support_rgb_direct
1853        );
1854        Ok(converter)
1855    }
1856
1857    /// Probe whether the GPU supports direct RGB rendering via BGR888 DMA-buf
1858    /// backed renderbuffer. Creates a small test FBO and checks completeness.
1859    /// Returns `false` on any failure (DMA unavailable, EGLImage rejected, FBO incomplete).
1860    fn probe_rgb_direct_support(&self) -> bool {
1861        if !self.gl_context.transfer_backend.is_dma() {
1862            log::debug!("probe_rgb_direct: no DMA support");
1863            return false;
1864        }
1865
1866        // Check glEGLImageTargetRenderbufferStorageOES is available
1867        if self
1868            .gl_context
1869            .egl
1870            .get_proc_address("glEGLImageTargetRenderbufferStorageOES")
1871            .is_none()
1872        {
1873            log::debug!("probe_rgb_direct: glEGLImageTargetRenderbufferStorageOES not available");
1874            return false;
1875        }
1876
1877        // Allocate a small test DMA buffer (64x64 RGB = 12288 bytes)
1878        let test_img = match TensorImage::new(64, 64, RGB, Some(TensorMemory::Dma)) {
1879            Ok(img) => img,
1880            Err(e) => {
1881                log::debug!("probe_rgb_direct: failed to allocate test DMA buffer: {e}");
1882                return false;
1883            }
1884        };
1885
1886        // Create EGLImage from the test DMA buffer
1887        let egl_image =
1888            match self.create_egl_image_with_dims(&test_img, 64, 64, DrmFourcc::Bgr888, 3) {
1889                Ok(img) => img,
1890                Err(e) => {
1891                    log::debug!("probe_rgb_direct: EGLImage creation failed: {e}");
1892                    return false;
1893                }
1894            };
1895
1896        // Create renderbuffer, bind EGLImage, create FBO, check completeness
1897        let result = unsafe {
1898            let mut rbo = 0u32;
1899            gls::gl::GenRenderbuffers(1, &mut rbo);
1900            gls::gl::BindRenderbuffer(gls::gl::RENDERBUFFER, rbo);
1901            gls::gl::EGLImageTargetRenderbufferStorageOES(
1902                gls::gl::RENDERBUFFER,
1903                egl_image.egl_image.as_ptr(),
1904            );
1905
1906            let gl_err = gls::gl::GetError();
1907            if gl_err != gls::gl::NO_ERROR {
1908                log::debug!(
1909                    "probe_rgb_direct: EGLImageTargetRenderbufferStorageOES failed: {gl_err:#X}"
1910                );
1911                gls::gl::BindRenderbuffer(gls::gl::RENDERBUFFER, 0);
1912                gls::gl::DeleteRenderbuffers(1, &rbo);
1913                return false;
1914            }
1915
1916            let mut fbo = 0u32;
1917            gls::gl::GenFramebuffers(1, &mut fbo);
1918            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, fbo);
1919            gls::gl::FramebufferRenderbuffer(
1920                gls::gl::FRAMEBUFFER,
1921                gls::gl::COLOR_ATTACHMENT0,
1922                gls::gl::RENDERBUFFER,
1923                rbo,
1924            );
1925
1926            let status = gls::gl::CheckFramebufferStatus(gls::gl::FRAMEBUFFER);
1927            let complete = status == gls::gl::FRAMEBUFFER_COMPLETE;
1928
1929            // Cleanup
1930            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, 0);
1931            gls::gl::DeleteFramebuffers(1, &fbo);
1932            gls::gl::BindRenderbuffer(gls::gl::RENDERBUFFER, 0);
1933            gls::gl::DeleteRenderbuffers(1, &rbo);
1934
1935            complete
1936        };
1937        // egl_image and test_img drop automatically here
1938
1939        log::info!("probe_rgb_direct: BGR888 renderbuffer FBO support = {result}");
1940        result
1941    }
1942
1943    /// Verify that DMA-buf EGLImage round-trip actually works on this GPU.
1944    ///
1945    /// Renders a solid red quad to a 64x64 DMA-buf-backed RGBA texture via
1946    /// EGLImage, then reads it back and checks that the center pixel is red.
1947    /// Returns `true` if the data round-trips correctly.
1948    ///
1949    /// This catches GPUs like NVIDIA discrete where `eglCreateImage` from
1950    /// `dma_heap` fds succeeds but the rendered data is all zeros.
1951    fn verify_dma_buf_roundtrip(&mut self) -> bool {
1952        // Allocate a 64x64 RGBA DMA source tensor and fill it with solid red
1953        let src = match TensorImage::new(64, 64, RGBA, Some(TensorMemory::Dma)) {
1954            Ok(img) => img,
1955            Err(e) => {
1956                log::info!("verify_dma_buf_roundtrip: failed to allocate DMA source: {e}");
1957                return false;
1958            }
1959        };
1960
1961        {
1962            let mut map = match src.tensor().map() {
1963                Ok(m) => m,
1964                Err(e) => {
1965                    log::info!("verify_dma_buf_roundtrip: failed to map DMA source: {e}");
1966                    return false;
1967                }
1968            };
1969            for pixel in map.chunks_exact_mut(4) {
1970                pixel[0] = 255; // R
1971                pixel[1] = 0; // G
1972                pixel[2] = 0; // B
1973                pixel[3] = 255; // A
1974            }
1975        }
1976
1977        // Allocate a 64x64 RGBA DMA destination tensor
1978        let mut dst = match TensorImage::new(64, 64, RGBA, Some(TensorMemory::Dma)) {
1979            Ok(img) => img,
1980            Err(e) => {
1981                log::info!("verify_dma_buf_roundtrip: failed to allocate DMA destination: {e}");
1982                return false;
1983            }
1984        };
1985
1986        // Run the full DMA-buf EGLImage render pipeline
1987        if let Err(e) =
1988            self.convert_dest_dma(&mut dst, &src, Rotation::None, Flip::None, Crop::no_crop())
1989        {
1990            log::info!("verify_dma_buf_roundtrip: convert_dest_dma failed: {e}");
1991            return false;
1992        }
1993
1994        // Read back the center pixel at (32, 32) from the destination
1995        let map = match dst.tensor().map() {
1996            Ok(m) => m,
1997            Err(e) => {
1998                log::info!("verify_dma_buf_roundtrip: failed to map DMA destination: {e}");
1999                return false;
2000            }
2001        };
2002
2003        let offset = (32 * 64 + 32) * 4;
2004        if map.len() < offset + 4 {
2005            log::info!("verify_dma_buf_roundtrip: destination buffer too small");
2006            return false;
2007        }
2008
2009        let r = map[offset];
2010        let g = map[offset + 1];
2011        let b = map[offset + 2];
2012        let a = map[offset + 3];
2013
2014        let pass = r > 250 && g < 5 && b < 5 && a > 250;
2015
2016        if pass {
2017            log::info!("verify_dma_buf_roundtrip: PASSED (center pixel RGBA={r},{g},{b},{a})");
2018        } else {
2019            log::info!(
2020                "verify_dma_buf_roundtrip: FAILED (center pixel RGBA={r},{g},{b},{a}, \
2021                 expected ~255,0,0,255)"
2022            );
2023        }
2024
2025        pass
2026    }
2027
2028    /// Compute padded bbox regions and atlas offsets for a set of detections.
2029    ///
2030    /// Returns the vector of `MaskRegion` with stacked atlas_y_offset values
2031    /// and the total compact atlas height.
2032    fn compute_atlas_regions(
2033        detect: &[DetectBox],
2034        output_width: usize,
2035        output_height: usize,
2036        padding: usize,
2037    ) -> (Vec<MaskRegion>, usize) {
2038        let ow = output_width as i32;
2039        let oh = output_height as i32;
2040        let owf = output_width as f32;
2041        let ohf = output_height as f32;
2042        let pad = padding as i32;
2043
2044        let mut regions = Vec::with_capacity(detect.len());
2045        let mut atlas_y = 0usize;
2046        for det in detect.iter() {
2047            let bbox_x = (det.bbox.xmin * owf).round() as i32;
2048            let bbox_y = (det.bbox.ymin * ohf).round() as i32;
2049            let bbox_w = ((det.bbox.xmax - det.bbox.xmin) * owf).round() as i32;
2050            let bbox_h = ((det.bbox.ymax - det.bbox.ymin) * ohf).round() as i32;
2051            let bbox_x = bbox_x.max(0).min(ow);
2052            let bbox_y = bbox_y.max(0).min(oh);
2053            let bbox_w = bbox_w.max(1).min(ow - bbox_x);
2054            let bbox_h = bbox_h.max(1).min(oh - bbox_y);
2055
2056            let padded_x = (bbox_x - pad).max(0);
2057            let padded_y = (bbox_y - pad).max(0);
2058            let padded_w = ((bbox_x + bbox_w + pad).min(ow) - padded_x).max(1);
2059            let padded_h = ((bbox_y + bbox_h + pad).min(oh) - padded_y).max(1);
2060
2061            regions.push(MaskRegion {
2062                atlas_y_offset: atlas_y,
2063                padded_x: padded_x as usize,
2064                padded_y: padded_y as usize,
2065                padded_w: padded_w as usize,
2066                padded_h: padded_h as usize,
2067                bbox_x: bbox_x as usize,
2068                bbox_y: bbox_y as usize,
2069                bbox_w: bbox_w as usize,
2070                bbox_h: bbox_h as usize,
2071            });
2072            atlas_y += padded_h as usize;
2073        }
2074        (regions, atlas_y)
2075    }
2076
2077    /// Sets the interpolation mode for int8 proto textures.
2078    pub fn set_int8_interpolation_mode(&mut self, mode: Int8InterpolationMode) {
2079        self.int8_interpolation_mode = mode;
2080        log::debug!("Int8 interpolation mode set to {:?}", mode);
2081    }
2082
2083    /// Ensures the mask FBO + R8 texture are allocated at the given dimensions.
2084    /// Creates or resizes the FBO and texture as needed.
2085    fn ensure_mask_fbo(&mut self, width: usize, height: usize) -> crate::Result<()> {
2086        if self.mask_fbo_width == width && self.mask_fbo_height == height && self.mask_fbo != 0 {
2087            return Ok(());
2088        }
2089
2090        // Create FBO if needed
2091        if self.mask_fbo == 0 {
2092            unsafe {
2093                gls::gl::GenFramebuffers(1, &mut self.mask_fbo);
2094            }
2095        }
2096        // Create texture if needed
2097        if self.mask_fbo_texture == 0 {
2098            unsafe {
2099                gls::gl::GenTextures(1, &mut self.mask_fbo_texture);
2100            }
2101        }
2102
2103        // Allocate R8 texture
2104        unsafe {
2105            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.mask_fbo_texture);
2106            gls::gl::TexImage2D(
2107                gls::gl::TEXTURE_2D,
2108                0,
2109                gls::gl::R8 as i32,
2110                width as i32,
2111                height as i32,
2112                0,
2113                gls::gl::RED,
2114                gls::gl::UNSIGNED_BYTE,
2115                std::ptr::null(),
2116            );
2117            gls::gl::TexParameteri(
2118                gls::gl::TEXTURE_2D,
2119                gls::gl::TEXTURE_MIN_FILTER,
2120                gls::gl::NEAREST as i32,
2121            );
2122            gls::gl::TexParameteri(
2123                gls::gl::TEXTURE_2D,
2124                gls::gl::TEXTURE_MAG_FILTER,
2125                gls::gl::NEAREST as i32,
2126            );
2127        }
2128
2129        // Attach to FBO
2130        unsafe {
2131            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, self.mask_fbo);
2132            gls::gl::FramebufferTexture2D(
2133                gls::gl::FRAMEBUFFER,
2134                gls::gl::COLOR_ATTACHMENT0,
2135                gls::gl::TEXTURE_2D,
2136                self.mask_fbo_texture,
2137                0,
2138            );
2139            let status = gls::gl::CheckFramebufferStatus(gls::gl::FRAMEBUFFER);
2140            if status != gls::gl::FRAMEBUFFER_COMPLETE {
2141                return Err(crate::Error::OpenGl(format!(
2142                    "Mask FBO incomplete: status=0x{status:X}"
2143                )));
2144            }
2145            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, 0);
2146        }
2147
2148        self.mask_fbo_width = width;
2149        self.mask_fbo_height = height;
2150        log::debug!("Mask FBO allocated at {width}x{height}");
2151        Ok(())
2152    }
2153
2154    /// Ensures the mask atlas FBO and PBO are allocated for the given total
2155    /// atlas dimensions.  Unlike `ensure_mask_atlas`, the caller provides
2156    /// the exact atlas height (e.g. sum of padded bbox heights).
2157    fn ensure_mask_atlas_size(&mut self, width: usize, atlas_height: usize) -> crate::Result<()> {
2158        if self.mask_fbo_width == width
2159            && self.mask_fbo_height >= atlas_height
2160            && self.mask_fbo != 0
2161            && self.mask_atlas_pbo != 0
2162        {
2163            return Ok(());
2164        }
2165        self.ensure_mask_fbo(width, atlas_height)?;
2166        let pbo_size = width * atlas_height;
2167        unsafe {
2168            if self.mask_atlas_pbo == 0 {
2169                gls::gl::GenBuffers(1, &mut self.mask_atlas_pbo);
2170            }
2171            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, self.mask_atlas_pbo);
2172            gls::gl::BufferData(
2173                gls::gl::PIXEL_PACK_BUFFER,
2174                pbo_size as isize,
2175                std::ptr::null(),
2176                gls::gl::DYNAMIC_READ,
2177            );
2178            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
2179        }
2180        Ok(())
2181    }
2182
2183    /// Decode all detection masks into a single atlas texture and read back
2184    /// as a contiguous buffer, with one PBO readback for all masks.
2185    ///
2186    /// Returns `(atlas_pixels, metadata)` where `atlas_pixels` is a contiguous
2187    /// `Vec<u8>` of size `output_width * compact_atlas_height` (where
2188    /// `compact_atlas_height` is the sum of padded bbox heights) and `metadata`
2189    /// contains per-detection bbox info (with empty pixel vecs).
2190    pub fn decode_masks_atlas(
2191        &mut self,
2192        detect: &[DetectBox],
2193        proto_data: &ProtoData,
2194        output_width: usize,
2195        output_height: usize,
2196    ) -> crate::Result<(Vec<u8>, Vec<MaskRegion>)> {
2197        use crate::FunctionTimer;
2198
2199        let _timer = FunctionTimer::new("GLProcessorST::decode_masks_atlas");
2200
2201        if detect.is_empty() || proto_data.mask_coefficients.is_empty() {
2202            return Ok((Vec::new(), Vec::new()));
2203        }
2204
2205        let padding = 4usize;
2206
2207        let (height, width, num_protos) = proto_data.protos.dim();
2208        let texture_target = gls::gl::TEXTURE_2D_ARRAY;
2209
2210        // Pre-compute atlas regions and total height to size the FBO/PBO
2211        let (regions, compact_atlas_height) =
2212            Self::compute_atlas_regions(detect, output_width, output_height, padding);
2213
2214        // Save current FBO and viewport
2215        let (saved_fbo, saved_viewport) = unsafe {
2216            let mut fbo: i32 = 0;
2217            gls::gl::GetIntegerv(gls::gl::FRAMEBUFFER_BINDING, &mut fbo);
2218            let mut vp = [0i32; 4];
2219            gls::gl::GetIntegerv(gls::gl::VIEWPORT, vp.as_mut_ptr());
2220            (fbo as u32, vp)
2221        };
2222
2223        // Ensure atlas FBO and PBO are allocated for the compact size
2224        self.ensure_mask_atlas_size(output_width, compact_atlas_height)?;
2225
2226        // Upload proto texture array and select the logit-threshold shader
2227        gls::active_texture(gls::gl::TEXTURE0);
2228        gls::bind_texture(texture_target, self.proto_texture.id);
2229        gls::tex_parameteri(
2230            texture_target,
2231            gls::gl::TEXTURE_MIN_FILTER,
2232            gls::gl::NEAREST as i32,
2233        );
2234        gls::tex_parameteri(
2235            texture_target,
2236            gls::gl::TEXTURE_MAG_FILTER,
2237            gls::gl::NEAREST as i32,
2238        );
2239        gls::tex_parameteri(
2240            texture_target,
2241            gls::gl::TEXTURE_WRAP_S,
2242            gls::gl::CLAMP_TO_EDGE as i32,
2243        );
2244        gls::tex_parameteri(
2245            texture_target,
2246            gls::gl::TEXTURE_WRAP_T,
2247            gls::gl::CLAMP_TO_EDGE as i32,
2248        );
2249
2250        let atlas_result = match &proto_data.protos {
2251            ProtoTensor::Quantized {
2252                protos,
2253                quantization,
2254            } => {
2255                let mut tex_data = vec![0i8; height * width * num_protos];
2256                for k in 0..num_protos {
2257                    for y in 0..height {
2258                        for x in 0..width {
2259                            tex_data[k * height * width + y * width + x] = protos[[y, x, k]];
2260                        }
2261                    }
2262                }
2263                gls::tex_image3d(
2264                    texture_target,
2265                    0,
2266                    gls::gl::R8I as i32,
2267                    width as i32,
2268                    height as i32,
2269                    num_protos as i32,
2270                    0,
2271                    gls::gl::RED_INTEGER,
2272                    gls::gl::BYTE,
2273                    Some(&tex_data),
2274                );
2275
2276                let proto_scale = quantization.scale;
2277                let proto_scaled_zp = -(quantization.zero_point as f32) * quantization.scale;
2278
2279                let program = match self.int8_interpolation_mode {
2280                    Int8InterpolationMode::Nearest => &self.proto_mask_logit_int8_nearest_program,
2281                    _ => &self.proto_mask_logit_int8_bilinear_program,
2282                };
2283                gls::use_program(program.id);
2284                program.load_uniform_1i(c"num_protos", num_protos as i32)?;
2285                program.load_uniform_1f(c"proto_scale", proto_scale)?;
2286
2287                self.render_mask_atlas_compact(
2288                    program,
2289                    regions,
2290                    &proto_data.mask_coefficients,
2291                    output_width,
2292                    output_height,
2293                    Some(proto_scaled_zp),
2294                )
2295            }
2296            ProtoTensor::Float(protos_f32) => {
2297                let mut tex_data = vec![0.0f32; height * width * num_protos];
2298                for k in 0..num_protos {
2299                    for y in 0..height {
2300                        for x in 0..width {
2301                            tex_data[k * height * width + y * width + x] = protos_f32[[y, x, k]];
2302                        }
2303                    }
2304                }
2305                gls::tex_image3d(
2306                    texture_target,
2307                    0,
2308                    gls::gl::R32F as i32,
2309                    width as i32,
2310                    height as i32,
2311                    num_protos as i32,
2312                    0,
2313                    gls::gl::RED,
2314                    gls::gl::FLOAT,
2315                    Some(&tex_data),
2316                );
2317                if self.has_float_linear {
2318                    gls::tex_parameteri(
2319                        texture_target,
2320                        gls::gl::TEXTURE_MIN_FILTER,
2321                        gls::gl::LINEAR as i32,
2322                    );
2323                    gls::tex_parameteri(
2324                        texture_target,
2325                        gls::gl::TEXTURE_MAG_FILTER,
2326                        gls::gl::LINEAR as i32,
2327                    );
2328                }
2329
2330                let program = &self.proto_mask_logit_f32_program;
2331                gls::use_program(program.id);
2332                program.load_uniform_1i(c"num_protos", num_protos as i32)?;
2333
2334                self.render_mask_atlas_compact(
2335                    program,
2336                    regions,
2337                    &proto_data.mask_coefficients,
2338                    output_width,
2339                    output_height,
2340                    None,
2341                )
2342            }
2343        };
2344
2345        // Restore previous FBO + viewport
2346        unsafe {
2347            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, saved_fbo);
2348            gls::gl::Viewport(
2349                saved_viewport[0],
2350                saved_viewport[1],
2351                saved_viewport[2],
2352                saved_viewport[3],
2353            );
2354        }
2355
2356        let (atlas_pixels, regions) = atlas_result?;
2357        Ok((atlas_pixels, regions))
2358    }
2359
2360    /// Render all detection masks into a compact atlas where each strip is
2361    /// sized to the padded bounding box, not the full output resolution.
2362    ///
2363    /// The atlas width equals `output_width`; each detection occupies a
2364    /// horizontal strip whose height is the padded bbox height.  Strips are
2365    /// stacked vertically.  A single PBO readback retrieves the entire atlas.
2366    ///
2367    /// Returns `(atlas_pixels, regions)` where `regions` describes each
2368    /// detection's location within the atlas.
2369    #[allow(clippy::too_many_arguments)]
2370    fn render_mask_atlas_compact(
2371        &self,
2372        program: &GlProgram,
2373        regions: Vec<MaskRegion>,
2374        mask_coefficients: &[Vec<f32>],
2375        output_width: usize,
2376        output_height: usize,
2377        proto_scaled_zp: Option<f32>,
2378    ) -> crate::Result<(Vec<u8>, Vec<MaskRegion>)> {
2379        if regions.is_empty() {
2380            return Ok((Vec::new(), Vec::new()));
2381        }
2382
2383        let owf = output_width as f32;
2384        let ohf = output_height as f32;
2385
2386        let atlas_height = regions.last().map_or(0, |r| r.atlas_y_offset + r.padded_h);
2387        let ahf = atlas_height as f32;
2388
2389        unsafe {
2390            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, self.mask_fbo);
2391            gls::gl::Viewport(0, 0, output_width as i32, atlas_height as i32);
2392            gls::gl::Disable(gls::gl::BLEND);
2393            gls::gl::ClearColor(0.0, 0.0, 0.0, 0.0);
2394            gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
2395        }
2396
2397        if let Some(first_coeff) = mask_coefficients.first() {
2398            if first_coeff.len() > 32 {
2399                log::warn!(
2400                    "render_mask_atlas_compact: {} mask coefficients exceeds shader \
2401                     limit of 32 — coefficients will be truncated",
2402                    first_coeff.len()
2403                );
2404            }
2405        }
2406
2407        for (region, coeff) in regions.iter().zip(mask_coefficients.iter()) {
2408            let mut packed_coeff = [[0.0f32; 4]; 8];
2409            for (j, val) in coeff.iter().enumerate().take(32) {
2410                packed_coeff[j / 4][j % 4] = *val;
2411            }
2412            program.load_uniform_4fv(c"mask_coeff", &packed_coeff)?;
2413
2414            // For int8 paths: upload precomputed coeff_sum * scaled_zp
2415            if let Some(szp) = proto_scaled_zp {
2416                let coeff_sum: f32 = coeff.iter().take(32).sum();
2417                program.load_uniform_1f(c"coeff_sum_x_szp", coeff_sum * szp)?;
2418            }
2419
2420            // The bbox quad position in the atlas:
2421            // - X: the padded bbox horizontal position (same as in output coords)
2422            // - Y: the strip's vertical offset in the atlas
2423            let dst_left = region.padded_x as f32 / owf * 2.0 - 1.0;
2424            let dst_right = (region.padded_x + region.padded_w) as f32 / owf * 2.0 - 1.0;
2425            let dst_bottom = region.atlas_y_offset as f32 / ahf * 2.0 - 1.0;
2426            let dst_top = (region.atlas_y_offset + region.padded_h) as f32 / ahf * 2.0 - 1.0;
2427
2428            // Proto texture coords map the padded bbox to proto space
2429            let src_left = region.padded_x as f32 / owf;
2430            let src_right = (region.padded_x + region.padded_w) as f32 / owf;
2431            let src_bottom = region.padded_y as f32 / ohf;
2432            let src_top = (region.padded_y + region.padded_h) as f32 / ohf;
2433
2434            unsafe {
2435                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
2436                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
2437                let verts: [f32; 12] = [
2438                    dst_left, dst_top, 0.0, dst_right, dst_top, 0.0, dst_right, dst_bottom, 0.0,
2439                    dst_left, dst_bottom, 0.0,
2440                ];
2441                gls::gl::BufferSubData(
2442                    gls::gl::ARRAY_BUFFER,
2443                    0,
2444                    (size_of::<f32>() * 12) as isize,
2445                    verts.as_ptr() as *const c_void,
2446                );
2447
2448                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
2449                gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
2450                let tc: [f32; 8] = [
2451                    src_left, src_top, src_right, src_top, src_right, src_bottom, src_left,
2452                    src_bottom,
2453                ];
2454                gls::gl::BufferSubData(
2455                    gls::gl::ARRAY_BUFFER,
2456                    0,
2457                    (size_of::<f32>() * 8) as isize,
2458                    tc.as_ptr() as *const c_void,
2459                );
2460
2461                let idx: [u32; 4] = [0, 1, 2, 3];
2462                gls::gl::DrawElements(
2463                    gls::gl::TRIANGLE_FAN,
2464                    4,
2465                    gls::gl::UNSIGNED_INT,
2466                    idx.as_ptr() as *const c_void,
2467                );
2468            }
2469        }
2470
2471        // Single readback for the compact atlas
2472        let atlas_bytes = output_width * atlas_height;
2473        let mut pixels = vec![0u8; atlas_bytes];
2474
2475        unsafe {
2476            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, self.mask_atlas_pbo);
2477            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
2478            gls::gl::ReadnPixels(
2479                0,
2480                0,
2481                output_width as i32,
2482                atlas_height as i32,
2483                gls::gl::RED,
2484                gls::gl::UNSIGNED_BYTE,
2485                atlas_bytes as i32,
2486                std::ptr::null_mut(),
2487            );
2488            gls::gl::Finish();
2489
2490            let ptr = gls::gl::MapBufferRange(
2491                gls::gl::PIXEL_PACK_BUFFER,
2492                0,
2493                atlas_bytes as isize,
2494                gls::gl::MAP_READ_BIT,
2495            );
2496            if ptr.is_null() {
2497                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
2498                return Err(crate::Error::OpenGl(
2499                    "Failed to map compact atlas PBO for readback".to_string(),
2500                ));
2501            }
2502            std::ptr::copy_nonoverlapping(ptr as *const u8, pixels.as_mut_ptr(), atlas_bytes);
2503            gls::gl::UnmapBuffer(gls::gl::PIXEL_PACK_BUFFER);
2504            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
2505        }
2506
2507        Ok((pixels, regions))
2508    }
2509
2510    fn check_src_format_supported(backend: TransferBackend, img: &TensorImage) -> bool {
2511        if backend.is_dma() && img.tensor().memory() == TensorMemory::Dma {
2512            // EGLImage supports RGBA, GREY, YUYV, and NV12 for DMA buffers.
2513            // VYUY excluded: Vivante GPU accepts the DRM fourcc but produces
2514            // incorrect output (similarity ~0.28 vs reference).
2515            matches!(img.fourcc(), RGBA | GREY | YUYV | NV12)
2516        } else {
2517            matches!(img.fourcc(), RGB | RGBA | GREY)
2518        }
2519    }
2520
2521    fn check_dst_format_supported(backend: TransferBackend, img: &TensorImage) -> bool {
2522        if backend.is_dma() && img.tensor().memory() == TensorMemory::Dma {
2523            matches!(
2524                img.fourcc(),
2525                RGBA | GREY | PLANAR_RGB | RGB | RGB_INT8 | PLANAR_RGB_INT8
2526            )
2527        } else {
2528            matches!(img.fourcc(), RGB | RGBA | GREY | RGB_INT8)
2529        }
2530    }
2531
2532    /// Checks required GL extensions and returns whether optional capabilities
2533    /// are available. Returns `has_float_linear` (GL_OES_texture_float_linear).
2534    fn gl_check_support() -> Result<bool, crate::Error> {
2535        if let Ok(version) = gls::get_string(gls::gl::SHADING_LANGUAGE_VERSION) {
2536            log::debug!("GL Shading Language Version: {version:?}");
2537        } else {
2538            log::warn!("Could not get GL Shading Language Version");
2539        }
2540
2541        let extensions = unsafe {
2542            let str = gls::gl::GetString(gls::gl::EXTENSIONS);
2543            if str.is_null() {
2544                return Err(crate::Error::GLVersion(
2545                    "GL returned no supported extensions".to_string(),
2546                ));
2547            }
2548            CStr::from_ptr(str as *const c_char)
2549                .to_string_lossy()
2550                .to_string()
2551        };
2552        log::debug!("GL Extensions: {extensions}");
2553        let required_ext = ["GL_OES_EGL_image_external_essl3"];
2554        let extensions = extensions.split_ascii_whitespace().collect::<BTreeSet<_>>();
2555        for required in required_ext {
2556            if !extensions.contains(required) {
2557                return Err(crate::Error::GLVersion(format!(
2558                    "GL does not support {required} extension",
2559                )));
2560            }
2561        }
2562
2563        let has_float_linear = extensions.contains("GL_OES_texture_float_linear");
2564        log::debug!("GL_OES_texture_float_linear: {has_float_linear}");
2565
2566        Ok(has_float_linear)
2567    }
2568
2569    fn setup_renderbuffer_dma(&mut self, dst: &TensorImage) -> crate::Result<()> {
2570        self.convert_fbo.bind();
2571
2572        let (width, height) = if matches!(dst.fourcc(), PLANAR_RGB | PLANAR_RGB_INT8) {
2573            let width = dst.width();
2574            let height = dst.height() * 3;
2575            (width as i32, height as i32)
2576        } else {
2577            (dst.width() as i32, dst.height() as i32)
2578        };
2579        let dest_egl = self.get_or_create_egl_image(CacheKind::Dst, dst)?;
2580        unsafe {
2581            gls::gl::UseProgram(self.texture_program_yuv.id);
2582            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
2583            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.render_texture.id);
2584            gls::gl::TexParameteri(
2585                gls::gl::TEXTURE_2D,
2586                gls::gl::TEXTURE_MIN_FILTER,
2587                gls::gl::LINEAR as i32,
2588            );
2589            gls::gl::TexParameteri(
2590                gls::gl::TEXTURE_2D,
2591                gls::gl::TEXTURE_MAG_FILTER,
2592                gls::gl::LINEAR as i32,
2593            );
2594            gls::gl::EGLImageTargetTexture2DOES(gls::gl::TEXTURE_2D, dest_egl.as_ptr());
2595            gls::gl::FramebufferTexture2D(
2596                gls::gl::FRAMEBUFFER,
2597                gls::gl::COLOR_ATTACHMENT0,
2598                gls::gl::TEXTURE_2D,
2599                self.render_texture.id,
2600                0,
2601            );
2602            check_gl_error(function!(), line!())?;
2603            gls::gl::Viewport(0, 0, width, height);
2604        }
2605        Ok(())
2606    }
2607
2608    fn convert_dest_dma(
2609        &mut self,
2610        dst: &mut TensorImage,
2611        src: &TensorImage,
2612        rotation: crate::Rotation,
2613        flip: Flip,
2614        crop: Crop,
2615    ) -> crate::Result<()> {
2616        assert!(self.gl_context.transfer_backend.is_dma());
2617        if fourcc_is_packed_rgb(dst.fourcc()) {
2618            if self.support_rgb_direct {
2619                self.convert_to_rgb_direct(src, dst, rotation, flip, crop)
2620            } else {
2621                // Two-pass packed RGB is slower than G2D/CPU; decline so
2622                // ImageProcessor falls through to a faster backend.
2623                Err(crate::Error::NotSupported(
2624                    "OpenGL two-pass packed RGB disabled (no direct RGB support)".into(),
2625                ))
2626            }
2627        } else if dst.is_planar() {
2628            self.setup_renderbuffer_dma(dst)?;
2629            self.convert_to_planar(src, dst, rotation, flip, crop)
2630        } else {
2631            self.setup_renderbuffer_dma(dst)?;
2632            self.convert_to(src, dst, rotation, flip, crop)
2633        }
2634    }
2635
2636    fn setup_renderbuffer_non_dma(&mut self, dst: &TensorImage, crop: Crop) -> crate::Result<()> {
2637        debug_assert!(matches!(
2638            dst.fourcc(),
2639            RGB | RGBA | GREY | PLANAR_RGB | RGB_INT8
2640        ));
2641        let (width, height) = if dst.is_planar() {
2642            let width = dst.width() / 4;
2643            let height = match dst.fourcc() {
2644                RGBA => dst.height() * 4,
2645                RGB => dst.height() * 3,
2646                GREY => dst.height(),
2647                _ => unreachable!(),
2648            };
2649            (width as i32, height as i32)
2650        } else {
2651            (dst.width() as i32, dst.height() as i32)
2652        };
2653
2654        let format = if dst.is_planar() {
2655            gls::gl::RED
2656        } else {
2657            match dst.fourcc() {
2658                RGB | RGB_INT8 => gls::gl::RGB,
2659                RGBA => gls::gl::RGBA,
2660                GREY => gls::gl::RED,
2661                _ => unreachable!(),
2662            }
2663        };
2664
2665        let start = Instant::now();
2666        self.convert_fbo.bind();
2667
2668        let map;
2669
2670        let pixels = if crop.dst_rect.is_none_or(|crop| {
2671            crop.top == 0
2672                && crop.left == 0
2673                && crop.height == dst.height()
2674                && crop.width == dst.width()
2675        }) {
2676            std::ptr::null()
2677        } else {
2678            map = dst.tensor().map()?;
2679            map.as_ptr() as *const c_void
2680        };
2681        unsafe {
2682            gls::gl::UseProgram(self.texture_program.id);
2683            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.render_texture.id);
2684            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
2685            gls::gl::TexParameteri(
2686                gls::gl::TEXTURE_2D,
2687                gls::gl::TEXTURE_MIN_FILTER,
2688                gls::gl::LINEAR as i32,
2689            );
2690            gls::gl::TexParameteri(
2691                gls::gl::TEXTURE_2D,
2692                gls::gl::TEXTURE_MAG_FILTER,
2693                gls::gl::LINEAR as i32,
2694            );
2695
2696            gls::gl::TexImage2D(
2697                gls::gl::TEXTURE_2D,
2698                0,
2699                format as i32,
2700                width,
2701                height,
2702                0,
2703                format,
2704                gls::gl::UNSIGNED_BYTE,
2705                pixels,
2706            );
2707            check_gl_error(function!(), line!())?;
2708            gls::gl::FramebufferTexture2D(
2709                gls::gl::FRAMEBUFFER,
2710                gls::gl::COLOR_ATTACHMENT0,
2711                gls::gl::TEXTURE_2D,
2712                self.render_texture.id,
2713                0,
2714            );
2715            check_gl_error(function!(), line!())?;
2716            gls::gl::Viewport(0, 0, width, height);
2717        }
2718        log::debug!("Set up framebuffer takes {:?}", start.elapsed());
2719        Ok(())
2720    }
2721
2722    fn convert_dest_non_dma(
2723        &mut self,
2724        dst: &mut TensorImage,
2725        src: &TensorImage,
2726        rotation: crate::Rotation,
2727        flip: Flip,
2728        crop: Crop,
2729    ) -> crate::Result<()> {
2730        self.setup_renderbuffer_non_dma(dst, crop)?;
2731        let start = Instant::now();
2732        if dst.is_planar() {
2733            self.convert_to_planar(src, dst, rotation, flip, crop)?;
2734        } else {
2735            self.convert_to(src, dst, rotation, flip, crop)?;
2736        }
2737        log::debug!("Draw to framebuffer takes {:?}", start.elapsed());
2738        let start = Instant::now();
2739        let dest_format = match dst.fourcc() {
2740            RGB | RGB_INT8 => gls::gl::RGB,
2741            RGBA => gls::gl::RGBA,
2742            GREY => gls::gl::RED,
2743            _ => unreachable!(),
2744        };
2745
2746        unsafe {
2747            let mut dst_map = dst.tensor().map()?;
2748            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
2749            gls::gl::ReadnPixels(
2750                0,
2751                0,
2752                dst.width() as i32,
2753                dst.height() as i32,
2754                dest_format,
2755                gls::gl::UNSIGNED_BYTE,
2756                dst.tensor.len() as i32,
2757                dst_map.as_mut_ptr() as *mut c_void,
2758            );
2759            // Apply XOR 0x80 for int8 formats (convert uint8 → int8 representation)
2760            if fourcc_is_int8(dst.fourcc()) {
2761                for byte in dst_map.iter_mut() {
2762                    *byte ^= 0x80;
2763                }
2764            }
2765        }
2766        log::debug!("Read from framebuffer takes {:?}", start.elapsed());
2767        Ok(())
2768    }
2769
2770    /// Convert between two PBO-backed images.
2771    ///
2772    /// Source PBO is bound as `GL_PIXEL_UNPACK_BUFFER` for zero-copy texture upload
2773    /// (avoids `tensor.map()` to prevent GL-thread deadlocks). Destination uses
2774    /// `GL_PIXEL_PACK_BUFFER` for zero-copy readback into the PBO.
2775    fn convert_pbo_to_pbo(
2776        &mut self,
2777        dst: &mut TensorImage,
2778        src: &TensorImage,
2779        rotation: crate::Rotation,
2780        flip: Flip,
2781        crop: Crop,
2782    ) -> crate::Result<()> {
2783        // Safety check: neither PBO must be mapped; extract buffer IDs before releasing borrows
2784        let (src_buffer_id, dst_buffer_id) = {
2785            let src_pbo = match &src.tensor {
2786                edgefirst_tensor::Tensor::Pbo(p) => p,
2787                _ => {
2788                    return Err(crate::Error::OpenGl(
2789                        "convert_pbo_to_pbo: src is not a PBO tensor".to_string(),
2790                    ))
2791                }
2792            };
2793            let dst_pbo = match &dst.tensor {
2794                edgefirst_tensor::Tensor::Pbo(p) => p,
2795                _ => {
2796                    return Err(crate::Error::OpenGl(
2797                        "convert_pbo_to_pbo: dst is not a PBO tensor".to_string(),
2798                    ))
2799                }
2800            };
2801
2802            if src_pbo.is_mapped() || dst_pbo.is_mapped() {
2803                return Err(crate::Error::OpenGl(
2804                    "Cannot convert PBO tensors while they are mapped".to_string(),
2805                ));
2806            }
2807
2808            (src_pbo.buffer_id(), dst_pbo.buffer_id())
2809        };
2810
2811        // Setup renderbuffer (same as non-DMA path)
2812        self.setup_renderbuffer_non_dma(dst, crop)?;
2813
2814        // Upload source from PBO and render.
2815        // We cannot call convert_to/draw_src_texture directly because they
2816        // call src.tensor().map() which sends a message back to THIS thread,
2817        // causing a deadlock. Instead, bind the source PBO as UNPACK buffer
2818        // and upload to the texture with a NULL pointer — GL reads directly
2819        // from the PBO, zero CPU copy.
2820        let start = Instant::now();
2821        self.draw_src_texture_from_pbo(src, src_buffer_id, dst, rotation, flip, crop)?;
2822        log::debug!("PBO render takes {:?}", start.elapsed());
2823
2824        // Readback into destination PBO instead of CPU memory
2825        let start_read = Instant::now();
2826        let dest_format = match dst.fourcc() {
2827            crate::RGB | crate::RGB_INT8 => gls::gl::RGB,
2828            crate::RGBA => gls::gl::RGBA,
2829            crate::GREY => gls::gl::RED,
2830            _ => {
2831                return Err(crate::Error::NotSupported(format!(
2832                    "PBO readback not supported for {}",
2833                    dst.fourcc().display()
2834                )))
2835            }
2836        };
2837
2838        unsafe {
2839            // Bind destination PBO as PACK buffer — glReadnPixels will write into it
2840            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, dst_buffer_id);
2841            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
2842            gls::gl::ReadnPixels(
2843                0,
2844                0,
2845                dst.width() as i32,
2846                dst.height() as i32,
2847                dest_format,
2848                gls::gl::UNSIGNED_BYTE,
2849                dst.tensor.len() as i32,
2850                std::ptr::null_mut(), // NULL pointer = write to bound PACK buffer
2851            );
2852            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
2853            gls::gl::Finish();
2854        }
2855
2856        check_gl_error(function!(), line!())?;
2857
2858        // Handle int8 XOR if needed (must map PBO to do this on the GL thread
2859        // directly, since we're already on the GL thread)
2860        if fourcc_is_int8(dst.fourcc()) {
2861            unsafe {
2862                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, dst_buffer_id);
2863                let ptr = gls::gl::MapBufferRange(
2864                    gls::gl::PIXEL_PACK_BUFFER,
2865                    0,
2866                    dst.tensor.len() as isize,
2867                    gls::gl::MAP_READ_BIT | gls::gl::MAP_WRITE_BIT,
2868                );
2869                if !ptr.is_null() {
2870                    let slice = std::slice::from_raw_parts_mut(ptr as *mut u8, dst.tensor.len());
2871                    for byte in slice.iter_mut() {
2872                        *byte ^= 0x80;
2873                    }
2874                    gls::gl::UnmapBuffer(gls::gl::PIXEL_PACK_BUFFER);
2875                }
2876                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
2877            }
2878            check_gl_error(function!(), line!())?;
2879        }
2880
2881        log::debug!("PBO readback takes {:?}", start_read.elapsed());
2882        Ok(())
2883    }
2884
2885    /// Upload source image from a PBO and render to the current framebuffer.
2886    /// This is the PBO equivalent of draw_src_texture — instead of mapping
2887    /// the tensor to CPU and calling glTexImage2D with a data pointer, we
2888    /// bind the source PBO as GL_PIXEL_UNPACK_BUFFER and pass NULL, causing
2889    /// GL to read directly from the PBO (zero CPU copy).
2890    fn draw_src_texture_from_pbo(
2891        &mut self,
2892        src: &TensorImage,
2893        src_buffer_id: u32,
2894        dst: &TensorImage,
2895        rotation: crate::Rotation,
2896        flip: Flip,
2897        crop: Crop,
2898    ) -> Result<(), Error> {
2899        let texture_target = gls::gl::TEXTURE_2D;
2900        let texture_format = match src.fourcc() {
2901            crate::RGB | crate::RGB_INT8 => gls::gl::RGB,
2902            crate::RGBA => gls::gl::RGBA,
2903            crate::GREY => gls::gl::RED,
2904            _ => {
2905                return Err(Error::NotSupported(format!(
2906                    "PBO upload not supported for {:?}",
2907                    src.fourcc()
2908                )));
2909            }
2910        };
2911
2912        let has_crop = crop.dst_rect.is_some_and(|x| {
2913            x.left != 0 || x.top != 0 || x.width != dst.width() || x.height != dst.height()
2914        });
2915
2916        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
2917        let src_roi = if let Some(crop) = crop.src_rect {
2918            RegionOfInterest {
2919                left: crop.left as f32 / src.width() as f32,
2920                top: (crop.top + crop.height) as f32 / src.height() as f32,
2921                right: (crop.left + crop.width) as f32 / src.width() as f32,
2922                bottom: crop.top as f32 / src.height() as f32,
2923            }
2924        } else {
2925            RegionOfInterest {
2926                left: 0.,
2927                top: 1.,
2928                right: 1.,
2929                bottom: 0.,
2930            }
2931        };
2932
2933        let cvt_screen_coord = |normalized| normalized * 2.0 - 1.0;
2934        let mut dst_roi = if let Some(crop) = crop.dst_rect {
2935            RegionOfInterest {
2936                left: cvt_screen_coord(crop.left as f32 / dst.width() as f32),
2937                top: cvt_screen_coord((crop.top + crop.height) as f32 / dst.height() as f32),
2938                right: cvt_screen_coord((crop.left + crop.width) as f32 / dst.width() as f32),
2939                bottom: cvt_screen_coord(crop.top as f32 / dst.height() as f32),
2940            }
2941        } else {
2942            RegionOfInterest {
2943                left: -1.,
2944                top: 1.,
2945                right: 1.,
2946                bottom: -1.,
2947            }
2948        };
2949
2950        let rotation_offset = match rotation {
2951            crate::Rotation::None => 0,
2952            crate::Rotation::Clockwise90 => 1,
2953            crate::Rotation::Rotate180 => 2,
2954            crate::Rotation::CounterClockwise90 => 3,
2955        };
2956
2957        unsafe {
2958            if has_crop {
2959                if let Some(dst_color) = crop.dst_color {
2960                    gls::gl::ClearColor(
2961                        dst_color[0] as f32 / 255.0,
2962                        dst_color[1] as f32 / 255.0,
2963                        dst_color[2] as f32 / 255.0,
2964                        dst_color[3] as f32 / 255.0,
2965                    );
2966                    gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
2967                }
2968            }
2969
2970            gls::gl::UseProgram(self.texture_program.id);
2971            gls::gl::BindTexture(texture_target, self.camera_normal_texture.id);
2972            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
2973            gls::gl::TexParameteri(
2974                texture_target,
2975                gls::gl::TEXTURE_MIN_FILTER,
2976                gls::gl::LINEAR as i32,
2977            );
2978            gls::gl::TexParameteri(
2979                texture_target,
2980                gls::gl::TEXTURE_MAG_FILTER,
2981                gls::gl::LINEAR as i32,
2982            );
2983            if src.fourcc() == crate::GREY {
2984                for swizzle in [
2985                    gls::gl::TEXTURE_SWIZZLE_R,
2986                    gls::gl::TEXTURE_SWIZZLE_G,
2987                    gls::gl::TEXTURE_SWIZZLE_B,
2988                ] {
2989                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, gls::gl::RED as i32);
2990                }
2991            } else {
2992                for (swizzle, src_component) in [
2993                    (gls::gl::TEXTURE_SWIZZLE_R, gls::gl::RED),
2994                    (gls::gl::TEXTURE_SWIZZLE_G, gls::gl::GREEN),
2995                    (gls::gl::TEXTURE_SWIZZLE_B, gls::gl::BLUE),
2996                ] {
2997                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, src_component as i32);
2998                }
2999            }
3000
3001            // Bind source PBO as UNPACK buffer — glTexImage2D reads from it
3002            gls::gl::BindBuffer(gls::gl::PIXEL_UNPACK_BUFFER, src_buffer_id);
3003            gls::gl::TexImage2D(
3004                texture_target,
3005                0,
3006                texture_format as i32,
3007                src.width() as i32,
3008                src.height() as i32,
3009                0,
3010                texture_format,
3011                gls::gl::UNSIGNED_BYTE,
3012                std::ptr::null(), // NULL = read from bound UNPACK buffer
3013            );
3014            gls::gl::BindBuffer(gls::gl::PIXEL_UNPACK_BUFFER, 0);
3015
3016            // Force texture cache state to be rebuilt next call
3017            self.camera_normal_texture.width = 0;
3018
3019            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
3020            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
3021
3022            match flip {
3023                crate::Flip::None => {}
3024                crate::Flip::Vertical => {
3025                    std::mem::swap(&mut dst_roi.top, &mut dst_roi.bottom);
3026                }
3027                crate::Flip::Horizontal => {
3028                    std::mem::swap(&mut dst_roi.left, &mut dst_roi.right);
3029                }
3030            }
3031
3032            let camera_vertices: [f32; 12] = [
3033                dst_roi.left,
3034                dst_roi.top,
3035                0., // left top
3036                dst_roi.right,
3037                dst_roi.top,
3038                0., // right top
3039                dst_roi.right,
3040                dst_roi.bottom,
3041                0., // right bottom
3042                dst_roi.left,
3043                dst_roi.bottom,
3044                0., // left bottom
3045            ];
3046            gls::gl::BufferData(
3047                gls::gl::ARRAY_BUFFER,
3048                (camera_vertices.len() * std::mem::size_of::<f32>()) as isize,
3049                camera_vertices.as_ptr() as *const c_void,
3050                gls::gl::STATIC_DRAW,
3051            );
3052            gls::gl::VertexAttribPointer(
3053                self.vertex_buffer.buffer_index,
3054                3,
3055                gls::gl::FLOAT,
3056                gls::gl::FALSE,
3057                0,
3058                std::ptr::null(),
3059            );
3060
3061            let texture_coords: [[f32; 8]; 4] = [
3062                [
3063                    src_roi.left,
3064                    src_roi.top,
3065                    src_roi.right,
3066                    src_roi.top,
3067                    src_roi.right,
3068                    src_roi.bottom,
3069                    src_roi.left,
3070                    src_roi.bottom,
3071                ],
3072                [
3073                    src_roi.left,
3074                    src_roi.bottom,
3075                    src_roi.left,
3076                    src_roi.top,
3077                    src_roi.right,
3078                    src_roi.top,
3079                    src_roi.right,
3080                    src_roi.bottom,
3081                ],
3082                [
3083                    src_roi.right,
3084                    src_roi.bottom,
3085                    src_roi.left,
3086                    src_roi.bottom,
3087                    src_roi.left,
3088                    src_roi.top,
3089                    src_roi.right,
3090                    src_roi.top,
3091                ],
3092                [
3093                    src_roi.right,
3094                    src_roi.top,
3095                    src_roi.right,
3096                    src_roi.bottom,
3097                    src_roi.left,
3098                    src_roi.bottom,
3099                    src_roi.left,
3100                    src_roi.top,
3101                ],
3102            ];
3103            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
3104            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
3105            gls::gl::BufferData(
3106                gls::gl::ARRAY_BUFFER,
3107                (texture_coords[0].len() * std::mem::size_of::<f32>()) as isize,
3108                texture_coords[rotation_offset].as_ptr() as *const c_void,
3109                gls::gl::STATIC_DRAW,
3110            );
3111            gls::gl::VertexAttribPointer(
3112                self.texture_buffer.buffer_index,
3113                2,
3114                gls::gl::FLOAT,
3115                gls::gl::FALSE,
3116                0,
3117                std::ptr::null(),
3118            );
3119            gls::gl::DrawArrays(gls::gl::TRIANGLE_FAN, 0, 4);
3120            gls::gl::DisableVertexAttribArray(self.vertex_buffer.buffer_index);
3121            gls::gl::DisableVertexAttribArray(self.texture_buffer.buffer_index);
3122
3123            gls::gl::Finish();
3124        }
3125
3126        check_gl_error(function!(), line!())?;
3127        Ok(())
3128    }
3129
3130    /// Convert any source (Mem/DMA) to a PBO destination.
3131    /// Source is uploaded via normal texture path (maps tensor for CPU upload).
3132    /// Destination readback uses PBO PACK binding (no map on GL thread).
3133    fn convert_any_to_pbo(
3134        &mut self,
3135        dst: &mut TensorImage,
3136        src: &TensorImage,
3137        rotation: crate::Rotation,
3138        flip: Flip,
3139        crop: Crop,
3140    ) -> crate::Result<()> {
3141        let dst_buffer_id = match &dst.tensor {
3142            edgefirst_tensor::Tensor::Pbo(p) => {
3143                if p.is_mapped() {
3144                    return Err(crate::Error::OpenGl(
3145                        "Cannot convert to a mapped PBO tensor".to_string(),
3146                    ));
3147                }
3148                p.buffer_id()
3149            }
3150            _ => {
3151                return Err(crate::Error::OpenGl(
3152                    "convert_any_to_pbo: dst is not a PBO tensor".to_string(),
3153                ))
3154            }
3155        };
3156
3157        self.setup_renderbuffer_non_dma(dst, crop)?;
3158        let start = Instant::now();
3159        if dst.is_planar() {
3160            self.convert_to_planar(src, dst, rotation, flip, crop)?;
3161        } else {
3162            self.convert_to(src, dst, rotation, flip, crop)?;
3163        }
3164        log::debug!("any-to-PBO render takes {:?}", start.elapsed());
3165
3166        // PBO readback
3167        let start_read = Instant::now();
3168        let dest_format = match dst.fourcc() {
3169            crate::RGB | crate::RGB_INT8 => gls::gl::RGB,
3170            crate::RGBA => gls::gl::RGBA,
3171            crate::GREY => gls::gl::RED,
3172            _ => {
3173                return Err(crate::Error::NotSupported(format!(
3174                    "PBO readback not supported for {}",
3175                    dst.fourcc().display()
3176                )))
3177            }
3178        };
3179        unsafe {
3180            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, dst_buffer_id);
3181            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
3182            gls::gl::ReadnPixels(
3183                0,
3184                0,
3185                dst.width() as i32,
3186                dst.height() as i32,
3187                dest_format,
3188                gls::gl::UNSIGNED_BYTE,
3189                dst.tensor.len() as i32,
3190                std::ptr::null_mut(),
3191            );
3192            gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
3193            gls::gl::Finish();
3194        }
3195        check_gl_error(function!(), line!())?;
3196
3197        if fourcc_is_int8(dst.fourcc()) {
3198            unsafe {
3199                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, dst_buffer_id);
3200                let ptr = gls::gl::MapBufferRange(
3201                    gls::gl::PIXEL_PACK_BUFFER,
3202                    0,
3203                    dst.tensor.len() as isize,
3204                    gls::gl::MAP_READ_BIT | gls::gl::MAP_WRITE_BIT,
3205                );
3206                if !ptr.is_null() {
3207                    let slice = std::slice::from_raw_parts_mut(ptr as *mut u8, dst.tensor.len());
3208                    for byte in slice.iter_mut() {
3209                        *byte ^= 0x80;
3210                    }
3211                    gls::gl::UnmapBuffer(gls::gl::PIXEL_PACK_BUFFER);
3212                }
3213                gls::gl::BindBuffer(gls::gl::PIXEL_PACK_BUFFER, 0);
3214            }
3215            check_gl_error(function!(), line!())?;
3216        }
3217
3218        log::debug!("any-to-PBO readback takes {:?}", start_read.elapsed());
3219        Ok(())
3220    }
3221
3222    /// Convert a PBO source to a non-PBO (Mem) destination.
3223    /// Source is uploaded via PBO UNPACK binding (no map on GL thread).
3224    /// Destination readback uses normal ReadnPixels into mapped Mem tensor.
3225    fn convert_pbo_to_mem(
3226        &mut self,
3227        dst: &mut TensorImage,
3228        src: &TensorImage,
3229        rotation: crate::Rotation,
3230        flip: Flip,
3231        crop: Crop,
3232    ) -> crate::Result<()> {
3233        let src_buffer_id = match &src.tensor {
3234            edgefirst_tensor::Tensor::Pbo(p) => {
3235                if p.is_mapped() {
3236                    return Err(crate::Error::OpenGl(
3237                        "Cannot convert from a mapped PBO tensor".to_string(),
3238                    ));
3239                }
3240                p.buffer_id()
3241            }
3242            _ => {
3243                return Err(crate::Error::OpenGl(
3244                    "convert_pbo_to_mem: src is not a PBO tensor".to_string(),
3245                ))
3246            }
3247        };
3248
3249        self.setup_renderbuffer_non_dma(dst, crop)?;
3250        let start = Instant::now();
3251        self.draw_src_texture_from_pbo(src, src_buffer_id, dst, rotation, flip, crop)?;
3252        log::debug!("PBO-to-mem render takes {:?}", start.elapsed());
3253
3254        // Normal readback into Mem dst
3255        let start = Instant::now();
3256        let dest_format = match dst.fourcc() {
3257            crate::RGB | crate::RGB_INT8 => gls::gl::RGB,
3258            crate::RGBA => gls::gl::RGBA,
3259            crate::GREY => gls::gl::RED,
3260            _ => unreachable!(),
3261        };
3262        unsafe {
3263            let mut dst_map = dst.tensor().map()?;
3264            gls::gl::ReadBuffer(gls::gl::COLOR_ATTACHMENT0);
3265            gls::gl::ReadnPixels(
3266                0,
3267                0,
3268                dst.width() as i32,
3269                dst.height() as i32,
3270                dest_format,
3271                gls::gl::UNSIGNED_BYTE,
3272                dst.tensor.len() as i32,
3273                dst_map.as_mut_ptr() as *mut c_void,
3274            );
3275            if fourcc_is_int8(dst.fourcc()) {
3276                for byte in dst_map.iter_mut() {
3277                    *byte ^= 0x80;
3278                }
3279            }
3280        }
3281        log::debug!("PBO-to-mem readback takes {:?}", start.elapsed());
3282        Ok(())
3283    }
3284
3285    fn convert_to(
3286        &mut self,
3287        src: &TensorImage,
3288        dst: &TensorImage,
3289        rotation: crate::Rotation,
3290        flip: Flip,
3291        crop: Crop,
3292    ) -> Result<(), crate::Error> {
3293        check_gl_error(function!(), line!())?;
3294
3295        let has_crop = crop.dst_rect.is_some_and(|x| {
3296            x.left != 0 || x.top != 0 || x.width != dst.width() || x.height != dst.height()
3297        });
3298        if has_crop {
3299            if let Some(dst_color) = crop.dst_color {
3300                unsafe {
3301                    gls::gl::ClearColor(
3302                        dst_color[0] as f32 / 255.0,
3303                        dst_color[1] as f32 / 255.0,
3304                        dst_color[2] as f32 / 255.0,
3305                        dst_color[3] as f32 / 255.0,
3306                    );
3307                    gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
3308                };
3309            }
3310        }
3311
3312        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3313        let src_roi = if let Some(crop) = crop.src_rect {
3314            RegionOfInterest {
3315                left: crop.left as f32 / src.width() as f32,
3316                top: (crop.top + crop.height) as f32 / src.height() as f32,
3317                right: (crop.left + crop.width) as f32 / src.width() as f32,
3318                bottom: crop.top as f32 / src.height() as f32,
3319            }
3320        } else {
3321            RegionOfInterest {
3322                left: 0.,
3323                top: 1.,
3324                right: 1.,
3325                bottom: 0.,
3326            }
3327        };
3328
3329        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3330        let cvt_screen_coord = |normalized| normalized * 2.0 - 1.0;
3331        let dst_roi = if let Some(crop) = crop.dst_rect {
3332            RegionOfInterest {
3333                left: cvt_screen_coord(crop.left as f32 / dst.width() as f32),
3334                top: cvt_screen_coord((crop.top + crop.height) as f32 / dst.height() as f32),
3335                right: cvt_screen_coord((crop.left + crop.width) as f32 / dst.width() as f32),
3336                bottom: cvt_screen_coord(crop.top as f32 / dst.height() as f32),
3337            }
3338        } else {
3339            RegionOfInterest {
3340                left: -1.,
3341                top: 1.,
3342                right: 1.,
3343                bottom: -1.,
3344            }
3345        };
3346        let rotation_offset = match rotation {
3347            crate::Rotation::None => 0,
3348            crate::Rotation::Clockwise90 => 1,
3349            crate::Rotation::Rotate180 => 2,
3350            crate::Rotation::CounterClockwise90 => 3,
3351        };
3352        if self.gl_context.transfer_backend.is_dma() && src.tensor().memory() == TensorMemory::Dma {
3353            match self.get_or_create_egl_image(CacheKind::Src, src) {
3354                Ok(src_egl) => self.draw_camera_texture_eglimage(
3355                    src,
3356                    src_egl,
3357                    src_roi,
3358                    dst_roi,
3359                    rotation_offset,
3360                    flip,
3361                )?,
3362                Err(e) => {
3363                    log::warn!("EGL image creation failed for {:?}: {:?}", src.fourcc(), e);
3364                    let start = Instant::now();
3365                    self.draw_src_texture(src, src_roi, dst_roi, rotation_offset, flip)?;
3366                    log::debug!("draw_src_texture takes {:?}", start.elapsed());
3367                }
3368            }
3369        } else {
3370            let start = Instant::now();
3371            self.draw_src_texture(src, src_roi, dst_roi, rotation_offset, flip)?;
3372            log::debug!("draw_src_texture takes {:?}", start.elapsed());
3373        }
3374
3375        let start = Instant::now();
3376        unsafe { gls::gl::Finish() };
3377        log::debug!("gl_Finish takes {:?}", start.elapsed());
3378        check_gl_error(function!(), line!())?;
3379        Ok(())
3380    }
3381
3382    fn convert_to_planar(
3383        &mut self,
3384        src: &TensorImage,
3385        dst: &TensorImage,
3386        rotation: crate::Rotation,
3387        flip: Flip,
3388        crop: Crop,
3389    ) -> Result<(), crate::Error> {
3390        // if let Some(crop) = crop.src_rect
3391        //     && (crop.left > 0
3392        //         || crop.top > 0
3393        //         || crop.height < src.height()
3394        //         || crop.width < src.width())
3395        // {
3396        //     return Err(crate::Error::NotSupported(
3397        //         "Cropping in planar RGB mode is not supported".to_string(),
3398        //     ));
3399        // }
3400
3401        // if let Some(crop) = crop.dst_rect
3402        //     && (crop.left > 0
3403        //         || crop.top > 0
3404        //         || crop.height < src.height()
3405        //         || crop.width < src.width())
3406        // {
3407        //     return Err(crate::Error::NotSupported(
3408        //         "Cropping in planar RGB mode is not supported".to_string(),
3409        //     ));
3410        // }
3411
3412        let alpha = match dst.fourcc() {
3413            PLANAR_RGB | PLANAR_RGB_INT8 => false,
3414            PLANAR_RGBA => true,
3415            _ => {
3416                return Err(crate::Error::NotSupported(
3417                    "Destination format must be PLANAR_RGB, PLANAR_RGB_INT8, or PLANAR_RGBA"
3418                        .to_string(),
3419                ));
3420            }
3421        };
3422        let is_int8 = fourcc_is_int8(dst.fourcc());
3423
3424        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3425        let src_roi = if let Some(crop) = crop.src_rect {
3426            RegionOfInterest {
3427                left: crop.left as f32 / src.width() as f32,
3428                top: (crop.top + crop.height) as f32 / src.height() as f32,
3429                right: (crop.left + crop.width) as f32 / src.width() as f32,
3430                bottom: crop.top as f32 / src.height() as f32,
3431            }
3432        } else {
3433            RegionOfInterest {
3434                left: 0.,
3435                top: 1.,
3436                right: 1.,
3437                bottom: 0.,
3438            }
3439        };
3440
3441        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
3442        let cvt_screen_coord = |normalized| normalized * 2.0 - 1.0;
3443        let dst_roi = if let Some(crop) = crop.dst_rect {
3444            RegionOfInterest {
3445                left: cvt_screen_coord(crop.left as f32 / dst.width() as f32),
3446                top: cvt_screen_coord((crop.top + crop.height) as f32 / dst.height() as f32),
3447                right: cvt_screen_coord((crop.left + crop.width) as f32 / dst.width() as f32),
3448                bottom: cvt_screen_coord(crop.top as f32 / dst.height() as f32),
3449            }
3450        } else {
3451            RegionOfInterest {
3452                left: -1.,
3453                top: 1.,
3454                right: 1.,
3455                bottom: -1.,
3456            }
3457        };
3458        let rotation_offset = match rotation {
3459            crate::Rotation::None => 0,
3460            crate::Rotation::Clockwise90 => 1,
3461            crate::Rotation::Rotate180 => 2,
3462            crate::Rotation::CounterClockwise90 => 3,
3463        };
3464
3465        let has_crop = crop.dst_rect.is_some_and(|x| {
3466            x.left != 0 || x.top != 0 || x.width != dst.width() || x.height != dst.height()
3467        });
3468        if has_crop {
3469            if let Some(dst_color) = crop.dst_color {
3470                self.clear_rect_planar(
3471                    dst.width(),
3472                    dst.height(),
3473                    dst_roi,
3474                    [
3475                        dst_color[0] as f32 / 255.0,
3476                        dst_color[1] as f32 / 255.0,
3477                        dst_color[2] as f32 / 255.0,
3478                        dst_color[3] as f32 / 255.0,
3479                    ],
3480                    alpha,
3481                )?;
3482            }
3483        }
3484
3485        let src_egl = self.get_or_create_egl_image(CacheKind::Src, src)?;
3486
3487        self.draw_camera_texture_to_rgb_planar(
3488            src_egl,
3489            src_roi,
3490            dst_roi,
3491            rotation_offset,
3492            flip,
3493            alpha,
3494            is_int8,
3495        )?;
3496        unsafe { gls::gl::Finish() };
3497
3498        Ok(())
3499    }
3500
3501    /// Render packed RGB (or RGB_INT8) to a DMA destination buffer using a
3502    /// two-pass architecture:
3503    ///
3504    /// **Pass 1:** Render source → intermediate RGBA texture via `convert_to()`
3505    /// (reuses the battle-tested RGBA path with full crop/letterbox/rotation/flip).
3506    ///
3507    /// **Pass 2:** Pack intermediate RGBA → RGB DMA destination using a simple
3508    /// packing shader with 2D sampler. The destination DMA buffer is reinterpreted
3509    /// as RGBA8 at (W*3/4) x H dimensions.
3510    fn convert_to_packed_rgb(
3511        &mut self,
3512        src: &TensorImage,
3513        dst: &mut TensorImage,
3514        rotation: crate::Rotation,
3515        flip: Flip,
3516        crop: Crop,
3517    ) -> crate::Result<()> {
3518        let dst_w = dst.width();
3519        let dst_h = dst.height();
3520        let is_int8 = fourcc_is_int8(dst.fourcc());
3521
3522        // Width must satisfy PackedRgba8 constraint: W*3 divisible by 4
3523        if !(dst_w * 3).is_multiple_of(4) {
3524            return Err(crate::Error::NotSupported(format!(
3525                "Packed RGB requires width*3 divisible by 4, got width={dst_w}"
3526            )));
3527        }
3528
3529        let render_w = dst_w * 3 / 4;
3530        let render_h = dst_h;
3531
3532        log::debug!(
3533            "convert_to_packed_rgb: {dst_w}x{dst_h} -> {render_w}x{render_h} two-pass int8={is_int8}",
3534        );
3535
3536        // --- Pass 1: Render source → intermediate RGBA texture ---
3537        self.ensure_packed_rgb_intermediate(dst_w, dst_h)?;
3538        self.packed_rgb_fbo.bind();
3539        unsafe {
3540            gls::gl::FramebufferTexture2D(
3541                gls::gl::FRAMEBUFFER,
3542                gls::gl::COLOR_ATTACHMENT0,
3543                gls::gl::TEXTURE_2D,
3544                self.packed_rgb_intermediate_tex.id,
3545                0,
3546            );
3547            check_gl_error(function!(), line!())?;
3548            gls::gl::Viewport(0, 0, dst_w as i32, dst_h as i32);
3549        }
3550        // convert_to() renders to the currently-bound FBO (packed_rgb_fbo → intermediate).
3551        // It uses dst only for width/height in ROI coordinate math.
3552        // Handles: source binding (DMA EGLImage or upload), crop, letterbox, rotation, flip.
3553        self.convert_to(src, dst, rotation, flip, crop)?;
3554
3555        // --- Pass 2: Pack intermediate RGBA → RGB DMA destination ---
3556        self.convert_fbo.bind();
3557        let dest_egl =
3558            self.get_or_create_egl_image_rgb(dst, render_w, render_h, DrmFourcc::Abgr8888, 4)?;
3559        unsafe {
3560            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
3561            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.render_texture.id);
3562            gls::gl::TexParameteri(
3563                gls::gl::TEXTURE_2D,
3564                gls::gl::TEXTURE_MIN_FILTER,
3565                gls::gl::NEAREST as i32,
3566            );
3567            gls::gl::TexParameteri(
3568                gls::gl::TEXTURE_2D,
3569                gls::gl::TEXTURE_MAG_FILTER,
3570                gls::gl::NEAREST as i32,
3571            );
3572            gls::gl::EGLImageTargetTexture2DOES(gls::gl::TEXTURE_2D, dest_egl.as_ptr());
3573            gls::gl::FramebufferTexture2D(
3574                gls::gl::FRAMEBUFFER,
3575                gls::gl::COLOR_ATTACHMENT0,
3576                gls::gl::TEXTURE_2D,
3577                self.render_texture.id,
3578                0,
3579            );
3580            check_gl_error(function!(), line!())?;
3581            gls::gl::Viewport(0, 0, render_w as i32, render_h as i32);
3582        }
3583
3584        // Bind intermediate RGBA texture as source for the packing shader
3585        let program = if is_int8 {
3586            &self.packed_rgba8_int8_program_2d
3587        } else {
3588            &self.packed_rgba8_program_2d
3589        };
3590        unsafe {
3591            gls::gl::UseProgram(program.id);
3592            gls::gl::ActiveTexture(gls::gl::TEXTURE1);
3593            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.packed_rgb_intermediate_tex.id);
3594            gls::gl::TexParameteri(
3595                gls::gl::TEXTURE_2D,
3596                gls::gl::TEXTURE_MIN_FILTER,
3597                gls::gl::NEAREST as i32,
3598            );
3599            gls::gl::TexParameteri(
3600                gls::gl::TEXTURE_2D,
3601                gls::gl::TEXTURE_MAG_FILTER,
3602                gls::gl::NEAREST as i32,
3603            );
3604        }
3605
3606        // Set uniform: tex = TEXTURE1 (intermediate RGBA texture)
3607        unsafe {
3608            let loc_tex = gls::gl::GetUniformLocation(program.id, c"tex".as_ptr());
3609            gls::gl::Uniform1i(loc_tex, 1);
3610        }
3611
3612        // Draw full-viewport quad to pack RGBA→RGB
3613        self.draw_fullscreen_quad()?;
3614
3615        unsafe { gls::gl::Finish() };
3616        Ok(())
3617    }
3618
3619    /// Render directly to an RGB8 renderbuffer backed by BGR888 DMA-buf.
3620    /// Single-pass: no intermediate texture, no packing shader.
3621    fn convert_to_rgb_direct(
3622        &mut self,
3623        src: &TensorImage,
3624        dst: &mut TensorImage,
3625        rotation: crate::Rotation,
3626        flip: Flip,
3627        crop: Crop,
3628    ) -> crate::Result<()> {
3629        let is_int8 = fourcc_is_int8(dst.fourcc());
3630
3631        log::debug!(
3632            "convert_to_rgb_direct: {}x{} single-pass int8={is_int8}",
3633            dst.width(),
3634            dst.height(),
3635        );
3636
3637        // Get or create cached renderbuffer
3638        let (rbo, width, height) = self.get_or_create_rgb_direct_rbo(dst)?;
3639
3640        // Bind FBO with renderbuffer attachment
3641        self.convert_fbo.bind();
3642        unsafe {
3643            gls::gl::FramebufferRenderbuffer(
3644                gls::gl::FRAMEBUFFER,
3645                gls::gl::COLOR_ATTACHMENT0,
3646                gls::gl::RENDERBUFFER,
3647                rbo,
3648            );
3649            check_gl_error(function!(), line!())?;
3650
3651            let status = gls::gl::CheckFramebufferStatus(gls::gl::FRAMEBUFFER);
3652            if status != gls::gl::FRAMEBUFFER_COMPLETE {
3653                log::warn!("convert_to_rgb_direct: FBO incomplete (0x{status:x}), falling back");
3654                return self.convert_to_packed_rgb(src, dst, rotation, flip, crop);
3655            }
3656
3657            gls::gl::Viewport(0, 0, width, height);
3658        }
3659
3660        // For int8, temporarily swap to int8 shader programs and bias the clear color
3661        let crop = if is_int8 {
3662            std::mem::swap(&mut self.texture_program, &mut self.texture_int8_program);
3663            std::mem::swap(
3664                &mut self.texture_program_yuv,
3665                &mut self.texture_int8_program_yuv,
3666            );
3667            // Bias the letterbox clear color with XOR 0x80 since glClear bypasses
3668            // the fragment shader — the int8 bias must be applied to the color directly.
3669            let mut crop = crop;
3670            if let Some(ref mut color) = crop.dst_color {
3671                color[0] ^= 0x80;
3672                color[1] ^= 0x80;
3673                color[2] ^= 0x80;
3674            }
3675            crop
3676        } else {
3677            crop
3678        };
3679
3680        let result = self.convert_to(src, dst, rotation, flip, crop);
3681
3682        // Swap back
3683        if is_int8 {
3684            std::mem::swap(&mut self.texture_program, &mut self.texture_int8_program);
3685            std::mem::swap(
3686                &mut self.texture_program_yuv,
3687                &mut self.texture_int8_program_yuv,
3688            );
3689        }
3690
3691        result
3692    }
3693
3694    /// Allocates or resizes the intermediate RGBA texture for two-pass packed RGB.
3695    fn ensure_packed_rgb_intermediate(&mut self, width: usize, height: usize) -> crate::Result<()> {
3696        if self.packed_rgb_intermediate_size == (width, height) {
3697            return Ok(());
3698        }
3699        unsafe {
3700            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.packed_rgb_intermediate_tex.id);
3701            gls::gl::TexParameteri(
3702                gls::gl::TEXTURE_2D,
3703                gls::gl::TEXTURE_MIN_FILTER,
3704                gls::gl::NEAREST as i32,
3705            );
3706            gls::gl::TexParameteri(
3707                gls::gl::TEXTURE_2D,
3708                gls::gl::TEXTURE_MAG_FILTER,
3709                gls::gl::NEAREST as i32,
3710            );
3711            gls::gl::TexImage2D(
3712                gls::gl::TEXTURE_2D,
3713                0,
3714                gls::gl::RGBA as i32,
3715                width as i32,
3716                height as i32,
3717                0,
3718                gls::gl::RGBA,
3719                gls::gl::UNSIGNED_BYTE,
3720                std::ptr::null(),
3721            );
3722            check_gl_error(function!(), line!())?;
3723        }
3724        self.packed_rgb_intermediate_size = (width, height);
3725        Ok(())
3726    }
3727
3728    /// Draw a fullscreen quad for the currently-bound shader program.
3729    /// Used by the pass-2 packing shader in the two-pass packed RGB pipeline.
3730    fn draw_fullscreen_quad(&self) -> Result<(), crate::Error> {
3731        unsafe {
3732            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
3733            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
3734
3735            let vertices: [f32; 12] = [
3736                -1.0, 1.0, 0.0, // top-left
3737                1.0, 1.0, 0.0, // top-right
3738                1.0, -1.0, 0.0, // bottom-right
3739                -1.0, -1.0, 0.0, // bottom-left
3740            ];
3741            gls::gl::BufferSubData(
3742                gls::gl::ARRAY_BUFFER,
3743                0,
3744                (size_of::<f32>() * vertices.len()) as isize,
3745                vertices.as_ptr() as *const c_void,
3746            );
3747
3748            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
3749            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
3750
3751            // Texture coordinates (the packed shader uses gl_FragCoord, not tc,
3752            // but we still need valid buffers for the vertex attribute layout)
3753            let tex_coords: [f32; 8] = [0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0];
3754            gls::gl::BufferSubData(
3755                gls::gl::ARRAY_BUFFER,
3756                0,
3757                (size_of::<f32>() * tex_coords.len()) as isize,
3758                tex_coords.as_ptr() as *const c_void,
3759            );
3760
3761            let indices: [u32; 4] = [0, 1, 2, 3];
3762            gls::gl::DrawElements(
3763                gls::gl::TRIANGLE_FAN,
3764                indices.len() as i32,
3765                gls::gl::UNSIGNED_INT,
3766                indices.as_ptr() as *const c_void,
3767            );
3768        }
3769        check_gl_error(function!(), line!())?;
3770        Ok(())
3771    }
3772
3773    fn clear_rect_planar(
3774        &self,
3775        width: usize,
3776        height: usize,
3777        dst_roi: RegionOfInterest,
3778        color: [f32; 4],
3779        alpha: bool,
3780    ) -> Result<(), Error> {
3781        if !alpha && color[0] == color[1] && color[1] == color[2] {
3782            unsafe {
3783                gls::gl::ClearColor(color[0], color[0], color[0], 1.0);
3784                gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
3785            };
3786        }
3787
3788        let split = if alpha { 4 } else { 3 };
3789
3790        unsafe {
3791            gls::gl::Enable(gls::gl::SCISSOR_TEST);
3792            let x = (((dst_roi.left + 1.0) / 2.0) * width as f32).round() as i32;
3793            let y = (((dst_roi.bottom + 1.0) / 2.0) * height as f32).round() as i32;
3794            let width = (((dst_roi.right - dst_roi.left) / 2.0) * width as f32).round() as i32;
3795            let height = (((dst_roi.top - dst_roi.bottom) / 2.0) * height as f32 / split as f32)
3796                .round() as i32;
3797            for (i, c) in color.iter().enumerate().take(split) {
3798                gls::gl::Scissor(x, y + i as i32 * height, width, height);
3799                gls::gl::ClearColor(*c, *c, *c, 1.0);
3800                gls::gl::Clear(gls::gl::COLOR_BUFFER_BIT);
3801            }
3802            gls::gl::Disable(gls::gl::SCISSOR_TEST);
3803        }
3804        Ok(())
3805    }
3806
3807    #[allow(clippy::too_many_arguments)]
3808    fn draw_camera_texture_to_rgb_planar(
3809        &self,
3810        egl_img: egl::Image,
3811        src_roi: RegionOfInterest,
3812        mut dst_roi: RegionOfInterest,
3813        rotation_offset: usize,
3814        flip: Flip,
3815        alpha: bool,
3816        int8: bool,
3817    ) -> Result<(), Error> {
3818        let texture_target = gls::gl::TEXTURE_EXTERNAL_OES;
3819        match flip {
3820            Flip::None => {}
3821            Flip::Vertical => {
3822                std::mem::swap(&mut dst_roi.top, &mut dst_roi.bottom);
3823            }
3824            Flip::Horizontal => {
3825                std::mem::swap(&mut dst_roi.left, &mut dst_roi.right);
3826            }
3827        }
3828        unsafe {
3829            let program = if int8 {
3830                &self.texture_program_planar_int8
3831            } else {
3832                &self.texture_program_planar
3833            };
3834            gls::gl::UseProgram(program.id);
3835            gls::gl::BindTexture(texture_target, self.camera_eglimage_texture.id);
3836            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
3837            gls::gl::TexParameteri(
3838                texture_target,
3839                gls::gl::TEXTURE_MIN_FILTER,
3840                gls::gl::LINEAR as i32,
3841            );
3842            gls::gl::TexParameteri(
3843                texture_target,
3844                gls::gl::TEXTURE_MAG_FILTER,
3845                gls::gl::LINEAR as i32,
3846            );
3847            gls::gl::TexParameteri(
3848                texture_target,
3849                gls::gl::TEXTURE_WRAP_S,
3850                gls::gl::CLAMP_TO_EDGE as i32,
3851            );
3852
3853            gls::gl::TexParameteri(
3854                texture_target,
3855                gls::gl::TEXTURE_WRAP_T,
3856                gls::gl::CLAMP_TO_EDGE as i32,
3857            );
3858
3859            gls::egl_image_target_texture_2d_oes(texture_target, egl_img.as_ptr());
3860            check_gl_error(function!(), line!())?;
3861            let y_centers = if alpha {
3862                vec![-3.0 / 4.0, -1.0 / 4.0, 1.0 / 4.0, 3.0 / 4.0]
3863            } else {
3864                vec![-2.0 / 3.0, 0.0, 2.0 / 3.0]
3865            };
3866            let swizzles = [gls::gl::RED, gls::gl::GREEN, gls::gl::BLUE, gls::gl::ALPHA];
3867            // starts from bottom
3868            for (i, y_center) in y_centers.iter().enumerate() {
3869                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
3870                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
3871                let camera_vertices: [f32; 12] = [
3872                    dst_roi.left,
3873                    dst_roi.top / 3.0 + y_center,
3874                    0., // left top
3875                    dst_roi.right,
3876                    dst_roi.top / 3.0 + y_center,
3877                    0., // right top
3878                    dst_roi.right,
3879                    dst_roi.bottom / 3.0 + y_center,
3880                    0., // right bottom
3881                    dst_roi.left,
3882                    dst_roi.bottom / 3.0 + y_center,
3883                    0., // left bottom
3884                ];
3885                gls::gl::BufferData(
3886                    gls::gl::ARRAY_BUFFER,
3887                    (size_of::<f32>() * camera_vertices.len()) as isize,
3888                    camera_vertices.as_ptr() as *const c_void,
3889                    gls::gl::DYNAMIC_DRAW,
3890                );
3891
3892                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
3893                gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
3894                let texture_vertices: [f32; 16] = [
3895                    src_roi.left,
3896                    src_roi.top,
3897                    src_roi.right,
3898                    src_roi.top,
3899                    src_roi.right,
3900                    src_roi.bottom,
3901                    src_roi.left,
3902                    src_roi.bottom,
3903                    src_roi.left,
3904                    src_roi.top,
3905                    src_roi.right,
3906                    src_roi.top,
3907                    src_roi.right,
3908                    src_roi.bottom,
3909                    src_roi.left,
3910                    src_roi.bottom,
3911                ];
3912
3913                gls::gl::BufferData(
3914                    gls::gl::ARRAY_BUFFER,
3915                    (size_of::<f32>() * 8) as isize,
3916                    (texture_vertices[(rotation_offset * 2)..]).as_ptr() as *const c_void,
3917                    gls::gl::DYNAMIC_DRAW,
3918                );
3919                let vertices_index: [u32; 4] = [0, 1, 2, 3];
3920                // self.texture_program_planar
3921                //     .load_uniform_1i(c"color_index", 2 - i as i32);
3922
3923                gls::gl::TexParameteri(
3924                    texture_target,
3925                    gls::gl::TEXTURE_SWIZZLE_R,
3926                    swizzles[i] as i32,
3927                );
3928
3929                gls::gl::DrawElements(
3930                    gls::gl::TRIANGLE_FAN,
3931                    vertices_index.len() as i32,
3932                    gls::gl::UNSIGNED_INT,
3933                    vertices_index.as_ptr() as *const c_void,
3934                );
3935            }
3936            check_gl_error(function!(), line!())?;
3937        }
3938        Ok(())
3939    }
3940
3941    fn draw_src_texture(
3942        &mut self,
3943        src: &TensorImage,
3944        src_roi: RegionOfInterest,
3945        mut dst_roi: RegionOfInterest,
3946        rotation_offset: usize,
3947        flip: Flip,
3948    ) -> Result<(), Error> {
3949        let texture_target = gls::gl::TEXTURE_2D;
3950        let texture_format = match src.fourcc() {
3951            RGB => gls::gl::RGB,
3952            RGBA => gls::gl::RGBA,
3953            GREY => gls::gl::RED,
3954            _ => {
3955                return Err(Error::NotSupported(format!(
3956                    "draw_src_texture does not support {:?} (use DMA-BUF path for YUV)",
3957                    src.fourcc()
3958                )));
3959            }
3960        };
3961        unsafe {
3962            gls::gl::UseProgram(self.texture_program.id);
3963            gls::gl::BindTexture(texture_target, self.camera_normal_texture.id);
3964            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
3965            gls::gl::TexParameteri(
3966                texture_target,
3967                gls::gl::TEXTURE_MIN_FILTER,
3968                gls::gl::LINEAR as i32,
3969            );
3970            gls::gl::TexParameteri(
3971                texture_target,
3972                gls::gl::TEXTURE_MAG_FILTER,
3973                gls::gl::LINEAR as i32,
3974            );
3975            if src.fourcc() == GREY {
3976                for swizzle in [
3977                    gls::gl::TEXTURE_SWIZZLE_R,
3978                    gls::gl::TEXTURE_SWIZZLE_G,
3979                    gls::gl::TEXTURE_SWIZZLE_B,
3980                ] {
3981                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, gls::gl::RED as i32);
3982                }
3983            } else {
3984                for (swizzle, src) in [
3985                    (gls::gl::TEXTURE_SWIZZLE_R, gls::gl::RED),
3986                    (gls::gl::TEXTURE_SWIZZLE_G, gls::gl::GREEN),
3987                    (gls::gl::TEXTURE_SWIZZLE_B, gls::gl::BLUE),
3988                ] {
3989                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, src as i32);
3990                }
3991            }
3992            self.camera_normal_texture.update_texture(
3993                texture_target,
3994                src.width(),
3995                src.height(),
3996                texture_format,
3997                &src.tensor().map()?,
3998            );
3999
4000            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4001            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4002
4003            match flip {
4004                Flip::None => {}
4005                Flip::Vertical => {
4006                    std::mem::swap(&mut dst_roi.top, &mut dst_roi.bottom);
4007                }
4008                Flip::Horizontal => {
4009                    std::mem::swap(&mut dst_roi.left, &mut dst_roi.right);
4010                }
4011            }
4012
4013            let camera_vertices: [f32; 12] = [
4014                dst_roi.left,
4015                dst_roi.top,
4016                0., // left top
4017                dst_roi.right,
4018                dst_roi.top,
4019                0., // right top
4020                dst_roi.right,
4021                dst_roi.bottom,
4022                0., // right bottom
4023                dst_roi.left,
4024                dst_roi.bottom,
4025                0., // left bottom
4026            ];
4027            gls::gl::BufferData(
4028                gls::gl::ARRAY_BUFFER,
4029                (size_of::<f32>() * camera_vertices.len()) as isize,
4030                camera_vertices.as_ptr() as *const c_void,
4031                gls::gl::DYNAMIC_DRAW,
4032            );
4033            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4034            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4035            let texture_vertices: [f32; 16] = [
4036                src_roi.left,
4037                src_roi.top,
4038                src_roi.right,
4039                src_roi.top,
4040                src_roi.right,
4041                src_roi.bottom,
4042                src_roi.left,
4043                src_roi.bottom,
4044                src_roi.left,
4045                src_roi.top,
4046                src_roi.right,
4047                src_roi.top,
4048                src_roi.right,
4049                src_roi.bottom,
4050                src_roi.left,
4051                src_roi.bottom,
4052            ];
4053
4054            gls::gl::BufferData(
4055                gls::gl::ARRAY_BUFFER,
4056                (size_of::<f32>() * 8) as isize,
4057                (texture_vertices[(rotation_offset * 2)..]).as_ptr() as *const c_void,
4058                gls::gl::DYNAMIC_DRAW,
4059            );
4060            let vertices_index: [u32; 4] = [0, 1, 2, 3];
4061            gls::gl::DrawElements(
4062                gls::gl::TRIANGLE_FAN,
4063                vertices_index.len() as i32,
4064                gls::gl::UNSIGNED_INT,
4065                vertices_index.as_ptr() as *const c_void,
4066            );
4067            check_gl_error(function!(), line!())?;
4068
4069            Ok(())
4070        }
4071    }
4072
4073    fn draw_camera_texture_eglimage(
4074        &self,
4075        src: &TensorImage,
4076        egl_img: egl::Image,
4077        src_roi: RegionOfInterest,
4078        mut dst_roi: RegionOfInterest,
4079        rotation_offset: usize,
4080        flip: Flip,
4081    ) -> Result<(), Error> {
4082        // let texture_target = gls::gl::TEXTURE_2D;
4083        let texture_target = gls::gl::TEXTURE_EXTERNAL_OES;
4084        unsafe {
4085            gls::gl::UseProgram(self.texture_program_yuv.id);
4086            gls::gl::BindTexture(texture_target, self.camera_eglimage_texture.id);
4087            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
4088            gls::gl::TexParameteri(
4089                texture_target,
4090                gls::gl::TEXTURE_MIN_FILTER,
4091                gls::gl::LINEAR as i32,
4092            );
4093            gls::gl::TexParameteri(
4094                texture_target,
4095                gls::gl::TEXTURE_MAG_FILTER,
4096                gls::gl::LINEAR as i32,
4097            );
4098
4099            if src.fourcc() == GREY {
4100                for swizzle in [
4101                    gls::gl::TEXTURE_SWIZZLE_R,
4102                    gls::gl::TEXTURE_SWIZZLE_G,
4103                    gls::gl::TEXTURE_SWIZZLE_B,
4104                ] {
4105                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, gls::gl::RED as i32);
4106                }
4107            } else {
4108                for (swizzle, src) in [
4109                    (gls::gl::TEXTURE_SWIZZLE_R, gls::gl::RED),
4110                    (gls::gl::TEXTURE_SWIZZLE_G, gls::gl::GREEN),
4111                    (gls::gl::TEXTURE_SWIZZLE_B, gls::gl::BLUE),
4112                ] {
4113                    gls::gl::TexParameteri(gls::gl::TEXTURE_2D, swizzle, src as i32);
4114                }
4115            }
4116
4117            gls::egl_image_target_texture_2d_oes(texture_target, egl_img.as_ptr());
4118            check_gl_error(function!(), line!())?;
4119            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4120            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4121
4122            match flip {
4123                Flip::None => {}
4124                Flip::Vertical => {
4125                    std::mem::swap(&mut dst_roi.top, &mut dst_roi.bottom);
4126                }
4127                Flip::Horizontal => {
4128                    std::mem::swap(&mut dst_roi.left, &mut dst_roi.right);
4129                }
4130            }
4131
4132            let camera_vertices: [f32; 12] = [
4133                dst_roi.left,
4134                dst_roi.top,
4135                0., // left top
4136                dst_roi.right,
4137                dst_roi.top,
4138                0., // right top
4139                dst_roi.right,
4140                dst_roi.bottom,
4141                0., // right bottom
4142                dst_roi.left,
4143                dst_roi.bottom,
4144                0., // left bottom
4145            ];
4146            gls::gl::BufferSubData(
4147                gls::gl::ARRAY_BUFFER,
4148                0,
4149                (size_of::<f32>() * camera_vertices.len()) as isize,
4150                camera_vertices.as_ptr() as *const c_void,
4151            );
4152
4153            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4154            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4155
4156            let texture_vertices: [f32; 16] = [
4157                src_roi.left,
4158                src_roi.top,
4159                src_roi.right,
4160                src_roi.top,
4161                src_roi.right,
4162                src_roi.bottom,
4163                src_roi.left,
4164                src_roi.bottom,
4165                src_roi.left,
4166                src_roi.top,
4167                src_roi.right,
4168                src_roi.top,
4169                src_roi.right,
4170                src_roi.bottom,
4171                src_roi.left,
4172                src_roi.bottom,
4173            ];
4174            gls::gl::BufferSubData(
4175                gls::gl::ARRAY_BUFFER,
4176                0,
4177                (size_of::<f32>() * 8) as isize,
4178                (texture_vertices[(rotation_offset * 2)..]).as_ptr() as *const c_void,
4179            );
4180
4181            let vertices_index: [u32; 4] = [0, 1, 2, 3];
4182            gls::gl::DrawElements(
4183                gls::gl::TRIANGLE_FAN,
4184                vertices_index.len() as i32,
4185                gls::gl::UNSIGNED_INT,
4186                vertices_index.as_ptr() as *const c_void,
4187            );
4188        }
4189        check_gl_error(function!(), line!())?;
4190        Ok(())
4191    }
4192
4193    fn create_image_from_dma2(&self, src: &TensorImage) -> Result<EglImage, crate::Error> {
4194        let width;
4195        let height;
4196        let format;
4197        let channels;
4198
4199        // NV12 is semi-planar but handled specially via EGL multi-plane import
4200        if src.fourcc() == NV12 {
4201            if !src.width().is_multiple_of(4) {
4202                return Err(Error::NotSupported(
4203                    "OpenGL EGLImage doesn't support image widths which are not multiples of 4"
4204                        .to_string(),
4205                ));
4206            }
4207            width = src.width();
4208            height = src.height();
4209            format = fourcc_to_drm(NV12)?;
4210            channels = 1; // Y plane pitch is 1 byte per pixel
4211        } else if src.is_planar() {
4212            if !src.width().is_multiple_of(16) {
4213                return Err(Error::NotSupported(
4214                    "OpenGL Planar RGB EGLImage doesn't support image widths which are not multiples of 16"
4215                        .to_string(),
4216                ));
4217            }
4218            match src.fourcc() {
4219                PLANAR_RGB | PLANAR_RGB_INT8 => {
4220                    format = DrmFourcc::R8;
4221                    width = src.width();
4222                    height = src.height() * 3;
4223                    channels = 1;
4224                }
4225                fourcc => {
4226                    return Err(crate::Error::NotSupported(format!(
4227                        "Unsupported Planar FourCC {fourcc:?}"
4228                    )));
4229                }
4230            };
4231        } else {
4232            if !src.width().is_multiple_of(4) {
4233                return Err(Error::NotSupported(
4234                    "OpenGL EGLImage doesn't support image widths which are not multiples of 4"
4235                        .to_string(),
4236                ));
4237            }
4238            width = src.width();
4239            height = src.height();
4240            format = fourcc_to_drm(src.fourcc())?;
4241            channels = src.channels();
4242        }
4243
4244        let fd = match &src.tensor {
4245            edgefirst_tensor::Tensor::Dma(dma_tensor) => dma_tensor.fd.as_raw_fd(),
4246            edgefirst_tensor::Tensor::Shm(_) => {
4247                return Err(Error::NotImplemented(
4248                    "OpenGL EGLImage doesn't support SHM".to_string(),
4249                ));
4250            }
4251            edgefirst_tensor::Tensor::Mem(_) => {
4252                return Err(Error::NotImplemented(
4253                    "OpenGL EGLImage doesn't support MEM".to_string(),
4254                ));
4255            }
4256            edgefirst_tensor::Tensor::Pbo(_) => {
4257                return Err(Error::NotImplemented(
4258                    "OpenGL EGLImage doesn't support PBO".to_string(),
4259                ));
4260            }
4261        };
4262
4263        // For NV12, plane0 pitch is width (Y is 1 byte/pixel)
4264        // For other formats, pitch is width * channels
4265        let plane0_pitch = if src.fourcc() == NV12 {
4266            width
4267        } else {
4268            width * channels
4269        };
4270
4271        let mut egl_img_attr = vec![
4272            egl_ext::LINUX_DRM_FOURCC as Attrib,
4273            format as Attrib,
4274            khronos_egl::WIDTH as Attrib,
4275            width as Attrib,
4276            khronos_egl::HEIGHT as Attrib,
4277            height as Attrib,
4278            egl_ext::DMA_BUF_PLANE0_PITCH as Attrib,
4279            plane0_pitch as Attrib,
4280            egl_ext::DMA_BUF_PLANE0_OFFSET as Attrib,
4281            0 as Attrib,
4282            egl_ext::DMA_BUF_PLANE0_FD as Attrib,
4283            fd as Attrib,
4284            egl::IMAGE_PRESERVED as Attrib,
4285            egl::TRUE as Attrib,
4286        ];
4287
4288        // NV12 requires a second plane for UV data
4289        if src.fourcc() == NV12 {
4290            let uv_offset = width * height; // Y plane size
4291            egl_img_attr.append(&mut vec![
4292                egl_ext::DMA_BUF_PLANE1_FD as Attrib,
4293                fd as Attrib,
4294                egl_ext::DMA_BUF_PLANE1_OFFSET as Attrib,
4295                uv_offset as Attrib,
4296                egl_ext::DMA_BUF_PLANE1_PITCH as Attrib,
4297                width as Attrib, // UV plane has same width as Y plane
4298            ]);
4299        }
4300
4301        if matches!(src.fourcc(), YUYV | VYUY | NV12) {
4302            egl_img_attr.append(&mut vec![
4303                egl_ext::YUV_COLOR_SPACE_HINT as Attrib,
4304                egl_ext::ITU_REC709 as Attrib,
4305                egl_ext::SAMPLE_RANGE_HINT as Attrib,
4306                egl_ext::YUV_NARROW_RANGE as Attrib,
4307            ]);
4308        }
4309
4310        egl_img_attr.push(khronos_egl::NONE as Attrib);
4311
4312        match self.new_egl_image_owned(egl_ext::LINUX_DMA_BUF, &egl_img_attr) {
4313            Ok(v) => Ok(v),
4314            Err(e) => Err(e),
4315        }
4316    }
4317
4318    fn new_egl_image_owned(
4319        &'_ self,
4320        target: egl::Enum,
4321        attrib_list: &[Attrib],
4322    ) -> Result<EglImage, Error> {
4323        let image = GlContext::egl_create_image_with_fallback(
4324            &self.gl_context.egl,
4325            self.gl_context.display.as_display(),
4326            unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) },
4327            target,
4328            unsafe { egl::ClientBuffer::from_ptr(null_mut()) },
4329            attrib_list,
4330        )?;
4331        Ok(EglImage {
4332            egl_image: image,
4333            display: self.gl_context.display.as_display(),
4334            egl: Rc::clone(&self.gl_context.egl),
4335        })
4336    }
4337
4338    /// Look up or create an EGLImage for a DMA tensor, returning the EGL image handle.
4339    ///
4340    /// Returns `egl::Image` (a `Copy` type wrapping `*const c_void`) to avoid borrow
4341    /// conflicts with the caller. The cache retains ownership of the `EglImage` value;
4342    /// the handle remains valid as long as the entry lives in the cache.
4343    fn get_or_create_egl_image(
4344        &mut self,
4345        cache: CacheKind,
4346        img: &TensorImage,
4347    ) -> Result<egl::Image, crate::Error> {
4348        let id = img.buffer_identity().id();
4349
4350        // Sweep dead entries opportunistically before looking up.
4351        match cache {
4352            CacheKind::Src => self.src_egl_cache.sweep(),
4353            CacheKind::Dst => self.dst_egl_cache.sweep(),
4354        }
4355
4356        {
4357            let egl_cache = match cache {
4358                CacheKind::Src => &mut self.src_egl_cache,
4359                CacheKind::Dst => &mut self.dst_egl_cache,
4360            };
4361            let ts = egl_cache.next_timestamp();
4362            if let Some(cached) = egl_cache.entries.get_mut(&id) {
4363                egl_cache.hits += 1;
4364                cached.last_used = ts;
4365                log::trace!("EglImageCache {:?} hit: id={id:#x}", cache);
4366                return Ok(cached.egl_image.egl_image);
4367            }
4368            egl_cache.misses += 1;
4369            log::trace!("EglImageCache {:?} miss: id={id:#x}", cache);
4370            // Evict least-recently-used entry if at capacity.
4371            if egl_cache.entries.len() >= egl_cache.capacity {
4372                egl_cache.evict_lru();
4373            }
4374        }
4375
4376        let egl_image_obj = self.create_image_from_dma2(img)?;
4377        let handle = egl_image_obj.egl_image;
4378        let guard = img.buffer_identity().weak();
4379        let egl_cache = match cache {
4380            CacheKind::Src => &mut self.src_egl_cache,
4381            CacheKind::Dst => &mut self.dst_egl_cache,
4382        };
4383        let ts = egl_cache.next_timestamp();
4384        egl_cache.entries.insert(
4385            id,
4386            CachedEglImage {
4387                egl_image: egl_image_obj,
4388                guard,
4389                renderbuffer: None,
4390                last_used: ts,
4391            },
4392        );
4393        Ok(handle)
4394    }
4395
4396    /// Create an EGLImage from a DMA buffer with explicitly specified internal
4397    /// dimensions and format. Used when the GL render surface differs from the
4398    /// logical TensorImage dimensions (e.g., packed RGB reinterpretation).
4399    fn create_egl_image_with_dims(
4400        &self,
4401        img: &TensorImage,
4402        width: usize,
4403        height: usize,
4404        drm_format: DrmFourcc,
4405        bpp: usize,
4406    ) -> Result<EglImage, crate::Error> {
4407        let fd = match &img.tensor {
4408            edgefirst_tensor::Tensor::Dma(dma_tensor) => dma_tensor.fd.as_raw_fd(),
4409            _ => {
4410                return Err(Error::NotImplemented(
4411                    "create_egl_image_with_dims requires DMA tensor".to_string(),
4412                ));
4413            }
4414        };
4415
4416        let pitch = width * bpp;
4417        let egl_img_attr = vec![
4418            egl_ext::LINUX_DRM_FOURCC as Attrib,
4419            drm_format as u32 as Attrib,
4420            khronos_egl::WIDTH as Attrib,
4421            width as Attrib,
4422            khronos_egl::HEIGHT as Attrib,
4423            height as Attrib,
4424            egl_ext::DMA_BUF_PLANE0_PITCH as Attrib,
4425            pitch as Attrib,
4426            egl_ext::DMA_BUF_PLANE0_OFFSET as Attrib,
4427            0 as Attrib,
4428            egl_ext::DMA_BUF_PLANE0_FD as Attrib,
4429            fd as Attrib,
4430            egl::IMAGE_PRESERVED as Attrib,
4431            egl::TRUE as Attrib,
4432            khronos_egl::NONE as Attrib,
4433        ];
4434
4435        self.new_egl_image_owned(egl_ext::LINUX_DMA_BUF, &egl_img_attr)
4436    }
4437
4438    /// Get or create an EGLImage for a packed RGB DMA destination with
4439    /// reinterpreted dimensions. Uses the dst cache keyed by buffer identity.
4440    fn get_or_create_egl_image_rgb(
4441        &mut self,
4442        img: &TensorImage,
4443        width: usize,
4444        height: usize,
4445        drm_format: DrmFourcc,
4446        bpp: usize,
4447    ) -> Result<egl::Image, crate::Error> {
4448        let id = img.buffer_identity().id();
4449        self.dst_egl_cache.sweep();
4450
4451        let ts = self.dst_egl_cache.next_timestamp();
4452        if let Some(cached) = self.dst_egl_cache.entries.get_mut(&id) {
4453            self.dst_egl_cache.hits += 1;
4454            cached.last_used = ts;
4455            log::trace!("EglImageCache dst (RGB) hit: id={id:#x}");
4456            return Ok(cached.egl_image.egl_image);
4457        }
4458        self.dst_egl_cache.misses += 1;
4459        log::trace!("EglImageCache dst (RGB) miss: id={id:#x}");
4460
4461        if self.dst_egl_cache.entries.len() >= self.dst_egl_cache.capacity {
4462            self.dst_egl_cache.evict_lru();
4463        }
4464
4465        let egl_image_obj = self.create_egl_image_with_dims(img, width, height, drm_format, bpp)?;
4466        let handle = egl_image_obj.egl_image;
4467        let guard = img.buffer_identity().weak();
4468        let ts = self.dst_egl_cache.next_timestamp();
4469        self.dst_egl_cache.entries.insert(
4470            id,
4471            CachedEglImage {
4472                egl_image: egl_image_obj,
4473                guard,
4474                renderbuffer: None,
4475                last_used: ts,
4476            },
4477        );
4478        Ok(handle)
4479    }
4480
4481    /// Get or create an EGLImage + renderbuffer for direct RGB rendering.
4482    /// Both are cached in dst_egl_cache keyed by buffer identity.
4483    /// Returns (renderbuffer_id, width, height).
4484    fn get_or_create_rgb_direct_rbo(
4485        &mut self,
4486        dst: &TensorImage,
4487    ) -> crate::Result<(u32, i32, i32)> {
4488        let id = dst.buffer_identity().id();
4489        let width = dst.width() as i32;
4490        let height = dst.height() as i32;
4491
4492        self.dst_egl_cache.sweep();
4493
4494        // Check cache for existing entry with renderbuffer
4495        let ts = self.dst_egl_cache.next_timestamp();
4496        if let Some(cached) = self.dst_egl_cache.entries.get_mut(&id) {
4497            if let Some(rbo) = cached.renderbuffer {
4498                self.dst_egl_cache.hits += 1;
4499                cached.last_used = ts;
4500                log::trace!("EglImageCache dst (rgb_direct) hit: id={id:#x}");
4501                return Ok((rbo, width, height));
4502            }
4503        }
4504        self.dst_egl_cache.misses += 1;
4505        log::trace!("EglImageCache dst (rgb_direct) miss: id={id:#x}");
4506
4507        // Evict least-recently-used entry if at capacity
4508        if self.dst_egl_cache.entries.len() >= self.dst_egl_cache.capacity {
4509            self.dst_egl_cache.evict_lru();
4510        }
4511
4512        // Create EGLImage from BGR888 DMA-buf
4513        let egl_image_obj =
4514            self.create_egl_image_with_dims(dst, dst.width(), dst.height(), DrmFourcc::Bgr888, 3)?;
4515
4516        // Create renderbuffer and bind EGLImage to it
4517        let rbo = unsafe {
4518            let mut rbo = 0u32;
4519            gls::gl::GenRenderbuffers(1, &mut rbo);
4520            gls::gl::BindRenderbuffer(gls::gl::RENDERBUFFER, rbo);
4521            gls::gl::EGLImageTargetRenderbufferStorageOES(
4522                gls::gl::RENDERBUFFER,
4523                egl_image_obj.egl_image.as_ptr(),
4524            );
4525            if let Err(e) = check_gl_error(function!(), line!()) {
4526                gls::gl::DeleteRenderbuffers(1, &rbo);
4527                return Err(e);
4528            }
4529            rbo
4530        };
4531
4532        // Cache both
4533        let guard = dst.buffer_identity().weak();
4534        let ts = self.dst_egl_cache.next_timestamp();
4535        self.dst_egl_cache.entries.insert(
4536            id,
4537            CachedEglImage {
4538                egl_image: egl_image_obj,
4539                guard,
4540                renderbuffer: Some(rbo),
4541                last_used: ts,
4542            },
4543        );
4544
4545        Ok((rbo, width, height))
4546    }
4547
4548    // Reshapes the segmentation to be compatible with RGBA texture array rendering.
4549    fn reshape_segmentation_to_rgba(&self, segmentation: &[u8], shape: [usize; 3]) -> Vec<u8> {
4550        let [height, width, classes] = shape;
4551
4552        let n_layer_stride = height * width * 4;
4553        let n_row_stride = width * 4;
4554        let n_col_stride = 4;
4555        let row_stride = width * classes;
4556        let col_stride = classes;
4557
4558        let mut new_segmentation = vec![0u8; n_layer_stride * classes.div_ceil(4)];
4559
4560        for i in 0..height {
4561            for j in 0..width {
4562                for k in 0..classes.div_ceil(4) * 4 {
4563                    if k >= classes {
4564                        new_segmentation[n_layer_stride * (k / 4)
4565                            + i * n_row_stride
4566                            + j * n_col_stride
4567                            + k % 4] = 0;
4568                    } else {
4569                        new_segmentation[n_layer_stride * (k / 4)
4570                            + i * n_row_stride
4571                            + j * n_col_stride
4572                            + k % 4] = segmentation[i * row_stride + j * col_stride + k];
4573                    }
4574                }
4575            }
4576        }
4577
4578        new_segmentation
4579    }
4580
4581    fn render_modelpack_segmentation(
4582        &mut self,
4583        dst_roi: RegionOfInterest,
4584        segmentation: &[u8],
4585        shape: [usize; 3],
4586    ) -> Result<(), crate::Error> {
4587        log::debug!("start render_segmentation_to_image");
4588
4589        // TODO: Implement specialization for 2 classes and 4 classes which shouldn't
4590        // need rearranging the data
4591        let new_segmentation = self.reshape_segmentation_to_rgba(segmentation, shape);
4592
4593        let [height, width, classes] = shape;
4594
4595        let format = gls::gl::RGBA;
4596        let texture_target = gls::gl::TEXTURE_2D_ARRAY;
4597        self.segmentation_program
4598            .load_uniform_1i(c"background_index", shape[2] as i32 - 1)?;
4599
4600        gls::use_program(self.segmentation_program.id);
4601
4602        gls::bind_texture(texture_target, self.segmentation_texture.id);
4603        gls::active_texture(gls::gl::TEXTURE0);
4604        gls::tex_parameteri(
4605            texture_target,
4606            gls::gl::TEXTURE_MIN_FILTER,
4607            gls::gl::LINEAR as i32,
4608        );
4609        gls::tex_parameteri(
4610            texture_target,
4611            gls::gl::TEXTURE_MAG_FILTER,
4612            gls::gl::LINEAR as i32,
4613        );
4614        gls::tex_parameteri(
4615            texture_target,
4616            gls::gl::TEXTURE_WRAP_S,
4617            gls::gl::CLAMP_TO_EDGE as i32,
4618        );
4619
4620        gls::tex_parameteri(
4621            texture_target,
4622            gls::gl::TEXTURE_WRAP_T,
4623            gls::gl::CLAMP_TO_EDGE as i32,
4624        );
4625
4626        gls::tex_image3d(
4627            texture_target,
4628            0,
4629            format as i32,
4630            width as i32,
4631            height as i32,
4632            classes.div_ceil(4) as i32,
4633            0,
4634            format,
4635            gls::gl::UNSIGNED_BYTE,
4636            Some(&new_segmentation),
4637        );
4638
4639        let src_roi = RegionOfInterest {
4640            left: 0.,
4641            top: 1.,
4642            right: 1.,
4643            bottom: 0.,
4644        };
4645
4646        unsafe {
4647            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4648            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4649
4650            let camera_vertices: [f32; 12] = [
4651                dst_roi.left,
4652                dst_roi.top,
4653                0., // left top
4654                dst_roi.right,
4655                dst_roi.top,
4656                0., // right top
4657                dst_roi.right,
4658                dst_roi.bottom,
4659                0., // right bottom
4660                dst_roi.left,
4661                dst_roi.bottom,
4662                0., // left bottom
4663            ];
4664            gls::gl::BufferSubData(
4665                gls::gl::ARRAY_BUFFER,
4666                0,
4667                (size_of::<f32>() * camera_vertices.len()) as isize,
4668                camera_vertices.as_ptr() as *const c_void,
4669            );
4670
4671            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4672            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4673
4674            let texture_vertices: [f32; 8] = [
4675                src_roi.left,
4676                src_roi.top,
4677                src_roi.right,
4678                src_roi.top,
4679                src_roi.right,
4680                src_roi.bottom,
4681                src_roi.left,
4682                src_roi.bottom,
4683            ];
4684            gls::gl::BufferSubData(
4685                gls::gl::ARRAY_BUFFER,
4686                0,
4687                (size_of::<f32>() * 8) as isize,
4688                (texture_vertices[0..]).as_ptr() as *const c_void,
4689            );
4690
4691            let vertices_index: [u32; 4] = [0, 1, 2, 3];
4692            gls::gl::DrawElements(
4693                gls::gl::TRIANGLE_FAN,
4694                vertices_index.len() as i32,
4695                gls::gl::UNSIGNED_INT,
4696                vertices_index.as_ptr() as *const c_void,
4697            );
4698        }
4699
4700        Ok(())
4701    }
4702
4703    fn render_yolo_segmentation(
4704        &mut self,
4705        dst_roi: RegionOfInterest,
4706        segmentation: &[u8],
4707        shape: [usize; 2],
4708        class: usize,
4709    ) -> Result<(), crate::Error> {
4710        log::debug!("start render_yolo_segmentation");
4711
4712        let [height, width] = shape;
4713
4714        let format = gls::gl::RED;
4715        let texture_target = gls::gl::TEXTURE_2D;
4716        gls::use_program(self.instanced_segmentation_program.id);
4717        self.instanced_segmentation_program
4718            .load_uniform_1i(c"class_index", class as i32)?;
4719        gls::bind_texture(texture_target, self.segmentation_texture.id);
4720        gls::active_texture(gls::gl::TEXTURE0);
4721        gls::tex_parameteri(
4722            texture_target,
4723            gls::gl::TEXTURE_MIN_FILTER,
4724            gls::gl::LINEAR as i32,
4725        );
4726        gls::tex_parameteri(
4727            texture_target,
4728            gls::gl::TEXTURE_MAG_FILTER,
4729            gls::gl::LINEAR as i32,
4730        );
4731        gls::tex_parameteri(
4732            texture_target,
4733            gls::gl::TEXTURE_WRAP_S,
4734            gls::gl::CLAMP_TO_EDGE as i32,
4735        );
4736
4737        gls::tex_parameteri(
4738            texture_target,
4739            gls::gl::TEXTURE_WRAP_T,
4740            gls::gl::CLAMP_TO_EDGE as i32,
4741        );
4742
4743        gls::tex_image2d(
4744            texture_target,
4745            0,
4746            format as i32,
4747            width as i32,
4748            height as i32,
4749            0,
4750            format,
4751            gls::gl::UNSIGNED_BYTE,
4752            Some(segmentation),
4753        );
4754
4755        let src_roi = RegionOfInterest {
4756            left: 0.,
4757            top: 1.,
4758            right: 1.,
4759            bottom: 0.,
4760        };
4761
4762        unsafe {
4763            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4764            gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4765
4766            let camera_vertices: [f32; 12] = [
4767                dst_roi.left,
4768                dst_roi.top,
4769                0., // left top
4770                dst_roi.right,
4771                dst_roi.top,
4772                0., // right top
4773                dst_roi.right,
4774                dst_roi.bottom,
4775                0., // right bottom
4776                dst_roi.left,
4777                dst_roi.bottom,
4778                0., // left bottom
4779            ];
4780            gls::gl::BufferSubData(
4781                gls::gl::ARRAY_BUFFER,
4782                0,
4783                (size_of::<f32>() * camera_vertices.len()) as isize,
4784                camera_vertices.as_ptr() as *const c_void,
4785            );
4786
4787            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4788            gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4789
4790            let texture_vertices: [f32; 8] = [
4791                src_roi.left,
4792                src_roi.top,
4793                src_roi.right,
4794                src_roi.top,
4795                src_roi.right,
4796                src_roi.bottom,
4797                src_roi.left,
4798                src_roi.bottom,
4799            ];
4800            gls::gl::BufferSubData(
4801                gls::gl::ARRAY_BUFFER,
4802                0,
4803                (size_of::<f32>() * 8) as isize,
4804                (texture_vertices).as_ptr() as *const c_void,
4805            );
4806
4807            let vertices_index: [u32; 4] = [0, 1, 2, 3];
4808            gls::gl::DrawElements(
4809                gls::gl::TRIANGLE_FAN,
4810                vertices_index.len() as i32,
4811                gls::gl::UNSIGNED_INT,
4812                vertices_index.as_ptr() as *const c_void,
4813            );
4814            gls::gl::Finish();
4815        }
4816
4817        Ok(())
4818    }
4819
4820    /// Repack proto tensor `(H, W, num_protos)` as f32 into RGBA f16 layers
4821    /// suitable for upload to a GL_TEXTURE_2D_ARRAY with GL_RGBA16F.
4822    ///
4823    /// Returns `(repacked_bytes, num_layers)` where each layer is H*W*4 half-floats.
4824    fn repack_protos_to_rgba_f16(protos: &ndarray::Array3<f32>) -> (Vec<u8>, usize) {
4825        let (height, width, num_protos) = protos.dim();
4826        let num_layers = num_protos.div_ceil(4);
4827        // Each layer is H*W*4 half-floats, each half-float is 2 bytes
4828        let layer_stride = height * width * 4;
4829        let mut buf = vec![0u16; layer_stride * num_layers];
4830
4831        for y in 0..height {
4832            for x in 0..width {
4833                for k in 0..num_layers * 4 {
4834                    let val = if k < num_protos {
4835                        half::f16::from_f32(protos[[y, x, k]])
4836                    } else {
4837                        half::f16::ZERO
4838                    };
4839                    let layer = k / 4;
4840                    let channel = k % 4;
4841                    buf[layer * layer_stride + y * width * 4 + x * 4 + channel] = val.to_bits();
4842                }
4843            }
4844        }
4845
4846        // Reinterpret u16 buffer as bytes
4847        let byte_buf = unsafe {
4848            std::slice::from_raw_parts(buf.as_ptr() as *const u8, buf.len() * 2).to_vec()
4849        };
4850        (byte_buf, num_layers)
4851    }
4852
4853    /// Render YOLO proto segmentation masks using the fused GPU pipeline.
4854    ///
4855    /// Dispatches to the appropriate shader based on `ProtoTensor` variant:
4856    /// - `Quantized`: uploads raw int8 as `GL_R8I`, dequantizes in shader
4857    /// - `Float`: uploads as `GL_R32F` with hardware bilinear (if available),
4858    ///   or falls back to f16 repack path
4859    fn render_proto_segmentation(
4860        &mut self,
4861        detect: &[DetectBox],
4862        proto_data: &ProtoData,
4863    ) -> crate::Result<()> {
4864        if detect.is_empty() || proto_data.mask_coefficients.is_empty() {
4865            return Ok(());
4866        }
4867
4868        let (height, width, num_protos) = proto_data.protos.dim();
4869        let texture_target = gls::gl::TEXTURE_2D_ARRAY;
4870
4871        match &proto_data.protos {
4872            ProtoTensor::Quantized {
4873                protos,
4874                quantization,
4875            } => {
4876                self.render_proto_segmentation_int8(
4877                    detect,
4878                    &proto_data.mask_coefficients,
4879                    protos,
4880                    quantization,
4881                    height,
4882                    width,
4883                    num_protos,
4884                    texture_target,
4885                )?;
4886            }
4887            ProtoTensor::Float(protos_f32) => {
4888                if self.has_float_linear {
4889                    self.render_proto_segmentation_f32(
4890                        detect,
4891                        &proto_data.mask_coefficients,
4892                        protos_f32,
4893                        height,
4894                        width,
4895                        num_protos,
4896                        texture_target,
4897                    )?;
4898                } else {
4899                    // Fallback: repack to RGBA16F and use existing f16 shader
4900                    self.render_proto_segmentation_f16(
4901                        detect,
4902                        &proto_data.mask_coefficients,
4903                        protos_f32,
4904                        height,
4905                        width,
4906                        num_protos,
4907                        texture_target,
4908                    )?;
4909                }
4910            }
4911        }
4912
4913        unsafe { gls::gl::Finish() };
4914        Ok(())
4915    }
4916
4917    /// Render detection quads using the active program. Shared by all proto
4918    /// shader paths.
4919    fn render_proto_detection_quads(
4920        &self,
4921        program: &GlProgram,
4922        detect: &[DetectBox],
4923        mask_coefficients: &[Vec<f32>],
4924    ) -> crate::Result<()> {
4925        let cvt_screen_coord = |normalized: f32| normalized * 2.0 - 1.0;
4926
4927        for (det, coeff) in detect.iter().zip(mask_coefficients.iter()) {
4928            let mut packed_coeff = [[0.0f32; 4]; 8];
4929            for (i, val) in coeff.iter().enumerate().take(32) {
4930                packed_coeff[i / 4][i % 4] = *val;
4931            }
4932
4933            program.load_uniform_4fv(c"mask_coeff", &packed_coeff)?;
4934            program.load_uniform_1i(c"class_index", det.label as i32)?;
4935
4936            let dst_roi = RegionOfInterest {
4937                left: cvt_screen_coord(det.bbox.xmin),
4938                top: cvt_screen_coord(det.bbox.ymax),
4939                right: cvt_screen_coord(det.bbox.xmax),
4940                bottom: cvt_screen_coord(det.bbox.ymin),
4941            };
4942
4943            // Proto texture coords: tex row 0 = image top (data uploaded in
4944            // row-major order where y=0 is top of image, and GL treats the
4945            // first row of pixel data as the bottom of the texture — but
4946            // texelFetch(y=0) returns that bottom row, which is our image top).
4947            // So tc.y=0 → image top, tc.y=1 → image bottom.
4948            // At NDC top (higher Y = image bottom = ymax), we want tc.y = ymax.
4949            // At NDC bottom (lower Y = image top = ymin), we want tc.y = ymin.
4950            let src_roi = RegionOfInterest {
4951                left: det.bbox.xmin,
4952                top: det.bbox.ymax,
4953                right: det.bbox.xmax,
4954                bottom: det.bbox.ymin,
4955            };
4956
4957            unsafe {
4958                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
4959                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
4960
4961                let camera_vertices: [f32; 12] = [
4962                    dst_roi.left,
4963                    dst_roi.top,
4964                    0.,
4965                    dst_roi.right,
4966                    dst_roi.top,
4967                    0.,
4968                    dst_roi.right,
4969                    dst_roi.bottom,
4970                    0.,
4971                    dst_roi.left,
4972                    dst_roi.bottom,
4973                    0.,
4974                ];
4975                gls::gl::BufferSubData(
4976                    gls::gl::ARRAY_BUFFER,
4977                    0,
4978                    (size_of::<f32>() * camera_vertices.len()) as isize,
4979                    camera_vertices.as_ptr() as *const c_void,
4980                );
4981
4982                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
4983                gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
4984
4985                let texture_vertices: [f32; 8] = [
4986                    src_roi.left,
4987                    src_roi.top,
4988                    src_roi.right,
4989                    src_roi.top,
4990                    src_roi.right,
4991                    src_roi.bottom,
4992                    src_roi.left,
4993                    src_roi.bottom,
4994                ];
4995                gls::gl::BufferSubData(
4996                    gls::gl::ARRAY_BUFFER,
4997                    0,
4998                    (size_of::<f32>() * 8) as isize,
4999                    texture_vertices.as_ptr() as *const c_void,
5000                );
5001
5002                let vertices_index: [u32; 4] = [0, 1, 2, 3];
5003                gls::gl::DrawElements(
5004                    gls::gl::TRIANGLE_FAN,
5005                    vertices_index.len() as i32,
5006                    gls::gl::UNSIGNED_INT,
5007                    vertices_index.as_ptr() as *const c_void,
5008                );
5009            }
5010        }
5011        Ok(())
5012    }
5013
5014    /// Int8 proto path: upload raw i8 protos as `GL_R8I`, dispatch by
5015    /// interpolation mode.
5016    #[allow(clippy::too_many_arguments)]
5017    fn render_proto_segmentation_int8(
5018        &mut self,
5019        detect: &[DetectBox],
5020        mask_coefficients: &[Vec<f32>],
5021        protos: &ndarray::Array3<i8>,
5022        quantization: &edgefirst_decoder::Quantization,
5023        height: usize,
5024        width: usize,
5025        num_protos: usize,
5026        texture_target: u32,
5027    ) -> crate::Result<()> {
5028        // Upload raw int8 protos as R8I texture array (1 proto per layer)
5029        gls::bind_texture(texture_target, self.proto_texture.id);
5030        gls::active_texture(gls::gl::TEXTURE0);
5031        gls::tex_parameteri(
5032            texture_target,
5033            gls::gl::TEXTURE_MIN_FILTER,
5034            gls::gl::NEAREST as i32,
5035        );
5036        gls::tex_parameteri(
5037            texture_target,
5038            gls::gl::TEXTURE_MAG_FILTER,
5039            gls::gl::NEAREST as i32,
5040        );
5041        gls::tex_parameteri(
5042            texture_target,
5043            gls::gl::TEXTURE_WRAP_S,
5044            gls::gl::CLAMP_TO_EDGE as i32,
5045        );
5046        gls::tex_parameteri(
5047            texture_target,
5048            gls::gl::TEXTURE_WRAP_T,
5049            gls::gl::CLAMP_TO_EDGE as i32,
5050        );
5051
5052        // Protos are (H, W, num_protos) in row-major. We need to repack to
5053        // layer-first layout: layer k = all (H, W) texels for proto k.
5054        let mut tex_data = vec![0i8; height * width * num_protos];
5055        for k in 0..num_protos {
5056            for y in 0..height {
5057                for x in 0..width {
5058                    tex_data[k * height * width + y * width + x] = protos[[y, x, k]];
5059                }
5060            }
5061        }
5062
5063        gls::tex_image3d(
5064            texture_target,
5065            0,
5066            gls::gl::R8I as i32,
5067            width as i32,
5068            height as i32,
5069            num_protos as i32,
5070            0,
5071            gls::gl::RED_INTEGER,
5072            gls::gl::BYTE,
5073            Some(&tex_data),
5074        );
5075
5076        let proto_scale = quantization.scale;
5077        let proto_scaled_zp = -(quantization.zero_point as f32) * quantization.scale;
5078
5079        match self.int8_interpolation_mode {
5080            Int8InterpolationMode::Nearest => {
5081                let program = &self.proto_segmentation_int8_nearest_program;
5082                gls::use_program(program.id);
5083                program.load_uniform_1i(c"num_protos", num_protos as i32)?;
5084                program.load_uniform_1f(c"proto_scale", proto_scale)?;
5085                program.load_uniform_1f(c"proto_scaled_zp", proto_scaled_zp)?;
5086                self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5087            }
5088            Int8InterpolationMode::Bilinear => {
5089                let program = &self.proto_segmentation_int8_bilinear_program;
5090                gls::use_program(program.id);
5091                program.load_uniform_1i(c"num_protos", num_protos as i32)?;
5092                program.load_uniform_1f(c"proto_scale", proto_scale)?;
5093                program.load_uniform_1f(c"proto_scaled_zp", proto_scaled_zp)?;
5094                self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5095            }
5096            Int8InterpolationMode::TwoPass => {
5097                self.render_proto_int8_two_pass(
5098                    detect,
5099                    mask_coefficients,
5100                    quantization,
5101                    height,
5102                    width,
5103                    num_protos,
5104                    texture_target,
5105                )?;
5106            }
5107        }
5108
5109        Ok(())
5110    }
5111
5112    /// Two-pass int8 path: dequant int8→RGBA16F FBO, then render with
5113    /// existing f16 shader using GL_LINEAR.
5114    #[allow(clippy::too_many_arguments)]
5115    fn render_proto_int8_two_pass(
5116        &self,
5117        detect: &[DetectBox],
5118        mask_coefficients: &[Vec<f32>],
5119        quantization: &edgefirst_decoder::Quantization,
5120        height: usize,
5121        width: usize,
5122        num_protos: usize,
5123        texture_target: u32,
5124    ) -> crate::Result<()> {
5125        let num_layers = num_protos.div_ceil(4);
5126
5127        // Save the caller's FBO and viewport so we can restore after dequant.
5128        let (saved_fbo, saved_viewport) = unsafe {
5129            let mut fbo: i32 = 0;
5130            gls::gl::GetIntegerv(gls::gl::FRAMEBUFFER_BINDING, &mut fbo);
5131            let mut vp = [0i32; 4];
5132            gls::gl::GetIntegerv(gls::gl::VIEWPORT, vp.as_mut_ptr());
5133            (fbo as u32, vp)
5134        };
5135
5136        // Pass 1: Dequantize int8 → RGBA16F texture via framebuffer
5137        let dequant_fbo = FrameBuffer::new();
5138        gls::bind_texture(texture_target, self.proto_dequant_texture.id);
5139        gls::tex_image3d::<u8>(
5140            texture_target,
5141            0,
5142            gls::gl::RGBA16F as i32,
5143            width as i32,
5144            height as i32,
5145            num_layers as i32,
5146            0,
5147            gls::gl::RGBA,
5148            gls::gl::HALF_FLOAT,
5149            None,
5150        );
5151        gls::tex_parameteri(
5152            texture_target,
5153            gls::gl::TEXTURE_MIN_FILTER,
5154            gls::gl::LINEAR as i32,
5155        );
5156        gls::tex_parameteri(
5157            texture_target,
5158            gls::gl::TEXTURE_MAG_FILTER,
5159            gls::gl::LINEAR as i32,
5160        );
5161        gls::tex_parameteri(
5162            texture_target,
5163            gls::gl::TEXTURE_WRAP_S,
5164            gls::gl::CLAMP_TO_EDGE as i32,
5165        );
5166        gls::tex_parameteri(
5167            texture_target,
5168            gls::gl::TEXTURE_WRAP_T,
5169            gls::gl::CLAMP_TO_EDGE as i32,
5170        );
5171
5172        let proto_scale = quantization.scale;
5173        let proto_scaled_zp = -(quantization.zero_point as f32) * quantization.scale;
5174
5175        let dequant_program = &self.proto_dequant_int8_program;
5176        gls::use_program(dequant_program.id);
5177        dequant_program.load_uniform_1f(c"proto_scale", proto_scale)?;
5178        dequant_program.load_uniform_1f(c"proto_scaled_zp", proto_scaled_zp)?;
5179
5180        // Bind the int8 proto texture to TEXTURE0 for the dequant shader
5181        gls::active_texture(gls::gl::TEXTURE0);
5182        gls::bind_texture(texture_target, self.proto_texture.id);
5183
5184        // Render each RGBA16F layer (4 protos per layer)
5185        for layer in 0..num_layers {
5186            dequant_fbo.bind();
5187            unsafe {
5188                gls::gl::FramebufferTextureLayer(
5189                    gls::gl::FRAMEBUFFER,
5190                    gls::gl::COLOR_ATTACHMENT0,
5191                    self.proto_dequant_texture.id,
5192                    0,
5193                    layer as i32,
5194                );
5195                gls::gl::Viewport(0, 0, width as i32, height as i32);
5196            }
5197            dequant_program.load_uniform_1i(c"base_layer", (layer * 4) as i32)?;
5198
5199            // Full-screen quad
5200            unsafe {
5201                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
5202                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
5203                let verts: [f32; 12] = [
5204                    -1.0, -1.0, 0.0, 1.0, -1.0, 0.0, 1.0, 1.0, 0.0, -1.0, 1.0, 0.0,
5205                ];
5206                gls::gl::BufferSubData(
5207                    gls::gl::ARRAY_BUFFER,
5208                    0,
5209                    (size_of::<f32>() * 12) as isize,
5210                    verts.as_ptr() as *const c_void,
5211                );
5212                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.texture_buffer.id);
5213                gls::gl::EnableVertexAttribArray(self.texture_buffer.buffer_index);
5214                let tc: [f32; 8] = [0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0];
5215                gls::gl::BufferSubData(
5216                    gls::gl::ARRAY_BUFFER,
5217                    0,
5218                    (size_of::<f32>() * 8) as isize,
5219                    tc.as_ptr() as *const c_void,
5220                );
5221                let idx: [u32; 4] = [0, 1, 2, 3];
5222                gls::gl::DrawElements(
5223                    gls::gl::TRIANGLE_FAN,
5224                    4,
5225                    gls::gl::UNSIGNED_INT,
5226                    idx.as_ptr() as *const c_void,
5227                );
5228            }
5229        }
5230
5231        // Drop the dequant FBO (its Drop unbinds to 0) and restore the caller's.
5232        drop(dequant_fbo);
5233        unsafe {
5234            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, saved_fbo);
5235            gls::gl::Viewport(
5236                saved_viewport[0],
5237                saved_viewport[1],
5238                saved_viewport[2],
5239                saved_viewport[3],
5240            );
5241        }
5242
5243        // Pass 2: render with existing f16 shader reading from dequant texture
5244        let program = &self.proto_segmentation_program;
5245        gls::use_program(program.id);
5246        gls::active_texture(gls::gl::TEXTURE0);
5247        gls::bind_texture(texture_target, self.proto_dequant_texture.id);
5248        program.load_uniform_1i(c"num_layers", num_layers as i32)?;
5249        self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5250
5251        Ok(())
5252    }
5253
5254    /// F32 proto path: upload as `GL_R32F` with `GL_LINEAR` filtering.
5255    #[allow(clippy::too_many_arguments)]
5256    fn render_proto_segmentation_f32(
5257        &self,
5258        detect: &[DetectBox],
5259        mask_coefficients: &[Vec<f32>],
5260        protos_f32: &ndarray::Array3<f32>,
5261        height: usize,
5262        width: usize,
5263        num_protos: usize,
5264        texture_target: u32,
5265    ) -> crate::Result<()> {
5266        let program = &self.proto_segmentation_f32_program;
5267        gls::use_program(program.id);
5268        gls::bind_texture(texture_target, self.proto_texture.id);
5269        gls::active_texture(gls::gl::TEXTURE0);
5270        gls::tex_parameteri(
5271            texture_target,
5272            gls::gl::TEXTURE_MIN_FILTER,
5273            gls::gl::LINEAR as i32,
5274        );
5275        gls::tex_parameteri(
5276            texture_target,
5277            gls::gl::TEXTURE_MAG_FILTER,
5278            gls::gl::LINEAR as i32,
5279        );
5280        gls::tex_parameteri(
5281            texture_target,
5282            gls::gl::TEXTURE_WRAP_S,
5283            gls::gl::CLAMP_TO_EDGE as i32,
5284        );
5285        gls::tex_parameteri(
5286            texture_target,
5287            gls::gl::TEXTURE_WRAP_T,
5288            gls::gl::CLAMP_TO_EDGE as i32,
5289        );
5290
5291        // Repack protos to layer-first layout: (num_protos, H, W)
5292        let mut tex_data = vec![0.0f32; height * width * num_protos];
5293        for k in 0..num_protos {
5294            for y in 0..height {
5295                for x in 0..width {
5296                    tex_data[k * height * width + y * width + x] = protos_f32[[y, x, k]];
5297                }
5298            }
5299        }
5300
5301        gls::tex_image3d(
5302            texture_target,
5303            0,
5304            gls::gl::R32F as i32,
5305            width as i32,
5306            height as i32,
5307            num_protos as i32,
5308            0,
5309            gls::gl::RED,
5310            gls::gl::FLOAT,
5311            Some(&tex_data),
5312        );
5313
5314        program.load_uniform_1i(c"num_protos", num_protos as i32)?;
5315        self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5316
5317        Ok(())
5318    }
5319
5320    /// F16 fallback path: repack f32 protos to RGBA16F and use existing
5321    /// f16 shader with GL_LINEAR. Used when GL_OES_texture_float_linear
5322    /// is not available.
5323    #[allow(clippy::too_many_arguments)]
5324    fn render_proto_segmentation_f16(
5325        &self,
5326        detect: &[DetectBox],
5327        mask_coefficients: &[Vec<f32>],
5328        protos_f32: &ndarray::Array3<f32>,
5329        height: usize,
5330        width: usize,
5331        num_protos: usize,
5332        texture_target: u32,
5333    ) -> crate::Result<()> {
5334        let num_layers = num_protos.div_ceil(4);
5335        let (tex_data, _) = Self::repack_protos_to_rgba_f16(protos_f32);
5336
5337        let program = &self.proto_segmentation_program;
5338        gls::use_program(program.id);
5339        gls::bind_texture(texture_target, self.proto_texture.id);
5340        gls::active_texture(gls::gl::TEXTURE0);
5341        gls::tex_parameteri(
5342            texture_target,
5343            gls::gl::TEXTURE_MIN_FILTER,
5344            gls::gl::LINEAR as i32,
5345        );
5346        gls::tex_parameteri(
5347            texture_target,
5348            gls::gl::TEXTURE_MAG_FILTER,
5349            gls::gl::LINEAR as i32,
5350        );
5351        gls::tex_parameteri(
5352            texture_target,
5353            gls::gl::TEXTURE_WRAP_S,
5354            gls::gl::CLAMP_TO_EDGE as i32,
5355        );
5356        gls::tex_parameteri(
5357            texture_target,
5358            gls::gl::TEXTURE_WRAP_T,
5359            gls::gl::CLAMP_TO_EDGE as i32,
5360        );
5361
5362        gls::tex_image3d(
5363            texture_target,
5364            0,
5365            gls::gl::RGBA16F as i32,
5366            width as i32,
5367            height as i32,
5368            num_layers as i32,
5369            0,
5370            gls::gl::RGBA,
5371            gls::gl::HALF_FLOAT,
5372            Some(&tex_data),
5373        );
5374
5375        program.load_uniform_1i(c"num_layers", num_layers as i32)?;
5376        self.render_proto_detection_quads(program, detect, mask_coefficients)?;
5377
5378        Ok(())
5379    }
5380
5381    fn render_segmentation(
5382        &mut self,
5383        detect: &[DetectBox],
5384        segmentation: &[Segmentation],
5385    ) -> crate::Result<()> {
5386        if segmentation.is_empty() {
5387            return Ok(());
5388        }
5389
5390        let is_modelpack = segmentation[0].segmentation.shape()[2] > 1;
5391        // top and bottom are flipped because OpenGL uses 0,0 as bottom left
5392        let cvt_screen_coord = |normalized| normalized * 2.0 - 1.0;
5393        if is_modelpack {
5394            let seg = &segmentation[0];
5395            let dst_roi = RegionOfInterest {
5396                left: cvt_screen_coord(seg.xmin),
5397                top: cvt_screen_coord(seg.ymax),
5398                right: cvt_screen_coord(seg.xmax),
5399                bottom: cvt_screen_coord(seg.ymin),
5400            };
5401            let segment = seg.segmentation.as_standard_layout();
5402            let slice = segment.as_slice().ok_or(Error::Internal(
5403                "Cannot get slice of segmentation".to_owned(),
5404            ))?;
5405
5406            self.render_modelpack_segmentation(
5407                dst_roi,
5408                slice,
5409                [
5410                    seg.segmentation.shape()[0],
5411                    seg.segmentation.shape()[1],
5412                    seg.segmentation.shape()[2],
5413                ],
5414            )?;
5415        } else {
5416            for (seg, det) in segmentation.iter().zip(detect) {
5417                let dst_roi = RegionOfInterest {
5418                    left: cvt_screen_coord(seg.xmin),
5419                    top: cvt_screen_coord(seg.ymax),
5420                    right: cvt_screen_coord(seg.xmax),
5421                    bottom: cvt_screen_coord(seg.ymin),
5422                };
5423
5424                let segment = seg.segmentation.as_standard_layout();
5425                let slice = segment.as_slice().ok_or(Error::Internal(
5426                    "Cannot get slice of segmentation".to_owned(),
5427                ))?;
5428
5429                self.render_yolo_segmentation(
5430                    dst_roi,
5431                    slice,
5432                    [seg.segmentation.shape()[0], seg.segmentation.shape()[1]],
5433                    det.label,
5434                )?;
5435            }
5436        }
5437
5438        gls::disable(gls::gl::BLEND);
5439        Ok(())
5440    }
5441
5442    fn render_box(&mut self, dst: &TensorImage, detect: &[DetectBox]) -> Result<(), Error> {
5443        unsafe {
5444            gls::gl::UseProgram(self.color_program.id);
5445            let rescale = |x: f32| x * 2.0 - 1.0;
5446            let thickness = 3.0;
5447            for d in detect {
5448                self.color_program
5449                    .load_uniform_1i(c"class_index", d.label as i32)?;
5450                gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, self.vertex_buffer.id);
5451                gls::gl::EnableVertexAttribArray(self.vertex_buffer.buffer_index);
5452                let bbox: [f32; 4] = d.bbox.into();
5453                let outer_box = [
5454                    bbox[0] - thickness / dst.width() as f32,
5455                    bbox[1] - thickness / dst.height() as f32,
5456                    bbox[2] + thickness / dst.width() as f32,
5457                    bbox[3] + thickness / dst.height() as f32,
5458                ];
5459                let camera_vertices: [f32; 24] = [
5460                    rescale(bbox[0]),
5461                    rescale(bbox[3]),
5462                    0., // bottom left
5463                    rescale(bbox[2]),
5464                    rescale(bbox[3]),
5465                    0., // bottom right
5466                    rescale(bbox[2]),
5467                    rescale(bbox[1]),
5468                    0., // top right
5469                    rescale(bbox[0]),
5470                    rescale(bbox[1]),
5471                    0., // top left
5472                    rescale(outer_box[0]),
5473                    rescale(outer_box[3]),
5474                    0., // bottom left
5475                    rescale(outer_box[2]),
5476                    rescale(outer_box[3]),
5477                    0., // bottom right
5478                    rescale(outer_box[2]),
5479                    rescale(outer_box[1]),
5480                    0., // top right
5481                    rescale(outer_box[0]),
5482                    rescale(outer_box[1]),
5483                    0., // top left
5484                ];
5485                gls::gl::BufferData(
5486                    gls::gl::ARRAY_BUFFER,
5487                    (size_of::<f32>() * camera_vertices.len()) as isize,
5488                    camera_vertices.as_ptr() as *const c_void,
5489                    gls::gl::DYNAMIC_DRAW,
5490                );
5491
5492                let vertices_index: [u32; 10] = [0, 1, 5, 2, 6, 3, 7, 0, 4, 5];
5493                gls::gl::DrawElements(
5494                    gls::gl::TRIANGLE_STRIP,
5495                    vertices_index.len() as i32,
5496                    gls::gl::UNSIGNED_INT,
5497                    vertices_index.as_ptr() as *const c_void,
5498                );
5499            }
5500        }
5501        check_gl_error(function!(), line!())?;
5502        Ok(())
5503    }
5504}
5505struct EglImage {
5506    egl_image: egl::Image,
5507    egl: Rc<Egl>,
5508    display: egl::Display,
5509}
5510
5511impl Drop for EglImage {
5512    fn drop(&mut self) {
5513        if self.egl_image.as_ptr() == egl::NO_IMAGE {
5514            return;
5515        }
5516
5517        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
5518            let e =
5519                GlContext::egl_destroy_image_with_fallback(&self.egl, self.display, self.egl_image);
5520            if let Err(e) = e {
5521                error!("Could not destroy EGL image: {e:?}");
5522            }
5523        }));
5524    }
5525}
5526
5527struct Texture {
5528    id: u32,
5529    target: gls::gl::types::GLenum,
5530    width: usize,
5531    height: usize,
5532    format: gls::gl::types::GLenum,
5533}
5534
5535impl Default for Texture {
5536    fn default() -> Self {
5537        Self::new()
5538    }
5539}
5540
5541impl Texture {
5542    fn new() -> Self {
5543        let mut id = 0;
5544        unsafe { gls::gl::GenTextures(1, &raw mut id) };
5545        Self {
5546            id,
5547            target: 0,
5548            width: 0,
5549            height: 0,
5550            format: 0,
5551        }
5552    }
5553
5554    fn update_texture(
5555        &mut self,
5556        target: gls::gl::types::GLenum,
5557        width: usize,
5558        height: usize,
5559        format: gls::gl::types::GLenum,
5560        data: &[u8],
5561    ) {
5562        if target != self.target
5563            || width != self.width
5564            || height != self.height
5565            || format != self.format
5566        {
5567            unsafe {
5568                gls::gl::TexImage2D(
5569                    target,
5570                    0,
5571                    format as i32,
5572                    width as i32,
5573                    height as i32,
5574                    0,
5575                    format,
5576                    gls::gl::UNSIGNED_BYTE,
5577                    data.as_ptr() as *const c_void,
5578                );
5579            }
5580            self.target = target;
5581            self.format = format;
5582            self.width = width;
5583            self.height = height;
5584        } else {
5585            unsafe {
5586                gls::gl::TexSubImage2D(
5587                    target,
5588                    0,
5589                    0,
5590                    0,
5591                    width as i32,
5592                    height as i32,
5593                    format,
5594                    gls::gl::UNSIGNED_BYTE,
5595                    data.as_ptr() as *const c_void,
5596                );
5597            }
5598        }
5599    }
5600}
5601
5602impl Drop for Texture {
5603    fn drop(&mut self) {
5604        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| unsafe {
5605            gls::gl::DeleteTextures(1, &raw mut self.id)
5606        }));
5607    }
5608}
5609
5610struct Buffer {
5611    id: u32,
5612    buffer_index: u32,
5613}
5614
5615impl Buffer {
5616    fn new(buffer_index: u32, size_per_point: usize, max_points: usize) -> Buffer {
5617        let mut id = 0;
5618        unsafe {
5619            gls::gl::EnableVertexAttribArray(buffer_index);
5620            gls::gl::GenBuffers(1, &raw mut id);
5621            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, id);
5622            gls::gl::VertexAttribPointer(
5623                buffer_index,
5624                size_per_point as i32,
5625                gls::gl::FLOAT,
5626                gls::gl::FALSE,
5627                0,
5628                null(),
5629            );
5630            gls::gl::BufferData(
5631                gls::gl::ARRAY_BUFFER,
5632                (size_of::<f32>() * size_per_point * max_points) as isize,
5633                null(),
5634                gls::gl::DYNAMIC_DRAW,
5635            );
5636        }
5637
5638        Buffer { id, buffer_index }
5639    }
5640}
5641
5642impl Drop for Buffer {
5643    fn drop(&mut self) {
5644        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| unsafe {
5645            gls::gl::DeleteBuffers(1, &raw mut self.id)
5646        }));
5647    }
5648}
5649
5650struct FrameBuffer {
5651    id: u32,
5652}
5653
5654impl FrameBuffer {
5655    fn new() -> FrameBuffer {
5656        let mut id = 0;
5657        unsafe {
5658            gls::gl::GenFramebuffers(1, &raw mut id);
5659        }
5660
5661        FrameBuffer { id }
5662    }
5663
5664    fn bind(&self) {
5665        unsafe { gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, self.id) };
5666    }
5667
5668    fn unbind(&self) {
5669        unsafe { gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, 0) };
5670    }
5671}
5672
5673impl Drop for FrameBuffer {
5674    fn drop(&mut self) {
5675        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
5676            self.unbind();
5677            unsafe {
5678                gls::gl::DeleteFramebuffers(1, &raw mut self.id);
5679            }
5680        }));
5681    }
5682}
5683
5684pub struct GlProgram {
5685    id: u32,
5686    vertex_id: u32,
5687    fragment_id: u32,
5688}
5689
5690impl GlProgram {
5691    fn new(vertex_shader: &str, fragment_shader: &str) -> Result<Self, crate::Error> {
5692        let id = unsafe { gls::gl::CreateProgram() };
5693        let vertex_id = unsafe { gls::gl::CreateShader(gls::gl::VERTEX_SHADER) };
5694        if compile_shader_from_str(vertex_id, vertex_shader, "shader_vert").is_err() {
5695            log::debug!("Vertex shader source:\n{}", vertex_shader);
5696            return Err(crate::Error::OpenGl(format!(
5697                "Shader compile error: {vertex_shader}"
5698            )));
5699        }
5700        unsafe {
5701            gls::gl::AttachShader(id, vertex_id);
5702        }
5703
5704        let fragment_id = unsafe { gls::gl::CreateShader(gls::gl::FRAGMENT_SHADER) };
5705        if compile_shader_from_str(fragment_id, fragment_shader, "shader_frag").is_err() {
5706            log::debug!("Fragment shader source:\n{}", fragment_shader);
5707            return Err(crate::Error::OpenGl(format!(
5708                "Shader compile error: {fragment_shader}"
5709            )));
5710        }
5711
5712        unsafe {
5713            gls::gl::AttachShader(id, fragment_id);
5714            gls::gl::LinkProgram(id);
5715            gls::gl::UseProgram(id);
5716        }
5717
5718        Ok(Self {
5719            id,
5720            vertex_id,
5721            fragment_id,
5722        })
5723    }
5724
5725    #[allow(dead_code)]
5726    fn load_uniform_1f(&self, name: &CStr, value: f32) -> Result<(), crate::Error> {
5727        unsafe {
5728            gls::gl::UseProgram(self.id);
5729            let location = gls::gl::GetUniformLocation(self.id, name.as_ptr());
5730            gls::gl::Uniform1f(location, value);
5731        }
5732        Ok(())
5733    }
5734
5735    #[allow(dead_code)]
5736    fn load_uniform_1i(&self, name: &CStr, value: i32) -> Result<(), crate::Error> {
5737        unsafe {
5738            gls::gl::UseProgram(self.id);
5739            let location = gls::gl::GetUniformLocation(self.id, name.as_ptr());
5740            gls::gl::Uniform1i(location, value);
5741        }
5742        Ok(())
5743    }
5744
5745    fn load_uniform_4fv(&self, name: &CStr, value: &[[f32; 4]]) -> Result<(), crate::Error> {
5746        unsafe {
5747            gls::gl::UseProgram(self.id);
5748            let location = gls::gl::GetUniformLocation(self.id, name.as_ptr());
5749            if location == -1 {
5750                return Err(crate::Error::OpenGl(format!(
5751                    "Could not find uniform location for '{}'",
5752                    name.to_string_lossy().into_owned()
5753                )));
5754            }
5755            gls::gl::Uniform4fv(location, value.len() as i32, value.as_flattened().as_ptr());
5756        }
5757        check_gl_error(function!(), line!())?;
5758        Ok(())
5759    }
5760}
5761
5762impl Drop for GlProgram {
5763    fn drop(&mut self) {
5764        let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| unsafe {
5765            gls::gl::DeleteProgram(self.id);
5766            gls::gl::DeleteShader(self.fragment_id);
5767            gls::gl::DeleteShader(self.vertex_id);
5768        }));
5769    }
5770}
5771
5772fn compile_shader_from_str(shader: u32, shader_source: &str, shader_name: &str) -> Result<(), ()> {
5773    let src = match CString::from_str(shader_source) {
5774        Ok(v) => v,
5775        Err(_) => return Err(()),
5776    };
5777    let src_ptr = src.as_ptr();
5778    unsafe {
5779        gls::gl::ShaderSource(shader, 1, &raw const src_ptr, null());
5780        gls::gl::CompileShader(shader);
5781        let mut is_compiled = 0;
5782        gls::gl::GetShaderiv(shader, gls::gl::COMPILE_STATUS, &raw mut is_compiled);
5783        if is_compiled == 0 {
5784            let mut max_length = 0;
5785            gls::gl::GetShaderiv(shader, gls::gl::INFO_LOG_LENGTH, &raw mut max_length);
5786            let mut error_log: Vec<u8> = vec![0; max_length as usize];
5787            gls::gl::GetShaderInfoLog(
5788                shader,
5789                max_length,
5790                &raw mut max_length,
5791                error_log.as_mut_ptr() as *mut c_char,
5792            );
5793            error!(
5794                "Shader '{}' failed: {:?}\n",
5795                shader_name,
5796                CString::from_vec_with_nul(error_log)
5797                    .unwrap()
5798                    .into_string()
5799                    .unwrap()
5800            );
5801            gls::gl::DeleteShader(shader);
5802            return Err(());
5803        }
5804        Ok(())
5805    }
5806}
5807
5808fn check_gl_error(name: &str, line: u32) -> Result<(), Error> {
5809    unsafe {
5810        let err = gls::gl::GetError();
5811        if err != gls::gl::NO_ERROR {
5812            error!("GL Error: {name}:{line}: {err:#X}");
5813            // panic!("GL Error: {err}");
5814            return Err(Error::OpenGl(format!("{err:#X}")));
5815        }
5816    }
5817    Ok(())
5818}
5819
5820fn fourcc_to_drm(fourcc: FourCharCode) -> Result<DrmFourcc, Error> {
5821    match fourcc {
5822        RGBA => Ok(DrmFourcc::Abgr8888),
5823        YUYV => Ok(DrmFourcc::Yuyv),
5824        VYUY => Ok(DrmFourcc::Vyuy),
5825        RGB | RGB_INT8 => Ok(DrmFourcc::Bgr888),
5826        GREY => Ok(DrmFourcc::R8),
5827        NV12 => Ok(DrmFourcc::Nv12),
5828        PLANAR_RGB | PLANAR_RGB_INT8 => Ok(DrmFourcc::R8),
5829        _ => Err(Error::NotSupported(format!(
5830            "FourCC {fourcc:?} has no DRM format mapping"
5831        ))),
5832    }
5833}
5834
5835mod egl_ext {
5836    #![allow(dead_code)]
5837    pub(crate) const LINUX_DMA_BUF: u32 = 0x3270;
5838    pub(crate) const LINUX_DRM_FOURCC: u32 = 0x3271;
5839    pub(crate) const DMA_BUF_PLANE0_FD: u32 = 0x3272;
5840    pub(crate) const DMA_BUF_PLANE0_OFFSET: u32 = 0x3273;
5841    pub(crate) const DMA_BUF_PLANE0_PITCH: u32 = 0x3274;
5842    pub(crate) const DMA_BUF_PLANE1_FD: u32 = 0x3275;
5843    pub(crate) const DMA_BUF_PLANE1_OFFSET: u32 = 0x3276;
5844    pub(crate) const DMA_BUF_PLANE1_PITCH: u32 = 0x3277;
5845    pub(crate) const DMA_BUF_PLANE2_FD: u32 = 0x3278;
5846    pub(crate) const DMA_BUF_PLANE2_OFFSET: u32 = 0x3279;
5847    pub(crate) const DMA_BUF_PLANE2_PITCH: u32 = 0x327A;
5848    pub(crate) const YUV_COLOR_SPACE_HINT: u32 = 0x327B;
5849    pub(crate) const SAMPLE_RANGE_HINT: u32 = 0x327C;
5850    pub(crate) const YUV_CHROMA_HORIZONTAL_SITING_HINT: u32 = 0x327D;
5851    pub(crate) const YUV_CHROMA_VERTICAL_SITING_HINT: u32 = 0x327E;
5852
5853    pub(crate) const ITU_REC601: u32 = 0x327F;
5854    pub(crate) const ITU_REC709: u32 = 0x3280;
5855    pub(crate) const ITU_REC2020: u32 = 0x3281;
5856
5857    pub(crate) const YUV_FULL_RANGE: u32 = 0x3282;
5858    pub(crate) const YUV_NARROW_RANGE: u32 = 0x3283;
5859
5860    pub(crate) const YUV_CHROMA_SITING_0: u32 = 0x3284;
5861    pub(crate) const YUV_CHROMA_SITING_0_5: u32 = 0x3285;
5862
5863    pub(crate) const PLATFORM_GBM_KHR: u32 = 0x31D7;
5864
5865    pub(crate) const PLATFORM_DEVICE_EXT: u32 = 0x313F;
5866
5867    /// EGL_KHR_no_config_context: null config for eglCreateContext.
5868    /// Defined as ((EGLConfig)0) in the EGL spec.
5869    ///
5870    /// # Safety
5871    /// The EGL spec defines EGL_NO_CONFIG_KHR as a null pointer. This is
5872    /// a safe transmute since `Config` is a newtype wrapper around `*mut c_void`.
5873    pub(crate) const NO_CONFIG_KHR: khronos_egl::Config =
5874        unsafe { std::mem::transmute(std::ptr::null_mut::<std::ffi::c_void>()) };
5875}
5876
5877fn generate_vertex_shader() -> &'static str {
5878    "\
5879#version 300 es
5880precision mediump float;
5881layout(location = 0) in vec3 pos;
5882layout(location = 1) in vec2 texCoord;
5883
5884out vec3 fragPos;
5885out vec2 tc;
5886
5887void main() {
5888    fragPos = pos;
5889    tc = texCoord;
5890
5891    gl_Position = vec4(pos, 1.0);
5892}
5893"
5894}
5895
5896fn generate_texture_fragment_shader() -> &'static str {
5897    "\
5898#version 300 es
5899
5900precision mediump float;
5901uniform sampler2D tex;
5902in vec3 fragPos;
5903in vec2 tc;
5904
5905out vec4 color;
5906
5907void main(){
5908    color = texture(tex, tc);
5909}
5910"
5911}
5912
5913fn generate_texture_fragment_shader_yuv() -> &'static str {
5914    "\
5915#version 300 es
5916#extension GL_OES_EGL_image_external_essl3 : require
5917precision mediump float;
5918uniform samplerExternalOES tex;
5919in vec3 fragPos;
5920in vec2 tc;
5921
5922out vec4 color;
5923
5924void main(){
5925    color = texture(tex, tc);
5926}
5927"
5928}
5929
5930fn generate_planar_rgb_shader() -> &'static str {
5931    "\
5932#version 300 es
5933#extension GL_OES_EGL_image_external_essl3 : require
5934precision mediump float;
5935uniform samplerExternalOES tex;
5936in vec3 fragPos;
5937in vec2 tc;
5938
5939out vec4 color;
5940
5941void main(){
5942    color = texture(tex, tc);
5943}
5944"
5945}
5946
5947/// Int8 variant of [`generate_planar_rgb_shader`]. Applies XOR 0x80 bias
5948/// to each RGB channel (uint8 → int8 conversion) using the bit-exact
5949/// quantize+mod approach: `floor(v * 255 + 0.5) + 128 mod 256 / 255`.
5950fn generate_planar_rgb_int8_shader() -> &'static str {
5951    "\
5952#version 300 es
5953#extension GL_OES_EGL_image_external_essl3 : require
5954precision highp float;
5955uniform samplerExternalOES tex;
5956in vec3 fragPos;
5957in vec2 tc;
5958
5959out vec4 color;
5960
5961vec3 int8_bias(vec3 v) {
5962    vec3 q = floor(v * 255.0 + 0.5);
5963    return mod(q + 128.0, 256.0) / 255.0;
5964}
5965
5966void main(){
5967    vec4 c = texture(tex, tc);
5968    color = vec4(int8_bias(c.rgb), c.a);
5969}
5970"
5971}
5972
5973/// Int8 variant of [`generate_texture_fragment_shader`]. Applies `fract(v + 0.5)`
5974/// to each RGB channel for XOR 0x80 bias (uint8 → int8 conversion).
5975/// Used by the direct RGB render path for RGB_INT8 output.
5976fn generate_texture_int8_shader() -> &'static str {
5977    "\
5978#version 300 es
5979precision highp float;
5980uniform sampler2D tex;
5981in vec3 fragPos;
5982in vec2 tc;
5983
5984out vec4 color;
5985
5986// XOR 0x80 bias: quantize to uint8, add 128 mod 256, normalize back.
5987// This matches the CPU `byte ^ 0x80` operation exactly.
5988vec3 int8_bias(vec3 v) {
5989    vec3 q = floor(v * 255.0 + 0.5);
5990    return mod(q + 128.0, 256.0) / 255.0;
5991}
5992
5993void main(){
5994    vec4 c = texture(tex, tc);
5995    color = vec4(int8_bias(c.rgb), c.a);
5996}
5997"
5998}
5999
6000/// Int8 variant of [`generate_texture_fragment_shader_yuv`]. Applies XOR 0x80 bias
6001/// to each RGB channel (uint8 → int8 conversion).
6002/// Used by the direct RGB render path for RGB_INT8 output with external OES sources.
6003fn generate_texture_int8_shader_yuv() -> &'static str {
6004    "\
6005#version 300 es
6006#extension GL_OES_EGL_image_external_essl3 : require
6007precision highp float;
6008uniform samplerExternalOES tex;
6009in vec3 fragPos;
6010in vec2 tc;
6011
6012out vec4 color;
6013
6014vec3 int8_bias(vec3 v) {
6015    vec3 q = floor(v * 255.0 + 0.5);
6016    return mod(q + 128.0, 256.0) / 255.0;
6017}
6018
6019void main(){
6020    vec4 c = texture(tex, tc);
6021    color = vec4(int8_bias(c.rgb), c.a);
6022}
6023"
6024}
6025
6026/// this shader requires a reshape of the segmentation output tensor to (H, W,
6027/// C/4, 4)
6028fn generate_segmentation_shader() -> &'static str {
6029    "\
6030#version 300 es
6031precision mediump float;
6032precision mediump sampler2DArray;
6033
6034uniform sampler2DArray tex;
6035uniform vec4 colors[20];
6036uniform int background_index;
6037
6038in vec3 fragPos;
6039in vec2 tc;
6040in vec4 fragColor;
6041
6042out vec4 color;
6043
6044float max_arg(const in vec4 args, out int argmax) {
6045    if (args[0] >= args[1] && args[0] >= args[2] && args[0] >= args[3]) {
6046        argmax = 0;
6047        return args[0];
6048    }
6049    if (args[1] >= args[0] && args[1] >= args[2] && args[1] >= args[3]) {
6050        argmax = 1;
6051        return args[1];
6052    }
6053    if (args[2] >= args[0] && args[2] >= args[1] && args[2] >= args[3]) {
6054        argmax = 2;
6055        return args[2];
6056    }
6057    argmax = 3;
6058    return args[3];
6059}
6060
6061void main() {
6062    mediump int layers = textureSize(tex, 0).z;
6063    float max_all = -4.0;
6064    int max_ind = 0;
6065    for (int i = 0; i < layers; i++) {
6066        vec4 d = texture(tex, vec3(tc, i));
6067        int max_ind_ = 0;
6068        float max_ = max_arg(d, max_ind_);
6069        if (max_ <= max_all) { continue; }
6070        max_all = max_;
6071        max_ind = i*4 + max_ind_;
6072    }
6073    if (max_ind == background_index) {
6074        discard;
6075    }
6076    max_ind = max_ind % 20;
6077    color = colors[max_ind];
6078}
6079"
6080}
6081
6082fn generate_instanced_segmentation_shader() -> &'static str {
6083    "\
6084#version 300 es
6085precision mediump float;
6086uniform sampler2D mask0;
6087uniform vec4 colors[20];
6088uniform int class_index;
6089in vec3 fragPos;
6090in vec2 tc;
6091in vec4 fragColor;
6092
6093out vec4 color;
6094void main() {
6095    float r0 = texture(mask0, tc).r;
6096    int arg = int(r0>=0.5);
6097    if (arg == 0) {
6098        discard;
6099    }
6100    color = colors[class_index % 20];
6101}
6102"
6103}
6104
6105fn generate_proto_segmentation_shader() -> &'static str {
6106    "\
6107#version 300 es
6108precision highp float;
6109precision highp sampler2DArray;
6110
6111uniform sampler2DArray proto_tex;  // ceil(num_protos/4) layers, RGBA = 4 channels per layer
6112uniform vec4 mask_coeff[8];        // 32 coefficients packed as 8 vec4s
6113uniform vec4 colors[20];
6114uniform int class_index;
6115uniform int num_layers;
6116
6117in vec2 tc;
6118out vec4 color;
6119
6120void main() {
6121    float acc = 0.0;
6122    for (int i = 0; i < num_layers; i++) {
6123        // texture() returns bilinearly interpolated proto values (GL_LINEAR)
6124        acc += dot(mask_coeff[i], texture(proto_tex, vec3(tc, float(i))));
6125    }
6126    float mask = 1.0 / (1.0 + exp(-acc));  // sigmoid
6127    if (mask < 0.5) discard;
6128    color = colors[class_index % 20];
6129}
6130"
6131}
6132
6133/// Int8 proto shader — nearest-neighbor only.
6134///
6135/// Uses `texelFetch()` at the nearest texel. No interpolation. Simplest and
6136/// fastest GPU execution but may show staircase artifacts at mask edges.
6137///
6138/// Layout: `GL_R8I` texture with 1 proto per layer (32 layers).
6139/// Mask coefficients packed as `vec4[8]`, indexed `mask_coeff[k/4][k%4]`.
6140fn generate_proto_segmentation_shader_int8_nearest() -> &'static str {
6141    "\
6142#version 300 es
6143precision highp float;
6144precision highp int;
6145precision highp isampler2DArray;
6146
6147uniform isampler2DArray proto_tex;  // 32 layers, R channel = 1 proto per layer
6148uniform vec4 mask_coeff[8];         // 32 coefficients packed as 8 vec4s
6149uniform vec4 colors[20];
6150uniform int class_index;
6151uniform int num_protos;
6152uniform float proto_scale;
6153uniform float proto_scaled_zp;      // -zero_point * scale
6154
6155in vec2 tc;
6156out vec4 color;
6157
6158void main() {
6159    ivec3 tex_size = textureSize(proto_tex, 0);
6160    int ix = clamp(int(tc.x * float(tex_size.x)), 0, tex_size.x - 1);
6161    int iy = clamp(int(tc.y * float(tex_size.y)), 0, tex_size.y - 1);
6162
6163    float acc = 0.0;
6164    for (int k = 0; k < num_protos; k++) {
6165        float raw = float(texelFetch(proto_tex, ivec3(ix, iy, k), 0).r);
6166        float val = raw * proto_scale + proto_scaled_zp;
6167        acc += mask_coeff[k / 4][k % 4] * val;
6168    }
6169    float mask = 1.0 / (1.0 + exp(-acc));
6170    if (mask < 0.5) discard;
6171    color = colors[class_index % 20];
6172}
6173"
6174}
6175
6176/// Int8 proto shader — shader-based bilinear interpolation (recommended).
6177///
6178/// Uses `texelFetch()` to fetch 4 neighboring texels per fragment, dequantizes
6179/// each, and computes bilinear weights from `fract(tc * textureSize)`.
6180///
6181/// Layout: `GL_R8I` texture with 1 proto per layer (32 layers).
6182fn generate_proto_segmentation_shader_int8_bilinear() -> &'static str {
6183    "\
6184#version 300 es
6185precision highp float;
6186precision highp int;
6187precision highp isampler2DArray;
6188
6189uniform isampler2DArray proto_tex;  // 32 layers, R channel = 1 proto per layer
6190uniform vec4 mask_coeff[8];         // 32 coefficients packed as 8 vec4s
6191uniform vec4 colors[20];
6192uniform int class_index;
6193uniform int num_protos;
6194uniform float proto_scale;
6195uniform float proto_scaled_zp;      // -zero_point * scale
6196
6197in vec2 tc;
6198out vec4 color;
6199
6200void main() {
6201    ivec3 tex_size = textureSize(proto_tex, 0);
6202    // Compute continuous position (matching GL_LINEAR convention: center at +0.5)
6203    vec2 pos = tc * vec2(tex_size.xy) - 0.5;
6204    vec2 f = fract(pos);
6205    ivec2 p0 = ivec2(floor(pos));
6206    ivec2 p1 = p0 + 1;
6207    // Clamp to texture bounds
6208    p0 = clamp(p0, ivec2(0), tex_size.xy - 1);
6209    p1 = clamp(p1, ivec2(0), tex_size.xy - 1);
6210
6211    float w00 = (1.0 - f.x) * (1.0 - f.y);
6212    float w10 = f.x * (1.0 - f.y);
6213    float w01 = (1.0 - f.x) * f.y;
6214    float w11 = f.x * f.y;
6215
6216    float acc = 0.0;
6217    for (int k = 0; k < num_protos; k++) {
6218        float r00 = float(texelFetch(proto_tex, ivec3(p0.x, p0.y, k), 0).r);
6219        float r10 = float(texelFetch(proto_tex, ivec3(p1.x, p0.y, k), 0).r);
6220        float r01 = float(texelFetch(proto_tex, ivec3(p0.x, p1.y, k), 0).r);
6221        float r11 = float(texelFetch(proto_tex, ivec3(p1.x, p1.y, k), 0).r);
6222        float interp = r00 * w00 + r10 * w10 + r01 * w01 + r11 * w11;
6223        float val = interp * proto_scale + proto_scaled_zp;
6224        acc += mask_coeff[k / 4][k % 4] * val;
6225    }
6226    float mask = 1.0 / (1.0 + exp(-acc));
6227    if (mask < 0.5) discard;
6228    color = colors[class_index % 20];
6229}
6230"
6231}
6232
6233/// Int8 dequantization pass shader (two-pass Option C, pass 1).
6234///
6235/// Reads `GL_R8I` texel, dequantizes, and writes float to `GL_RGBA16F` render
6236/// target. This shader processes 4 protos at a time (packing into RGBA).
6237/// After this pass, the existing f16 shader reads the dequantized texture with
6238/// `GL_LINEAR`.
6239fn generate_proto_dequant_shader_int8() -> &'static str {
6240    "\
6241#version 300 es
6242precision highp float;
6243precision highp int;
6244precision highp isampler2DArray;
6245
6246uniform isampler2DArray proto_tex;  // 32 layers of R8I (1 proto per layer)
6247uniform float proto_scale;
6248uniform float proto_scaled_zp;      // -zero_point * scale
6249uniform int base_layer;             // first proto index for this output layer (0, 4, 8, ...)
6250
6251in vec2 tc;
6252out vec4 color;
6253
6254void main() {
6255    ivec3 tex_size = textureSize(proto_tex, 0);
6256    int ix = clamp(int(tc.x * float(tex_size.x)), 0, tex_size.x - 1);
6257    int iy = clamp(int(tc.y * float(tex_size.y)), 0, tex_size.y - 1);
6258
6259    vec4 result;
6260    for (int c = 0; c < 4; c++) {
6261        int layer = base_layer + c;
6262        float raw = float(texelFetch(proto_tex, ivec3(ix, iy, layer), 0).r);
6263        result[c] = raw * proto_scale + proto_scaled_zp;
6264    }
6265    color = result;
6266}
6267"
6268}
6269
6270/// F32 proto shader — direct R32F texture with hardware bilinear filtering.
6271///
6272/// Same structure as int8 bilinear shader but uses `texture()` for hardware
6273/// interpolation (requires `GL_OES_texture_float_linear`). No dequantization.
6274///
6275/// Layout: `GL_R32F` texture with 1 proto per layer (32 layers).
6276fn generate_proto_segmentation_shader_f32() -> &'static str {
6277    "\
6278#version 300 es
6279precision highp float;
6280precision highp sampler2DArray;
6281
6282uniform sampler2DArray proto_tex;  // 32 layers, R channel = 1 proto per layer
6283uniform vec4 mask_coeff[8];        // 32 coefficients packed as 8 vec4s
6284uniform vec4 colors[20];
6285uniform int class_index;
6286uniform int num_protos;
6287
6288in vec2 tc;
6289out vec4 color;
6290
6291void main() {
6292    float acc = 0.0;
6293    for (int k = 0; k < num_protos; k++) {
6294        // texture() returns bilinearly interpolated proto value (GL_LINEAR on R32F)
6295        float val = texture(proto_tex, vec3(tc, float(k))).r;
6296        acc += mask_coeff[k / 4][k % 4] * val;
6297    }
6298    float mask = 1.0 / (1.0 + exp(-acc));
6299    if (mask < 0.5) discard;
6300    color = colors[class_index % 20];
6301}
6302"
6303}
6304
6305/// Binary mask shader — int8, nearest-neighbor, logit threshold.
6306///
6307/// Outputs binary `acc > 0 ? 1.0 : 0.0` instead of `sigmoid(acc)`.  Avoids
6308/// the `exp()` per fragment; used by `decode_masks_atlas` where only mask
6309/// presence matters.
6310fn generate_proto_mask_logit_shader_int8_nearest() -> &'static str {
6311    "\
6312#version 300 es
6313precision highp float;
6314precision highp int;
6315precision highp isampler2DArray;
6316
6317uniform isampler2DArray proto_tex;
6318uniform vec4 mask_coeff[8];
6319uniform int num_protos;
6320uniform float proto_scale;
6321uniform float coeff_sum_x_szp;
6322
6323in vec2 tc;
6324out vec4 color;
6325
6326void main() {
6327    ivec3 tex_size = textureSize(proto_tex, 0);
6328    int ix = clamp(int(tc.x * float(tex_size.x)), 0, tex_size.x - 1);
6329    int iy = clamp(int(tc.y * float(tex_size.y)), 0, tex_size.y - 1);
6330
6331    int groups = (num_protos + 3) / 4;
6332    float acc = 0.0;
6333    for (int i = 0; i < groups; i++) {
6334        int base = i * 4;
6335        vec4 raw = vec4(
6336            float(texelFetch(proto_tex, ivec3(ix, iy, min(base, num_protos - 1)), 0).r),
6337            float(texelFetch(proto_tex, ivec3(ix, iy, min(base + 1, num_protos - 1)), 0).r),
6338            float(texelFetch(proto_tex, ivec3(ix, iy, min(base + 2, num_protos - 1)), 0).r),
6339            float(texelFetch(proto_tex, ivec3(ix, iy, min(base + 3, num_protos - 1)), 0).r)
6340        );
6341        acc += dot(mask_coeff[i], raw);
6342    }
6343    float logit = acc * proto_scale + coeff_sum_x_szp;
6344    float mask = logit > 0.0 ? 1.0 : 0.0;
6345    color = vec4(mask, 0.0, 0.0, 1.0);
6346}
6347"
6348}
6349
6350/// Binary mask shader — int8, shader-based bilinear interpolation, logit threshold.
6351///
6352/// Outputs binary `acc > 0 ? 1.0 : 0.0` instead of `sigmoid(acc)`.  Used by
6353/// `decode_masks_atlas` for int8 models with bilinear interpolation.
6354fn generate_proto_mask_logit_shader_int8_bilinear() -> &'static str {
6355    "\
6356#version 300 es
6357precision highp float;
6358precision highp int;
6359precision highp isampler2DArray;
6360
6361uniform isampler2DArray proto_tex;
6362uniform vec4 mask_coeff[8];
6363uniform int num_protos;
6364uniform float proto_scale;
6365uniform float coeff_sum_x_szp;
6366
6367in vec2 tc;
6368out vec4 color;
6369
6370void main() {
6371    ivec3 tex_size = textureSize(proto_tex, 0);
6372    vec2 pos = tc * vec2(tex_size.xy) - 0.5;
6373    vec2 f = fract(pos);
6374    ivec2 p0 = ivec2(floor(pos));
6375    ivec2 p1 = p0 + 1;
6376    p0 = clamp(p0, ivec2(0), tex_size.xy - 1);
6377    p1 = clamp(p1, ivec2(0), tex_size.xy - 1);
6378
6379    float w00 = (1.0 - f.x) * (1.0 - f.y);
6380    float w10 = f.x * (1.0 - f.y);
6381    float w01 = (1.0 - f.x) * f.y;
6382    float w11 = f.x * f.y;
6383
6384    int groups = (num_protos + 3) / 4;
6385    float acc = 0.0;
6386    for (int i = 0; i < groups; i++) {
6387        int base = i * 4;
6388        int l0 = min(base, num_protos - 1);
6389        int l1 = min(base + 1, num_protos - 1);
6390        int l2 = min(base + 2, num_protos - 1);
6391        int l3 = min(base + 3, num_protos - 1);
6392        vec4 r00 = vec4(
6393            float(texelFetch(proto_tex, ivec3(p0.x, p0.y, l0), 0).r),
6394            float(texelFetch(proto_tex, ivec3(p0.x, p0.y, l1), 0).r),
6395            float(texelFetch(proto_tex, ivec3(p0.x, p0.y, l2), 0).r),
6396            float(texelFetch(proto_tex, ivec3(p0.x, p0.y, l3), 0).r)
6397        );
6398        vec4 r10 = vec4(
6399            float(texelFetch(proto_tex, ivec3(p1.x, p0.y, l0), 0).r),
6400            float(texelFetch(proto_tex, ivec3(p1.x, p0.y, l1), 0).r),
6401            float(texelFetch(proto_tex, ivec3(p1.x, p0.y, l2), 0).r),
6402            float(texelFetch(proto_tex, ivec3(p1.x, p0.y, l3), 0).r)
6403        );
6404        vec4 r01 = vec4(
6405            float(texelFetch(proto_tex, ivec3(p0.x, p1.y, l0), 0).r),
6406            float(texelFetch(proto_tex, ivec3(p0.x, p1.y, l1), 0).r),
6407            float(texelFetch(proto_tex, ivec3(p0.x, p1.y, l2), 0).r),
6408            float(texelFetch(proto_tex, ivec3(p0.x, p1.y, l3), 0).r)
6409        );
6410        vec4 r11 = vec4(
6411            float(texelFetch(proto_tex, ivec3(p1.x, p1.y, l0), 0).r),
6412            float(texelFetch(proto_tex, ivec3(p1.x, p1.y, l1), 0).r),
6413            float(texelFetch(proto_tex, ivec3(p1.x, p1.y, l2), 0).r),
6414            float(texelFetch(proto_tex, ivec3(p1.x, p1.y, l3), 0).r)
6415        );
6416        vec4 interp = r00 * w00 + r10 * w10 + r01 * w01 + r11 * w11;
6417        acc += dot(mask_coeff[i], interp);
6418    }
6419    float logit = acc * proto_scale + coeff_sum_x_szp;
6420    float mask = logit > 0.0 ? 1.0 : 0.0;
6421    color = vec4(mask, 0.0, 0.0, 1.0);
6422}
6423"
6424}
6425
6426/// Binary mask shader — f32 protos with hardware bilinear filtering, logit threshold.
6427///
6428/// Outputs binary `acc > 0 ? 1.0 : 0.0` instead of `sigmoid(acc)`.  Used by
6429/// `decode_masks_atlas` for f32 models.
6430fn generate_proto_mask_logit_shader_f32() -> &'static str {
6431    "\
6432#version 300 es
6433precision highp float;
6434precision highp sampler2DArray;
6435
6436uniform sampler2DArray proto_tex;
6437uniform vec4 mask_coeff[8];
6438uniform int num_protos;
6439
6440in vec2 tc;
6441out vec4 color;
6442
6443void main() {
6444    int groups = (num_protos + 3) / 4;
6445    float acc = 0.0;
6446    for (int i = 0; i < groups; i++) {
6447        int base = i * 4;
6448        vec4 val = vec4(
6449            texture(proto_tex, vec3(tc, float(min(base, num_protos - 1)))).r,
6450            texture(proto_tex, vec3(tc, float(min(base + 1, num_protos - 1)))).r,
6451            texture(proto_tex, vec3(tc, float(min(base + 2, num_protos - 1)))).r,
6452            texture(proto_tex, vec3(tc, float(min(base + 3, num_protos - 1)))).r
6453        );
6454        acc += dot(mask_coeff[i], val);
6455    }
6456    float mask = acc > 0.0 ? 1.0 : 0.0;
6457    color = vec4(mask, 0.0, 0.0, 1.0);
6458}
6459"
6460}
6461
6462fn generate_color_shader() -> &'static str {
6463    "\
6464#version 300 es
6465precision mediump float;
6466uniform vec4 colors[20];
6467uniform int class_index;
6468
6469out vec4 color;
6470void main() {
6471    int index = class_index % 20;
6472    color = colors[index];
6473}
6474"
6475}
6476
6477/// Packed RGB -> RGBA8 packing shader (2D texture source, pass 2).
6478///
6479/// Reads from an intermediate RGBA texture and packs 3 RGB channels into
6480/// RGBA8 output pixels. Each output pixel stores 4 consecutive bytes of the
6481/// destination RGB buffer. Uses only 2 texture fetches per fragment (down
6482/// from 4) by exploiting the fact that 4 consecutive bytes span at most 2
6483/// source pixels.
6484fn generate_packed_rgba8_shader_2d() -> &'static str {
6485    "\
6486#version 300 es
6487precision highp float;
6488precision highp int;
6489uniform sampler2D tex;
6490out vec4 color;
6491void main() {
6492    // gl_FragCoord is at pixel center (n+0.5). Use floor() for robust
6493    // integer pixel index on all GPUs (Vivante, Mali, Adreno).
6494    int out_x = int(floor(gl_FragCoord.x));
6495    int out_y = int(floor(gl_FragCoord.y));
6496    int base = out_x * 4;
6497    // 4 consecutive byte indices map to at most 2 source pixels
6498    int px0 = base / 3;
6499    int px1 = (base + 3) / 3;
6500    vec4 s0 = texelFetch(tex, ivec2(px0, out_y), 0);
6501    vec4 s1 = (px1 != px0) ? texelFetch(tex, ivec2(px1, out_y), 0) : s0;
6502    // Extract channels based on phase (base % 3)
6503    int phase = base - px0 * 3;
6504    if (phase == 0) {
6505        color = vec4(s0.r, s0.g, s0.b, s1.r);
6506    } else if (phase == 1) {
6507        color = vec4(s0.g, s0.b, s1.r, s1.g);
6508    } else {
6509        color = vec4(s0.b, s1.r, s1.g, s1.b);
6510    }
6511}
6512"
6513}
6514
6515/// Packed RGB -> RGBA8 packing shader with int8 XOR 0x80 bias (2D source, pass 2).
6516///
6517/// Same packing logic as [`generate_packed_rgba8_shader_2d`] but applies
6518/// bit-exact XOR 0x80 bias via quantize+mod: `floor(v * 255 + 0.5) + 128
6519/// mod 256 / 255`. This matches the CPU `byte ^ 0x80` operation exactly.
6520fn generate_packed_rgba8_int8_shader_2d() -> &'static str {
6521    "\
6522#version 300 es
6523precision highp float;
6524precision highp int;
6525uniform sampler2D tex;
6526out vec4 color;
6527
6528vec4 int8_bias(vec4 v) {
6529    vec4 q = floor(v * 255.0 + 0.5);
6530    return mod(q + 128.0, 256.0) / 255.0;
6531}
6532
6533void main() {
6534    // gl_FragCoord is at pixel center (n+0.5). Use floor() for robust
6535    // integer pixel index on all GPUs (Vivante, Mali, Adreno).
6536    int out_x = int(floor(gl_FragCoord.x));
6537    int out_y = int(floor(gl_FragCoord.y));
6538    int base = out_x * 4;
6539    // 4 consecutive byte indices map to at most 2 source pixels
6540    int px0 = base / 3;
6541    int px1 = (base + 3) / 3;
6542    vec4 s0 = texelFetch(tex, ivec2(px0, out_y), 0);
6543    vec4 s1 = (px1 != px0) ? texelFetch(tex, ivec2(px1, out_y), 0) : s0;
6544    // Extract channels based on phase (base % 3), then apply int8 bias
6545    int phase = base - px0 * 3;
6546    if (phase == 0) {
6547        color = int8_bias(vec4(s0.r, s0.g, s0.b, s1.r));
6548    } else if (phase == 1) {
6549        color = int8_bias(vec4(s0.g, s0.b, s1.r, s1.g));
6550    } else {
6551        color = int8_bias(vec4(s0.b, s1.r, s1.g, s1.b));
6552    }
6553}
6554"
6555}
6556
6557#[cfg(test)]
6558#[cfg(feature = "opengl")]
6559mod gl_tests {
6560    use super::*;
6561    use crate::{TensorImage, RGBA};
6562    #[cfg(feature = "dma_test_formats")]
6563    use crate::{NV12, YUYV};
6564    use edgefirst_tensor::TensorTrait;
6565    #[cfg(feature = "dma_test_formats")]
6566    use edgefirst_tensor::{is_dma_available, TensorMapTrait, TensorMemory};
6567    use image::buffer::ConvertBuffer;
6568    use ndarray::Array3;
6569
6570    #[test]
6571    fn test_segmentation() {
6572        use edgefirst_decoder::Segmentation;
6573
6574        if !is_opengl_available() {
6575            eprintln!("SKIPPED: {} - OpenGL not available", function!());
6576            return;
6577        }
6578
6579        let mut image = TensorImage::load(
6580            include_bytes!("../../../testdata/giraffe.jpg"),
6581            Some(RGBA),
6582            None,
6583        )
6584        .unwrap();
6585
6586        let mut segmentation = Array3::from_shape_vec(
6587            (2, 160, 160),
6588            include_bytes!("../../../testdata/modelpack_seg_2x160x160.bin").to_vec(),
6589        )
6590        .unwrap();
6591        segmentation.swap_axes(0, 1);
6592        segmentation.swap_axes(1, 2);
6593        let segmentation = segmentation.as_standard_layout().to_owned();
6594
6595        let seg = Segmentation {
6596            segmentation,
6597            xmin: 0.0,
6598            ymin: 0.0,
6599            xmax: 1.0,
6600            ymax: 1.0,
6601        };
6602
6603        let mut renderer = GLProcessorThreaded::new(None).unwrap();
6604        renderer.draw_masks(&mut image, &[], &[seg]).unwrap();
6605    }
6606
6607    #[test]
6608    fn test_segmentation_mem() {
6609        use edgefirst_decoder::Segmentation;
6610
6611        if !is_opengl_available() {
6612            eprintln!("SKIPPED: {} - OpenGL not available", function!());
6613            return;
6614        }
6615
6616        let mut image = TensorImage::load(
6617            include_bytes!("../../../testdata/giraffe.jpg"),
6618            Some(RGBA),
6619            Some(edgefirst_tensor::TensorMemory::Mem),
6620        )
6621        .unwrap();
6622
6623        let mut segmentation = Array3::from_shape_vec(
6624            (2, 160, 160),
6625            include_bytes!("../../../testdata/modelpack_seg_2x160x160.bin").to_vec(),
6626        )
6627        .unwrap();
6628        segmentation.swap_axes(0, 1);
6629        segmentation.swap_axes(1, 2);
6630        let segmentation = segmentation.as_standard_layout().to_owned();
6631
6632        let seg = Segmentation {
6633            segmentation,
6634            xmin: 0.0,
6635            ymin: 0.0,
6636            xmax: 1.0,
6637            ymax: 1.0,
6638        };
6639
6640        let mut renderer = GLProcessorThreaded::new(None).unwrap();
6641        renderer.draw_masks(&mut image, &[], &[seg]).unwrap();
6642    }
6643
6644    #[test]
6645    fn test_segmentation_yolo() {
6646        use edgefirst_decoder::Segmentation;
6647        use ndarray::Array3;
6648
6649        if !is_opengl_available() {
6650            eprintln!("SKIPPED: {} - OpenGL not available", function!());
6651            return;
6652        }
6653
6654        let mut image = TensorImage::load(
6655            include_bytes!("../../../testdata/giraffe.jpg"),
6656            Some(RGBA),
6657            None,
6658        )
6659        .unwrap();
6660
6661        let segmentation = Array3::from_shape_vec(
6662            (76, 55, 1),
6663            include_bytes!("../../../testdata/yolov8_seg_crop_76x55.bin").to_vec(),
6664        )
6665        .unwrap();
6666
6667        let detect = DetectBox {
6668            bbox: [0.59375, 0.25, 0.9375, 0.725].into(),
6669            score: 0.99,
6670            label: 1,
6671        };
6672
6673        let seg = Segmentation {
6674            segmentation,
6675            xmin: 0.59375,
6676            ymin: 0.25,
6677            xmax: 0.9375,
6678            ymax: 0.725,
6679        };
6680
6681        let mut renderer = GLProcessorThreaded::new(None).unwrap();
6682        renderer
6683            .set_class_colors(&[[255, 255, 0, 233], [128, 128, 255, 100]])
6684            .unwrap();
6685        renderer.draw_masks(&mut image, &[detect], &[seg]).unwrap();
6686
6687        let expected = TensorImage::load(
6688            include_bytes!("../../../testdata/output_render_gl.jpg"),
6689            Some(RGBA),
6690            None,
6691        )
6692        .unwrap();
6693
6694        compare_images(&image, &expected, 0.99, function!());
6695    }
6696
6697    #[test]
6698    fn test_boxes() {
6699        use edgefirst_decoder::DetectBox;
6700
6701        if !is_opengl_available() {
6702            eprintln!("SKIPPED: {} - OpenGL not available", function!());
6703            return;
6704        }
6705
6706        let mut image = TensorImage::load(
6707            include_bytes!("../../../testdata/giraffe.jpg"),
6708            Some(RGBA),
6709            None,
6710        )
6711        .unwrap();
6712
6713        let detect = DetectBox {
6714            bbox: [0.59375, 0.25, 0.9375, 0.725].into(),
6715            score: 0.99,
6716            label: 0,
6717        };
6718        let mut renderer = GLProcessorThreaded::new(None).unwrap();
6719        renderer
6720            .set_class_colors(&[[255, 255, 0, 233], [128, 128, 255, 100]])
6721            .unwrap();
6722        renderer.draw_masks(&mut image, &[detect], &[]).unwrap();
6723    }
6724
6725    static GL_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
6726    // Helper function to check if OpenGL is available
6727    fn is_opengl_available() -> bool {
6728        #[cfg(all(target_os = "linux", feature = "opengl"))]
6729        {
6730            *GL_AVAILABLE.get_or_init(|| GLProcessorThreaded::new(None).is_ok())
6731        }
6732
6733        #[cfg(not(all(target_os = "linux", feature = "opengl")))]
6734        {
6735            false
6736        }
6737    }
6738
6739    fn compare_images(img1: &TensorImage, img2: &TensorImage, threshold: f64, name: &str) {
6740        assert_eq!(img1.height(), img2.height(), "Heights differ");
6741        assert_eq!(img1.width(), img2.width(), "Widths differ");
6742        assert_eq!(img1.fourcc(), img2.fourcc(), "FourCC differ");
6743        assert!(
6744            matches!(img1.fourcc(), RGB | RGBA | GREY | PLANAR_RGB),
6745            "FourCC must be RGB or RGBA for comparison"
6746        );
6747
6748        let image1 = match img1.fourcc() {
6749            RGB => image::RgbImage::from_vec(
6750                img1.width() as u32,
6751                img1.height() as u32,
6752                img1.tensor().map().unwrap().to_vec(),
6753            )
6754            .unwrap(),
6755            RGBA => image::RgbaImage::from_vec(
6756                img1.width() as u32,
6757                img1.height() as u32,
6758                img1.tensor().map().unwrap().to_vec(),
6759            )
6760            .unwrap()
6761            .convert(),
6762            GREY => image::GrayImage::from_vec(
6763                img1.width() as u32,
6764                img1.height() as u32,
6765                img1.tensor().map().unwrap().to_vec(),
6766            )
6767            .unwrap()
6768            .convert(),
6769            PLANAR_RGB => image::GrayImage::from_vec(
6770                img1.width() as u32,
6771                (img1.height() * 3) as u32,
6772                img1.tensor().map().unwrap().to_vec(),
6773            )
6774            .unwrap()
6775            .convert(),
6776            _ => return,
6777        };
6778
6779        let image2 = match img2.fourcc() {
6780            RGB => image::RgbImage::from_vec(
6781                img2.width() as u32,
6782                img2.height() as u32,
6783                img2.tensor().map().unwrap().to_vec(),
6784            )
6785            .unwrap(),
6786            RGBA => image::RgbaImage::from_vec(
6787                img2.width() as u32,
6788                img2.height() as u32,
6789                img2.tensor().map().unwrap().to_vec(),
6790            )
6791            .unwrap()
6792            .convert(),
6793            GREY => image::GrayImage::from_vec(
6794                img2.width() as u32,
6795                img2.height() as u32,
6796                img2.tensor().map().unwrap().to_vec(),
6797            )
6798            .unwrap()
6799            .convert(),
6800            PLANAR_RGB => image::GrayImage::from_vec(
6801                img2.width() as u32,
6802                (img2.height() * 3) as u32,
6803                img2.tensor().map().unwrap().to_vec(),
6804            )
6805            .unwrap()
6806            .convert(),
6807            _ => return,
6808        };
6809
6810        let similarity = image_compare::rgb_similarity_structure(
6811            &image_compare::Algorithm::RootMeanSquared,
6812            &image1,
6813            &image2,
6814        )
6815        .expect("Image Comparison failed");
6816        if similarity.score < threshold {
6817            // image1.save(format!("{name}_1.png"));
6818            // image2.save(format!("{name}_2.png"));
6819            similarity
6820                .image
6821                .to_color_map()
6822                .save(format!("{name}.png"))
6823                .unwrap();
6824            panic!(
6825                "{name}: converted image and target image have similarity score too low: {} < {}",
6826                similarity.score, threshold
6827            )
6828        }
6829    }
6830
6831    // =========================================================================
6832    // NV12 Reference Validation Tests
6833    // These tests compare OpenGL NV12 conversions against ffmpeg-generated
6834    // references
6835    // =========================================================================
6836
6837    #[cfg(feature = "dma_test_formats")]
6838    fn load_raw_image(
6839        width: usize,
6840        height: usize,
6841        fourcc: FourCharCode,
6842        memory: Option<TensorMemory>,
6843        bytes: &[u8],
6844    ) -> Result<TensorImage, crate::Error> {
6845        let img = TensorImage::new(width, height, fourcc, memory)?;
6846        let mut map = img.tensor().map()?;
6847        map.as_mut_slice()[..bytes.len()].copy_from_slice(bytes);
6848        Ok(img)
6849    }
6850
6851    /// Test OpenGL NV12→RGBA conversion against ffmpeg reference
6852    #[test]
6853    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
6854    fn test_opengl_nv12_to_rgba_reference() {
6855        if !is_dma_available() {
6856            return;
6857        }
6858        // Load NV12 source with DMA
6859        let src = load_raw_image(
6860            1280,
6861            720,
6862            NV12,
6863            Some(TensorMemory::Dma),
6864            include_bytes!("../../../testdata/camera720p.nv12"),
6865        )
6866        .unwrap();
6867
6868        // Load RGBA reference (ffmpeg-generated)
6869        let reference = load_raw_image(
6870            1280,
6871            720,
6872            RGBA,
6873            None,
6874            include_bytes!("../../../testdata/camera720p.rgba"),
6875        )
6876        .unwrap();
6877
6878        // Convert using OpenGL
6879        let mut dst = TensorImage::new(1280, 720, RGBA, Some(TensorMemory::Dma)).unwrap();
6880        let mut gl = GLProcessorThreaded::new(None).unwrap();
6881        gl.convert(&src, &mut dst, Rotation::None, Flip::None, Crop::no_crop())
6882            .unwrap();
6883
6884        // Copy to CPU for comparison
6885        let cpu_dst = TensorImage::new(1280, 720, RGBA, None).unwrap();
6886        cpu_dst
6887            .tensor()
6888            .map()
6889            .unwrap()
6890            .as_mut_slice()
6891            .copy_from_slice(dst.tensor().map().unwrap().as_slice());
6892
6893        compare_images(&reference, &cpu_dst, 0.98, "opengl_nv12_to_rgba_reference");
6894    }
6895
6896    /// Test OpenGL YUYV→RGBA conversion against ffmpeg reference
6897    #[test]
6898    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
6899    fn test_opengl_yuyv_to_rgba_reference() {
6900        if !is_dma_available() {
6901            return;
6902        }
6903        // Load YUYV source with DMA
6904        let src = load_raw_image(
6905            1280,
6906            720,
6907            YUYV,
6908            Some(TensorMemory::Dma),
6909            include_bytes!("../../../testdata/camera720p.yuyv"),
6910        )
6911        .unwrap();
6912
6913        // Load RGBA reference (ffmpeg-generated)
6914        let reference = load_raw_image(
6915            1280,
6916            720,
6917            RGBA,
6918            None,
6919            include_bytes!("../../../testdata/camera720p.rgba"),
6920        )
6921        .unwrap();
6922
6923        // Convert using OpenGL
6924        let mut dst = TensorImage::new(1280, 720, RGBA, Some(TensorMemory::Dma)).unwrap();
6925        let mut gl = GLProcessorThreaded::new(None).unwrap();
6926        gl.convert(&src, &mut dst, Rotation::None, Flip::None, Crop::no_crop())
6927            .unwrap();
6928
6929        // Copy to CPU for comparison
6930        let cpu_dst = TensorImage::new(1280, 720, RGBA, None).unwrap();
6931        cpu_dst
6932            .tensor()
6933            .map()
6934            .unwrap()
6935            .as_mut_slice()
6936            .copy_from_slice(dst.tensor().map().unwrap().as_slice());
6937
6938        compare_images(&reference, &cpu_dst, 0.98, "opengl_yuyv_to_rgba_reference");
6939    }
6940
6941    // =========================================================================
6942    // EGL Display Probe & Override Tests
6943    // =========================================================================
6944
6945    /// Validate that probe_egl_displays() discovers available display types
6946    /// and returns them in priority order (GBM first).
6947    ///
6948    /// On headless i.MX hardware, GBM and PlatformDevice are typically
6949    /// available. Default requires a running compositor (Wayland/X11) and
6950    /// may not be present on headless targets.
6951    #[test]
6952    fn test_probe_egl_displays() {
6953        let displays = match probe_egl_displays() {
6954            Ok(d) => d,
6955            Err(e) => {
6956                eprintln!("SKIPPED: {} - EGL not available: {e:?}", function!());
6957                return;
6958            }
6959        };
6960
6961        if displays.is_empty() {
6962            eprintln!("SKIPPED: {} - No EGL displays available", function!());
6963            return;
6964        }
6965
6966        let kinds: Vec<_> = displays.iter().map(|d| d.kind).collect();
6967        eprintln!("Probed EGL displays: {kinds:?}");
6968        for d in &displays {
6969            eprintln!("  {:?}: {}", d.kind, d.description);
6970        }
6971
6972        // Verify priority ordering: PlatformDevice > GBM > Default.
6973        // Not all display types are available on every system, but the
6974        // ones that are present must appear in this order.
6975        let priority = |k: &EglDisplayKind| match k {
6976            EglDisplayKind::PlatformDevice => 0,
6977            EglDisplayKind::Gbm => 1,
6978            EglDisplayKind::Default => 2,
6979        };
6980        for w in kinds.windows(2) {
6981            assert!(
6982                priority(&w[0]) < priority(&w[1]),
6983                "Display ordering violated: {:?} should come after {:?}",
6984                w[1],
6985                w[0],
6986            );
6987        }
6988    }
6989
6990    /// Validate that explicitly selecting each available display kind via
6991    /// GLProcessorThreaded::new(Some(kind)) succeeds and produces a working
6992    /// converter.
6993    #[test]
6994    fn test_override_each_display_kind() {
6995        let displays = match probe_egl_displays() {
6996            Ok(d) => d,
6997            Err(e) => {
6998                eprintln!("SKIPPED: {} - EGL not available: {e:?}", function!());
6999                return;
7000            }
7001        };
7002
7003        if displays.is_empty() {
7004            eprintln!("SKIPPED: {} - No EGL displays available", function!());
7005            return;
7006        }
7007
7008        for display in &displays {
7009            eprintln!(
7010                "Testing override: {:?} ({})",
7011                display.kind, display.description
7012            );
7013            let mut gl = GLProcessorThreaded::new(Some(display.kind)).unwrap_or_else(|e| {
7014                panic!(
7015                    "GLProcessorThreaded::new(Some({:?})) failed: {e:?}",
7016                    display.kind
7017                )
7018            });
7019
7020            // Smoke test: do a simple RGBA → RGBA conversion to verify the
7021            // GL context is fully functional.
7022            let src = TensorImage::load(
7023                include_bytes!("../../../testdata/zidane.jpg"),
7024                Some(RGBA),
7025                None,
7026            )
7027            .unwrap();
7028            let mut dst = TensorImage::new(320, 240, RGBA, None).unwrap();
7029            gl.convert(&src, &mut dst, Rotation::None, Flip::None, Crop::no_crop())
7030                .unwrap_or_else(|e| {
7031                    panic!("convert() with {:?} display failed: {e:?}", display.kind)
7032                });
7033            eprintln!("  {:?} display: convert OK", display.kind);
7034        }
7035    }
7036
7037    /// Validate that requesting a display kind that doesn't exist on the
7038    /// system returns an error rather than falling back silently.
7039    #[test]
7040    fn test_override_unavailable_display_errors() {
7041        let displays = match probe_egl_displays() {
7042            Ok(d) => d,
7043            Err(e) => {
7044                eprintln!("SKIPPED: {} - EGL not available: {e:?}", function!());
7045                return;
7046            }
7047        };
7048        let available_kinds: Vec<_> = displays.iter().map(|d| d.kind).collect();
7049
7050        // Find a kind that is NOT available; if all three are available,
7051        // this test has nothing to verify — skip it.
7052        let unavailable = [
7053            EglDisplayKind::PlatformDevice,
7054            EglDisplayKind::Gbm,
7055            EglDisplayKind::Default,
7056        ]
7057        .into_iter()
7058        .find(|k| !available_kinds.contains(k));
7059
7060        if let Some(kind) = unavailable {
7061            eprintln!("Testing override with unavailable kind: {kind:?}");
7062            let result = GLProcessorThreaded::new(Some(kind));
7063            assert!(
7064                result.is_err(),
7065                "Expected error for unavailable display kind {kind:?}, got Ok"
7066            );
7067            eprintln!("  Correctly returned error: {:?}", result.unwrap_err());
7068        } else {
7069            eprintln!(
7070                "SKIPPED: {} - All three display kinds are available",
7071                function!()
7072            );
7073        }
7074    }
7075
7076    /// Validate that auto-detection (None) still works — this is the existing
7077    /// default behaviour and must not regress.
7078    #[test]
7079    fn test_auto_detect_display() {
7080        if !is_opengl_available() {
7081            eprintln!("SKIPPED: {} - OpenGL not available", function!());
7082            return;
7083        }
7084
7085        let mut gl = GLProcessorThreaded::new(None).expect("auto-detect should succeed");
7086        let src = TensorImage::load(
7087            include_bytes!("../../../testdata/zidane.jpg"),
7088            Some(RGBA),
7089            None,
7090        )
7091        .unwrap();
7092        let mut dst = TensorImage::new(320, 240, RGBA, None).unwrap();
7093        gl.convert(&src, &mut dst, Rotation::None, Flip::None, Crop::no_crop())
7094            .expect("auto-detect convert should succeed");
7095    }
7096
7097    #[test]
7098    fn test_packed_rgb_width_constraint() {
7099        // Standard ML model input widths — all satisfy W*3 % 4 == 0
7100        assert_eq!((640usize * 3) % 4, 0);
7101        assert_eq!((320usize * 3) % 4, 0);
7102        assert_eq!((1280usize * 3) % 4, 0);
7103
7104        // Non-divisible widths should be rejected
7105        assert_ne!((322usize * 3) % 4, 0);
7106        assert_ne!((333usize * 3) % 4, 0);
7107    }
7108
7109    // =========================================================================
7110    // Packed RGB Correctness Tests (two-pass pipeline)
7111    // These tests compare GL RGBA output (alpha stripped) against GL packed
7112    // RGB output. Both use the same GPU color conversion, so differences
7113    // isolate packing shader bugs rather than CPU-vs-GPU YUV conversion.
7114    // They require DMA + OpenGL hardware (on-target only).
7115    // =========================================================================
7116
7117    /// Compare two byte slices pixel-by-pixel with tolerance.
7118    /// Panics with details if any byte differs by more than `tolerance`.
7119    #[cfg(feature = "dma_test_formats")]
7120    fn assert_pixels_match(expected: &[u8], actual: &[u8], tolerance: u8) {
7121        assert_eq!(expected.len(), actual.len(), "Buffer size mismatch");
7122        let mut max_diff: u8 = 0;
7123        let mut diff_count: usize = 0;
7124        let mut first_diff_idx = None;
7125        for (i, (&e, &a)) in expected.iter().zip(actual.iter()).enumerate() {
7126            let diff = (e as i16 - a as i16).unsigned_abs() as u8;
7127            if diff > tolerance {
7128                diff_count += 1;
7129                if first_diff_idx.is_none() {
7130                    first_diff_idx = Some(i);
7131                }
7132            }
7133            max_diff = max_diff.max(diff);
7134        }
7135        assert!(
7136            diff_count == 0,
7137            "Pixel mismatch: {diff_count} bytes differ (max_diff={max_diff}, first at index {})",
7138            first_diff_idx.unwrap_or(0)
7139        );
7140    }
7141
7142    /// Build a letterbox crop that fits src into dst_w x dst_h, preserving aspect ratio.
7143    #[cfg(feature = "dma_test_formats")]
7144    fn letterbox_crop(src_w: usize, src_h: usize, dst_w: usize, dst_h: usize) -> Crop {
7145        let src_aspect = src_w as f64 / src_h as f64;
7146        let dst_aspect = dst_w as f64 / dst_h as f64;
7147        let (new_w, new_h) = if src_aspect > dst_aspect {
7148            let new_h = (dst_w as f64 / src_aspect).round() as usize;
7149            (dst_w, new_h)
7150        } else {
7151            let new_w = (dst_h as f64 * src_aspect).round() as usize;
7152            (new_w, dst_h)
7153        };
7154        let left = (dst_w - new_w) / 2;
7155        let top = (dst_h - new_h) / 2;
7156        Crop::new()
7157            .with_dst_rect(Some(crate::Rect::new(left, top, new_w, new_h)))
7158            .with_dst_color(Some([114, 114, 114, 255]))
7159    }
7160
7161    /// Strip alpha from RGBA bytes → packed RGB bytes.
7162    #[cfg(feature = "dma_test_formats")]
7163    fn rgba_to_rgb(rgba: &[u8]) -> Vec<u8> {
7164        assert_eq!(
7165            rgba.len() % 4,
7166            0,
7167            "RGBA buffer length must be divisible by 4"
7168        );
7169        let mut rgb = Vec::with_capacity(rgba.len() / 4 * 3);
7170        for pixel in rgba.chunks_exact(4) {
7171            rgb.push(pixel[0]);
7172            rgb.push(pixel[1]);
7173            rgb.push(pixel[2]);
7174        }
7175        rgb
7176    }
7177
7178    /// Convert uint8 RGB bytes to int8 (XOR 0x80 each byte).
7179    #[cfg(feature = "dma_test_formats")]
7180    fn uint8_to_int8(data: &[u8]) -> Vec<u8> {
7181        data.iter().map(|&b| b ^ 0x80).collect()
7182    }
7183
7184    /// YUYV 1080p → RGB 640x640 with letterbox (two-pass packed RGB pipeline).
7185    /// Compares GL RGBA (alpha-stripped) against GL packed RGB to validate packing.
7186    #[test]
7187    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7188    fn test_opengl_rgb_correctness() {
7189        if !is_dma_available() {
7190            return;
7191        }
7192        let src_dma = load_raw_image(
7193            1920,
7194            1080,
7195            YUYV,
7196            Some(TensorMemory::Dma),
7197            include_bytes!("../../../testdata/camera1080p.yuyv"),
7198        )
7199        .unwrap();
7200
7201        let crop = letterbox_crop(1920, 1080, 640, 640);
7202        let mut gl = GLProcessorThreaded::new(None).unwrap();
7203
7204        // GL RGBA reference
7205        let mut dst_rgba = TensorImage::new(640, 640, RGBA, Some(TensorMemory::Dma)).unwrap();
7206        gl.convert(&src_dma, &mut dst_rgba, Rotation::None, Flip::None, crop)
7207            .unwrap();
7208
7209        // GL packed RGB output
7210        let mut dst_rgb = TensorImage::new(640, 640, RGB, Some(TensorMemory::Dma)).unwrap();
7211        gl.convert(&src_dma, &mut dst_rgb, Rotation::None, Flip::None, crop)
7212            .unwrap();
7213
7214        let rgba_data = dst_rgba.tensor().map().unwrap();
7215        let expected_rgb = rgba_to_rgb(rgba_data.as_slice());
7216        let gl_data = dst_rgb.tensor().map().unwrap();
7217        assert_pixels_match(&expected_rgb, gl_data.as_slice(), 1);
7218    }
7219
7220    /// YUYV 1080p → RGB_INT8 640x640 with letterbox.
7221    /// Compares GL RGBA (alpha-stripped, XOR 0x80) against GL packed RGB_INT8.
7222    #[test]
7223    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7224    fn test_opengl_rgb_int8_correctness() {
7225        if !is_dma_available() {
7226            return;
7227        }
7228        let src_dma = load_raw_image(
7229            1920,
7230            1080,
7231            YUYV,
7232            Some(TensorMemory::Dma),
7233            include_bytes!("../../../testdata/camera1080p.yuyv"),
7234        )
7235        .unwrap();
7236
7237        let crop = letterbox_crop(1920, 1080, 640, 640);
7238        // Use GLProcessorST with direct RGB disabled to validate two-pass int8
7239        // pipeline against RGBA reference. The direct path renders to a different
7240        // framebuffer format (RGB8 renderbuffer vs RGBA8 texture) which produces
7241        // different YUV interpolation results; it is validated separately by
7242        // test_opengl_rgb_direct_matches_two_pass.
7243        let mut gl = match GLProcessorST::new(None) {
7244            Ok(gl) => gl,
7245            Err(e) => {
7246                eprintln!("SKIPPED: {} - GL not available: {e}", function!());
7247                return;
7248            }
7249        };
7250        gl.support_rgb_direct = false;
7251
7252        // GL RGBA reference
7253        let mut dst_rgba = TensorImage::new(640, 640, RGBA, Some(TensorMemory::Dma)).unwrap();
7254        gl.convert(&src_dma, &mut dst_rgba, Rotation::None, Flip::None, crop)
7255            .unwrap();
7256
7257        // GL packed RGB_INT8 output (two-pass path)
7258        let mut dst_rgb = TensorImage::new(640, 640, RGB_INT8, Some(TensorMemory::Dma)).unwrap();
7259        gl.convert(&src_dma, &mut dst_rgb, Rotation::None, Flip::None, crop)
7260            .unwrap();
7261
7262        let rgba_data = dst_rgba.tensor().map().unwrap();
7263        let expected_rgb = uint8_to_int8(&rgba_to_rgb(rgba_data.as_slice()));
7264        let gl_data = dst_rgb.tensor().map().unwrap();
7265        assert_pixels_match(&expected_rgb, gl_data.as_slice(), 1);
7266    }
7267
7268    /// YUYV 1080p → RGB 1920x1080 (no letterbox, same size).
7269    /// Compares GL RGBA (alpha-stripped) against GL packed RGB without scaling.
7270    #[test]
7271    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7272    fn test_opengl_rgb_no_letterbox_correctness() {
7273        if !is_dma_available() {
7274            return;
7275        }
7276        let src_dma = load_raw_image(
7277            1920,
7278            1080,
7279            YUYV,
7280            Some(TensorMemory::Dma),
7281            include_bytes!("../../../testdata/camera1080p.yuyv"),
7282        )
7283        .unwrap();
7284
7285        let mut gl = GLProcessorThreaded::new(None).unwrap();
7286
7287        // GL RGBA reference (no letterbox — 1920 satisfies W*3 % 4 == 0)
7288        let mut dst_rgba = TensorImage::new(1920, 1080, RGBA, Some(TensorMemory::Dma)).unwrap();
7289        gl.convert(
7290            &src_dma,
7291            &mut dst_rgba,
7292            Rotation::None,
7293            Flip::None,
7294            Crop::no_crop(),
7295        )
7296        .unwrap();
7297
7298        // GL packed RGB output
7299        let mut dst_rgb = TensorImage::new(1920, 1080, RGB, Some(TensorMemory::Dma)).unwrap();
7300        gl.convert(
7301            &src_dma,
7302            &mut dst_rgb,
7303            Rotation::None,
7304            Flip::None,
7305            Crop::no_crop(),
7306        )
7307        .unwrap();
7308
7309        let rgba_data = dst_rgba.tensor().map().unwrap();
7310        let expected_rgb = rgba_to_rgb(rgba_data.as_slice());
7311        let gl_data = dst_rgb.tensor().map().unwrap();
7312        assert_pixels_match(&expected_rgb, gl_data.as_slice(), 1);
7313    }
7314
7315    // =========================================================================
7316    // Direct RGB Render Path Tests
7317    // These tests exercise the single-pass BGR888 renderbuffer path added by
7318    // the GL cache work (EDGEAI-776). They require DMA + OpenGL hardware.
7319    // =========================================================================
7320
7321    /// Verify that the direct RGB probe runs without crashing.
7322    #[test]
7323    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7324    fn test_probe_rgb_direct_support() {
7325        if !is_dma_available() {
7326            eprintln!("SKIPPED: {} - DMA not available", function!());
7327            return;
7328        }
7329        let gl = match GLProcessorST::new(None) {
7330            Ok(gl) => gl,
7331            Err(e) => {
7332                eprintln!("SKIPPED: {} - GL not available: {e}", function!());
7333                return;
7334            }
7335        };
7336        // The probe runs during new(). Just check the field is set.
7337        eprintln!(
7338            "support_rgb_direct = {} (probe completed without crash)",
7339            gl.support_rgb_direct
7340        );
7341    }
7342
7343    /// Compare direct RGB path against two-pass path pixel-for-pixel.
7344    /// If GPU doesn't support direct RGB, this test is a no-op.
7345    #[test]
7346    #[cfg(all(target_os = "linux", feature = "dma_test_formats"))]
7347    fn test_opengl_rgb_direct_matches_two_pass() {
7348        if !is_dma_available() {
7349            eprintln!("SKIPPED: {} - DMA not available", function!());
7350            return;
7351        }
7352        let mut gl = match GLProcessorST::new(None) {
7353            Ok(gl) => gl,
7354            Err(e) => {
7355                eprintln!("SKIPPED: {} - GL not available: {e}", function!());
7356                return;
7357            }
7358        };
7359        if !gl.support_rgb_direct {
7360            eprintln!("SKIPPED: {} - GPU does not support direct RGB", function!());
7361            return;
7362        }
7363
7364        // Create RGBA source with deterministic pattern
7365        // Use 640x480 source → 320x320 output so pitch (320*3=960) is 64-byte aligned
7366        // for Mali GPU DMA-buf import requirements.
7367        let src = TensorImage::new(640, 480, RGBA, Some(TensorMemory::Dma)).unwrap();
7368        {
7369            let mut map = src.tensor().map().unwrap();
7370            for (i, byte) in map.as_mut_slice().iter_mut().enumerate() {
7371                *byte = (i % 251) as u8; // deterministic pattern
7372            }
7373        }
7374
7375        let crop = crate::Crop {
7376            src_rect: None,
7377            dst_rect: None,
7378            dst_color: None,
7379        };
7380
7381        // Direct path (support_rgb_direct = true)
7382        let mut dst_direct = TensorImage::new(320, 320, RGB, Some(TensorMemory::Dma)).unwrap();
7383        gl.convert(&src, &mut dst_direct, Rotation::None, Flip::None, crop)
7384            .unwrap();
7385
7386        // Force two-pass path
7387        gl.support_rgb_direct = false;
7388        let mut dst_twop = TensorImage::new(320, 320, RGB, Some(TensorMemory::Dma)).unwrap();
7389        gl.convert(&src, &mut dst_twop, Rotation::None, Flip::None, crop)
7390            .unwrap();
7391        gl.support_rgb_direct = true;
7392
7393        // Compare
7394        let map_direct = dst_direct.tensor().map().unwrap();
7395        let map_twop = dst_twop.tensor().map().unwrap();
7396        // Allow ±1 tolerance for potential rounding differences
7397        let mut max_diff = 0i32;
7398        for (a, b) in map_direct.as_slice().iter().zip(map_twop.as_slice().iter()) {
7399            let diff = (*a as i32 - *b as i32).abs();
7400            max_diff = max_diff.max(diff);
7401        }
7402        eprintln!("RGB direct vs two-pass max pixel diff: {max_diff}");
7403        assert!(max_diff <= 1, "Pixel mismatch > 1: max_diff={max_diff}");
7404    }
7405}