edgefirst-image 0.24.2

High-performance image processing with hardware acceleration for edge AI
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
// SPDX-FileCopyrightText: Copyright 2026 Au-Zone Technologies
// SPDX-License-Identifier: Apache-2.0

//! macOS GL image processor backed by ANGLE + IOSurface.
//!
//! Mirrors the role of `GLProcessorThreaded` on Linux — the parallel
//! Linux implementation lives in `gl/threaded.rs` and is structurally
//! more elaborate because of vendor-driver thread-safety constraints
//! (Vivante galcore in particular). ANGLE's Metal backend is
//! thread-safe enough that we run GL inline under a process-wide mutex
//! instead of through a dedicated thread + command channel.
//!
//! Format coverage in this initial implementation:
//!   * YUYV → RGBA — full shader-based BT.709 limited-range conversion
//!
//! Other format pairs and the mask-rendering / decoder paths return
//! `NotImplemented` and fall back to the CPU backend, matching the
//! contract the Linux backend uses for unsupported combinations on a
//! given GPU driver.
//!
//! ## Resource model
//!
//! The ANGLE EGL display + context + dummy pbuffer are *process-global*,
//! shared via `SHARED_DISPLAY` on first construction. The Linux backend
//! makes the same choice for the same reason — `eglTerminate` is
//! ref-counted but never safely terminable mid-process, and ANGLE's
//! Metal device is a singleton. Per-instance state is limited to the
//! cached shader program, VBO/VAO/FBO, transient texture handles, and
//! the IOSurface→pbuffer cache.
//!
//! GL/EGL calls are serialised behind a single static `GL_MUTEX` so
//! concurrent `MacosGlProcessor` instances do not race on the shared
//! context's current-thread state.
//!
//! See `crates/image/src/gl/platform/macos.rs` for the platform helpers
//! this processor builds on, and `crates/image/src/gl/iosurface_import.rs`
//! for the IOSurface allocation + EGL pbuffer attribute setup.

#![cfg(target_os = "macos")]

use super::iosurface_import;
use super::platform::macos::MacosPlatform;
// `MacosPlatform::{load_egl_lib, create_display}` are the two macOS-specific
// helpers; everything else (pbuffer creation, texture binding, FBO setup,
// shader compilation) is inline here. See platform/mod.rs for the seam
// rationale.
use super::Egl;
use crate::{Crop, Error, Flip, ImageProcessorTrait, MaskOverlay, Result, Rotation};
use edgefirst_decoder::{DetectBox, ProtoData, Segmentation};
use edgefirst_tensor::{PixelFormat, TensorDyn};
use khronos_egl as egl;
use log::debug;
use std::collections::HashMap;
use std::ffi::{c_void, CString};
use std::sync::{Mutex, MutexGuard, OnceLock};

// ---------------------------------------------------------------------------
// EGL constants reused across the macOS path. The "production" constants in
// `super::iosurface_import` cover the IOSurface-pbuffer attribute set; these
// are the additional constants needed at MacosGlProcessor::new time.
// ---------------------------------------------------------------------------

const EGL_OPENGL_ES3_BIT: i32 = 0x0040;
const EGL_PBUFFER_BIT: i32 = 0x0001;
const EGL_RENDERABLE_TYPE: i32 = 0x3040;
const EGL_SURFACE_TYPE: i32 = 0x3033;
const EGL_RED_SIZE: i32 = 0x3024;
const EGL_GREEN_SIZE: i32 = 0x3023;
const EGL_BLUE_SIZE: i32 = 0x3022;
const EGL_ALPHA_SIZE: i32 = 0x3021;
const EGL_CONTEXT_CLIENT_VERSION: i32 = 0x3098;
const EGL_BACK_BUFFER: i32 = 0x3084;

// ---------------------------------------------------------------------------
// Shaders. YUYV-as-GL_RG sampling: each source texel is (Y, C) where C
// alternates U/V every other column. We sample the current and partner
// texel to recover both chroma values for each output pixel, then apply
// the BT.709 limited-range matrix.
//
// The shader matches the spike at `spikes/angle_iosurface/`. Bit-near-
// exact (≤1 LSB) match to the CPU scalar reference was validated there.
// ---------------------------------------------------------------------------

const VERTEX_SHADER: &str = r#"#version 300 es
precision mediump float;
layout(location = 0) in vec2 pos;
layout(location = 1) in vec2 uv_in;
out vec2 v_uv;
void main() {
    v_uv = uv_in;
    gl_Position = vec4(pos, 0.0, 1.0);
}
"#;

const YUYV_TO_RGBA_FRAGMENT: &str = r#"#version 300 es
precision mediump float;
uniform sampler2D src;
uniform vec2 src_size;
in vec2 v_uv;
out vec4 frag;

void main() {
    vec2 texel = vec2(1.0) / src_size;
    vec2 col = floor(v_uv * src_size);
    bool even = mod(col.x, 2.0) < 0.5;
    vec2 self_uv = (col + vec2(0.5)) * texel;
    vec2 pair_uv = (col + vec2(even ? 1.5 : -0.5, 0.5)) * texel;

    vec4 self_rg = texture(src, self_uv);
    vec4 pair_rg = texture(src, pair_uv);
    float y = self_rg.r;
    float u, v;
    if (even) { u = self_rg.g; v = pair_rg.g; }
    else      { v = self_rg.g; u = pair_rg.g; }

    float yp = (y * 255.0 - 16.0) * (1.164 / 255.0);
    float up = u - 128.0/255.0;
    float vp = v - 128.0/255.0;
    float r = clamp(yp + 1.793 * vp, 0.0, 1.0);
    float g = clamp(yp - 0.213 * up - 0.533 * vp, 0.0, 1.0);
    float b = clamp(yp + 2.112 * up, 0.0, 1.0);
    frag = vec4(r, g, b, 1.0);
}
"#;

// ---------------------------------------------------------------------------
// One-shot GL function-pointer table.
//
// `gls::load_with` populates global function pointers — exists once per
// process. We load via EGL's `eglGetProcAddress` so the symbols come
// from ANGLE's libGLESv2.dylib.
// ---------------------------------------------------------------------------

static GL_LOADED: OnceLock<()> = OnceLock::new();

fn load_gl_once(egl: &Egl) {
    GL_LOADED.get_or_init(|| {
        gls::load_with(|name| match egl.get_proc_address(name) {
            Some(ptr) => ptr as *const c_void,
            None => std::ptr::null(),
        });
    });
}

// ---------------------------------------------------------------------------
// Process-global ANGLE EGL display + context + dummy pbuffer.
//
// `eglTerminate` is ref-counted but never safely terminable mid-process
// (ANGLE's Metal device is a per-process singleton, and any in-flight GL
// command from any thread aborts when the display goes away). The Linux
// backend uses a `SharedEglDisplay` in `context.rs` for the same reason.
// Sharing here also avoids hammering `eglInitialize` from every
// `MacosGlProcessor::new()` call.
//
// `GL_MUTEX` serialises every `eglMakeCurrent` + GL call across all
// `MacosGlProcessor` instances — ANGLE's Metal backend is internally
// thread-safe enough that a single global mutex is the right granularity.
// Per-instance mutexes would race on the current-thread context state
// because the EGL context is shared.
// ---------------------------------------------------------------------------

/// All process-global EGL state. Use [`shared_display`] to access.
struct SharedAngleDisplay {
    /// Static-lifetime EGL handle. The actual ANGLE libEGL.dylib is
    /// leaked at first dlopen and never closed.
    egl: Egl,
    display: egl::Display,
    config: egl::Config,
    context: egl::Context,
    /// Tiny scratch surface kept alive so the context can be made
    /// current outside of a `convert` call (e.g. for shader compile,
    /// resource allocation, or `Drop`-time cleanup).
    dummy_pbuffer: egl::Surface,
}

// SAFETY: every member is either a leak'd static, an EGL handle (which
// the ANGLE driver synchronises internally), or a pointer to driver-
// owned state. Access is gated by GL_MUTEX.
unsafe impl Send for SharedAngleDisplay {}
unsafe impl Sync for SharedAngleDisplay {}

static SHARED_DISPLAY: OnceLock<std::result::Result<SharedAngleDisplay, String>> = OnceLock::new();
static GL_MUTEX: Mutex<()> = Mutex::new(());

/// Acquire a reference to the process-global ANGLE display, initialising
/// it on first call. Subsequent calls return the same handle. The error
/// case is cached too — once ANGLE fails to load we don't keep retrying.
fn shared_display() -> Result<&'static SharedAngleDisplay> {
    SHARED_DISPLAY
        .get_or_init(|| init_shared_display().map_err(|e| e.to_string()))
        .as_ref()
        .map_err(|s| Error::Io(std::io::Error::other(s.clone())))
}

fn init_shared_display() -> Result<SharedAngleDisplay> {
    let _span =
        tracing::info_span!("image.gl_init", platform = "macos", backend = "iosurface",).entered();

    // 1. Load ANGLE libEGL and bring up an EGL instance.
    let egl_lib = MacosPlatform::load_egl_lib()
        .map_err(|e| Error::Io(std::io::Error::other(format!("ANGLE libEGL: {e}"))))?;
    let egl: Egl = unsafe {
        khronos_egl::Instance::<
            khronos_egl::Dynamic<&'static libloading::Library, khronos_egl::EGL1_4>,
        >::load_required_from(egl_lib)
    }
    .map_err(|e| Error::Io(std::io::Error::other(format!("EGL load: {e:?}"))))?;

    // 2. Metal-backed display from MacosPlatform.
    let display = MacosPlatform::create_display(&egl)?;
    let (maj, min) = egl
        .initialize(display)
        .map_err(|e| Error::Io(std::io::Error::other(format!("eglInitialize: {e:?}"))))?;
    debug!("MacosGlProcessor: EGL {maj}.{min} initialised via ANGLE (process-global)");

    egl.bind_api(egl::OPENGL_ES_API)
        .map_err(|e| Error::Io(std::io::Error::other(format!("eglBindAPI: {e:?}"))))?;

    // 3. Choose an EGL config that supports GLES 3 + PBUFFER +
    //    EGL_BIND_TO_TEXTURE_TARGET_ANGLE = EGL_TEXTURE_2D.
    let cfg_attribs = [
        EGL_RENDERABLE_TYPE,
        EGL_OPENGL_ES3_BIT,
        EGL_SURFACE_TYPE,
        EGL_PBUFFER_BIT,
        EGL_RED_SIZE,
        8,
        EGL_GREEN_SIZE,
        8,
        EGL_BLUE_SIZE,
        8,
        EGL_ALPHA_SIZE,
        8,
        iosurface_import::EGL_BIND_TO_TEXTURE_TARGET_ANGLE,
        0x305F, // EGL_TEXTURE_2D
        egl::NONE,
    ];
    let config = egl
        .choose_first_config(display, &cfg_attribs)
        .map_err(|e| Error::Io(std::io::Error::other(format!("eglChooseConfig: {e:?}"))))?
        .ok_or_else(|| {
            Error::NotSupported("no EGL config with GLES3+PBUFFER+TEXTURE_2D bind".into())
        })?;

    // 4. GLES3 context.
    let ctx_attribs = [EGL_CONTEXT_CLIENT_VERSION, 3, egl::NONE];
    let context = egl
        .create_context(display, config, None, &ctx_attribs)
        .map_err(|e| Error::Io(std::io::Error::other(format!("eglCreateContext: {e:?}"))))?;

    // 5. Dummy pbuffer for context-current bring-up.
    let dummy_attribs = [egl::WIDTH, 16, egl::HEIGHT, 16, egl::NONE];
    let dummy_pbuffer = egl
        .create_pbuffer_surface(display, config, &dummy_attribs)
        .map_err(|e| {
            // Clean up the context we just created before bailing.
            let _ = egl.destroy_context(display, context);
            Error::Io(std::io::Error::other(format!(
                "eglCreatePbufferSurface(dummy): {e:?}"
            )))
        })?;

    // 6. Load GL function pointers via the now-initialised display.
    //    Make-current is required for some drivers to expose GLES symbols.
    if let Err(e) = egl.make_current(
        display,
        Some(dummy_pbuffer),
        Some(dummy_pbuffer),
        Some(context),
    ) {
        let _ = egl.destroy_surface(display, dummy_pbuffer);
        let _ = egl.destroy_context(display, context);
        return Err(Error::Io(std::io::Error::other(format!(
            "eglMakeCurrent(dummy): {e:?}"
        ))));
    }
    load_gl_once(&egl);
    let _ = egl.make_current(display, None, None, None);

    Ok(SharedAngleDisplay {
        egl,
        display,
        config,
        context,
        dummy_pbuffer,
    })
}

// ---------------------------------------------------------------------------
// Mutex helpers.
//
// `GL_MUTEX.lock()` can return a `PoisonError` if a previous panic left
// the mutex poisoned. Recover by extracting the inner guard — the GL
// state behind it is just a `()` and there's no invariant to honour.
// Using `unwrap()` here would turn any panic in `convert_*` into a
// permanent failure of every subsequent call.
// ---------------------------------------------------------------------------

fn lock_gl() -> MutexGuard<'static, ()> {
    GL_MUTEX.lock().unwrap_or_else(|p| p.into_inner())
}

// ---------------------------------------------------------------------------
// The processor itself.
//
// Holds: EGL display + config + context, the compiled YUYV→RGBA program,
// a fullscreen-quad VAO/VBO, an FBO for off-screen rendering, and a
// pair of GL textures used for transient binding of the source/dest
// IOSurface pbuffers.
//
// GL state is shared across calls to amortize shader compilation and
// VAO/FBO setup. A mutex serializes calls to `convert` so EGL state
// changes (eglMakeCurrent, eglBindTexImage) are not racing.
// ---------------------------------------------------------------------------

pub struct MacosGlProcessor {
    /// Per-instance GL resources. EGL display/context/dummy_pbuffer
    /// live in `SHARED_DISPLAY` (process-global).
    program_yuyv_to_rgba: u32,
    uniform_src: i32,
    uniform_src_size: i32,
    vao: u32,
    vbo: u32,
    fbo: u32,
    src_tex: u32,
    dst_tex: u32,

    /// (IOSurfaceID, format-as-u32) → cached EGL pbuffer surface.
    /// Same-tensor convert() calls reuse the pbuffer instead of paying
    /// `eglCreatePbufferFromClientBuffer` every frame.
    ///
    /// The cache is guarded by `GL_MUTEX` (so accessed only while the
    /// caller holds the GL lock), but lives on the processor rather
    /// than globally so each processor's resource lifetime is
    /// independent and easy to reason about. ANGLE's pbuffers are
    /// per-display, not per-context, so this is sound.
    pbuf_cache: Mutex<HashMap<PbufferCacheKey, egl::Surface>>,
}

#[derive(Hash, Eq, PartialEq, Clone, Copy, Debug)]
struct PbufferCacheKey {
    iosurface_id: u32,
    /// Discriminant of the [`PixelFormat`] — ANGLE validates
    /// FourCC/GL-format agreement at pbuffer-creation time, so two
    /// different formats over the same IOSurface need distinct pbuffers
    /// even though that pairing is unusual in practice.
    format_disc: u8,
}

impl std::fmt::Debug for MacosGlProcessor {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("MacosGlProcessor")
            .field("backend", &"ANGLE+IOSurface")
            .finish()
    }
}

// ---------------------------------------------------------------------------
// RAII guards.
// ---------------------------------------------------------------------------

/// Makes the shared EGL context current on the calling thread for its
/// lifetime, then releases it on drop. Drop runs even on panic, so the
/// next `MacosGlProcessor::convert*` call on a different processor (or
/// the same one after a panic recovery) sees a clean make-current state.
struct MakeCurrentGuard<'d> {
    egl: &'d Egl,
    display: egl::Display,
}

impl<'d> MakeCurrentGuard<'d> {
    fn new(d: &'d SharedAngleDisplay) -> Result<Self> {
        d.egl
            .make_current(
                d.display,
                Some(d.dummy_pbuffer),
                Some(d.dummy_pbuffer),
                Some(d.context),
            )
            .map_err(|e| Error::Io(std::io::Error::other(format!("eglMakeCurrent: {e:?}"))))?;
        Ok(Self {
            egl: &d.egl,
            display: d.display,
        })
    }
}

impl Drop for MakeCurrentGuard<'_> {
    fn drop(&mut self) {
        // Release the context on this thread. Failure here is logged
        // but ignored — Drop must not panic, and the next make-current
        // will overwrite the state anyway.
        let _ = self.egl.make_current(self.display, None, None, None);
    }
}

/// Owns an `eglBindTexImage` binding. On drop calls `eglReleaseTexImage`
/// — required by the EGL spec before the pbuffer can be destroyed and
/// strictly necessary for ANGLE on some Metal device states.
struct BoundTexImage<'d> {
    egl: &'d Egl,
    display: egl::Display,
    pbuf: egl::Surface,
    bound: bool,
}

impl<'d> BoundTexImage<'d> {
    fn bind(d: &'d SharedAngleDisplay, pbuf: egl::Surface) -> Result<Self> {
        d.egl
            .bind_tex_image(d.display, pbuf, EGL_BACK_BUFFER)
            .map_err(|e| Error::Io(std::io::Error::other(format!("eglBindTexImage: {e:?}"))))?;
        Ok(Self {
            egl: &d.egl,
            display: d.display,
            pbuf,
            bound: true,
        })
    }
}

impl Drop for BoundTexImage<'_> {
    fn drop(&mut self) {
        if self.bound {
            let _ = self
                .egl
                .release_tex_image(self.display, self.pbuf, EGL_BACK_BUFFER);
        }
    }
}

impl MacosGlProcessor {
    pub fn new() -> Result<Self> {
        // SHARED_DISPLAY caches both successes and failures, so this is
        // cheap on the hot path. It also surfaces "ANGLE not installed"
        // exactly once per process.
        let d = shared_display()?;

        // Per-instance setup runs under the GL mutex so we don't race
        // with another processor's convert() on context-current state.
        let _guard = lock_gl();
        let _current = MakeCurrentGuard::new(d)?;

        // SAFETY: serialized via `_guard`; context is current via
        // `_current`. Each helper handles its own internal cleanup on
        // error; if a step later in this sequence fails, the
        // `InstanceCleanup` scope guard below tears down the partially
        // built state.
        unsafe {
            // Build the per-instance resources behind a scope guard so
            // any error path below cleans up GL allocations rather than
            // leaking them.
            let program = compile_program(VERTEX_SHADER, YUYV_TO_RGBA_FRAGMENT)?;
            // From here on, every fallible step must reach Drop-cleanup
            // for `program` if it fails. The simplest pattern: stash
            // resources in `Option<u32>` and let `InstanceCleanup` Drop
            // delete whichever are still `Some`.

            struct InstanceCleanup {
                program: Option<u32>,
                vbo: Option<u32>,
                vao: Option<u32>,
                fbo: Option<u32>,
                src_tex: Option<u32>,
                dst_tex: Option<u32>,
            }
            impl Drop for InstanceCleanup {
                fn drop(&mut self) {
                    // SAFETY: only one current context per thread; we
                    // hold the GL mutex transitively via the caller.
                    unsafe {
                        if let Some(p) = self.program {
                            gls::gl::DeleteProgram(p);
                        }
                        if let Some(b) = self.vbo {
                            gls::gl::DeleteBuffers(1, &b);
                        }
                        if let Some(a) = self.vao {
                            gls::gl::DeleteVertexArrays(1, &a);
                        }
                        if let Some(f) = self.fbo {
                            gls::gl::DeleteFramebuffers(1, &f);
                        }
                        if let Some(t) = self.src_tex {
                            gls::gl::DeleteTextures(1, &t);
                        }
                        if let Some(t) = self.dst_tex {
                            gls::gl::DeleteTextures(1, &t);
                        }
                    }
                }
            }
            let mut cleanup = InstanceCleanup {
                program: Some(program),
                vbo: None,
                vao: None,
                fbo: None,
                src_tex: None,
                dst_tex: None,
            };

            let (uniform_src, uniform_src_size) = {
                let loc_src = gls::gl::GetUniformLocation(program, c"src".as_ptr() as *const _);
                let loc_size =
                    gls::gl::GetUniformLocation(program, c"src_size".as_ptr() as *const _);
                (loc_src, loc_size)
            };

            // Fullscreen-quad VBO + VAO.
            #[rustfmt::skip]
            let quad: [f32; 16] = [
                -1.0,-1.0,  0.0, 0.0,
                 1.0,-1.0,  1.0, 0.0,
                -1.0, 1.0,  0.0, 1.0,
                 1.0, 1.0,  1.0, 1.0,
            ];
            let mut vbo = 0u32;
            let mut vao = 0u32;
            gls::gl::GenBuffers(1, &mut vbo);
            cleanup.vbo = Some(vbo);
            gls::gl::BindBuffer(gls::gl::ARRAY_BUFFER, vbo);
            gls::gl::BufferData(
                gls::gl::ARRAY_BUFFER,
                std::mem::size_of_val(&quad) as isize,
                quad.as_ptr() as *const _,
                gls::gl::STATIC_DRAW,
            );
            gls::gl::GenVertexArrays(1, &mut vao);
            cleanup.vao = Some(vao);
            gls::gl::BindVertexArray(vao);
            gls::gl::VertexAttribPointer(0, 2, gls::gl::FLOAT, 0, 16, std::ptr::null());
            gls::gl::EnableVertexAttribArray(0);
            gls::gl::VertexAttribPointer(1, 2, gls::gl::FLOAT, 0, 16, 8 as *const _);
            gls::gl::EnableVertexAttribArray(1);

            // FBO + two transient texture handles.
            let mut fbo = 0u32;
            let mut src_tex = 0u32;
            let mut dst_tex = 0u32;
            gls::gl::GenFramebuffers(1, &mut fbo);
            cleanup.fbo = Some(fbo);
            gls::gl::GenTextures(1, &mut src_tex);
            cleanup.src_tex = Some(src_tex);
            gls::gl::GenTextures(1, &mut dst_tex);
            cleanup.dst_tex = Some(dst_tex);
            for tex in [src_tex, dst_tex] {
                gls::gl::BindTexture(gls::gl::TEXTURE_2D, tex);
                gls::gl::TexParameteri(
                    gls::gl::TEXTURE_2D,
                    gls::gl::TEXTURE_MIN_FILTER,
                    gls::gl::NEAREST as i32,
                );
                gls::gl::TexParameteri(
                    gls::gl::TEXTURE_2D,
                    gls::gl::TEXTURE_MAG_FILTER,
                    gls::gl::NEAREST as i32,
                );
                gls::gl::TexParameteri(
                    gls::gl::TEXTURE_2D,
                    gls::gl::TEXTURE_WRAP_S,
                    gls::gl::CLAMP_TO_EDGE as i32,
                );
                gls::gl::TexParameteri(
                    gls::gl::TEXTURE_2D,
                    gls::gl::TEXTURE_WRAP_T,
                    gls::gl::CLAMP_TO_EDGE as i32,
                );
            }

            // Construction succeeded — disarm `cleanup` so its Drop
            // doesn't tear down the resources we're about to hand out.
            let program = cleanup.program.take().unwrap();
            let vbo = cleanup.vbo.take().unwrap();
            let vao = cleanup.vao.take().unwrap();
            let fbo = cleanup.fbo.take().unwrap();
            let src_tex = cleanup.src_tex.take().unwrap();
            let dst_tex = cleanup.dst_tex.take().unwrap();
            std::mem::forget(cleanup);

            Ok(Self {
                program_yuyv_to_rgba: program,
                uniform_src,
                uniform_src_size,
                vao,
                vbo,
                fbo,
                src_tex,
                dst_tex,
                pbuf_cache: Mutex::new(HashMap::new()),
            })
        }
    }

    /// Whether the requested conversion is supported by the GL backend.
    /// Used by `ImageProcessor::convert` to decide whether to dispatch
    /// here or fall back to CPU.
    ///
    /// Only `YUYV → RGBA` is implemented today. `YUYV → BGRA` is
    /// deliberately *not* in this set even though the IOSurface
    /// FourCC `'BGRA'` is supported by ANGLE — the current shader
    /// writes `vec4(r, g, b, 1.0)`, which lands as RGBA bytes in the
    /// CPU readback regardless of FourCC. A dedicated BGRA shader
    /// (writing `vec4(b, g, r, 1.0)`) needs to land before we widen
    /// the support set.
    pub fn supports(src_fmt: PixelFormat, dst_fmt: PixelFormat) -> bool {
        matches!((src_fmt, dst_fmt), (PixelFormat::Yuyv, PixelFormat::Rgba))
    }

    /// The actual conversion path. Caller guarantees `supports(src_fmt, dst_fmt)`.
    fn convert_yuyv_to_rgba(
        &self,
        src: &TensorDyn,
        dst: &mut TensorDyn,
        src_fmt: PixelFormat,
        dst_fmt: PixelFormat,
    ) -> Result<()> {
        let _span = tracing::trace_span!(
            "image.convert",
            backend = "gl",
            platform = "macos",
            src_fmt = ?src_fmt,
            dst_fmt = ?dst_fmt,
        )
        .entered();

        let src_w = src
            .width()
            .ok_or_else(|| Error::InvalidShape("src width".into()))?;
        let src_h = src
            .height()
            .ok_or_else(|| Error::InvalidShape("src height".into()))?;
        let dst_w = dst
            .width()
            .ok_or_else(|| Error::InvalidShape("dst width".into()))?;
        let dst_h = dst
            .height()
            .ok_or_else(|| Error::InvalidShape("dst height".into()))?;

        // Validation: same-size only in this first cut. Resize support
        // is straightforward (just change the viewport + texture sample
        // ratio) but not in scope for the initial integration.
        if src_w != dst_w || src_h != dst_h {
            return Err(Error::NotImplemented(format!(
                "MacosGlProcessor: resize not yet supported (src {src_w}×{src_h} → dst {dst_w}×{dst_h}); CPU fallback handles this"
            )));
        }

        let src_u8 = src
            .as_u8()
            .ok_or_else(|| Error::NotSupported("GL backend requires u8 source tensor".into()))?;
        let dst_u8 = dst.as_u8_mut().ok_or_else(|| {
            Error::NotSupported("GL backend requires u8 destination tensor".into())
        })?;

        // Both tensors MUST be IOSurface-backed for the zero-copy path.
        let src_iosurface = src_u8.iosurface_ref().ok_or_else(|| {
            Error::NotSupported("GL convert: source tensor is not IOSurface-backed".into())
        })?;
        let dst_iosurface = dst_u8.iosurface_ref().ok_or_else(|| {
            Error::NotSupported("GL convert: destination tensor is not IOSurface-backed".into())
        })?;
        let src_id = src_u8.iosurface_id().unwrap_or(0);
        let dst_id = dst_u8.iosurface_id().unwrap_or(0);

        let d = shared_display()?;
        let _gl_guard = lock_gl();
        let _current = MakeCurrentGuard::new(d)?;

        // Look up (or create) the source/dest pbuffers in the cache.
        // Cache miss path calls `eglCreatePbufferFromClientBuffer` and
        // inserts; cache hit returns the existing surface.
        let src_pbuf =
            self.get_or_create_pbuffer(d, src_id, src_iosurface, src_fmt, src_w, src_h)?;
        let dst_pbuf =
            self.get_or_create_pbuffer(d, dst_id, dst_iosurface, dst_fmt, dst_w, dst_h)?;

        // SAFETY: GL mutex held; context current via `_current`. Each
        // pbuffer's tex-image binding is owned by a `BoundTexImage` RAII
        // guard so eglReleaseTexImage runs even on panic.
        unsafe {
            // Source texture binding.
            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.src_tex);
            let _src_bound = BoundTexImage::bind(d, src_pbuf)?;

            // Destination texture binding + attach to FBO.
            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.dst_tex);
            let _dst_bound = BoundTexImage::bind(d, dst_pbuf)?;
            gls::gl::BindFramebuffer(gls::gl::FRAMEBUFFER, self.fbo);
            gls::gl::FramebufferTexture2D(
                gls::gl::FRAMEBUFFER,
                gls::gl::COLOR_ATTACHMENT0,
                gls::gl::TEXTURE_2D,
                self.dst_tex,
                0,
            );
            let fbo_status = gls::gl::CheckFramebufferStatus(gls::gl::FRAMEBUFFER);
            if fbo_status != gls::gl::FRAMEBUFFER_COMPLETE {
                return Err(Error::Io(std::io::Error::other(format!(
                    "FBO incomplete: 0x{fbo_status:x}"
                ))));
            }

            // Render.
            gls::gl::Viewport(0, 0, dst_w as i32, dst_h as i32);
            gls::gl::UseProgram(self.program_yuyv_to_rgba);
            gls::gl::ActiveTexture(gls::gl::TEXTURE0);
            gls::gl::BindTexture(gls::gl::TEXTURE_2D, self.src_tex);
            gls::gl::Uniform1i(self.uniform_src, 0);
            gls::gl::Uniform2f(self.uniform_src_size, src_w as f32, src_h as f32);
            gls::gl::BindVertexArray(self.vao);
            gls::gl::DrawArrays(gls::gl::TRIANGLE_STRIP, 0, 4);
            gls::gl::Finish();

            // `_src_bound` and `_dst_bound` Drop release the tex-image
            // bindings here. The pbuffers themselves stay in the cache.
        }
        Ok(())
    }

    /// Look up or create the EGL pbuffer wrapping a given IOSurface.
    ///
    /// Cache key is `(iosurface_id, format_discriminant)`. The cache is
    /// keyed by IOSurfaceID rather than `BufferIdentity` so externally
    /// imported surfaces (via `Tensor::from_iosurface`) share a cache
    /// slot with internally allocated ones — same underlying surface,
    /// same pbuffer.
    ///
    /// IOSurfaceID `0` is treated as un-cacheable: it's the sentinel
    /// returned by `iosurface_id()` when the tensor's IOSurface backing
    /// is somehow malformed (shouldn't happen but the path stays sound).
    fn get_or_create_pbuffer(
        &self,
        d: &SharedAngleDisplay,
        iosurface_id: u32,
        surface_ref: *mut c_void,
        format: PixelFormat,
        width: usize,
        height: usize,
    ) -> Result<egl::Surface> {
        let key = PbufferCacheKey {
            iosurface_id,
            format_disc: pixel_format_discriminant(format),
        };
        if iosurface_id != 0 {
            let cache = self.pbuf_cache.lock().unwrap_or_else(|p| p.into_inner());
            if let Some(&pbuf) = cache.get(&key) {
                return Ok(pbuf);
            }
        }
        // SAFETY: surface_ref borrowed from a live tensor; config has
        // EGL_BIND_TO_TEXTURE_TARGET_ANGLE set.
        let pbuf = unsafe {
            iosurface_import::create_iosurface_pbuffer(
                &d.egl,
                d.display,
                d.config,
                surface_ref,
                format,
                width,
                height,
            )?
        };
        if iosurface_id != 0 {
            let mut cache = self.pbuf_cache.lock().unwrap_or_else(|p| p.into_inner());
            cache.insert(key, pbuf);
        }
        Ok(pbuf)
    }
}

fn pixel_format_discriminant(fmt: PixelFormat) -> u8 {
    // PixelFormat is #[repr(u8)] so the cast is a guaranteed
    // collision-free discriminant.
    fmt as u8
}

impl Drop for MacosGlProcessor {
    fn drop(&mut self) {
        // Best-effort cleanup; Drop must not panic.
        let Ok(d) = shared_display() else {
            return; // ANGLE never initialised — nothing to clean up.
        };
        let _gl_guard = lock_gl();
        let _current = match MakeCurrentGuard::new(d) {
            Ok(g) => g,
            Err(_) => return,
        };
        unsafe {
            // Destroy cached pbuffers.
            let mut cache = self
                .pbuf_cache
                .get_mut()
                .map(std::mem::take)
                .unwrap_or_default();
            for (_, pbuf) in cache.drain() {
                let _ = d.egl.destroy_surface(d.display, pbuf);
            }
            // Per-instance GL resources.
            gls::gl::DeleteFramebuffers(1, &self.fbo);
            gls::gl::DeleteTextures(1, &self.src_tex);
            gls::gl::DeleteTextures(1, &self.dst_tex);
            gls::gl::DeleteBuffers(1, &self.vbo);
            gls::gl::DeleteVertexArrays(1, &self.vao);
            gls::gl::DeleteProgram(self.program_yuyv_to_rgba);
            // Shared EGL display/context/dummy_pbuffer outlive every
            // processor instance and are never destroyed — see the
            // module docstring for why.
        }
    }
}

impl ImageProcessorTrait for MacosGlProcessor {
    fn convert(
        &mut self,
        src: &TensorDyn,
        dst: &mut TensorDyn,
        rotation: Rotation,
        flip: Flip,
        crop: Crop,
    ) -> Result<()> {
        if !matches!(rotation, Rotation::None) || !matches!(flip, Flip::None) {
            return Err(Error::NotImplemented(
                "MacosGlProcessor: rotation/flip not yet supported; CPU fallback handles this"
                    .into(),
            ));
        }
        if crop.src_rect.is_some() || crop.dst_rect.is_some() {
            return Err(Error::NotImplemented(
                "MacosGlProcessor: crop not yet supported; CPU fallback handles this".into(),
            ));
        }
        let (src_fmt, dst_fmt) = match (src.format(), dst.format()) {
            (Some(s), Some(d)) => (s, d),
            _ => {
                return Err(Error::NotSupported(
                    "MacosGlProcessor: untyped tensors (None format) not supported".into(),
                ));
            }
        };
        if !Self::supports(src_fmt, dst_fmt) {
            return Err(Error::NotSupported(format!(
                "MacosGlProcessor: {src_fmt:?}{dst_fmt:?} not in the initial GL coverage set"
            )));
        }
        self.convert_yuyv_to_rgba(src, dst, src_fmt, dst_fmt)
    }

    fn draw_decoded_masks(
        &mut self,
        _dst: &mut TensorDyn,
        _detect: &[DetectBox],
        _segmentation: &[Segmentation],
        _overlay: MaskOverlay<'_>,
    ) -> Result<()> {
        Err(Error::NotImplemented(
            "MacosGlProcessor: draw_decoded_masks not yet ported (use CPU backend)".into(),
        ))
    }

    fn draw_proto_masks(
        &mut self,
        _dst: &mut TensorDyn,
        _detect: &[DetectBox],
        _proto_data: &ProtoData,
        _overlay: MaskOverlay<'_>,
    ) -> Result<()> {
        Err(Error::NotImplemented(
            "MacosGlProcessor: draw_proto_masks not yet ported (use CPU backend)".into(),
        ))
    }

    fn set_class_colors(&mut self, _colors: &[[u8; 4]]) -> Result<()> {
        // Class-color lookup table is only used by mask rendering, which
        // currently falls back to CPU on macOS. Accepting the call as a
        // no-op keeps the API surface symmetric with Linux.
        Ok(())
    }
}

// ---------------------------------------------------------------------------
// Shader helpers
// ---------------------------------------------------------------------------

unsafe fn compile_program(vertex_src: &str, fragment_src: &str) -> Result<u32> {
    let vs = compile_shader(gls::gl::VERTEX_SHADER, vertex_src)?;
    // From this point on, `vs` and (later) `fs` and `program` are owned
    // by the helper and must be cleaned up on any error path. Track them
    // in an Option and let `ProgramBuild`'s Drop clean up whatever is
    // still present when we leave the function abnormally.
    struct ProgramBuild {
        vs: Option<u32>,
        fs: Option<u32>,
        program: Option<u32>,
    }
    impl Drop for ProgramBuild {
        fn drop(&mut self) {
            unsafe {
                if let Some(p) = self.program {
                    gls::gl::DeleteProgram(p);
                }
                if let Some(s) = self.fs {
                    gls::gl::DeleteShader(s);
                }
                if let Some(s) = self.vs {
                    gls::gl::DeleteShader(s);
                }
            }
        }
    }
    let mut state = ProgramBuild {
        vs: Some(vs),
        fs: None,
        program: None,
    };

    let fs = compile_shader(gls::gl::FRAGMENT_SHADER, fragment_src)?;
    state.fs = Some(fs);

    let program = gls::gl::CreateProgram();
    state.program = Some(program);
    gls::gl::AttachShader(program, vs);
    gls::gl::AttachShader(program, fs);
    gls::gl::LinkProgram(program);
    let mut ok = 0i32;
    gls::gl::GetProgramiv(program, gls::gl::LINK_STATUS, &mut ok);
    if ok == 0 {
        let mut log = [0u8; 4096];
        let mut len = 0i32;
        gls::gl::GetProgramInfoLog(
            program,
            log.len() as i32,
            &mut len,
            log.as_mut_ptr() as *mut _,
        );
        // `state` Drop deletes program + fs + vs as we return.
        return Err(Error::Internal(format!(
            "program link failed: {}",
            String::from_utf8_lossy(&log[..len.max(0) as usize])
        )));
    }

    // Success: detach shaders + delete them (GL drops them when
    // unreferenced by any program). Disarm state so it doesn't delete
    // the program we're returning.
    gls::gl::DeleteShader(state.vs.take().unwrap());
    gls::gl::DeleteShader(state.fs.take().unwrap());
    let program = state.program.take().unwrap();
    std::mem::forget(state);
    Ok(program)
}

unsafe fn compile_shader(kind: u32, src: &str) -> Result<u32> {
    let shader = gls::gl::CreateShader(kind);
    let c = CString::new(src).map_err(|e| Error::Internal(format!("shader CString: {e}")))?;
    let ptr = c.as_ptr();
    let len = src.len() as i32;
    gls::gl::ShaderSource(shader, 1, &ptr, &len);
    gls::gl::CompileShader(shader);
    let mut ok = 0i32;
    gls::gl::GetShaderiv(shader, gls::gl::COMPILE_STATUS, &mut ok);
    if ok == 0 {
        let mut log = [0u8; 4096];
        let mut len = 0i32;
        gls::gl::GetShaderInfoLog(
            shader,
            log.len() as i32,
            &mut len,
            log.as_mut_ptr() as *mut _,
        );
        return Err(Error::Internal(format!(
            "shader compile failed (kind=0x{kind:x}): {}",
            String::from_utf8_lossy(&log[..len.max(0) as usize])
        )));
    }
    Ok(shader)
}