edgefirst-tensor 0.24.2

Zero-copy tensor memory management with DMA, shared memory, and heap backends
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
// SPDX-FileCopyrightText: Copyright 2026 Au-Zone Technologies
// SPDX-License-Identifier: Apache-2.0

//! IOSurface-backed tensor storage for macOS.
//!
//! `IoSurfaceTensor<T>` is the macOS counterpart to `DmaTensor<T>` on
//! Linux: a zero-copy GPU↔CPU buffer that the OpenGL backend (ANGLE on
//! macOS) can import directly via `EGL_ANGLE_iosurface_client_buffer`.
//! Both fit into the `TensorMemory::Dma` slot — the variant name is
//! shared, the inner storage type differs per platform.
//!
//! ## Bindings approach
//!
//! Raw FFI to the IOSurface and CoreFoundation frameworks (linked via
//! `#[link]`). The `objc2-io-surface` crate's Obj-C wrappers want
//! `NSDictionary` for properties and don't accept CFDictionary cleanly
//! through their `initWithProperties` API; the C IOSurface API takes
//! `CFDictionaryRef` directly, which is what we have to hand and what
//! the spike at `spikes/angle_iosurface/` validates.
//!
//! ## CPU access
//!
//! `map()` returns `IoSurfaceMap<T>` which holds an `IOSurfaceLock` and
//! exposes the base address as a slice. `unmap()`/`Drop` calls the
//! matching unlock. IOSurface handles GPU↔CPU cache coherency
//! implicitly — no separate `DMA_BUF_IOCTL_SYNC` analog needed.
//!
//! ## Cross-process sharing
//!
//! IOSurfaces are identified by `IOSurfaceID` (u32) within a host. The
//! GL backend uses this id as part of the buffer-identity cache key so
//! repeated frames for the same IOSurface reuse the EGL pbuffer import.
//! Full Mach port passing is deferred.

#![cfg(target_os = "macos")]

use crate::{
    error::{Error, Result},
    BufferIdentity, PixelFormat, TensorMap, TensorMapTrait, TensorMemory, TensorTrait,
};
use log::trace;
use num_traits::Num;
use std::{
    ffi::c_void,
    fmt,
    marker::PhantomData,
    ops::{Deref, DerefMut},
    ptr::NonNull,
    sync::Arc,
};

// ---------------------------------------------------------------------------
// Raw FFI to IOSurface + CoreFoundation
// ---------------------------------------------------------------------------

type IOSurfaceRef = *mut c_void;
type CFDictionaryRef = *mut c_void;
type CFStringRef = *mut c_void;
type CFNumberRef = *mut c_void;

const K_CF_NUMBER_LONG_TYPE: i32 = 10; // kCFNumberLongType
const K_CF_STRING_ENCODING_UTF8: u32 = 0x08000100;

// IOSurface lock options.
const K_IOSURFACE_LOCK_READ_ONLY: u32 = 0x01;
#[allow(dead_code)]
const K_IOSURFACE_LOCK_AVOID_SYNC: u32 = 0x02;

// IOSurface depends on CoreFoundation for the CFDictionary properties
// dict and CFNumber values. Both frameworks must be linked explicitly:
// clippy warns about the duplicate `kind` attribute but it's required —
// each framework is its own dylib.
#[allow(clippy::duplicated_attributes)]
#[link(name = "IOSurface", kind = "framework")]
#[link(name = "CoreFoundation", kind = "framework")]
extern "C" {
    fn IOSurfaceCreate(properties: CFDictionaryRef) -> IOSurfaceRef;
    fn IOSurfaceLock(surface: IOSurfaceRef, options: u32, seed: *mut u32) -> i32;
    fn IOSurfaceUnlock(surface: IOSurfaceRef, options: u32, seed: *mut u32) -> i32;
    fn IOSurfaceGetBaseAddress(surface: IOSurfaceRef) -> *mut c_void;
    fn IOSurfaceGetAllocSize(surface: IOSurfaceRef) -> usize;
    fn IOSurfaceGetID(surface: IOSurfaceRef) -> u32;

    fn CFRetain(cf: *const c_void) -> *const c_void;
    fn CFRelease(cf: *const c_void);

    fn CFDictionaryCreateMutable(
        allocator: *const c_void,
        capacity: isize,
        key_callbacks: *const c_void,
        value_callbacks: *const c_void,
    ) -> CFDictionaryRef;
    fn CFDictionarySetValue(dict: CFDictionaryRef, key: *const c_void, value: *const c_void);
    fn CFStringCreateWithCString(
        allocator: *const c_void,
        cstr: *const i8,
        encoding: u32,
    ) -> CFStringRef;
    fn CFNumberCreate(allocator: *const c_void, ty: i32, value_ptr: *const c_void) -> CFNumberRef;

    static kCFTypeDictionaryKeyCallBacks: c_void;
    static kCFTypeDictionaryValueCallBacks: c_void;
}

/// Owned IOSurface handle. Releases on Drop via `CFRelease`. Cloneable
/// via Arc — every clone shares the same underlying surface.
#[derive(Debug, Clone)]
pub(crate) struct OwnedIoSurface {
    inner: Arc<IoSurfaceHandle>,
}

/// Inner wrapper that handles the actual CFRelease in Drop. Wrapping in
/// Arc means multiple clones of `OwnedIoSurface` share the same retain
/// count and the surface is released exactly once when the last clone
/// drops.
#[derive(Debug)]
struct IoSurfaceHandle(IOSurfaceRef);

unsafe impl Send for IoSurfaceHandle {}
unsafe impl Sync for IoSurfaceHandle {}

impl Drop for IoSurfaceHandle {
    fn drop(&mut self) {
        if !self.0.is_null() {
            unsafe { CFRelease(self.0 as *const c_void) };
        }
    }
}

impl OwnedIoSurface {
    /// Take ownership of an `IOSurfaceRef` returned by `IOSurfaceCreate`.
    /// The caller must not call `CFRelease` on the ref after this.
    fn from_created(ptr: IOSurfaceRef) -> Result<Self> {
        if ptr.is_null() {
            return Err(Error::IoError(std::io::Error::other(
                "IOSurfaceCreate returned null",
            )));
        }
        Ok(Self {
            inner: Arc::new(IoSurfaceHandle(ptr)),
        })
    }

    /// Wrap an externally-owned `IOSurfaceRef`. Calls `CFRetain` so the
    /// surface is kept alive while this `OwnedIoSurface` exists; the
    /// caller's reference is independent.
    pub(crate) fn from_external(ptr: IOSurfaceRef) -> Result<Self> {
        if ptr.is_null() {
            return Err(Error::InvalidArgument(
                "from_external: null IOSurfaceRef".into(),
            ));
        }
        unsafe { CFRetain(ptr as *const c_void) };
        Ok(Self {
            inner: Arc::new(IoSurfaceHandle(ptr)),
        })
    }

    pub(crate) fn as_ptr(&self) -> IOSurfaceRef {
        self.inner.0
    }
}

// ---------------------------------------------------------------------------
// IoSurfaceTensor — fits into TensorStorage::Dma on macOS.
// ---------------------------------------------------------------------------

#[derive(Debug)]
pub struct IoSurfaceTensor<T>
where
    T: Num + Clone + fmt::Debug + Send + Sync,
{
    pub name: String,
    pub(crate) surface: OwnedIoSurface,
    pub shape: Vec<usize>,
    pub _marker: PhantomData<T>,
    identity: BufferIdentity,
    /// Total bytes allocated by the IOSurface (from `IOSurfaceGetAllocSize`).
    pub(crate) buf_size: usize,
    /// Whether this tensor was constructed from an externally-provided
    /// IOSurface via `from_surface`. Mirrors `DmaTensor::is_imported`
    /// and is reserved for diagnostic and C-API parity uses. Not yet
    /// consumed by any decision logic on macOS — IOSurface lifecycle is
    /// CFRetain/CFRelease symmetric regardless of import origin.
    #[allow(dead_code)]
    pub(crate) is_imported: bool,
}

unsafe impl<T> Send for IoSurfaceTensor<T> where T: Num + Clone + fmt::Debug + Send + Sync {}
unsafe impl<T> Sync for IoSurfaceTensor<T> where T: Num + Clone + fmt::Debug + Send + Sync {}

impl<T> TensorTrait<T> for IoSurfaceTensor<T>
where
    T: Num + Clone + fmt::Debug + Send + Sync,
{
    fn new(shape: &[usize], name: Option<&str>) -> Result<Self> {
        let byte_size = shape
            .iter()
            .product::<usize>()
            .saturating_mul(std::mem::size_of::<T>());
        Self::new_with_byte_size(shape, byte_size, name)
    }

    fn from_fd(_fd: std::os::fd::OwnedFd, _shape: &[usize], _name: Option<&str>) -> Result<Self> {
        Err(Error::NotImplemented(
            "IoSurfaceTensor::from_fd: IOSurface is not fd-backed; use from_surface()".into(),
        ))
    }

    fn clone_fd(&self) -> Result<std::os::fd::OwnedFd> {
        Err(Error::NotImplemented(
            "IoSurfaceTensor::clone_fd: use surface_id() for cross-process sharing".into(),
        ))
    }

    fn memory(&self) -> TensorMemory {
        // Unified variant: macOS reports Dma, same as Linux. The variant
        // name is shared; the inner storage type differs per platform.
        TensorMemory::Dma
    }

    fn name(&self) -> String {
        self.name.clone()
    }

    fn shape(&self) -> &[usize] {
        &self.shape
    }

    fn reshape(&mut self, shape: &[usize]) -> Result<()> {
        let new_elems: usize = shape.iter().product();
        let cur_elems: usize = self.shape.iter().product();
        if new_elems != cur_elems {
            return Err(Error::InvalidShape(format!(
                "reshape: element count mismatch ({cur_elems}{new_elems})",
            )));
        }
        self.shape = shape.to_vec();
        Ok(())
    }

    fn map(&self) -> Result<TensorMap<T>> {
        let _span = tracing::trace_span!("tensor.map", memory = "iosurface",).entered();
        let m = IoSurfaceMap::new(self.surface.clone(), self.shape.clone(), self.buf_size)?;
        Ok(TensorMap::IoSurface(m))
    }

    fn buffer_identity(&self) -> &BufferIdentity {
        &self.identity
    }
}

impl<T> IoSurfaceTensor<T>
where
    T: Num + Clone + fmt::Debug + Send + Sync,
{
    /// Allocate a new IOSurface large enough to hold `byte_size` bytes
    /// arranged as `shape`. The surface is created as a 1-row,
    /// 1-byte-per-element layout — the GL backend separately allocates
    /// properly-shaped image IOSurfaces (with YUYV/NV12/BGRA FOURCC) via
    /// `crates/image/src/gl/iosurface_import.rs`.
    pub(crate) fn new_with_byte_size(
        shape: &[usize],
        byte_size: usize,
        name: Option<&str>,
    ) -> Result<Self> {
        // Span name follows the project convention `<crate>.<function>`
        // (see `ARCHITECTURE.md § Span naming conventions`). The
        // `memory = "iosurface"` field tags the variant so traces can
        // filter macOS-specific allocations.
        let _span =
            tracing::trace_span!("tensor.alloc", memory = "iosurface", byte_size,).entered();

        // SAFETY: dict is created and consumed within this block; the
        // CFDictionary stays alive across the IOSurfaceCreate call.
        let (dict, ptr) = unsafe {
            let dict = build_props(byte_size.max(1), 1, 1, FOURCC_L008)?;
            let ptr = IOSurfaceCreate(dict);
            (dict, ptr)
        };
        unsafe { CFRelease(dict as *const c_void) };
        let surface = OwnedIoSurface::from_created(ptr)?;
        let alloc = unsafe { IOSurfaceGetAllocSize(surface.as_ptr()) };

        let name = match name {
            Some(s) => s.to_owned(),
            None => format!("iosurface-{}", uuid::Uuid::new_v4()),
        };

        trace!("IoSurfaceTensor::new: name={name} bytes={alloc} shape={shape:?}",);

        Ok(Self {
            name,
            surface,
            shape: shape.to_vec(),
            _marker: PhantomData,
            identity: BufferIdentity::new(),
            buf_size: alloc,
            is_imported: false,
        })
    }

    /// Allocate an image-formatted IOSurface — proper FourCC + per-pixel
    /// byte count + 2D dimensions, suitable for binding directly via
    /// `EGL_ANGLE_iosurface_client_buffer` on the GL backend.
    ///
    /// Unlike `new_with_byte_size`, the returned IOSurface has the
    /// format ANGLE expects when the GL backend later wraps it in a
    /// pbuffer. Used by `Tensor::image()` on macOS when the caller
    /// requests `TensorMemory::Dma`.
    pub(crate) fn new_image(
        width: usize,
        height: usize,
        format: PixelFormat,
        shape: &[usize],
        name: Option<&str>,
    ) -> Result<Self> {
        let _span =
            tracing::trace_span!("tensor.alloc", memory = "iosurface", width, height, ?format,)
                .entered();

        let (fourcc, bpe) = image_fourcc_and_bpe(format).ok_or_else(|| {
            Error::NotImplemented(format!(
                "IoSurfaceTensor::new_image: format {format:?} has no IOSurface FourCC mapping"
            ))
        })?;
        let dict = unsafe { build_props(width, height, bpe, fourcc) }?;
        let ptr = unsafe { IOSurfaceCreate(dict) };
        unsafe { CFRelease(dict) };
        let surface = OwnedIoSurface::from_created(ptr)?;
        let alloc = unsafe { IOSurfaceGetAllocSize(surface.as_ptr()) };

        let name = match name {
            Some(s) => s.to_owned(),
            None => format!("iosurface-img-{}", uuid::Uuid::new_v4()),
        };

        trace!(
            "IoSurfaceTensor::new_image: name={name} {width}x{height} {format:?} fourcc=0x{fourcc:08x} bytes={alloc}",
        );

        Ok(Self {
            name,
            surface,
            shape: shape.to_vec(),
            _marker: PhantomData,
            identity: BufferIdentity::new(),
            buf_size: alloc,
            is_imported: false,
        })
    }

    /// Wrap an existing IOSurface as a tensor. Used by the GL backend
    /// when importing an externally-allocated surface (e.g. from
    /// VideoToolbox or cross-process `IOSurfaceID` lookup).
    ///
    /// `surface_ref` must be a valid `IOSurfaceRef`. The pointer is
    /// retained for the tensor's lifetime; the external owner keeps its
    /// own reference and must release it independently.
    ///
    /// The type-erased public entry point is [`crate::TensorDyn::from_iosurface`]
    /// (and [`crate::Tensor::from_iosurface`] for the typed wrapper);
    /// most callers should prefer those over calling this inner
    /// constructor directly.
    ///
    /// # Safety
    ///
    /// The caller must ensure `surface_ref` is a valid live
    /// `IOSurfaceRef`. Passing a stale or invalid pointer is UB.
    ///
    /// The shape footprint
    /// (`shape.iter().product::<usize>() * std::mem::size_of::<T>()`) is
    /// validated against the IOSurface's allocated byte size
    /// (`IOSurfaceGetAllocSize`) and the constructor returns
    /// `Err(InvalidShape)` if it does not fit. This catches accidental
    /// mismatches that would otherwise cause out-of-bounds reads/writes
    /// in [`crate::Tensor::map`]; it does not relax the pointer-validity
    /// requirement above.
    pub unsafe fn from_surface(
        surface_ref: *mut c_void,
        shape: &[usize],
        name: Option<&str>,
    ) -> Result<Self> {
        let surface = OwnedIoSurface::from_external(surface_ref)?;
        let alloc = IOSurfaceGetAllocSize(surface.as_ptr());

        let elem_size = std::mem::size_of::<T>();
        let elems: usize = shape.iter().product();
        let requested = elems.checked_mul(elem_size).ok_or_else(|| {
            Error::InvalidShape(format!(
                "from_surface: shape footprint overflows usize (shape={shape:?}, sizeof T={elem_size})",
            ))
        })?;
        if requested > alloc {
            return Err(Error::InvalidShape(format!(
                "from_surface: shape requires {requested} bytes but IOSurface only \
                 has {alloc} (shape={shape:?}, sizeof T={elem_size})",
            )));
        }

        let name = match name {
            Some(s) => s.to_owned(),
            None => format!("iosurface-imported-{}", uuid::Uuid::new_v4()),
        };
        Ok(Self {
            name,
            surface,
            shape: shape.to_vec(),
            _marker: PhantomData,
            identity: BufferIdentity::new(),
            buf_size: alloc,
            is_imported: true,
        })
    }

    /// Raw `IOSurfaceID` for cross-process sharing or GL backend import.
    pub fn surface_id(&self) -> u32 {
        unsafe { IOSurfaceGetID(self.surface.as_ptr()) }
    }

    /// Raw `IOSurfaceRef` for the GL backend to pass to
    /// `eglCreatePbufferFromClientBuffer(EGL_IOSURFACE_ANGLE, ...)`.
    pub fn surface_ref(&self) -> *mut c_void {
        self.surface.as_ptr()
    }
}

// ---------------------------------------------------------------------------
// Tensor<T>::iosurface_ref accessor — macOS-only.
//
// The GL backend (image crate) calls this when importing an
// IOSurface-backed tensor as an EGL pbuffer. Returns None for tensors
// backed by SHM/Mem/Pbo since they have no associated IOSurface.
// ---------------------------------------------------------------------------

impl<T> crate::Tensor<T>
where
    T: Num + Clone + fmt::Debug + Send + Sync,
{
    /// Borrow the underlying `IOSurfaceRef` for this tensor (macOS only).
    ///
    /// Returns `Some(ptr)` when the tensor is backed by IOSurface (i.e.
    /// `TensorMemory::Dma` on macOS), `None` otherwise. The pointer is
    /// borrowed — its lifetime is tied to the tensor.
    pub fn iosurface_ref(&self) -> Option<*mut c_void> {
        match &self.storage {
            crate::TensorStorage::Dma(io_tensor) => Some(io_tensor.surface.as_ptr()),
            _ => None,
        }
    }

    /// Return the IOSurfaceID for cross-process IOSurface lookup
    /// (macOS only). Returns `None` when the tensor is not
    /// IOSurface-backed.
    ///
    /// The ID is stable for the lifetime of the IOSurface and can be
    /// passed across process boundaries; the receiver recovers the
    /// `IOSurfaceRef` via `IOSurfaceLookup(id)`.
    pub fn iosurface_id(&self) -> Option<u32> {
        match &self.storage {
            crate::TensorStorage::Dma(io_tensor) => Some(io_tensor.surface_id()),
            _ => None,
        }
    }

    /// Wrap an externally-allocated IOSurface as a tensor (macOS only).
    ///
    /// Used to import IOSurfaces from VideoToolbox, AVFoundation, or
    /// other producers, and to recover a tensor from an IOSurfaceID
    /// received over a Mach port or XPC connection. The surface is
    /// retained for the tensor's lifetime; the external owner keeps
    /// its own reference and must release it independently.
    ///
    /// # Safety
    ///
    /// `surface_ref` must be a valid live `IOSurfaceRef`. Passing a
    /// stale or invalid pointer is UB.
    ///
    /// HAL validates that
    /// `shape.iter().product::<usize>() * std::mem::size_of::<T>()` fits
    /// within the IOSurface's allocated byte size
    /// (`IOSurfaceGetAllocSize`) and returns `Err(InvalidShape)`
    /// otherwise. The pointer-validity requirement above is the
    /// caller's responsibility.
    pub unsafe fn from_iosurface(
        surface_ref: *mut c_void,
        shape: &[usize],
        name: Option<&str>,
    ) -> Result<Self>
    where
        T: num_traits::Num,
    {
        let inner = unsafe { IoSurfaceTensor::<T>::from_surface(surface_ref, shape, name)? };
        Ok(crate::Tensor::wrap(crate::TensorStorage::Dma(inner)))
    }
}

// ---------------------------------------------------------------------------
// IoSurfaceMap — locked-for-CPU view.
// ---------------------------------------------------------------------------

#[derive(Debug)]
pub struct IoSurfaceMap<T>
where
    T: Num + Clone + fmt::Debug,
{
    surface: OwnedIoSurface,
    shape: Vec<usize>,
    base_ptr: NonNull<c_void>,
    buf_size: usize,
    _marker: PhantomData<T>,
    /// Lock options used at map time, replayed in unmap for symmetry.
    lock_options: u32,
    locked: bool,
}

unsafe impl<T> Send for IoSurfaceMap<T> where T: Num + Clone + fmt::Debug {}
unsafe impl<T> Sync for IoSurfaceMap<T> where T: Num + Clone + fmt::Debug {}

impl<T> IoSurfaceMap<T>
where
    T: Num + Clone + fmt::Debug,
{
    fn new(surface: OwnedIoSurface, shape: Vec<usize>, buf_size: usize) -> Result<Self> {
        // Default to read-write (options = 0). The read-only path
        // (K_IOSURFACE_LOCK_READ_ONLY) skips a CPU cache flush when the
        // caller only reads — a measurable savings if it becomes a hot
        // path. Left as a future enhancement once we have call-site
        // information about read-vs-write intent.
        let options: u32 = 0;
        let mut seed: u32 = 0;
        let lock_rc = unsafe { IOSurfaceLock(surface.as_ptr(), options, &mut seed) };
        if lock_rc != 0 {
            return Err(Error::IoError(std::io::Error::other(format!(
                "IOSurfaceLock failed (rc={lock_rc})"
            ))));
        }
        let base = unsafe { IOSurfaceGetBaseAddress(surface.as_ptr()) };
        let base_ptr = NonNull::new(base).ok_or_else(|| {
            Error::IoError(std::io::Error::other(
                "IOSurfaceGetBaseAddress returned null after lock",
            ))
        })?;
        Ok(Self {
            surface,
            shape,
            base_ptr,
            buf_size,
            _marker: PhantomData,
            lock_options: options,
            locked: true,
        })
    }

    fn elem_count(&self) -> usize {
        self.shape.iter().product()
    }
}

impl<T> TensorMapTrait<T> for IoSurfaceMap<T>
where
    T: Num + Clone + fmt::Debug,
{
    fn shape(&self) -> &[usize] {
        &self.shape
    }

    fn unmap(&mut self) {
        if self.locked {
            let mut seed: u32 = 0;
            unsafe {
                IOSurfaceUnlock(self.surface.as_ptr(), self.lock_options, &mut seed);
            }
            self.locked = false;
        }
    }

    fn as_slice(&self) -> &[T] {
        self.deref()
    }

    fn as_mut_slice(&mut self) -> &mut [T] {
        self.deref_mut()
    }
}

impl<T> Deref for IoSurfaceMap<T>
where
    T: Num + Clone + fmt::Debug,
{
    type Target = [T];
    fn deref(&self) -> &[T] {
        let ptr = self.base_ptr.as_ptr() as *const T;
        let len = self.elem_count();
        debug_assert!(
            len * std::mem::size_of::<T>() <= self.buf_size,
            "IoSurfaceMap deref: {} elems × {} bytes > buf_size {}",
            len,
            std::mem::size_of::<T>(),
            self.buf_size,
        );
        unsafe { std::slice::from_raw_parts(ptr, len) }
    }
}

impl<T> DerefMut for IoSurfaceMap<T>
where
    T: Num + Clone + fmt::Debug,
{
    fn deref_mut(&mut self) -> &mut [T] {
        let ptr = self.base_ptr.as_ptr() as *mut T;
        let len = self.elem_count();
        // Symmetric with `Deref::deref` — without this an oversized
        // mutable write proceeds silently in release builds even
        // though the read path would have caught the same mismatch.
        debug_assert!(
            len * std::mem::size_of::<T>() <= self.buf_size,
            "IoSurfaceMap deref_mut: {} elems × {} bytes > buf_size {}",
            len,
            std::mem::size_of::<T>(),
            self.buf_size,
        );
        unsafe { std::slice::from_raw_parts_mut(ptr, len) }
    }
}

impl<T> Drop for IoSurfaceMap<T>
where
    T: Num + Clone + fmt::Debug,
{
    fn drop(&mut self) {
        self.unmap();
    }
}

// ---------------------------------------------------------------------------
// CFDictionary builder for IOSurfaceCreate
// ---------------------------------------------------------------------------

/// `L008` (kCVPixelFormatType_OneComponent8) — single-channel 8-bit
/// layout used for raw-byte allocations from
/// `IoSurfaceTensor::new_with_byte_size`.
const FOURCC_L008: u32 = u32::from_be_bytes(*b"L008");

/// IOSurface FourCC + bytes-per-element mapping for image-formatted
/// IOSurfaces. The GL backend's
/// `EGL_ANGLE_iosurface_client_buffer` import requires the IOSurface
/// pixel format to match the GL internal format / type combination —
/// ANGLE validates `IOSurfaceGetBytesPerElement` against the requested
/// `EGL_TEXTURE_INTERNAL_FORMAT_ANGLE` and rejects mismatches with
/// `EGL_BAD_ATTRIBUTE`. **This function is the single source of truth
/// for the `PixelFormat → (FourCC, bpe)` mapping** — the image crate's
/// macOS GL backend reads it via [`image_iosurface_layout`] when
/// constructing the EGL pbuffer attribute list. Keep the two layers in
/// sync by not duplicating this table.
///
/// Formats not listed are not supported by the GL backend on macOS;
/// callers fall back to SHM/Mem and a CPU code path.
fn image_fourcc_and_bpe(format: PixelFormat) -> Option<(u32, usize)> {
    match format {
        // YUYV is 4:2:2 packed (2 bytes/pixel); sampled as GL_RG via
        // FourCC '2C08' (kCVPixelFormatType_TwoComponent8).
        PixelFormat::Yuyv => Some((u32::from_be_bytes(*b"2C08"), 2)),
        // The FourCC matches the in-memory byte order: 'RGBA' for Rgba
        // tensors, 'BGRA' for Bgra. ANGLE supports both via
        // `EGL_TEXTURE_INTERNAL_FORMAT_ANGLE = GL_RGBA` / `GL_BGRA_EXT`
        // and produces the matching shader output. Mapping both to
        // 'BGRA' would put the IOSurface bytes in BGRA order, which is
        // wrong for the Rgba contract.
        PixelFormat::Rgba => Some((u32::from_be_bytes(*b"RGBA"), 4)),
        PixelFormat::Bgra => Some((u32::from_be_bytes(*b"BGRA"), 4)),
        _ => None,
    }
}

/// Public re-export of the `PixelFormat → (FourCC, bytes-per-element)`
/// mapping for callers in other crates (specifically the `edgefirst-image`
/// macOS GL backend). The FourCC is the cross-crate identifier — the GL
/// backend maps it to the matching `EGL_TEXTURE_INTERNAL_FORMAT_ANGLE`
/// internally.
///
/// The image crate must use this function rather than duplicating the
/// table; a drift between the allocation and import sides produced a
/// silent R↔B swap during macOS bring-up (mapping Rgba to `'BGRA'`),
/// which is why the table now lives in one place.
///
/// Returns `None` when the format does not have a defined IOSurface
/// FourCC mapping in HAL (NV12, planar layouts, etc).
pub fn image_iosurface_layout(format: PixelFormat) -> Option<(u32, usize)> {
    image_fourcc_and_bpe(format)
}

unsafe fn build_props(
    width: usize,
    height: usize,
    bytes_per_element: usize,
    fourcc: u32,
) -> Result<CFDictionaryRef> {
    let bytes_per_row = (width * bytes_per_element + 63) & !63;
    let alloc_size = bytes_per_row * height;

    let dict = CFDictionaryCreateMutable(
        std::ptr::null(),
        0,
        &kCFTypeDictionaryKeyCallBacks,
        &kCFTypeDictionaryValueCallBacks,
    );
    if dict.is_null() {
        return Err(Error::IoError(std::io::Error::other(
            "CFDictionaryCreateMutable returned null",
        )));
    }

    let set_num = |key: &str, value: i64| -> Result<()> {
        let key_c = std::ffi::CString::new(key)
            .map_err(|e| Error::InvalidArgument(format!("CString: {e}")))?;
        let key_cf =
            CFStringCreateWithCString(std::ptr::null(), key_c.as_ptr(), K_CF_STRING_ENCODING_UTF8);
        if key_cf.is_null() {
            return Err(Error::IoError(std::io::Error::other(
                "CFStringCreateWithCString returned null",
            )));
        }
        let value_cf = CFNumberCreate(
            std::ptr::null(),
            K_CF_NUMBER_LONG_TYPE,
            &value as *const i64 as *const c_void,
        );
        if value_cf.is_null() {
            CFRelease(key_cf as *const c_void);
            return Err(Error::IoError(std::io::Error::other(
                "CFNumberCreate returned null",
            )));
        }
        CFDictionarySetValue(dict, key_cf as *const c_void, value_cf as *const c_void);
        CFRelease(key_cf as *const c_void);
        CFRelease(value_cf as *const c_void);
        Ok(())
    };

    let result = (|| -> Result<()> {
        set_num("IOSurfaceWidth", width as i64)?;
        set_num("IOSurfaceHeight", height as i64)?;
        set_num("IOSurfaceBytesPerElement", bytes_per_element as i64)?;
        set_num("IOSurfacePixelFormat", fourcc as i64)?;
        set_num("IOSurfaceBytesPerRow", bytes_per_row as i64)?;
        set_num("IOSurfaceAllocSize", alloc_size as i64)?;
        Ok(())
    })();

    if let Err(e) = result {
        CFRelease(dict as *const c_void);
        return Err(e);
    }
    let _ = K_IOSURFACE_LOCK_READ_ONLY; // silence unused on bring-up
    Ok(dict)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn alloc_map_write_read_roundtrip() {
        let t = IoSurfaceTensor::<u8>::new(&[256], None).expect("alloc");
        assert!(t.buf_size >= 256, "buf_size should accommodate shape");
        assert_eq!(t.memory(), TensorMemory::Dma);

        // Write via map
        {
            let mut m = t.map().expect("map");
            let slice = m.as_mut_slice();
            assert!(slice.len() >= 256);
            for (i, b) in slice.iter_mut().take(256).enumerate() {
                *b = (i & 0xff) as u8;
            }
        }
        // Read back via fresh map
        {
            let m = t.map().expect("remap");
            let slice = m.as_slice();
            for (i, b) in slice.iter().take(256).enumerate() {
                assert_eq!(*b, (i & 0xff) as u8, "byte {i} mismatch");
            }
        }
    }

    #[test]
    fn surface_id_is_nonzero() {
        let t = IoSurfaceTensor::<u8>::new(&[64], None).expect("alloc");
        assert!(t.surface_id() != 0, "IOSurface IDs should be nonzero");
    }

    #[test]
    fn shape_reshape_roundtrip() {
        let mut t = IoSurfaceTensor::<u8>::new(&[16, 16], None).expect("alloc");
        assert_eq!(t.shape(), &[16, 16]);
        t.reshape(&[256]).expect("flatten");
        assert_eq!(t.shape(), &[256]);
        // Element count mismatch rejected
        assert!(t.reshape(&[100]).is_err());
    }

    #[test]
    fn from_surface_rejects_shape_overflowing_alloc() {
        // Allocate a small backing surface and try to import it under a
        // shape whose footprint is much larger than the allocation.
        let src = IoSurfaceTensor::<u8>::new(&[64], None).expect("alloc");
        let alloc = src.buf_size;
        let surface_ref = src.surface.as_ptr();

        // u32 element type: requested bytes = (alloc + 1) * 4 ≫ alloc.
        let bad_shape = [alloc + 1];
        let err = unsafe { IoSurfaceTensor::<u32>::from_surface(surface_ref, &bad_shape, None) }
            .expect_err("oversized shape must be rejected");
        match err {
            Error::InvalidShape(msg) => assert!(
                msg.contains("IOSurface only has"),
                "unexpected message: {msg}"
            ),
            other => panic!("expected InvalidShape, got {other:?}"),
        }

        // Sanity check: the same surface accepts a shape that does fit.
        let ok_shape = [alloc / std::mem::size_of::<u32>()];
        unsafe { IoSurfaceTensor::<u32>::from_surface(surface_ref, &ok_shape, None) }
            .expect("fitting shape should succeed");
    }
}