edgefirst_tensor/
lib.rs

1// SPDX-FileCopyrightText: Copyright 2025 Au-Zone Technologies
2// SPDX-License-Identifier: Apache-2.0
3
4/*!
5EdgeFirst HAL - Tensor Module
6
7The `edgefirst_tensor` crate provides a unified interface for managing multi-dimensional arrays (tensors)
8with support for different memory types, including Direct Memory Access (DMA), POSIX Shared Memory (Shm),
9and system memory. The crate defines traits and structures for creating, reshaping, and mapping tensors into memory.
10
11## Examples
12```rust
13use edgefirst_tensor::{Error, Tensor, TensorMemory, TensorTrait};
14# fn main() -> Result<(), Error> {
15let tensor = Tensor::<f32>::new(&[2, 3, 4], Some(TensorMemory::Mem), Some("test_tensor"))?;
16assert_eq!(tensor.memory(), TensorMemory::Mem);
17assert_eq!(tensor.name(), "test_tensor");
18#    Ok(())
19# }
20```
21
22## Overview
23The main structures and traits provided by the `edgefirst_tensor` crate are `TensorTrait` and `TensorMapTrait`,
24which define the behavior of Tensors and their memory mappings, respectively.
25The `Tensor<T>` struct wraps a backend-specific storage with optional image format metadata (`PixelFormat`),
26while the `TensorMap` enum provides access to the underlying data. The `TensorDyn` type-erased enum
27wraps `Tensor<T>` for runtime element-type dispatch.
28 */
29#[cfg(target_os = "android")]
30mod ahardwarebuffer;
31// Pure AHardwareBuffer layout logic (format table, descriptor geometry,
32// overflow-checked shape math) — cfg-free so it compiles and unit-tests
33// on every host; the android module above consumes it.
34#[allow(dead_code)]
35mod ahardwarebuffer_layout;
36pub mod colorimetry;
37pub mod covguard;
38mod cuda;
39#[cfg(target_os = "linux")]
40mod dma;
41#[cfg(target_os = "linux")]
42mod dmabuf;
43mod error;
44mod format;
45#[cfg(any(target_os = "macos", target_os = "ios"))]
46mod iosurface;
47mod mem;
48mod pbo;
49#[cfg(unix)]
50mod shm;
51mod tensor_dyn;
52pub use colorimetry::{
53    ColorEncoding, ColorRange, ColorSpace, ColorTransfer, Colorimetry, MatrixWeights, RangeScaling,
54};
55
56/// Retained constructor: installs the coverage flush-on-abort handler for this
57/// crate's instrumented test binary. See `covguard`. Only present under
58/// coverage on Linux (`.init_array` is ELF-only; the i.MX flush is Linux-only).
59#[cfg(all(coverage, target_os = "linux"))]
60#[used]
61#[link_section = ".init_array"]
62static __EDGEFIRST_COV_INSTALL: extern "C" fn() = {
63    extern "C" fn ctor() {
64        crate::covguard::install();
65    }
66    ctor
67};
68
69// Backing tensor/map types are internal implementation details: callers
70// allocate `Tensor<T>` / `TensorDyn` and map them, never naming the per-memory
71// backing types directly. They are `pub(crate)` so they stay nameable for the
72// `TensorStorage` / `TensorMap` enums without leaking into the public API.
73// Exceptions kept public: `Pbo*` is a GL extension point implemented by the
74// image crate, and `image_iosurface_layout` is a public helper.
75#[cfg(target_os = "android")]
76pub use crate::ahardwarebuffer::image_ahardwarebuffer_layout;
77#[cfg(target_os = "android")]
78pub(crate) use crate::ahardwarebuffer::{AHardwareBufferMap, AHardwareBufferTensor};
79#[cfg(target_os = "linux")]
80pub(crate) use crate::dma::{DmaMap, DmaTensor};
81#[cfg(any(target_os = "macos", target_os = "ios"))]
82pub use crate::iosurface::image_iosurface_layout;
83#[cfg(any(target_os = "macos", target_os = "ios"))]
84pub(crate) use crate::iosurface::{IoSurfaceMap, IoSurfaceTensor};
85pub(crate) use crate::mem::{MemMap, MemTensor};
86pub use crate::pbo::{PboMap, PboMapping, PboOps, PboTensor};
87#[cfg(unix)]
88pub(crate) use crate::shm::{ShmMap, ShmTensor};
89pub use cuda::{
90    gl_map_resource, gl_register_buffer, gl_unmap_resource, gl_unregister_resource,
91    is_cuda_available, memcpy_device_to_host, stream_create, stream_destroy, stream_synchronize,
92    CudaGlOps, CudaHandle, CudaMap, CudaStream,
93};
94pub use error::{Error, Result};
95pub use format::{ChromaLayout, PixelFormat, PixelLayout};
96use num_traits::Num;
97use serde::{Deserialize, Serialize};
98#[cfg(unix)]
99use std::os::fd::OwnedFd;
100use std::{
101    fmt,
102    ops::{Deref, DerefMut},
103    sync::{
104        atomic::{AtomicU64, Ordering},
105        Arc, Weak,
106    },
107};
108pub use tensor_dyn::TensorDyn;
109
110/// Opaque keep-alive handle for a foreign-memory tensor (see
111/// [`Tensor::from_foreign`] / [`TensorDyn::from_foreign_ptr`]).
112///
113/// The HAL borrows the foreign buffer without owning it; this handle co-owns
114/// the *source* so the borrowed memory stays valid for the tensor's life. Its
115/// `Drop` releases the source — e.g. a small struct that calls `cudaFreeHost`,
116/// or a `Py<PyAny>` that decrements a NumPy array's refcount. Wrapping it in an
117/// `Arc` (then boxing each clone) makes the release fire exactly once, after
118/// the last sharing tensor/view/map drops, regardless of drop order.
119pub type ForeignOwner = Box<dyn std::any::Any + Send + Sync>;
120
121/// Re-export of `half::f16` so downstream crates can write
122/// `Tensor::<edgefirst_tensor::f16>::from_iosurface(…)` without
123/// adding `half` to their own dependency list. The version stays in
124/// lockstep with the `half` workspace dep.
125pub use half::f16;
126
127// =============================================================================
128// RGBA16F packed-layout geometry — single source of truth
129//
130// A `PlanarRgb` [3,H,W] or `PlanarRgba` [4,H,W] f16 tensor is represented
131// on the GPU as an RGBA16F surface (the only float format accepted by the
132// ANGLE IOSurface extension). Four contiguous f16 elements are packed into
133// each 8-byte RGBA16F texel, yielding a `(W/4, C*H)` surface.
134//
135// All call sites that need these dimensions must use `packed_rgba16f_layout`
136// so the rule lives in exactly one place. Currently consumed by:
137//  - `crates/tensor/src/iosurface.rs` `new_image` (macOS IOSurface alloc)
138//  - `crates/image/src/gl/iosurface_import.rs` (macOS GL IOSurface import)
139//  - `crates/image/src/gl/processor/float.rs` (Linux GL float render — PBO
140//    readback and DMA-BUF, also via the `dma_f16_packed_layout` wrapper)
141// =============================================================================
142
143/// Geometry of the RGBA16F-packed surface backing a planar F16 image tensor.
144///
145/// ANGLE only supports one float `(type, internal_format)` pair for IOSurface
146/// import: `(GL_HALF_FLOAT, GL_RGBA)` = RGBA16F (8 bytes/texel). To map a
147/// `[C, H, W]` f16 planar tensor onto such a surface, 4 contiguous f16
148/// elements are packed into each RGBA16F texel, yielding a surface of
149/// `(W/4, C*H)` texels at 8 bytes/texel. The byte stream is identical to a
150/// (nonexistent) R16F `(W, C*H)` surface and can be consumed as `&[f16]`
151/// with shape `[1, C, H, W]` without rearrangement.
152///
153/// Obtain via [`packed_rgba16f_layout`] — never construct directly.
154#[derive(Debug, Clone, Copy, PartialEq, Eq)]
155pub struct PackedRgba16fLayout {
156    /// Surface width in texels (`width / 4`).
157    pub surface_w: usize,
158    /// Surface height in texels (`planes * height`).
159    pub surface_h: usize,
160    /// Bytes per RGBA16F texel (always 8).
161    pub bytes_per_texel: usize,
162    /// Row pitch in bytes (`surface_w * 8`).
163    pub pitch: usize,
164}
165
166/// Canonical geometry for the RGBA16F-packed surface backing a planar F16
167/// image tensor.
168///
169/// Returns `Some(layout)` only when **all** of the following hold:
170///
171/// - `dtype == DType::F16`
172/// - `format` is `PixelFormat::PlanarRgb` (3 planes) or
173///   `PixelFormat::PlanarRgba` (4 planes)
174/// - `width % 4 == 0`
175///
176/// Returns `None` for any other `(format, dtype)` combination, misaligned
177/// width, or when the surface geometry would overflow `usize` — callers
178/// must fall back to a non-packed path or return a context-appropriate
179/// error.
180///
181/// # Examples
182///
183/// ```rust
184/// use edgefirst_tensor::{packed_rgba16f_layout, PixelFormat, DType};
185///
186/// let layout = packed_rgba16f_layout(PixelFormat::PlanarRgb, DType::F16, 640, 480).unwrap();
187/// assert_eq!(layout.surface_w, 160);
188/// assert_eq!(layout.surface_h, 1440);
189/// assert_eq!(layout.bytes_per_texel, 8);
190/// assert_eq!(layout.pitch, 1280);
191/// ```
192pub fn packed_rgba16f_layout(
193    format: PixelFormat,
194    dtype: DType,
195    width: usize,
196    height: usize,
197) -> Option<PackedRgba16fLayout> {
198    if dtype != DType::F16 {
199        return None;
200    }
201    let planes: usize = match format {
202        PixelFormat::PlanarRgb => 3,
203        PixelFormat::PlanarRgba => 4,
204        _ => return None,
205    };
206    if !width.is_multiple_of(4) {
207        return None;
208    }
209    let surface_w = width / 4;
210    // Checked arithmetic: a degenerate (height, width) could otherwise wrap
211    // and yield an under-sized layout, which downstream allocators trust for
212    // GPU/CPU buffer sizing. Overflow → None (handled like any other
213    // unsupported geometry).
214    let surface_h = planes.checked_mul(height)?;
215    let bytes_per_texel = 8;
216    let pitch = surface_w.checked_mul(bytes_per_texel)?;
217    Some(PackedRgba16fLayout {
218        surface_w,
219        surface_h,
220        bytes_per_texel,
221        pitch,
222    })
223}
224
225/// Geometry of the RGBA8888-packed surface backing a packed RGB u8/i8 image
226/// tensor.
227///
228/// GPUs have no 3-channel renderable format, so the GL engine's two-pass
229/// packed-RGB shader writes the tight `[H, W, 3]` byte stream into an
230/// RGBA8888 surface: each texel carries 4 consecutive RGB bytes, giving a
231/// `(W*3/4, H)` surface at 4 bytes/texel whose rows are byte-identical to
232/// tight RGB — consumable flat as `[H, W, 3]` with no rearrangement (the
233/// u8/i8 analog of [`packed_rgba16f_layout`]; i8 shares the layout since
234/// INT8 quantization is a per-byte `^0x80` bias, not a format change).
235///
236/// Returns `Some(layout)` only when `width % 4 == 0` (so `W*3` bytes divide
237/// into whole texels) and the geometry does not overflow `usize`.
238///
239/// # Examples
240///
241/// ```rust
242/// use edgefirst_tensor::packed_rgb888_layout;
243///
244/// let layout = packed_rgb888_layout(640, 480).unwrap();
245/// assert_eq!(layout.surface_w, 480); // 640*3/4
246/// assert_eq!(layout.surface_h, 480);
247/// assert_eq!(layout.bytes_per_texel, 4);
248/// assert_eq!(layout.pitch, 1920); // 640*3
249/// assert!(packed_rgb888_layout(641, 480).is_none());
250/// ```
251pub fn packed_rgb888_layout(width: usize, height: usize) -> Option<PackedRgb888Layout> {
252    if !width.is_multiple_of(4) {
253        return None;
254    }
255    let row_bytes = width.checked_mul(3)?;
256    let surface_w = row_bytes / 4;
257    Some(PackedRgb888Layout {
258        surface_w,
259        surface_h: height,
260        bytes_per_texel: 4,
261        pitch: row_bytes,
262    })
263}
264
265/// Geometry of the RGBA8888 surface backing a packed RGB u8/i8 image tensor.
266///
267/// Obtain via [`packed_rgb888_layout`] — never construct directly.
268#[derive(Debug, Clone, Copy, PartialEq, Eq)]
269pub struct PackedRgb888Layout {
270    /// Surface width in texels (`width * 3 / 4`).
271    pub surface_w: usize,
272    /// Surface height in texels (`height`).
273    pub surface_h: usize,
274    /// Bytes per RGBA8888 texel (always 4).
275    pub bytes_per_texel: usize,
276    /// Row pitch in bytes (`surface_w * 4` = `width * 3`).
277    pub pitch: usize,
278}
279
280/// Per-plane DMA-BUF descriptor for external buffer import.
281///
282/// Owns a duplicated file descriptor plus optional stride and offset metadata.
283/// The fd is duplicated eagerly in [`new()`](Self::new) so that a bad fd is
284/// caught immediately. `import_image` consumes the descriptor and takes
285/// ownership of the duped fd — no further cleanup is needed by the caller.
286///
287/// # Examples
288///
289/// ```rust,no_run
290/// use edgefirst_tensor::PlaneDescriptor;
291/// use std::os::fd::BorrowedFd;
292///
293/// // SAFETY: fd 42 is hypothetical; real code must pass a valid fd.
294/// let pd = unsafe { PlaneDescriptor::new(BorrowedFd::borrow_raw(42)) }
295///     .unwrap()
296///     .with_stride(2048)
297///     .with_offset(0);
298/// ```
299#[cfg(unix)]
300pub struct PlaneDescriptor {
301    fd: OwnedFd,
302    stride: Option<usize>,
303    offset: Option<usize>,
304}
305
306#[cfg(unix)]
307impl PlaneDescriptor {
308    /// Create a new plane descriptor by duplicating the given file descriptor.
309    ///
310    /// The fd is duped immediately — a bad fd fails here rather than inside
311    /// `import_image`. The caller retains ownership of the original fd.
312    ///
313    /// # Errors
314    ///
315    /// Returns an error if the `dup()` syscall fails (e.g. invalid fd or
316    /// fd limit reached).
317    pub fn new(fd: std::os::fd::BorrowedFd<'_>) -> Result<Self> {
318        let owned = fd.try_clone_to_owned()?;
319        Ok(Self {
320            fd: owned,
321            stride: None,
322            offset: None,
323        })
324    }
325
326    /// Set the row stride in bytes (consuming builder).
327    pub fn with_stride(mut self, stride: usize) -> Self {
328        self.stride = Some(stride);
329        self
330    }
331
332    /// Set the plane offset in bytes (consuming builder).
333    pub fn with_offset(mut self, offset: usize) -> Self {
334        self.offset = Some(offset);
335        self
336    }
337
338    /// Consume the descriptor and return the owned file descriptor.
339    pub fn into_fd(self) -> OwnedFd {
340        self.fd
341    }
342
343    /// Row stride in bytes, if set.
344    pub fn stride(&self) -> Option<usize> {
345        self.stride
346    }
347
348    /// Plane offset in bytes, if set.
349    pub fn offset(&self) -> Option<usize> {
350        self.offset
351    }
352}
353
354/// A rectangular sub-region of a tensor's leading spatial frame, in pixels.
355///
356/// `Region` is the single rectangle type in the workspace: the argument to
357/// [`Tensor::view`], the source sampling window in the image crate's `Crop`,
358/// and the geometry the image backend lowers to a `glViewport` (a destination
359/// tile) or a sampling rectangle (a source). Coordinates are pixel/element
360/// units of the leading spatial axes; byte addressing is derived from the
361/// parent's row stride, not stored here.
362#[derive(Debug, Clone, Copy, PartialEq, Eq)]
363pub struct Region {
364    pub x: usize,
365    pub y: usize,
366    pub width: usize,
367    pub height: usize,
368}
369
370impl Region {
371    /// Create a region at `(x, y)` spanning `width` × `height` pixels.
372    pub fn new(x: usize, y: usize, width: usize, height: usize) -> Self {
373        Self {
374            x,
375            y,
376            width,
377            height,
378        }
379    }
380
381    /// True when the region lies fully within a `width` × `height` frame.
382    pub fn fits_within(&self, width: usize, height: usize) -> bool {
383        self.x.saturating_add(self.width) <= width && self.y.saturating_add(self.height) <= height
384    }
385}
386
387/// The parent image a [`view`](Tensor::view)/[`batch`](Tensor::batch) sub-region
388/// was carved from, snapshotted at the time the view was created.
389///
390/// A view shares the parent's `BufferIdentity` and addresses a sub-rectangle of
391/// it. The GL backend keys its EGLImage import on the **parent** geometry (so all
392/// sibling views of one buffer collapse to a single import) and renders each
393/// view as a `glViewport`+`glScissor` ROI at `(x, y, width, height)` within that
394/// parent — the view is render state, never a distinct import. `parent_width`/
395/// `parent_height` are the parent's logical pixel dimensions; `x`/`y` are this
396/// view's top-left origin within the parent (pixels). Nested views compose:
397/// the snapshot always names the **root** parent, with offsets accumulated.
398#[derive(Debug, Clone, Copy, PartialEq, Eq)]
399pub struct ViewOrigin {
400    /// Logical width of the root parent image, in pixels.
401    pub parent_width: usize,
402    /// Logical height of the root parent image, in pixels. For a `batch(n)` view
403    /// of an `[N, H, W, C]` tensor this is `N * H` (the tiles stack vertically in
404    /// the shared buffer).
405    pub parent_height: usize,
406    /// The parent's row stride in **bytes**. The GL backend keys its EGLImage
407    /// import/cache and pitch on this — NOT on the view's own `row_stride`, which
408    /// a single-row view sets tight (for map-span safety). Using the parent
409    /// stride keeps the import pitch parent-consistent so single-row and
410    /// multi-row sibling views collapse onto the same parent import.
411    pub parent_row_stride: usize,
412    /// This view's top-left x origin within the root parent, in pixels.
413    pub x: usize,
414    /// This view's top-left y origin within the root parent, in pixels.
415    pub y: usize,
416}
417
418/// Element type discriminant for runtime type identification.
419#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
420#[repr(u8)]
421#[non_exhaustive]
422pub enum DType {
423    U8,
424    I8,
425    U16,
426    I16,
427    U32,
428    I32,
429    U64,
430    I64,
431    F16,
432    F32,
433    F64,
434}
435
436impl DType {
437    /// Size of one element in bytes.
438    pub const fn size(&self) -> usize {
439        match self {
440            Self::U8 | Self::I8 => 1,
441            Self::U16 | Self::I16 | Self::F16 => 2,
442            Self::U32 | Self::I32 | Self::F32 => 4,
443            Self::U64 | Self::I64 | Self::F64 => 8,
444        }
445    }
446
447    /// Short type name (e.g., "u8", "f32", "f16").
448    pub const fn name(&self) -> &'static str {
449        match self {
450            Self::U8 => "u8",
451            Self::I8 => "i8",
452            Self::U16 => "u16",
453            Self::I16 => "i16",
454            Self::U32 => "u32",
455            Self::I32 => "i32",
456            Self::U64 => "u64",
457            Self::I64 => "i64",
458            Self::F16 => "f16",
459            Self::F32 => "f32",
460            Self::F64 => "f64",
461        }
462    }
463}
464
465impl fmt::Display for DType {
466    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
467        f.write_str(self.name())
468    }
469}
470
471/// Map a static numeric type `T` to its `DType` discriminant, returning
472/// `None` for types that do not have a `DType` representation (e.g.
473/// user-defined wrappers in tests).
474///
475/// Runtime dtype of a static `Tensor<T>` element type — a safe `TypeId`
476/// allowlist of HAL's primitive numeric types (`Some` for `u8`..`i64` /
477/// `f16` / `f32` / `f64`, `None` otherwise). Used by the macOS IOSurface
478/// image constructors (FourCC / pixel-format lookup) and by the `Mem`
479/// backing's `alloc_zeroed` fast path (these types' `T::zero()` is the
480/// all-zeros bit pattern), so it is compiled on every target.
481pub(crate) fn dtype_of<T: 'static>() -> Option<DType> {
482    use std::any::TypeId;
483    let id = TypeId::of::<T>();
484    if id == TypeId::of::<u8>() {
485        Some(DType::U8)
486    } else if id == TypeId::of::<i8>() {
487        Some(DType::I8)
488    } else if id == TypeId::of::<u16>() {
489        Some(DType::U16)
490    } else if id == TypeId::of::<i16>() {
491        Some(DType::I16)
492    } else if id == TypeId::of::<u32>() {
493        Some(DType::U32)
494    } else if id == TypeId::of::<i32>() {
495        Some(DType::I32)
496    } else if id == TypeId::of::<u64>() {
497        Some(DType::U64)
498    } else if id == TypeId::of::<i64>() {
499        Some(DType::I64)
500    } else if id == TypeId::of::<half::f16>() {
501        Some(DType::F16)
502    } else if id == TypeId::of::<f32>() {
503        Some(DType::F32)
504    } else if id == TypeId::of::<f64>() {
505        Some(DType::F64)
506    } else {
507        None
508    }
509}
510
511// =============================================================================
512// Quantization metadata — type-gated to integer element types via sealed
513// `IntegerType` trait. Accessors on `Tensor<T>` only compile when `T` is
514// an integer type; calling them on `Tensor<f32>` / `Tensor<f16>` etc. is a
515// compile error, not a runtime one.
516// =============================================================================
517
518mod sealed {
519    pub trait Sealed {}
520    impl Sealed for u8 {}
521    impl Sealed for i8 {}
522    impl Sealed for u16 {}
523    impl Sealed for i16 {}
524    impl Sealed for u32 {}
525    impl Sealed for i32 {}
526    impl Sealed for u64 {}
527    impl Sealed for i64 {}
528    // Deliberately NOT implemented for f16 / f32 / f64.
529}
530
531/// Integer element types that may carry quantization metadata.
532///
533/// Sealed trait: implemented for `u8`, `i8`, `u16`, `i16`, `u32`, `i32`,
534/// `u64`, `i64`. Cannot be implemented downstream. Float element types
535/// (`half::f16`, `f32`, `f64`) are explicitly excluded — quantization
536/// metadata does not apply to float tensors per the edgefirst.json spec.
537pub trait IntegerType: sealed::Sealed {}
538impl IntegerType for u8 {}
539impl IntegerType for i8 {}
540impl IntegerType for u16 {}
541impl IntegerType for i16 {}
542impl IntegerType for u32 {}
543impl IntegerType for i32 {}
544impl IntegerType for u64 {}
545impl IntegerType for i64 {}
546
547/// Quantization parameters for an integer tensor.
548///
549/// Covers all four modes the edgefirst.json spec defines:
550///
551/// | Mode | `scale.len()` | `zero_point` | `axis` |
552/// |---|---|---|---|
553/// | Per-tensor symmetric | 1 | `None` | `None` |
554/// | Per-tensor asymmetric | 1 | `Some(len == 1)` | `None` |
555/// | Per-channel symmetric | >1 | `None` | `Some(c)` |
556/// | Per-channel asymmetric | >1 | `Some(len == scale.len())` | `Some(c)` |
557///
558/// The quantized storage type is carried on the parent [`Tensor<T>`]; this
559/// struct does not duplicate it. Construct via the four named constructors
560/// (the only public entry points); direct field mutation is not allowed so
561/// invalid combinations cannot be represented.
562///
563/// Dequantization formula:
564///
565/// ```text
566///   real_value = scale[c] × (quantized_value[c] - zero_point[c])
567/// ```
568///
569/// where `c` is the channel index (always `0` for per-tensor).
570#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
571pub struct Quantization {
572    /// Per-tensor: `vec![scale]`. Per-channel: `vec![scale_0, scale_1, ...]`.
573    #[serde(deserialize_with = "deserialize_scalar_or_vec_f32")]
574    scale: Vec<f32>,
575
576    /// `None` means symmetric (zero-point is 0). `Some(vec)` must have the
577    /// same length as `scale`.
578    #[serde(
579        default,
580        deserialize_with = "deserialize_opt_scalar_or_vec_i32",
581        skip_serializing_if = "Option::is_none"
582    )]
583    zero_point: Option<Vec<i32>>,
584
585    /// Channel axis for per-channel quantization. `Some(_)` iff
586    /// `scale.len() > 1`. Validated against the parent tensor's shape at
587    /// `set_quantization()` time.
588    #[serde(default, skip_serializing_if = "Option::is_none")]
589    axis: Option<usize>,
590}
591
592/// Semantic mode discriminant for hot-path kernel dispatch.
593///
594/// Obtain via [`Quantization::mode`] once at kernel entry; never inside a
595/// pixel-level loop. The enum is borrow-based so the hot kernel receives
596/// the scales / zero-points as slices without reallocation.
597#[derive(Debug, Clone, Copy)]
598pub enum QuantMode<'a> {
599    PerTensorSymmetric {
600        scale: f32,
601    },
602    PerTensor {
603        scale: f32,
604        zero_point: i32,
605    },
606    PerChannelSymmetric {
607        scales: &'a [f32],
608        axis: usize,
609    },
610    PerChannel {
611        scales: &'a [f32],
612        zero_points: &'a [i32],
613        axis: usize,
614    },
615}
616
617impl Quantization {
618    /// Per-tensor symmetric (zero_point = 0).
619    pub fn per_tensor_symmetric(scale: f32) -> Self {
620        Self {
621            scale: vec![scale],
622            zero_point: None,
623            axis: None,
624        }
625    }
626
627    /// Per-tensor asymmetric — the most common runtime shape.
628    pub fn per_tensor(scale: f32, zero_point: i32) -> Self {
629        Self {
630            scale: vec![scale],
631            zero_point: Some(vec![zero_point]),
632            axis: None,
633        }
634    }
635
636    /// Per-channel symmetric. Errors on empty `scales`.
637    pub fn per_channel_symmetric(scales: Vec<f32>, axis: usize) -> Result<Self> {
638        if scales.is_empty() {
639            return Err(Error::QuantizationInvalid {
640                field: "scale.len",
641                expected: "non-empty per-channel scales".to_string(),
642                got: "length 0".to_string(),
643            });
644        }
645        Ok(Self {
646            scale: scales,
647            zero_point: None,
648            axis: Some(axis),
649        })
650    }
651
652    /// Per-channel asymmetric. Errors on length mismatch between `scales`
653    /// and `zero_points`, or empty arrays.
654    pub fn per_channel(scales: Vec<f32>, zero_points: Vec<i32>, axis: usize) -> Result<Self> {
655        if scales.is_empty() {
656            return Err(Error::QuantizationInvalid {
657                field: "scale.len",
658                expected: "non-empty per-channel scales".to_string(),
659                got: "length 0".to_string(),
660            });
661        }
662        if scales.len() != zero_points.len() {
663            return Err(Error::QuantizationInvalid {
664                field: "zero_point.len",
665                expected: format!("length matches scale ({})", scales.len()),
666                got: format!("length {}", zero_points.len()),
667            });
668        }
669        Ok(Self {
670            scale: scales,
671            zero_point: Some(zero_points),
672            axis: Some(axis),
673        })
674    }
675
676    /// Borrow-based dispatch view. Match once at kernel entry.
677    pub fn mode(&self) -> QuantMode<'_> {
678        match (self.scale.len(), self.zero_point.as_deref(), self.axis) {
679            (1, None, _) => QuantMode::PerTensorSymmetric {
680                scale: self.scale[0],
681            },
682            (1, Some(zps), _) => QuantMode::PerTensor {
683                scale: self.scale[0],
684                zero_point: zps.first().copied().unwrap_or(0),
685            },
686            (_, None, Some(axis)) => QuantMode::PerChannelSymmetric {
687                scales: &self.scale,
688                axis,
689            },
690            (_, Some(zps), Some(axis)) => QuantMode::PerChannel {
691                scales: &self.scale,
692                zero_points: zps,
693                axis,
694            },
695            // The `validate()` path prevents constructing a
696            // per-channel Quantization without an axis, so the remaining
697            // pattern is unreachable in practice. Fall back to
698            // per-tensor symmetric using scale[0] to avoid panicking in
699            // release; debug builds assert.
700            _ => {
701                debug_assert!(
702                    false,
703                    "Quantization::mode: per-channel without axis is unreachable"
704                );
705                QuantMode::PerTensorSymmetric {
706                    scale: self.scale.first().copied().unwrap_or(1.0),
707                }
708            }
709        }
710    }
711
712    /// Returns `true` for per-tensor quantization (`scale.len() == 1`).
713    pub fn is_per_tensor(&self) -> bool {
714        self.scale.len() == 1
715    }
716
717    /// Returns `true` for per-channel quantization (`scale.len() > 1`).
718    pub fn is_per_channel(&self) -> bool {
719        self.scale.len() > 1
720    }
721
722    /// Returns `true` for symmetric quantization (no zero-point, or
723    /// zero-point vector of all zeros).
724    pub fn is_symmetric(&self) -> bool {
725        match &self.zero_point {
726            None => true,
727            Some(zps) => zps.iter().all(|&z| z == 0),
728        }
729    }
730
731    /// Borrow the scale array. Length 1 for per-tensor; `num_channels` for
732    /// per-channel.
733    pub fn scale(&self) -> &[f32] {
734        &self.scale
735    }
736
737    /// Borrow the zero-point array. `None` for symmetric.
738    pub fn zero_point(&self) -> Option<&[i32]> {
739        self.zero_point.as_deref()
740    }
741
742    /// Channel axis for per-channel quantization. `None` for per-tensor.
743    pub fn axis(&self) -> Option<usize> {
744        self.axis
745    }
746
747    /// Validate against a target tensor shape. Runs in
748    /// `Tensor::set_quantization()`. Catches:
749    ///   - empty `scale` (reject — must declare at least one factor)
750    ///   - `zero_point` length inconsistent with `scale` (reject —
751    ///     per-tensor must have len 1, per-channel must match `scale.len`)
752    ///   - `axis >= shape.len()` (axis out of range)
753    ///   - `scale.len() != shape[axis]` for per-channel
754    ///   - per-channel without axis (reject)
755    ///   - per-tensor with redundant axis (reject)
756    pub(crate) fn validate(&self, shape: &[usize]) -> Result<()> {
757        // `Quantization` is `Deserialize`, so malformed JSON like
758        // `{"scale": [], "zero_point": []}` could otherwise produce an
759        // ill-defined value that confuses `mode()` selection and the
760        // per-channel kernels' indexing.
761        if self.scale.is_empty() {
762            return Err(Error::QuantizationInvalid {
763                field: "scale.len",
764                expected: ">= 1".to_string(),
765                got: "0".to_string(),
766            });
767        }
768        if let Some(zps) = self.zero_point.as_ref() {
769            // Per-tensor: scale.len() == 1 and zero_point.len() must == 1.
770            // Per-channel: zero_point.len() must == scale.len().
771            let expected = if self.scale.len() == 1 {
772                1
773            } else {
774                self.scale.len()
775            };
776            if zps.len() != expected {
777                return Err(Error::QuantizationInvalid {
778                    field: "zero_point.len",
779                    expected: format!(
780                        "{expected} (matching {})",
781                        if self.scale.len() == 1 {
782                            "per-tensor scale"
783                        } else {
784                            "per-channel scale.len"
785                        }
786                    ),
787                    got: format!("length {}", zps.len()),
788                });
789            }
790        }
791
792        match (self.scale.len(), self.axis) {
793            (1, None) => Ok(()),
794            (1, Some(_)) => Err(Error::QuantizationInvalid {
795                field: "per_tensor_redundant_axis",
796                expected: "axis=None for per-tensor quantization".to_string(),
797                got: format!("axis={:?}", self.axis),
798            }),
799            (_, None) => Err(Error::QuantizationInvalid {
800                field: "per_channel_requires_axis",
801                expected: format!(
802                    "axis=Some(_) for per-channel quantization (scale.len={})",
803                    self.scale.len()
804                ),
805                got: "axis=None".to_string(),
806            }),
807            (n, Some(axis)) => {
808                if axis >= shape.len() {
809                    return Err(Error::QuantizationInvalid {
810                        field: "axis",
811                        expected: format!("axis < tensor rank ({})", shape.len()),
812                        got: format!("axis={axis}"),
813                    });
814                }
815                if shape[axis] != n {
816                    return Err(Error::QuantizationInvalid {
817                        field: "scale.len",
818                        expected: format!("length matches shape[{axis}] ({})", shape[axis]),
819                        got: format!("length {n}"),
820                    });
821                }
822                Ok(())
823            }
824        }
825    }
826}
827
828impl From<(f32, i32)> for Quantization {
829    /// Convenience construction from a `(scale, zero_point)` tuple. Matches
830    /// the legacy `QuantTuple` / `Quantization::new` calling convention so
831    /// existing `(0.1, -128).into()` sites keep working.
832    fn from((scale, zero_point): (f32, i32)) -> Self {
833        Self::per_tensor(scale, zero_point)
834    }
835}
836
837fn deserialize_scalar_or_vec_f32<'de, D: serde::Deserializer<'de>>(
838    de: D,
839) -> std::result::Result<Vec<f32>, D::Error> {
840    use serde::de::{self, Visitor};
841    struct V;
842    impl<'de> Visitor<'de> for V {
843        type Value = Vec<f32>;
844        fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
845            f.write_str("f32 or array of f32")
846        }
847        fn visit_f64<E: de::Error>(self, v: f64) -> std::result::Result<Self::Value, E> {
848            Ok(vec![v as f32])
849        }
850        #[allow(clippy::cast_possible_truncation)]
851        fn visit_i64<E: de::Error>(self, v: i64) -> std::result::Result<Self::Value, E> {
852            Ok(vec![v as f32])
853        }
854        #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
855        fn visit_u64<E: de::Error>(self, v: u64) -> std::result::Result<Self::Value, E> {
856            Ok(vec![v as f32])
857        }
858        fn visit_seq<A: de::SeqAccess<'de>>(
859            self,
860            mut seq: A,
861        ) -> std::result::Result<Self::Value, A::Error> {
862            let mut out = Vec::with_capacity(seq.size_hint().unwrap_or(1));
863            while let Some(x) = seq.next_element::<f32>()? {
864                out.push(x);
865            }
866            Ok(out)
867        }
868    }
869    de.deserialize_any(V)
870}
871
872fn deserialize_opt_scalar_or_vec_i32<'de, D: serde::Deserializer<'de>>(
873    de: D,
874) -> std::result::Result<Option<Vec<i32>>, D::Error> {
875    use serde::de::{self, Visitor};
876    struct V;
877    impl<'de> Visitor<'de> for V {
878        type Value = Option<Vec<i32>>;
879        fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
880            f.write_str("null, i32, or array of i32")
881        }
882        fn visit_none<E: de::Error>(self) -> std::result::Result<Self::Value, E> {
883            Ok(None)
884        }
885        fn visit_unit<E: de::Error>(self) -> std::result::Result<Self::Value, E> {
886            Ok(None)
887        }
888        fn visit_some<D2: serde::Deserializer<'de>>(
889            self,
890            de: D2,
891        ) -> std::result::Result<Self::Value, D2::Error> {
892            struct Inner;
893            impl<'de> Visitor<'de> for Inner {
894                type Value = Vec<i32>;
895                fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
896                    f.write_str("i32 or array of i32")
897                }
898                #[allow(clippy::cast_possible_truncation)]
899                fn visit_i64<E: de::Error>(self, v: i64) -> std::result::Result<Self::Value, E> {
900                    Ok(vec![v as i32])
901                }
902                #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
903                fn visit_u64<E: de::Error>(self, v: u64) -> std::result::Result<Self::Value, E> {
904                    Ok(vec![v as i32])
905                }
906                fn visit_seq<A: de::SeqAccess<'de>>(
907                    self,
908                    mut seq: A,
909                ) -> std::result::Result<Self::Value, A::Error> {
910                    let mut out = Vec::with_capacity(seq.size_hint().unwrap_or(1));
911                    while let Some(x) = seq.next_element::<i32>()? {
912                        out.push(x);
913                    }
914                    Ok(out)
915                }
916            }
917            de.deserialize_any(Inner).map(Some)
918        }
919        #[allow(clippy::cast_possible_truncation)]
920        fn visit_i64<E: de::Error>(self, v: i64) -> std::result::Result<Self::Value, E> {
921            Ok(Some(vec![v as i32]))
922        }
923        #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
924        fn visit_u64<E: de::Error>(self, v: u64) -> std::result::Result<Self::Value, E> {
925            Ok(Some(vec![v as i32]))
926        }
927        fn visit_seq<A: de::SeqAccess<'de>>(
928            self,
929            mut seq: A,
930        ) -> std::result::Result<Self::Value, A::Error> {
931            let mut out = Vec::with_capacity(seq.size_hint().unwrap_or(1));
932            while let Some(x) = seq.next_element::<i32>()? {
933                out.push(x);
934            }
935            Ok(Some(out))
936        }
937    }
938    de.deserialize_option(V)
939}
940
941/// Monotonic counter for buffer identity IDs.
942static NEXT_BUFFER_ID: AtomicU64 = AtomicU64::new(1);
943
944/// Count of tensor maps requested beyond the buffer's declared
945/// [`CpuAccess`] (including any CPU map of a `CpuAccess::None` buffer).
946/// Undeclared CPU access is a pipeline smell: it forfeits layout
947/// optimizations (write-combined mappings, tile compression) and may be
948/// slow or refused on Android. Read via [`unplanned_cpu_access_count`];
949/// each offending buffer also logs one warning.
950static UNPLANNED_CPU_ACCESS: AtomicU64 = AtomicU64::new(0);
951
952/// Number of tensor maps that exceeded the buffer's declared
953/// [`CpuAccess`] since process start. A pipeline that declares its CPU
954/// access correctly holds this flat; see [`CpuAccess`] for the contract.
955pub fn unplanned_cpu_access_count() -> u64 {
956    UNPLANNED_CPU_ACCESS.load(Ordering::Relaxed)
957}
958
959/// Record an unplanned CPU access on `identity_id`, warning once per
960/// buffer (a steady-state pipeline maps the same buffer every frame — a
961/// per-map warn would flood the log; repeats count silently).
962pub(crate) fn note_unplanned_cpu_access(identity_id: u64, backend: &str, detail: &str) {
963    UNPLANNED_CPU_ACCESS.fetch_add(1, Ordering::Relaxed);
964    static WARNED: std::sync::OnceLock<std::sync::Mutex<std::collections::HashSet<u64>>> =
965        std::sync::OnceLock::new();
966    let warned = WARNED.get_or_init(Default::default);
967    if warned.lock().is_ok_and(|mut s| s.insert(identity_id)) {
968        log::warn!(
969            "unplanned CPU access on {backend} buffer (identity {identity_id}): {detail} — \
970             declare the intent at allocation (CpuAccess::Read/Write/ReadWrite) to make \
971             this a planned, optimized mapping"
972        );
973    }
974}
975
976/// Uniform guard for mutable access through a read-only map. Every
977/// `TensorMap` backend calls this from `as_mut_slice`/`deref_mut` so a
978/// `map_read()` misuse fails identically on all platforms instead of
979/// passing on tolerant ones and exploding only on Android.
980#[inline]
981pub(crate) fn assert_map_writable(writable: bool, backend: &str) {
982    assert!(
983        writable,
984        "{backend} map is read-only (obtained via map_read()/CpuAccess::Read) — \
985         use map_mut() or map_write() for mutable access"
986    );
987}
988
989/// Declared CPU involvement for an image tensor, chosen at allocation.
990///
991/// The HAL assumes buffers are produced and consumed by hardware (ISP,
992/// codec, GPU, NPU) — hardware access needs no declaration. CPU access is
993/// the opt-in: it selects the CPU usage/mapping mode at allocation
994/// (write-combined for `Write`, cached for `Read`) and, on Android, pins
995/// the layout linear (vendor tile compression requires `None`).
996///
997/// Mapping beyond the declared access is best-effort, never silent: it
998/// may be refused (`Error::NotSupported`) or take a slow path, and it
999/// always increments [`unplanned_cpu_access_count`] with a once-per-buffer
1000/// warning.
1001#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
1002pub enum CpuAccess {
1003    /// Hardware-only buffer (the default): no CPU mapping declared.
1004    /// Compression-eligible on platforms with vendor tile layouts.
1005    #[default]
1006    None,
1007    /// CPU reads (verification, CPU consumers) — cached mapping.
1008    Read,
1009    /// CPU writes (decode targets) — write-combined mapping where the
1010    /// platform supports it; reading through a `Write` map is undeclared.
1011    Write,
1012    /// CPU reads and writes — the pre-CpuAccess implicit behavior.
1013    ReadWrite,
1014}
1015
1016impl CpuAccess {
1017    /// Whether this declaration includes CPU reads.
1018    pub fn reads(self) -> bool {
1019        matches!(self, CpuAccess::Read | CpuAccess::ReadWrite)
1020    }
1021
1022    /// Whether this declaration includes CPU writes.
1023    pub fn writes(self) -> bool {
1024        matches!(self, CpuAccess::Write | CpuAccess::ReadWrite)
1025    }
1026
1027    /// Whether this declaration covers `requested` (every direction the
1028    /// request needs is declared).
1029    pub fn covers(self, requested: CpuAccess) -> bool {
1030        (!requested.reads() || self.reads()) && (!requested.writes() || self.writes())
1031    }
1032}
1033
1034/// Requested tile-compression behavior for an image allocation (set via
1035/// [`ImageDesc::with_compression`]).
1036///
1037/// Vendor GPUs store textures in proprietary compressed tile layouts
1038/// (UBWC, AFBC, PVRIC, DCC) that cut memory bandwidth; eligibility
1039/// requires a hardware-only buffer ([`CpuAccess::None`]) because CPU
1040/// mapping pins the layout linear. The request records best knowledge on
1041/// the tensor ([`Tensor::compression`]); it never changes what bytes a
1042/// consumer sees through the GPU/NPU.
1043#[non_exhaustive]
1044#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1045pub enum Compression {
1046    /// Let the platform use its native scheme when the format is
1047    /// eligible; otherwise allocate linear and count the fallback
1048    /// ([`compression_fallback_count`]). The right default for pipelines
1049    /// that want the bandwidth win without portability failures.
1050    Any,
1051    /// Require one specific scheme: allocation fails with
1052    /// [`Error::InvalidArgument`] when the device's scheme differs and
1053    /// [`Error::NotSupported`] on platforms without vendor tile
1054    /// compression. For consumers whose ABI names a layout (e.g. a
1055    /// QNN context binary declaring UBWC inputs).
1056    Scheme(CompressionScheme),
1057}
1058
1059/// Vendor tile-compression schemes the HAL recognizes and records.
1060///
1061/// Unrecognized platforms record `None` (linear) — there is deliberately
1062/// no `Unknown` variant; a scheme is only recorded when the vendor is
1063/// positively identified.
1064#[non_exhaustive]
1065#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1066pub enum CompressionScheme {
1067    /// Qualcomm Adreno Universal Bandwidth Compression.
1068    Ubwc,
1069    /// Arm Mali/Immortalis Framebuffer Compression.
1070    Afbc,
1071    /// Imagination PowerVR Image Compression (Google Tensor G5+).
1072    Pvric,
1073    /// Samsung Xclipse (AMD RDNA) Delta Color Compression.
1074    Dcc,
1075}
1076
1077/// Count of image allocations that requested [`Compression::Any`] but
1078/// resolved to a linear layout (ineligible format/dtype, unrecognized
1079/// vendor, or a platform without vendor tile compression). Read via
1080/// [`compression_fallback_count`].
1081static COMPRESSION_FALLBACKS: AtomicU64 = AtomicU64::new(0);
1082
1083/// Number of [`Compression::Any`] requests since process start that
1084/// resolved to a linear layout instead of a vendor tile scheme. A
1085/// steady-state pipeline holds this flat after warmup; growth means an
1086/// allocation path keeps requesting compression it never gets.
1087pub fn compression_fallback_count() -> u64 {
1088    COMPRESSION_FALLBACKS.load(Ordering::Relaxed)
1089}
1090
1091/// Record a `Compression::Any` request resolving linear.
1092pub(crate) fn note_compression_fallback(detail: &str) {
1093    COMPRESSION_FALLBACKS.fetch_add(1, Ordering::Relaxed);
1094    log::debug!("compression request fell back to linear: {detail}");
1095}
1096
1097/// Whether this platform can allocate `(format, dtype)` images in a
1098/// vendor tile-compressed layout. `true` requires an Android build, an
1099/// eligible format (RGBA8888 `u8`/`i8` initially), and a positively
1100/// identified GPU vendor; everywhere else the answer is `false` and
1101/// [`Compression::Any`] requests fall back to linear.
1102pub fn compression_support(format: PixelFormat, dtype: DType) -> bool {
1103    #[cfg(target_os = "android")]
1104    {
1105        crate::ahardwarebuffer_layout::compression_eligible(format, dtype)
1106            && crate::ahardwarebuffer::device_compression_scheme().is_some()
1107    }
1108    #[cfg(not(target_os = "android"))]
1109    {
1110        let _ = (format, dtype);
1111        false
1112    }
1113}
1114
1115/// Declarative image-allocation request — the full-featured front door
1116/// for image tensors ([`TensorDyn::image_desc`] and the image crate's
1117/// `ImageProcessor::create_image_desc`).
1118///
1119/// The classic constructors (`image`, `create_image`) cover the common
1120/// cases; the desc carries the optional requests — today the
1121/// [`Compression`] request — without another constructor-parameter
1122/// sweep. Fields are private and the builders consume/return by value,
1123/// so future options are non-breaking.
1124///
1125/// ```
1126/// use edgefirst_tensor::{Compression, CpuAccess, DType, ImageDesc, PixelFormat};
1127/// let desc = ImageDesc::new(640, 640, PixelFormat::Rgba, DType::U8)
1128///     .with_access(CpuAccess::None)
1129///     .with_compression(Compression::Any);
1130/// ```
1131#[derive(Debug, Clone)]
1132pub struct ImageDesc {
1133    width: usize,
1134    height: usize,
1135    format: PixelFormat,
1136    dtype: DType,
1137    memory: Option<TensorMemory>,
1138    access: CpuAccess,
1139    compression: Option<Compression>,
1140}
1141
1142impl ImageDesc {
1143    /// A new image request: auto-selected memory, [`CpuAccess::None`]
1144    /// (hardware-only), no compression request.
1145    pub fn new(width: usize, height: usize, format: PixelFormat, dtype: DType) -> Self {
1146        Self {
1147            width,
1148            height,
1149            format,
1150            dtype,
1151            memory: None,
1152            access: CpuAccess::None,
1153            compression: None,
1154        }
1155    }
1156
1157    /// Request a specific memory backing (`None` = auto-select).
1158    pub fn with_memory(mut self, memory: Option<TensorMemory>) -> Self {
1159        self.memory = memory;
1160        self
1161    }
1162
1163    /// Declare the CPU access (see [`CpuAccess`]). Any declaration other
1164    /// than `None` makes a compression request invalid.
1165    pub fn with_access(mut self, access: CpuAccess) -> Self {
1166        self.access = access;
1167        self
1168    }
1169
1170    /// Request a tile-compressed layout (see [`Compression`]).
1171    pub fn with_compression(mut self, compression: Compression) -> Self {
1172        self.compression = Some(compression);
1173        self
1174    }
1175
1176    /// Requested width in pixels.
1177    pub fn width(&self) -> usize {
1178        self.width
1179    }
1180
1181    /// Requested height in pixels.
1182    pub fn height(&self) -> usize {
1183        self.height
1184    }
1185
1186    /// Requested pixel format.
1187    pub fn format(&self) -> PixelFormat {
1188        self.format
1189    }
1190
1191    /// Requested element type.
1192    pub fn dtype(&self) -> DType {
1193        self.dtype
1194    }
1195
1196    /// Requested memory backing (`None` = auto-select).
1197    pub fn memory(&self) -> Option<TensorMemory> {
1198        self.memory
1199    }
1200
1201    /// Declared CPU access.
1202    pub fn access(&self) -> CpuAccess {
1203        self.access
1204    }
1205
1206    /// The compression request, if any.
1207    pub fn compression(&self) -> Option<Compression> {
1208        self.compression
1209    }
1210}
1211
1212/// Unique identity for a tensor's underlying buffer.
1213///
1214/// Created fresh on every buffer allocation or import. The `id` is a monotonic
1215/// u64 used as a cache key. The `guard` is an `Arc<()>` whose weak references
1216/// allow downstream caches to detect when the buffer has been dropped.
1217#[derive(Debug, Clone)]
1218pub struct BufferIdentity {
1219    id: u64,
1220    guard: Arc<()>,
1221}
1222
1223impl BufferIdentity {
1224    /// Create a new unique buffer identity.
1225    pub fn new() -> Self {
1226        Self {
1227            id: NEXT_BUFFER_ID.fetch_add(1, Ordering::Relaxed),
1228            guard: Arc::new(()),
1229        }
1230    }
1231
1232    /// Unique identifier for this buffer. Changes when the buffer changes.
1233    pub fn id(&self) -> u64 {
1234        self.id
1235    }
1236
1237    /// Returns a weak reference to the buffer guard. Goes dead when the
1238    /// owning Tensor is dropped (and no clones remain).
1239    pub fn weak(&self) -> Weak<()> {
1240        Arc::downgrade(&self.guard)
1241    }
1242
1243    /// Rebuild an identity from interned parts — crate-private: only the
1244    /// Android `AHardwareBuffer_getId` intern table may resurrect an
1245    /// existing identity (arbitrary construction would forge cache hits).
1246    // Only the Android AHardwareBuffer intern path uses these today.
1247    #[cfg_attr(not(target_os = "android"), allow(dead_code))]
1248    pub(crate) fn from_parts(id: u64, guard: Arc<()>) -> Self {
1249        Self { id, guard }
1250    }
1251
1252    /// The strong guard handle (for the intern table's mint path).
1253    #[cfg_attr(not(target_os = "android"), allow(dead_code))]
1254    pub(crate) fn guard_arc(&self) -> Arc<()> {
1255        Arc::clone(&self.guard)
1256    }
1257}
1258
1259impl Default for BufferIdentity {
1260    fn default() -> Self {
1261        Self::new()
1262    }
1263}
1264
1265#[cfg(target_os = "linux")]
1266use nix::sys::stat::{major, minor};
1267
1268pub trait TensorTrait<T>: Send + Sync
1269where
1270    T: Num + Clone + fmt::Debug,
1271{
1272    /// Create a new tensor with the given shape and optional name. If no name
1273    /// is given, a random name will be generated.
1274    fn new(shape: &[usize], name: Option<&str>) -> Result<Self>
1275    where
1276        Self: Sized;
1277
1278    #[cfg(unix)]
1279    /// Create a new tensor using the given file descriptor, shape, and optional
1280    /// name. If no name is given, a random name will be generated.
1281    ///
1282    /// On Linux: Inspects the fd to determine DMA vs SHM based on device major/minor.
1283    /// On other Unix (macOS): Always creates SHM tensor.
1284    fn from_fd(fd: std::os::fd::OwnedFd, shape: &[usize], name: Option<&str>) -> Result<Self>
1285    where
1286        Self: Sized;
1287
1288    #[cfg(unix)]
1289    /// Clone the file descriptor associated with this tensor.
1290    fn clone_fd(&self) -> Result<std::os::fd::OwnedFd>;
1291
1292    /// Get the memory type of this tensor.
1293    fn memory(&self) -> TensorMemory;
1294
1295    /// Get the name of this tensor.
1296    fn name(&self) -> String;
1297
1298    /// Get the number of elements in this tensor.
1299    fn len(&self) -> usize {
1300        self.shape().iter().product()
1301    }
1302
1303    /// Check if the tensor is empty.
1304    fn is_empty(&self) -> bool {
1305        self.len() == 0
1306    }
1307
1308    /// Get the size in bytes of this tensor.
1309    fn size(&self) -> usize {
1310        self.len() * std::mem::size_of::<T>()
1311    }
1312
1313    /// Get the shape of this tensor.
1314    fn shape(&self) -> &[usize];
1315
1316    /// Reshape this tensor to the given shape. The total number of elements
1317    /// must remain the same.
1318    fn reshape(&mut self, shape: &[usize]) -> Result<()>;
1319
1320    /// Bytes of the underlying allocation (>= the current logical `size()`).
1321    /// Defaults to the logical size for storages without spare capacity.
1322    fn capacity_bytes(&self) -> usize {
1323        self.size()
1324    }
1325
1326    /// Set the logical shape to any shape whose byte size fits the allocation
1327    /// capacity, without the equal-size constraint of `reshape`.
1328    fn set_logical_shape(&mut self, shape: &[usize]) -> Result<()> {
1329        self.reshape(shape)
1330    }
1331
1332    /// Map the tensor into memory with the given access direction and
1333    /// return a TensorMap for accessing the data.
1334    ///
1335    /// `access` selects the platform mapping mode (read-only IOSurface
1336    /// lock, dma-buf sync direction, AHardwareBuffer lock usage) and the
1337    /// map's mutability: a map obtained with [`CpuAccess::Read`] rejects
1338    /// `as_mut_slice`. [`CpuAccess::None`] is not a mappable direction
1339    /// and returns [`Error::InvalidArgument`].
1340    ///
1341    /// Prefer the typed wrappers [`map_read`](Self::map_read) /
1342    /// [`map_write`](Self::map_write) / [`map_mut`](Self::map_mut).
1343    fn map_with(&self, access: CpuAccess) -> Result<TensorMap<T>>;
1344
1345    /// Map the tensor read-write (equivalent to
1346    /// `map_with(CpuAccess::ReadWrite)` — the historical `map()`
1347    /// behavior).
1348    fn map(&self) -> Result<TensorMap<T>> {
1349        self.map_with(CpuAccess::ReadWrite)
1350    }
1351
1352    /// Map the tensor for CPU reading only. The returned map rejects
1353    /// `as_mut_slice`; on macOS this takes the read-only IOSurface lock
1354    /// (skips the unlock flush), on Linux the dma-buf read-direction
1355    /// sync.
1356    fn map_read(&self) -> Result<TensorMap<T>> {
1357        self.map_with(CpuAccess::Read)
1358    }
1359
1360    /// Map the tensor for CPU writing (fill-only: reading through a
1361    /// write map may see write-combined memory — do not read the slice).
1362    fn map_write(&self) -> Result<TensorMap<T>> {
1363        self.map_with(CpuAccess::Write)
1364    }
1365
1366    /// Map the tensor read-write (alias of [`map`](Self::map) with the
1367    /// intent spelled out).
1368    fn map_mut(&self) -> Result<TensorMap<T>> {
1369        self.map_with(CpuAccess::ReadWrite)
1370    }
1371
1372    /// Get the buffer identity for cache keying and liveness tracking.
1373    fn buffer_identity(&self) -> &BufferIdentity;
1374
1375    /// Create a zero-copy sub-region view of this backing that shares the
1376    /// underlying allocation **and** [`BufferIdentity`].
1377    ///
1378    /// The window is `[offset_bytes, offset_bytes + shape.product() *
1379    /// size_of::<T>())` measured from this tensor's own logical start, so a
1380    /// sub-view of a sub-view composes by adding offsets. Sharing the parent's
1381    /// identity is the contract that lets identity-keyed caches (e.g. the GL
1382    /// EGLImage import cache) treat offset-distinct windows as one buffer rather
1383    /// than unrelated allocations — `view` must never mint a fresh identity.
1384    ///
1385    /// Defaults to [`Error::NotImplemented`]; every backend that supports
1386    /// sub-views overrides it (`Mem`, `Shm`, Linux DMA, macOS IOSurface, `Pbo`).
1387    fn view(&self, offset_bytes: usize, shape: &[usize]) -> Result<Self>
1388    where
1389        Self: Sized,
1390    {
1391        let _ = (offset_bytes, shape);
1392        Err(Error::NotImplemented(
1393            "view (zero-copy sub-region) is not supported for this tensor backend".to_owned(),
1394        ))
1395    }
1396}
1397
1398pub trait TensorMapTrait<T>
1399where
1400    T: Num + Clone + fmt::Debug,
1401{
1402    /// Get the shape of this tensor map.
1403    fn shape(&self) -> &[usize];
1404
1405    /// Unmap the tensor from memory.
1406    fn unmap(&mut self);
1407
1408    /// Get the number of elements in this tensor map.
1409    fn len(&self) -> usize {
1410        self.shape().iter().product()
1411    }
1412
1413    /// Check if the tensor map is empty.
1414    fn is_empty(&self) -> bool {
1415        self.len() == 0
1416    }
1417
1418    /// Get the size in bytes of this tensor map.
1419    fn size(&self) -> usize {
1420        self.len() * std::mem::size_of::<T>()
1421    }
1422
1423    /// Get a slice to the data in this tensor map.
1424    fn as_slice(&self) -> &[T];
1425
1426    /// Get a mutable slice to the data in this tensor map.
1427    fn as_mut_slice(&mut self) -> &mut [T];
1428
1429    #[cfg(feature = "ndarray")]
1430    /// Get an ndarray ArrayView of the tensor data.
1431    fn view(&'_ self) -> Result<ndarray::ArrayView<'_, T, ndarray::Dim<ndarray::IxDynImpl>>> {
1432        Ok(ndarray::ArrayView::from_shape(
1433            self.shape(),
1434            self.as_slice(),
1435        )?)
1436    }
1437
1438    #[cfg(feature = "ndarray")]
1439    /// Get an ndarray ArrayViewMut of the tensor data.
1440    fn view_mut(
1441        &'_ mut self,
1442    ) -> Result<ndarray::ArrayViewMut<'_, T, ndarray::Dim<ndarray::IxDynImpl>>> {
1443        let shape = self.shape().to_vec();
1444        Ok(ndarray::ArrayViewMut::from_shape(
1445            shape,
1446            self.as_mut_slice(),
1447        )?)
1448    }
1449}
1450
1451#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1452pub enum TensorMemory {
1453    /// Platform-native zero-copy GPU buffer.
1454    ///
1455    /// On Linux this is a DMA-BUF (`DmaTensor` in `crates/tensor/src/dma.rs`)
1456    /// allocated via the DRM/dma-heap subsystem. On macOS this is an
1457    /// IOSurface (`IoSurfaceTensor` in `crates/tensor/src/iosurface.rs`).
1458    /// Both fit into the same `TensorStorage::Dma` slot at the trait
1459    /// level — the public C API discriminant (`HalTensorMemory::Dma=1`)
1460    /// works on both platforms with no ABI break.
1461    ///
1462    /// Allows hardware-accelerated paths (OpenGL backend on Linux via
1463    /// `EGL_EXT_image_dma_buf_import`; macOS via
1464    /// `EGL_ANGLE_iosurface_client_buffer`). CPU access via `map()`
1465    /// incurs cache-coherency overhead on Linux DMA-BUF and is similar
1466    /// in cost on IOSurface; SHM/Mem are cheaper for CPU-only workloads.
1467    Dma,
1468    #[cfg(unix)]
1469    /// POSIX Shared Memory allocation. Suitable for inter-process
1470    /// communication, but not suitable for hardware acceleration.
1471    Shm,
1472
1473    /// Regular system memory allocation
1474    Mem,
1475
1476    /// OpenGL Pixel Buffer Object memory. Created by ImageProcessor
1477    /// when DMA-buf is unavailable but OpenGL is present.
1478    Pbo,
1479}
1480
1481impl From<TensorMemory> for String {
1482    fn from(memory: TensorMemory) -> Self {
1483        match memory {
1484            TensorMemory::Dma => "dma".to_owned(),
1485            #[cfg(unix)]
1486            TensorMemory::Shm => "shm".to_owned(),
1487            TensorMemory::Mem => "mem".to_owned(),
1488            TensorMemory::Pbo => "pbo".to_owned(),
1489        }
1490    }
1491}
1492
1493impl TryFrom<&str> for TensorMemory {
1494    type Error = Error;
1495
1496    fn try_from(s: &str) -> Result<Self> {
1497        match s {
1498            "dma" => Ok(TensorMemory::Dma),
1499            #[cfg(unix)]
1500            "shm" => Ok(TensorMemory::Shm),
1501            "mem" => Ok(TensorMemory::Mem),
1502            "pbo" => Ok(TensorMemory::Pbo),
1503            _ => Err(Error::InvalidMemoryType(s.to_owned())),
1504        }
1505    }
1506}
1507
1508#[derive(Debug)]
1509#[allow(dead_code)] // Variants are constructed by downstream crates via pub(crate) helpers
1510pub(crate) enum TensorStorage<T>
1511where
1512    T: Num + Clone + fmt::Debug + Send + Sync,
1513{
1514    /// Platform-native zero-copy GPU buffer. Inner type differs per
1515    /// target: `DmaTensor` on Linux (DMA-BUF fd), `IoSurfaceTensor` on
1516    /// macOS (CFRetained IOSurface). The shared variant name keeps the
1517    /// public `TensorMemory::Dma` discriminant stable across platforms.
1518    #[cfg(target_os = "linux")]
1519    Dma(DmaTensor<T>),
1520    #[cfg(any(target_os = "macos", target_os = "ios"))]
1521    Dma(IoSurfaceTensor<T>),
1522    #[cfg(target_os = "android")]
1523    Dma(AHardwareBufferTensor<T>),
1524    #[cfg(unix)]
1525    Shm(ShmTensor<T>),
1526    Mem(MemTensor<T>),
1527    Pbo(PboTensor<T>),
1528}
1529
1530impl<T> TensorStorage<T>
1531where
1532    T: Num + Clone + fmt::Debug + Send + Sync,
1533{
1534    /// The backing allocation's intrinsic physical row pitch in bytes, if it
1535    /// has one that is fixed independent of the logical shape. macOS IOSurface
1536    /// reports its 64-aligned `bytesPerRow`; other backings (Linux DMA, SHM,
1537    /// Mem, PBO) have no fixed pitch beyond the logical shape and return `None`.
1538    ///
1539    /// Used by `configure_image` to preserve the physical pitch when a reused
1540    /// pool tensor is reconfigured to a smaller logical image — so the decode
1541    /// writes rows at the surface's real stride and the GPU samples them with
1542    /// the same stride (the physical-grid / logical-ROI decoupling).
1543    pub(crate) fn backing_row_stride(&self) -> Option<usize> {
1544        match self {
1545            // Only genuine image-formatted IOSurfaces (height > 1) carry a real
1546            // per-row pitch; a generic byte-bag (height == 1) returns `None` so
1547            // `configure_image` does not adopt its whole-buffer "row" as a stride.
1548            #[cfg(any(target_os = "macos", target_os = "ios"))]
1549            TensorStorage::Dma(t) => t.image_backing_row_stride(),
1550            // Android AHardwareBuffer: same rule — only genuine 2D
1551            // image-formatted buffers (height > 1) carry a real row pitch.
1552            #[cfg(target_os = "android")]
1553            TensorStorage::Dma(t) => t.image_backing_row_stride(),
1554            _ => None,
1555        }
1556    }
1557
1558    /// Create a new tensor storage with the given shape, memory type, and
1559    /// optional name. If no name is given, a random name will be generated.
1560    /// If no memory type is given, the best available memory type will be
1561    /// chosen based on the platform and environment variables.
1562    fn new(shape: &[usize], memory: Option<TensorMemory>, name: Option<&str>) -> Result<Self> {
1563        match memory {
1564            #[cfg(target_os = "linux")]
1565            Some(TensorMemory::Dma) => {
1566                DmaTensor::<T>::new(shape, name).map(TensorStorage::Dma)
1567            }
1568            #[cfg(any(target_os = "macos", target_os = "ios"))]
1569            Some(TensorMemory::Dma) => {
1570                IoSurfaceTensor::<T>::new(shape, name).map(TensorStorage::Dma)
1571            }
1572            #[cfg(target_os = "android")]
1573            Some(TensorMemory::Dma) => {
1574                AHardwareBufferTensor::<T>::new(shape, name).map(TensorStorage::Dma)
1575            }
1576            #[cfg(not(any(
1577                target_os = "linux",
1578                target_os = "macos",
1579                target_os = "ios",
1580                target_os = "android"
1581            )))]
1582            Some(TensorMemory::Dma) => Err(crate::error::Error::NotImplemented(
1583                "TensorMemory::Dma is only available on Linux (DMA-BUF), macOS/iOS (IOSurface), \
1584                 and Android (AHardwareBuffer)"
1585                    .to_owned(),
1586            )),
1587            #[cfg(unix)]
1588            Some(TensorMemory::Shm) => {
1589                ShmTensor::<T>::new(shape, name).map(TensorStorage::Shm)
1590            }
1591            Some(TensorMemory::Mem) => {
1592                MemTensor::<T>::new(shape, name).map(TensorStorage::Mem)
1593            }
1594            Some(TensorMemory::Pbo) => Err(crate::error::Error::NotImplemented(
1595                "PboTensor cannot be created via Tensor::new() — use ImageProcessor::create_image()".to_owned(),
1596            )),
1597            None => {
1598                if std::env::var("EDGEFIRST_TENSOR_FORCE_MEM")
1599                    .is_ok_and(|x| x != "0" && x.to_lowercase() != "false")
1600                {
1601                    MemTensor::<T>::new(shape, name).map(TensorStorage::Mem)
1602                } else {
1603                    // Auto-select priority: Dma > Mem. Shm is intentionally NOT
1604                    // auto-selected — it offers no advantage over Mem for an
1605                    // in-process tensor and Mem always succeeds, so Shm below Mem
1606                    // is effectively never reached. Request Shm explicitly via
1607                    // `TensorMemory::Shm` when cross-process sharing is needed.
1608                    // (PBO sits between Dma and Mem but is GL-backed and created
1609                    // only via `ImageProcessor::create_image`.)
1610                    #[cfg(target_os = "linux")]
1611                    {
1612                        // Linux: Try DMA -> Mem
1613                        match DmaTensor::<T>::new(shape, name) {
1614                            Ok(tensor) => Ok(TensorStorage::Dma(tensor)),
1615                            Err(_) => MemTensor::<T>::new(shape, name).map(TensorStorage::Mem),
1616                        }
1617                    }
1618                    #[cfg(any(target_os = "macos", target_os = "ios"))]
1619                    {
1620                        // macOS/iOS: Try IOSurface -> Mem. IOSurface is the
1621                        // GPU-shareable backend (zero-copy via ANGLE), filling the
1622                        // same role as DMA-BUF on Linux.
1623                        match IoSurfaceTensor::<T>::new(shape, name) {
1624                            Ok(tensor) => Ok(TensorStorage::Dma(tensor)),
1625                            Err(_) => MemTensor::<T>::new(shape, name).map(TensorStorage::Mem),
1626                        }
1627                    }
1628                    #[cfg(target_os = "android")]
1629                    {
1630                        // Android: Mem. Unlike macOS (where a byte-bag
1631                        // IOSurface is still GL-importable as R8) a generic
1632                        // BLOB AHardwareBuffer cannot back a GPU texture, so
1633                        // auto-selecting it buys nothing and costs plenty: a
1634                        // gralloc allocator-HAL ioctl per allocation (orders
1635                        // slower than malloc, ≥1 page + a dmabuf fd even for
1636                        // tiny tensors) and a lock/unlock cache-maintenance
1637                        // round trip on every map(). Zero-copy image tensors
1638                        // come from `Tensor::image(..)`; callers that want a
1639                        // BLOB (NNAPI handoff) request `TensorMemory::Dma`
1640                        // explicitly.
1641                        MemTensor::<T>::new(shape, name).map(TensorStorage::Mem)
1642                    }
1643                    #[cfg(all(
1644                        unix,
1645                        not(any(
1646                            target_os = "linux",
1647                            target_os = "macos",
1648                            target_os = "ios",
1649                            target_os = "android"
1650                        ))
1651                    ))]
1652                    {
1653                        // Other Unix (BSD): Mem only (no DMA; Shm is explicit-only)
1654                        MemTensor::<T>::new(shape, name).map(TensorStorage::Mem)
1655                    }
1656                    #[cfg(not(unix))]
1657                    {
1658                        // Windows/other: Mem only
1659                        MemTensor::<T>::new(shape, name).map(TensorStorage::Mem)
1660                    }
1661                }
1662            }
1663        }
1664    }
1665
1666    /// Create a DMA-backed tensor storage with an explicit byte size that
1667    /// may exceed `shape.product() * sizeof(T)`. Used for image tensors
1668    /// with row-padded layouts (see `DmaTensor::new_with_byte_size`).
1669    ///
1670    /// This is intentionally DMA-only: padding is only meaningful for
1671    /// buffers that will be imported as GPU textures via EGLImage. PBO,
1672    /// Shm, and Mem storage doesn't benefit from pitch alignment and
1673    /// shouldn't pay the memory cost.
1674    #[cfg(target_os = "linux")]
1675    pub(crate) fn new_dma_with_byte_size(
1676        shape: &[usize],
1677        byte_size: usize,
1678        name: Option<&str>,
1679    ) -> Result<Self> {
1680        DmaTensor::<T>::new_with_byte_size(shape, byte_size, name).map(TensorStorage::Dma)
1681    }
1682
1683    // No non-Linux stub: the only caller (`Tensor::image_with_stride`)
1684    // returns `NotImplemented` directly on non-Linux without ever
1685    // reaching the storage layer, so defining a stub here would be
1686    // dead code and fail the `-D warnings` clippy gate on macOS CI.
1687
1688    /// Create a Mem-backed tensor storage with an explicit byte size that may
1689    /// exceed `shape.product() * sizeof(T)`.  Used for image tensors with
1690    /// 64-byte-aligned row strides (see `MemTensor::with_capacity_bytes`).
1691    pub(crate) fn new_mem_with_byte_size(
1692        shape: &[usize],
1693        byte_size: usize,
1694        name: Option<&str>,
1695    ) -> Result<Self>
1696    where
1697        T: 'static,
1698    {
1699        MemTensor::<T>::with_capacity_bytes(shape, byte_size, name).map(TensorStorage::Mem)
1700    }
1701
1702    /// Create a Shm-backed tensor storage with an explicit byte size that may
1703    /// exceed `shape.product() * sizeof(T)`.  Used for image tensors with
1704    /// 64-byte-aligned row strides (see `ShmTensor::new_with_byte_size`).
1705    #[cfg(unix)]
1706    pub(crate) fn new_shm_with_byte_size(
1707        shape: &[usize],
1708        byte_size: usize,
1709        name: Option<&str>,
1710    ) -> Result<Self> {
1711        ShmTensor::<T>::new_with_byte_size(shape, byte_size, name).map(TensorStorage::Shm)
1712    }
1713
1714    /// Allocate an image-formatted IOSurface-backed storage (macOS).
1715    ///
1716    /// Used by `Tensor::image()` when the caller requests
1717    /// `TensorMemory::Dma` and the format has an IOSurface FourCC
1718    /// mapping (YUYV, RGBA, BGRA today). Falls back to `new_with_byte_size`
1719    /// otherwise.
1720    #[cfg(any(target_os = "macos", target_os = "ios"))]
1721    pub(crate) fn new_image_iosurface(
1722        width: usize,
1723        height: usize,
1724        format: PixelFormat,
1725        dtype: DType,
1726        shape: &[usize],
1727        name: Option<&str>,
1728    ) -> Result<Self> {
1729        IoSurfaceTensor::<T>::new_image(width, height, format, dtype, shape, name)
1730            .map(TensorStorage::Dma)
1731    }
1732
1733    /// Allocate an image-formatted AHardwareBuffer-backed storage (Android).
1734    ///
1735    /// Used by `Tensor::image()` when the caller requests
1736    /// `TensorMemory::Dma` and the format has an AHardwareBuffer format
1737    /// mapping (RGBA8 and the RGBA16F float paths today). Falls back to
1738    /// `new_with_byte_size` otherwise.
1739    #[cfg(target_os = "android")]
1740    pub(crate) fn new_image_ahardwarebuffer(
1741        width: usize,
1742        height: usize,
1743        format: PixelFormat,
1744        dtype: DType,
1745        shape: &[usize],
1746        name: Option<&str>,
1747        access: CpuAccess,
1748    ) -> Result<Self> {
1749        AHardwareBufferTensor::<T>::new_image(width, height, format, dtype, shape, name, access)
1750            .map(TensorStorage::Dma)
1751    }
1752
1753    /// Create a new tensor storage using the given file descriptor, shape,
1754    /// and optional name.
1755    #[cfg(unix)]
1756    fn from_fd(fd: OwnedFd, shape: &[usize], name: Option<&str>) -> Result<Self> {
1757        #[cfg(target_os = "linux")]
1758        {
1759            use nix::sys::stat::fstat;
1760
1761            let stat = fstat(&fd)?;
1762            let major = major(stat.st_dev);
1763            let minor = minor(stat.st_dev);
1764
1765            log::debug!("Creating tensor from fd: major={major}, minor={minor}");
1766
1767            if major != 0 {
1768                // Dma and Shm tensors are expected to have major number 0
1769                return Err(Error::UnknownDeviceType(major, minor));
1770            }
1771
1772            match minor {
1773                9 | 10 => {
1774                    // minor number 9 & 10 indicates DMA memory
1775                    DmaTensor::<T>::from_fd(fd, shape, name).map(TensorStorage::Dma)
1776                }
1777                _ => {
1778                    // other minor numbers are assumed to be shared memory
1779                    ShmTensor::<T>::from_fd(fd, shape, name).map(TensorStorage::Shm)
1780                }
1781            }
1782        }
1783        #[cfg(all(unix, not(target_os = "linux")))]
1784        {
1785            // On macOS/iOS/BSD, always use SHM (no DMA-BUF fd import)
1786            ShmTensor::<T>::from_fd(fd, shape, name).map(TensorStorage::Shm)
1787        }
1788    }
1789}
1790
1791impl<T> TensorTrait<T> for TensorStorage<T>
1792where
1793    T: Num + Clone + fmt::Debug + Send + Sync,
1794{
1795    fn new(shape: &[usize], name: Option<&str>) -> Result<Self> {
1796        Self::new(shape, None, name)
1797    }
1798
1799    #[cfg(unix)]
1800    fn from_fd(fd: OwnedFd, shape: &[usize], name: Option<&str>) -> Result<Self> {
1801        Self::from_fd(fd, shape, name)
1802    }
1803
1804    #[cfg(unix)]
1805    fn clone_fd(&self) -> Result<OwnedFd> {
1806        match self {
1807            #[cfg(any(
1808                target_os = "linux",
1809                target_os = "macos",
1810                target_os = "ios",
1811                target_os = "android"
1812            ))]
1813            TensorStorage::Dma(t) => t.clone_fd(),
1814            TensorStorage::Shm(t) => t.clone_fd(),
1815            TensorStorage::Mem(t) => t.clone_fd(),
1816            TensorStorage::Pbo(t) => t.clone_fd(),
1817        }
1818    }
1819
1820    fn memory(&self) -> TensorMemory {
1821        match self {
1822            #[cfg(any(
1823                target_os = "linux",
1824                target_os = "macos",
1825                target_os = "ios",
1826                target_os = "android"
1827            ))]
1828            TensorStorage::Dma(_) => TensorMemory::Dma,
1829            #[cfg(unix)]
1830            TensorStorage::Shm(_) => TensorMemory::Shm,
1831            TensorStorage::Mem(_) => TensorMemory::Mem,
1832            TensorStorage::Pbo(_) => TensorMemory::Pbo,
1833        }
1834    }
1835
1836    fn name(&self) -> String {
1837        match self {
1838            #[cfg(any(
1839                target_os = "linux",
1840                target_os = "macos",
1841                target_os = "ios",
1842                target_os = "android"
1843            ))]
1844            TensorStorage::Dma(t) => t.name(),
1845            #[cfg(unix)]
1846            TensorStorage::Shm(t) => t.name(),
1847            TensorStorage::Mem(t) => t.name(),
1848            TensorStorage::Pbo(t) => t.name(),
1849        }
1850    }
1851
1852    fn shape(&self) -> &[usize] {
1853        match self {
1854            #[cfg(any(
1855                target_os = "linux",
1856                target_os = "macos",
1857                target_os = "ios",
1858                target_os = "android"
1859            ))]
1860            TensorStorage::Dma(t) => t.shape(),
1861            #[cfg(unix)]
1862            TensorStorage::Shm(t) => t.shape(),
1863            TensorStorage::Mem(t) => t.shape(),
1864            TensorStorage::Pbo(t) => t.shape(),
1865        }
1866    }
1867
1868    fn reshape(&mut self, shape: &[usize]) -> Result<()> {
1869        match self {
1870            #[cfg(any(
1871                target_os = "linux",
1872                target_os = "macos",
1873                target_os = "ios",
1874                target_os = "android"
1875            ))]
1876            TensorStorage::Dma(t) => t.reshape(shape),
1877            #[cfg(unix)]
1878            TensorStorage::Shm(t) => t.reshape(shape),
1879            TensorStorage::Mem(t) => t.reshape(shape),
1880            TensorStorage::Pbo(t) => t.reshape(shape),
1881        }
1882    }
1883
1884    fn capacity_bytes(&self) -> usize {
1885        match self {
1886            #[cfg(any(
1887                target_os = "linux",
1888                target_os = "macos",
1889                target_os = "ios",
1890                target_os = "android"
1891            ))]
1892            TensorStorage::Dma(t) => t.capacity_bytes(),
1893            #[cfg(unix)]
1894            TensorStorage::Shm(t) => t.capacity_bytes(),
1895            TensorStorage::Mem(t) => t.capacity_bytes(),
1896            TensorStorage::Pbo(t) => t.capacity_bytes(),
1897        }
1898    }
1899
1900    fn set_logical_shape(&mut self, shape: &[usize]) -> Result<()> {
1901        match self {
1902            #[cfg(any(
1903                target_os = "linux",
1904                target_os = "macos",
1905                target_os = "ios",
1906                target_os = "android"
1907            ))]
1908            TensorStorage::Dma(t) => t.set_logical_shape(shape),
1909            #[cfg(unix)]
1910            TensorStorage::Shm(t) => t.set_logical_shape(shape),
1911            TensorStorage::Mem(t) => t.set_logical_shape(shape),
1912            TensorStorage::Pbo(t) => t.set_logical_shape(shape),
1913        }
1914    }
1915
1916    fn map_with(&self, access: CpuAccess) -> Result<TensorMap<T>> {
1917        match self {
1918            #[cfg(any(
1919                target_os = "linux",
1920                target_os = "macos",
1921                target_os = "ios",
1922                target_os = "android"
1923            ))]
1924            TensorStorage::Dma(t) => t.map_with(access),
1925            #[cfg(unix)]
1926            TensorStorage::Shm(t) => t.map_with(access),
1927            TensorStorage::Mem(t) => t.map_with(access),
1928            TensorStorage::Pbo(t) => t.map_with(access),
1929        }
1930    }
1931
1932    fn buffer_identity(&self) -> &BufferIdentity {
1933        match self {
1934            #[cfg(any(
1935                target_os = "linux",
1936                target_os = "macos",
1937                target_os = "ios",
1938                target_os = "android"
1939            ))]
1940            TensorStorage::Dma(t) => t.buffer_identity(),
1941            #[cfg(unix)]
1942            TensorStorage::Shm(t) => t.buffer_identity(),
1943            TensorStorage::Mem(t) => t.buffer_identity(),
1944            TensorStorage::Pbo(t) => t.buffer_identity(),
1945        }
1946    }
1947
1948    /// Forward a sub-region view to the active backend, re-wrapping the
1949    /// backend's view (which shares the parent's allocation and
1950    /// [`BufferIdentity`]) back into the matching `TensorStorage` variant. Each
1951    /// backend's `view` is its own `TensorTrait::view` override; this single
1952    /// match is the only per-variant dispatch (`Tensor::subview` calls through
1953    /// here rather than matching the storage itself).
1954    fn view(&self, offset_bytes: usize, shape: &[usize]) -> Result<Self> {
1955        match self {
1956            #[cfg(any(
1957                target_os = "linux",
1958                target_os = "macos",
1959                target_os = "ios",
1960                target_os = "android"
1961            ))]
1962            TensorStorage::Dma(t) => t.view(offset_bytes, shape).map(TensorStorage::Dma),
1963            #[cfg(unix)]
1964            TensorStorage::Shm(t) => t.view(offset_bytes, shape).map(TensorStorage::Shm),
1965            TensorStorage::Mem(t) => t.view(offset_bytes, shape).map(TensorStorage::Mem),
1966            TensorStorage::Pbo(t) => t.view(offset_bytes, shape).map(TensorStorage::Pbo),
1967        }
1968    }
1969}
1970
1971/// Multi-backend tensor with optional image format metadata.
1972///
1973/// When `format` is `Some`, this tensor represents an image. Width, height,
1974/// and channels are derived from `shape` + `format`. When `format` is `None`,
1975/// this is a raw tensor (identical to the pre-refactoring behavior).
1976#[derive(Debug)]
1977pub struct Tensor<T>
1978where
1979    T: Num + Clone + fmt::Debug + Send + Sync,
1980{
1981    /// CUDA registration for this tensor, if any. Set after creation by
1982    /// the image crate once a PBO is registered with CUDA interop.
1983    ///
1984    /// MUST be declared before `storage`: CUDA must unregister the GL buffer
1985    /// before storage's Drop deletes it (cudaGraphicsUnregisterResource before
1986    /// glDeleteBuffers). Rust drops fields in declaration order.
1987    cuda: Option<crate::cuda::CudaHandle>,
1988    pub(crate) storage: TensorStorage<T>,
1989    format: Option<PixelFormat>,
1990    chroma: Option<Box<Tensor<T>>>,
1991    /// Row stride in bytes for externally allocated buffers with row padding.
1992    /// `None` means tightly packed (stride == width * bytes_per_pixel).
1993    row_stride: Option<usize>,
1994    /// Byte offset within the DMA-BUF where image data starts.
1995    /// `None` means offset 0 (data starts at the beginning of the buffer).
1996    plane_offset: Option<usize>,
1997    /// Quantization metadata for integer-typed tensors. Public access is
1998    /// gated by the `IntegerType` trait — `Tensor<f32>` etc. carry the
1999    /// field for layout uniformity but have no way to read or write it.
2000    pub(crate) quantization: Option<Quantization>,
2001    /// Optional colorimetry metadata. `None` = undefined; never auto-filled.
2002    colorimetry: Option<crate::Colorimetry>,
2003    /// Declared CPU access (see [`CpuAccess`]). Image constructors set it
2004    /// from their `access` parameter; non-image tensors (`new`, imports,
2005    /// numpy) default to `ReadWrite` — they are CPU-centric by nature.
2006    /// `map_with` counts requests beyond this declaration as unplanned.
2007    cpu_access: CpuAccess,
2008    /// Recorded vendor tile-compression scheme — best knowledge from
2009    /// allocation time (see [`Compression`]). `Some` only for Android
2010    /// hardware-only AHardwareBuffers whose allocation requested
2011    /// compression on an eligible format with a recognized vendor. When
2012    /// set, the row-stride accessors describe no meaningful linear
2013    /// layout. `configure_image` preserves it (physical layout, unlike
2014    /// colorimetry); views inherit it.
2015    compression: Option<CompressionScheme>,
2016    /// Parent-image snapshot when this tensor is a [`view`](Self::view)/
2017    /// [`batch`](Self::batch) sub-region; `None` for a whole tensor. Lets the GL
2018    /// backend key its import on the parent and render the view as a
2019    /// `glViewport`/`glScissor` ROI. See [`ViewOrigin`].
2020    view_origin: Option<ViewOrigin>,
2021}
2022
2023impl<T> Tensor<T>
2024where
2025    T: Num + Clone + fmt::Debug + Send + Sync,
2026{
2027    /// Wrap a TensorStorage in a Tensor with no image metadata.
2028    pub(crate) fn wrap(storage: TensorStorage<T>) -> Self {
2029        Self {
2030            storage,
2031            format: None,
2032            chroma: None,
2033            row_stride: None,
2034            plane_offset: None,
2035            quantization: None,
2036            cuda: None,
2037            colorimetry: None,
2038            cpu_access: CpuAccess::ReadWrite,
2039            compression: None,
2040            view_origin: None,
2041        }
2042    }
2043
2044    /// Construct a tensor from a row-major element slice + shape. Allocates a
2045    /// new buffer (`TensorMemory::Mem`) and memcpys the contents; caller
2046    /// retains ownership of the input slice.
2047    ///
2048    /// # Errors
2049    ///
2050    /// - [`Error::InvalidShape`] if `values.len() != shape.iter().product()`.
2051    /// - Propagates any allocation error from [`Self::new`].
2052    pub fn from_slice(values: &[T], shape: &[usize]) -> Result<Self>
2053    where
2054        T: Copy,
2055    {
2056        let expected: usize = shape.iter().product();
2057        if values.len() != expected {
2058            return Err(Error::InvalidShape(format!(
2059                "from_slice: values.len()={} but shape product={expected} (shape={shape:?})",
2060                values.len()
2061            )));
2062        }
2063        let t = Self::new(shape, Some(TensorMemory::Mem), None)?;
2064        {
2065            let mut m = t.map()?;
2066            m.as_mut_slice().copy_from_slice(values);
2067        }
2068        Ok(t)
2069    }
2070
2071    /// Wrap externally-owned memory as a tensor without copying. The tensor
2072    /// borrows `[ptr, ptr + shape.product() * size_of::<T>())` as
2073    /// [`TensorMemory::Mem`]; `owner`, when `Some`, co-owns the source so it
2074    /// outlives the tensor (and all derived views/maps). See [`ForeignOwner`].
2075    ///
2076    /// The canonical use is CUDA zero-copy: allocate host-coherent memory
2077    /// (`cudaHostAlloc`), wrap the host pointer here, and bind the matching
2078    /// device pointer to the inference engine — reads and writes hit the same
2079    /// physical buffer with no host copy. The identical primitive backs the
2080    /// Python `Tensor.from_numpy` zero-copy borrow (owner = the NumPy object).
2081    ///
2082    /// # Safety
2083    ///
2084    /// `ptr` must be non-null, aligned to `align_of::<T>()`, and valid for
2085    /// `shape.product()` elements of `T` for as long as the returned tensor —
2086    /// and every view/map sharing its backing — is alive. Pass an `owner` that
2087    /// co-owns the source to uphold that contract.
2088    ///
2089    /// # Errors
2090    ///
2091    /// [`Error::InvalidSize`] if `shape` is empty.
2092    pub unsafe fn from_foreign(
2093        ptr: *mut T,
2094        shape: &[usize],
2095        owner: Option<crate::ForeignOwner>,
2096        name: Option<&str>,
2097    ) -> Result<Self> {
2098        if shape.is_empty() {
2099            return Err(Error::InvalidSize(0));
2100        }
2101        if ptr.is_null() {
2102            return Err(Error::InvalidArgument(
2103                "from_foreign: ptr must be non-null".to_owned(),
2104            ));
2105        }
2106        shape
2107            .iter()
2108            .copied()
2109            .try_fold(1usize, |acc, dim| acc.checked_mul(dim))
2110            .ok_or_else(|| {
2111                Error::InvalidArgument(format!(
2112                    "from_foreign: shape.product() overflows usize (shape={shape:?})"
2113                ))
2114            })?;
2115        let mem = MemTensor::<T>::from_foreign(ptr, shape, owner, name);
2116        Ok(Self::wrap(TensorStorage::Mem(mem)))
2117    }
2118
2119    /// Construct a tensor from a 3-D ndarray view. Respects strides — one
2120    /// copy in all cases; contiguous views take a memcpy fast path.
2121    ///
2122    /// Only available when the `ndarray` feature is enabled.
2123    #[cfg(feature = "ndarray")]
2124    pub fn from_arrayview3(view: ndarray::ArrayView3<'_, T>) -> Result<Self>
2125    where
2126        T: Copy,
2127    {
2128        let (h, w, c) = view.dim();
2129        let t = Self::new(&[h, w, c], Some(TensorMemory::Mem), None)?;
2130        {
2131            let mut m = t.map()?;
2132            let dst = m.as_mut_slice();
2133            if let Some(src) = view.as_slice() {
2134                dst.copy_from_slice(src);
2135            } else {
2136                for (d, &s) in dst.iter_mut().zip(view.iter()) {
2137                    *d = s;
2138                }
2139            }
2140        }
2141        Ok(t)
2142    }
2143
2144    /// Create a new tensor with the given shape, memory type, and optional
2145    /// name. If no name is given, a random name will be generated. If no
2146    /// memory type is given, the best available memory type will be chosen
2147    /// based on the platform and environment variables.
2148    ///
2149    /// On Linux platforms, the order of preference is: Dma -> Shm -> Mem.
2150    /// On other Unix platforms (macOS), the order is: Shm -> Mem.
2151    /// On non-Unix platforms, only Mem is available.
2152    ///
2153    /// # Environment Variables
2154    /// - `EDGEFIRST_TENSOR_FORCE_MEM`: If set to a non-zero and non-false
2155    ///   value, forces the use of regular system memory allocation
2156    ///   (`TensorMemory::Mem`) regardless of platform capabilities.
2157    ///
2158    /// # Example
2159    /// ```rust
2160    /// use edgefirst_tensor::{Error, Tensor, TensorMemory, TensorTrait};
2161    /// # fn main() -> Result<(), Error> {
2162    /// let tensor = Tensor::<f32>::new(&[2, 3, 4], Some(TensorMemory::Mem), Some("test_tensor"))?;
2163    /// assert_eq!(tensor.memory(), TensorMemory::Mem);
2164    /// assert_eq!(tensor.name(), "test_tensor");
2165    /// #    Ok(())
2166    /// # }
2167    /// ```
2168    pub fn new(shape: &[usize], memory: Option<TensorMemory>, name: Option<&str>) -> Result<Self> {
2169        let _span = tracing::trace_span!(
2170            "tensor.alloc",
2171            ?shape,
2172            memory = ?memory,
2173            dtype = std::any::type_name::<T>(),
2174        )
2175        .entered();
2176        #[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
2177        let mut t = TensorStorage::new(shape, memory, name).map(Self::wrap)?;
2178        // Best-effort: attach a CUDA ExternalMemory handle for DMA tensors on
2179        // CUDA-capable hosts. Never blocks tensor creation on failure.
2180        // RUNTIME-UNVALIDATED: no CUDA+dma_heap test platform available; ABI
2181        // layout-asserted vs. CUDA 12.6 driver_types.h; mechanism proven by
2182        // gpu-probe O5 on Orin.
2183        #[cfg(target_os = "linux")]
2184        t.try_init_dma_cuda();
2185        Ok(t)
2186    }
2187
2188    /// Create an image tensor with the given format.
2189    /// Allocate an image tensor from a declarative request — the
2190    /// full-featured constructor behind [`Self::image`] and friends.
2191    ///
2192    /// Adds the [`Compression`] request to the classic parameters:
2193    ///
2194    /// - a request with any CPU access other than [`CpuAccess::None`] is
2195    ///   [`Error::InvalidArgument`] (CPU mapping pins the layout linear);
2196    /// - [`Compression::Scheme`] fails with [`Error::NotImplemented`] on
2197    ///   platforms without vendor tile compression, and with
2198    ///   [`Error::InvalidArgument`] when the device's native scheme or
2199    ///   the format eligibility doesn't match;
2200    /// - [`Compression::Any`] never fails for compression reasons: it
2201    ///   records the scheme when the allocation is eligible and
2202    ///   otherwise resolves linear, incrementing
2203    ///   [`compression_fallback_count`].
2204    ///
2205    /// The recorded outcome is readable via [`Tensor::compression`].
2206    pub fn image_desc(desc: &ImageDesc) -> Result<Self>
2207    where
2208        T: 'static,
2209    {
2210        let Some(t_dtype) = dtype_of::<T>() else {
2211            return Err(Error::InvalidArgument(
2212                "image_desc: element type has no DType mapping".into(),
2213            ));
2214        };
2215        if t_dtype != desc.dtype {
2216            return Err(Error::InvalidArgument(format!(
2217                "image_desc: desc.dtype is {:?} but the tensor element type is {t_dtype:?}",
2218                desc.dtype
2219            )));
2220        }
2221
2222        // Compression-request guards. CPU access pins the layout linear,
2223        // so a request combined with any declared access is a
2224        // contradiction the caller should hear about immediately.
2225        if desc.compression.is_some() && desc.access != CpuAccess::None {
2226            return Err(Error::InvalidArgument(format!(
2227                "image_desc: a compression request requires CpuAccess::None                  (declared {:?}) — CPU mapping pins the layout linear",
2228                desc.access
2229            )));
2230        }
2231        if let Some(Compression::Scheme(requested)) = desc.compression {
2232            #[cfg(not(target_os = "android"))]
2233            {
2234                return Err(Error::NotImplemented(format!(
2235                    "image_desc: Compression::Scheme({requested:?}) — no vendor tile                      compression on this platform (request Compression::Any for a                      portable fallback)"
2236                )));
2237            }
2238            #[cfg(target_os = "android")]
2239            {
2240                if matches!(
2241                    desc.memory,
2242                    Some(TensorMemory::Mem) | Some(TensorMemory::Shm) | Some(TensorMemory::Pbo)
2243                ) {
2244                    return Err(Error::InvalidArgument(format!(
2245                        "image_desc: Compression::Scheme({requested:?}) requires                          hardware memory (TensorMemory::Dma or auto-select), got {:?}",
2246                        desc.memory
2247                    )));
2248                }
2249                if !crate::ahardwarebuffer_layout::compression_eligible(desc.format, desc.dtype) {
2250                    return Err(Error::InvalidArgument(format!(
2251                        "image_desc: ({:?}, {:?}) is not compression-eligible                          (RGBA8888 u8/i8 initially)",
2252                        desc.format, desc.dtype
2253                    )));
2254                }
2255                let device = crate::ahardwarebuffer::device_compression_scheme();
2256                if device != Some(requested) {
2257                    return Err(Error::InvalidArgument(format!(
2258                        "image_desc: Compression::Scheme({requested:?}) but the device's                          native scheme is {device:?}"
2259                    )));
2260                }
2261            }
2262        }
2263
2264        // A compression request implies a hardware pipeline, so auto
2265        // memory promotes to the platform's zero-copy allocation first
2266        // (`Tensor::image` only takes the AHardwareBuffer/IOSurface path
2267        // under an explicit Dma request). `Scheme` propagates the Dma
2268        // failure — the caller demanded a layout only that allocator can
2269        // produce; `Any` falls back to plain auto-select.
2270        #[allow(unused_mut)]
2271        let mut t = match (desc.memory, desc.compression) {
2272            (None, Some(request)) => {
2273                match Self::image(
2274                    desc.width,
2275                    desc.height,
2276                    desc.format,
2277                    Some(TensorMemory::Dma),
2278                    desc.access,
2279                ) {
2280                    Ok(t) => t,
2281                    Err(e) if matches!(request, Compression::Scheme(_)) => return Err(e),
2282                    Err(_) => Self::image(desc.width, desc.height, desc.format, None, desc.access)?,
2283                }
2284            }
2285            (memory, _) => Self::image(desc.width, desc.height, desc.format, memory, desc.access)?,
2286        };
2287
2288        // Record best knowledge / count fallbacks. Only an Android
2289        // hardware-only AHardwareBuffer allocation can actually hold a
2290        // vendor tile layout; everywhere else an Any request resolves
2291        // linear and is counted.
2292        if let Some(request) = desc.compression {
2293            #[cfg(target_os = "android")]
2294            {
2295                let eligible =
2296                    crate::ahardwarebuffer_layout::compression_eligible(desc.format, desc.dtype);
2297                let scheme = crate::ahardwarebuffer::device_compression_scheme();
2298                let is_ahb = t.memory() == TensorMemory::Dma;
2299                match (request, scheme) {
2300                    (_, Some(s)) if eligible && is_ahb => {
2301                        t.set_compression_unchecked(Some(s));
2302                    }
2303                    (Compression::Scheme(requested), _) => {
2304                        // Pre-validated eligible + scheme match, so the only
2305                        // way here is the allocation resolving off-AHB.
2306                        return Err(Error::InvalidOperation(format!(
2307                            "image_desc: Compression::Scheme({requested:?}) requested but                              the allocation resolved to {:?} (not an AHardwareBuffer)",
2308                            t.memory()
2309                        )));
2310                    }
2311                    (Compression::Any, _) => {
2312                        note_compression_fallback(&format!(
2313                            "({:?}, {:?}) {}x{}: eligible={eligible}, scheme={scheme:?},                              memory={:?}",
2314                            desc.format,
2315                            desc.dtype,
2316                            desc.width,
2317                            desc.height,
2318                            t.memory()
2319                        ));
2320                    }
2321                }
2322            }
2323            #[cfg(not(target_os = "android"))]
2324            {
2325                debug_assert!(matches!(request, Compression::Any));
2326                let _ = request;
2327                note_compression_fallback(&format!(
2328                    "({:?}, {:?}) {}x{}: no vendor tile compression on this platform",
2329                    desc.format, desc.dtype, desc.width, desc.height
2330                ));
2331            }
2332        }
2333        Ok(t)
2334    }
2335
2336    pub fn image(
2337        width: usize,
2338        height: usize,
2339        format: PixelFormat,
2340        memory: Option<TensorMemory>,
2341        access: CpuAccess,
2342    ) -> Result<Self>
2343    where
2344        T: 'static,
2345    {
2346        // Shape comes from the shared `PixelFormat::image_shape` helper (packed /
2347        // planar / semi-planar NV12·NV16). NV12 supports odd dimensions via the
2348        // `H + ceil(H/2)` combined-plane height.
2349        // The `T: 'static` bound is required by the macOS IOSurface path below.
2350        let shape = format.image_shape(width, height).ok_or_else(|| {
2351            Error::InvalidArgument(format!(
2352                "invalid dimensions {width}x{height} for format {format:?}"
2353            ))
2354        })?;
2355
2356        // macOS Dma path: allocate a format-aware IOSurface (FourCC +
2357        // 2D dimensions) so the GL backend can bind it via
2358        // `EGL_ANGLE_iosurface_client_buffer`. Without this, the IOSurface
2359        // would default to a generic byte buffer (FourCC 'L008') and
2360        // ANGLE would reject the import with `EGL_BAD_ATTRIBUTE`.
2361        //
2362        // Guard: IOSurface rounds `bytes_per_row` up to 64-byte alignment.
2363        // If the natural row pitch (`width * channels * sizeof(T)`) is not
2364        // already 64-byte aligned, the padded allocation cannot be mapped
2365        // as a contiguous packed tensor — CPU reads/writes would use the
2366        // wrong stride.
2367        //
2368        // Explicit-Dma contract: when the caller passes
2369        // `Some(TensorMemory::Dma)` they have asked for an
2370        // **image-formatted IOSurface**. Silently downgrading to the
2371        // generic 'L008' byte-bag when alignment fails buries the
2372        // mismatch — the caller only finds out hours later when ANGLE
2373        // (or any GL importer) rejects the bind with
2374        // `EGL_BAD_ATTRIBUTE`. Same anti-pattern bit us previously on
2375        // Mali GPUs with DMA-BUF padding. The right behaviour is to
2376        // fail loudly here with the alignment requirement spelled out
2377        // so the caller can either pick aligned dimensions, request
2378        // SHM/Mem explicitly, or pass `memory=None` for auto-select.
2379        #[cfg(any(target_os = "macos", target_os = "ios"))]
2380        if matches!(memory, Some(TensorMemory::Dma)) {
2381            // For planar formats the IOSurface stacks channels
2382            // vertically (channels * height rows), so the row stride is
2383            // single-channel width * sizeof(T). Packed formats keep the
2384            // natural width * channels * sizeof(T) stride.
2385            let natural_row_bytes = match format.layout() {
2386                PixelLayout::Planar => width * std::mem::size_of::<T>(),
2387                _ => width * format.channels() * std::mem::size_of::<T>(),
2388            };
2389            // A format with a real IOSurface FourCC (RGBA/BGRA/YUYV packed,
2390            // GREY/NV12/NV16/NV24 as R8) tolerates a non-64-aligned natural
2391            // pitch: the surface is allocated with its own 64-aligned
2392            // `bytes_per_row`, the tensor records that stride below, and a CPU
2393            // map iterates rows correctly via the strided-map path while the GL
2394            // import uses the surface's pitch directly — fully zero-copy.
2395            // Planar (the F16 RGBA16F packing) is consumed flat as
2396            // `[1, C, H, W]` with no stride, so it still requires an aligned
2397            // pitch; and formats without a FourCC would fall through to a
2398            // generic byte-bag GL can't bind. Both fail loudly rather than
2399            // silently downgrade.
2400            let has_image_fourcc = dtype_of::<T>()
2401                .and_then(|dt| crate::iosurface::image_iosurface_layout(format, dt))
2402                .is_some();
2403            let padded_ok = has_image_fourcc && format.layout() != PixelLayout::Planar;
2404            if !natural_row_bytes.is_multiple_of(64) && !padded_ok {
2405                let elem_size = std::mem::size_of::<T>();
2406                let per_pixel_bytes = match format.layout() {
2407                    PixelLayout::Planar => elem_size.max(1),
2408                    _ => format.channels().max(1) * elem_size.max(1),
2409                };
2410                // Compute the next 64-byte-aligned width by rounding the
2411                // natural row pitch up to the next multiple of 64 and
2412                // dividing back by per-pixel bytes. This handles every
2413                // `per_pixel_bytes` value correctly:
2414                //
2415                //   * Divisors of 64 (1/2/4/8/16/32/64) → the suggestion
2416                //     is always 64-byte aligned.
2417                //   * Non-divisors of 64 (e.g. RGB u8 with 3 B/pixel) →
2418                //     the next aligned row pitch may not be an integer
2419                //     multiple of per_pixel_bytes (3 doesn't divide 64
2420                //     in any way), so a "pad width to N" suggestion is
2421                //     structurally impossible — omit the suggestion
2422                //     instead of printing a wrong number.
2423                //   * per_pixel_bytes > 64 → same situation, also
2424                //     omitted; the previous formula divided by zero.
2425                //
2426                // The error always names the alignment requirement
2427                // verbatim and lists the two non-DMA alternatives so
2428                // the caller has at least one always-applicable fix.
2429                let aligned_row_bytes = natural_row_bytes.next_multiple_of(64);
2430                let pad_hint =
2431                    if per_pixel_bytes > 0 && aligned_row_bytes.is_multiple_of(per_pixel_bytes) {
2432                        let w = aligned_row_bytes / per_pixel_bytes;
2433                        format!("Pad width to {w} (the next 64-byte-aligned stride), ")
2434                    } else {
2435                        String::new()
2436                    };
2437                return Err(Error::InvalidArgument(format!(
2438                    "Tensor::image: {format:?} {width}x{height} with element \
2439                     size {elem_size} produces a {natural_row_bytes}-byte natural \
2440                     row pitch, which is not 64-byte aligned. \
2441                     IOSurface rounds bytes_per_row up to 64 bytes, so a \
2442                     contiguous CPU map of this tensor would read garbage. \
2443                     {pad_hint}pass memory=None to auto-fall-back to SHM, or \
2444                     pass memory=Some(TensorMemory::Shm) or \
2445                     Some(TensorMemory::Mem) explicitly."
2446                )));
2447            }
2448            // Alignment OK. Explicit-Dma contract: the caller asked for an
2449            // **image-formatted, GL-importable** IOSurface, so every failure
2450            // from here on is loud. The old behaviour fell through to the
2451            // generic 'L008' byte-bag `Some(other)` arm below, and the caller
2452            // only found out when the GL import rejected the bind with
2453            // `EGL_BAD_ATTRIBUTE` — the same silent-downgrade anti-pattern as
2454            // the alignment case above. (Semi-planar/Grey u8 map to 'L008'
2455            // *by design* in `image_iosurface_layout` — the R8-plane
2456            // representation the YUV shaders sample — so they return through
2457            // the mapped path and never reach these errors.)
2458            let dtype = dtype_of::<T>().ok_or_else(|| {
2459                Error::InvalidArgument(format!(
2460                    "Tensor::image: element type {} has no DType, so no \
2461                     image-formatted IOSurface exists. Pass memory=None or \
2462                     Some(TensorMemory::Mem) for a CPU tensor.",
2463                    std::any::type_name::<T>()
2464                ))
2465            })?;
2466            if crate::iosurface::image_iosurface_layout(format, dtype).is_none() {
2467                return Err(Error::InvalidArgument(format!(
2468                    "Tensor::image: no zero-copy IOSurface mapping exists for \
2469                     {format:?}/{dtype:?} on macOS/iOS (supported: \
2470                     Rgba/Rgb @ U8/I8, Bgra/Yuyv/Grey/Nv12/Nv16/Nv24 @ U8, \
2471                     Rgba/PlanarRgb/PlanarRgba @ F16). Pass memory=None to \
2472                     auto-select, or Some(TensorMemory::Mem) explicitly, for \
2473                     a CPU tensor."
2474                )));
2475            }
2476            // Packed RGB u8/i8 rides an RGBA8888 surface at (W*3/4, H) —
2477            // reject a width the texel packing cannot express up front
2478            // (mirrors the Android pre-guard).
2479            if format == PixelFormat::Rgb
2480                && matches!(dtype, DType::U8 | DType::I8)
2481                && packed_rgb888_layout(width, height).is_none()
2482            {
2483                return Err(Error::InvalidArgument(format!(
2484                    "Tensor::image: Rgb {dtype:?} requires width%4==0 for the RGBA8888 \
2485                     IOSurface packing (got width={width}). Pad the width, or pass \
2486                     memory=Some(TensorMemory::Mem) for a CPU tensor."
2487                )));
2488            }
2489            let storage = TensorStorage::<T>::new_image_iosurface(
2490                width, height, format, dtype, &shape, None,
2491            )?;
2492            let mut t = Self::wrap(storage);
2493            t.format = Some(format);
2494            // IOSurface rounds `bytes_per_row` up to 64 bytes. When that
2495            // pitch exceeds the natural packed/planar row stride, record
2496            // it so CPU consumers iterate rows correctly (the GL import
2497            // already uses the surface's own pitch). For 64-aligned rows
2498            // — the common model-input case — the two match and no stride
2499            // is stored, leaving the flat mapping unchanged.
2500            if let TensorStorage::Dma(ref io) = t.storage {
2501                let bpr = io.bytes_per_row();
2502                if let Some(natural) = t.effective_row_stride() {
2503                    if bpr > natural {
2504                        t.set_row_stride_unchecked(bpr);
2505                    }
2506                }
2507            }
2508            t.cpu_access = access;
2509            return Ok(t);
2510        }
2511
2512        // Android Dma path: allocate a format-aware AHardwareBuffer so the
2513        // GL backend can import it as an EGLImage
2514        // (`eglGetNativeClientBufferANDROID` → `eglCreateImageKHR`).
2515        //
2516        // Geometry: gralloc chooses the row pitch (`desc.stride`) at
2517        // allocation, and pads freely (validated on the Galaxy S26 Ultra,
2518        // where SnapAlloc pads the planar-F16 RGBA16F surface). A padded
2519        // pitch is recorded on the tensor so CPU maps iterate rows via the
2520        // strided-map path; the GL render uses the buffer's own pitch
2521        // through the EGLImage either way, so the GPU path stays fully
2522        // zero-copy. Consumers needing a FLAT layout (the future NPU
2523        // handoff's `[1, C, H, W]` contract) must check `row_stride()` and
2524        // repack when set — flatness is a per-device property here, unlike
2525        // macOS where IOSurface's 64-BYTE alignment keeps model-sized F16
2526        // surfaces naturally flat.
2527        #[cfg(target_os = "android")]
2528        if matches!(memory, Some(TensorMemory::Dma)) {
2529            // Explicit-Dma contract (mirrors the macOS block above): the
2530            // caller asked for an image-formatted, GL-importable
2531            // AHardwareBuffer, so an unmapped combination or a gralloc
2532            // refusal errors here instead of falling through to a BLOB
2533            // byte-bag the GL backend can never import — that downgrade
2534            // only surfaced as a silent per-frame CPU upload.
2535            let dtype = dtype_of::<T>().ok_or_else(|| {
2536                Error::InvalidArgument(format!(
2537                    "Tensor::image: element type {} has no DType, so no \
2538                     image-formatted AHardwareBuffer exists. Pass memory=None \
2539                     or Some(TensorMemory::Mem) for a CPU tensor.",
2540                    std::any::type_name::<T>()
2541                ))
2542            })?;
2543            // Planar F16 requires width % 4 == 0 for the RGBA16F
2544            // packing — reject up front with the requirement spelled
2545            // out (mirrors the macOS pre-allocation guard) instead of
2546            // falling through to a byte-bag GL cannot bind.
2547            if format.layout() == PixelLayout::Planar
2548                && dtype == DType::F16
2549                && packed_rgba16f_layout(format, dtype, width, height).is_none()
2550            {
2551                return Err(Error::InvalidArgument(format!(
2552                    "Tensor::image: {format:?} F16 requires width%4==0 for the RGBA16F \
2553                     AHardwareBuffer packing (got width={width}). Pad the width, or pass \
2554                     memory=Some(TensorMemory::Mem) for a CPU tensor."
2555                )));
2556            }
2557            // Packed RGB u8/i8 rides an RGBA8888 surface at (W*3/4, H) —
2558            // the same whole-texel constraint as the F16 packing above.
2559            if format == PixelFormat::Rgb
2560                && matches!(dtype, DType::U8 | DType::I8)
2561                && packed_rgb888_layout(width, height).is_none()
2562            {
2563                return Err(Error::InvalidArgument(format!(
2564                    "Tensor::image: Rgb {dtype:?} requires width%4==0 for the RGBA8888 \
2565                     AHardwareBuffer packing (got width={width}). Pad the width, or pass \
2566                     memory=Some(TensorMemory::Mem) for a CPU tensor."
2567                )));
2568            }
2569            if crate::ahardwarebuffer::image_ahardwarebuffer_layout(format, dtype).is_none() {
2570                return Err(Error::InvalidArgument(format!(
2571                    "Tensor::image: no zero-copy AHardwareBuffer mapping exists \
2572                     for {format:?}/{dtype:?} on Android (Grey/NV* need \
2573                     R8-format buffers, API 29+; the HAL floor is API 26 — see \
2574                     `image_ahardwarebuffer_layout`). Pass memory=None to \
2575                     auto-select, or Some(TensorMemory::Mem) explicitly, for a \
2576                     CPU tensor; camera NV12 stays zero-copy by wrapping the \
2577                     camera's own AHardwareBuffer instead of allocating one."
2578                )));
2579            }
2580            let storage = TensorStorage::<T>::new_image_ahardwarebuffer(
2581                width, height, format, dtype, &shape, None, access,
2582            )?;
2583            let mut t = Self::wrap(storage);
2584            t.format = Some(format);
2585            if let TensorStorage::Dma(ref ahb) = t.storage {
2586                // gralloc chooses the row pitch (Qualcomm's
2587                // SnapAlloc pads e.g. the 160-px-wide RGBA16F
2588                // surface of a 640-wide planar F16 target).
2589                // When it exceeds the natural stride, record
2590                // it so CPU consumers iterate rows correctly
2591                // via `effective_row_stride()` — the GPU
2592                // renders through the EGLImage at the
2593                // buffer's real pitch regardless, so the
2594                // render stays fully zero-copy. Consumers
2595                // that need the FLAT `[1, C, H, W]` layout
2596                // (the future NPU handoff) must check
2597                // `row_stride()` is unset and repack — or
2598                // pick an aligned width — rather than assume
2599                // flatness (see the module docs).
2600                let bpr = ahb.bytes_per_row();
2601                if let Some(natural) = t.effective_row_stride() {
2602                    if bpr > natural {
2603                        log::debug!(
2604                            "Tensor::image: gralloc padded the {format:?} \
2605                             AHardwareBuffer pitch to {bpr} bytes (natural \
2606                             {natural}); recording row stride"
2607                        );
2608                        t.set_row_stride_unchecked(bpr);
2609                    }
2610                }
2611            }
2612            t.cpu_access = access;
2613            return Ok(t);
2614        }
2615
2616        // Compute the **64-byte-aligned** row stride for every image layout.
2617        //
2618        // Embedded GPUs reject `eglCreateImage` DMA-BUF imports whose row pitch
2619        // is not 64-byte aligned: Mali returns `EGL_BAD_ALLOC`, Vivante
2620        // `EGL_BAD_ACCESS`. This bit packed RGBA/RGB destinations at odd widths
2621        // AND at even non-multiple-of-16 widths (e.g. 321→1284, 322→1288 bytes —
2622        // neither divisible by 64), so an odd-source → RGBA convert failed on
2623        // imx95/imx8mp while succeeding on V3D/Tegra. Semi-planar already aligned
2624        // here; we now align packed and planar identically so every image()
2625        // allocation is GPU-importable regardless of width.
2626        //
2627        // The per-layout natural pitch and total row count:
2628        //   * SemiPlanar `[total_h, width]`     — pitch = even(width)·elem, rows = total_h
2629        //   * Packed     `[height, width, ch]`  — pitch = width·ch·elem,    rows = height
2630        //   * Planar     `[ch, height, width]`  — pitch = width·elem,       rows = ch·height
2631        // Allocation byte size = `aligned_stride · total_rows` (NOT the shape
2632        // product, which reflects only the logical width and under-allocates the
2633        // padding on odd / unaligned widths).
2634        let elem = std::mem::size_of::<T>();
2635        let channels = format.channels();
2636        let (natural_stride, total_rows) = match format.layout() {
2637            PixelLayout::SemiPlanar => (width.next_multiple_of(2) * elem, shape[0]),
2638            PixelLayout::Packed => (width * channels * elem, height),
2639            PixelLayout::Planar => (width * elem, channels * height),
2640        };
2641        let aligned_stride = natural_stride.next_multiple_of(64);
2642        let semi = format.layout() == PixelLayout::SemiPlanar;
2643
2644        // DMA buffers MUST carry a 64-aligned row pitch — Mali/Vivante reject a
2645        // DMA-BUF EGLImage whose pitch is not 64-aligned. Semi-planar also needs
2646        // the aligned pitch on every backend (its chroma-plane offset math
2647        // assumes it). Packed/planar on host-only memory (Mem/Shm) keep the
2648        // natural tight pitch so the many flat CPU consumers are unaffected.
2649        let host_stride = if semi { aligned_stride } else { natural_stride };
2650        let host_byte_size = host_stride * total_rows;
2651        #[cfg(target_os = "linux")]
2652        let dma_byte_size = aligned_stride * total_rows;
2653
2654        // `used_stride` is the actual row pitch of the storage created below.
2655        let (storage, used_stride) = match memory {
2656            #[cfg(target_os = "linux")]
2657            Some(TensorMemory::Dma) => (
2658                TensorStorage::<T>::new_dma_with_byte_size(&shape, dma_byte_size, None)?,
2659                aligned_stride,
2660            ),
2661            #[cfg(unix)]
2662            Some(TensorMemory::Shm) => (
2663                TensorStorage::<T>::new_shm_with_byte_size(&shape, host_byte_size, None)?,
2664                host_stride,
2665            ),
2666            Some(TensorMemory::Mem) => (
2667                TensorStorage::<T>::new_mem_with_byte_size(&shape, host_byte_size, None)?,
2668                host_stride,
2669            ),
2670            #[allow(unused_variables)]
2671            Some(other) => {
2672                // PBO and any future variants: fall through to standard new().
2673                return {
2674                    let mut t = Self::new(&shape, Some(other), None)?;
2675                    t.format = Some(format);
2676                    t.cpu_access = access;
2677                    Ok(t)
2678                };
2679            }
2680            None => {
2681                // Auto-select priority: DMA → Mem (DMA gets the 64-aligned
2682                // pitch; the Mem fallback keeps the tight host pitch, so the
2683                // recorded stride matches the storage used). Shm is NOT
2684                // auto-selected — it offers no advantage over Mem for an
2685                // in-process image and Mem always succeeds, so it sits below Mem
2686                // and is reached only via an explicit `TensorMemory::Shm`.
2687                #[cfg(target_os = "linux")]
2688                {
2689                    match TensorStorage::<T>::new_dma_with_byte_size(&shape, dma_byte_size, None) {
2690                        Ok(s) => (s, aligned_stride),
2691                        Err(_) => (
2692                            TensorStorage::<T>::new_mem_with_byte_size(
2693                                &shape,
2694                                host_byte_size,
2695                                None,
2696                            )?,
2697                            host_stride,
2698                        ),
2699                    }
2700                }
2701                #[cfg(not(target_os = "linux"))]
2702                {
2703                    (
2704                        TensorStorage::<T>::new_mem_with_byte_size(&shape, host_byte_size, None)?,
2705                        host_stride,
2706                    )
2707                }
2708            }
2709        };
2710
2711        let mut t = Self::wrap(storage);
2712        t.format = Some(format);
2713        // Record the row stride when it exceeds the natural tight pitch (padding
2714        // is present — DMA packed/planar at an unaligned width, or always for
2715        // semi-planar), mirroring the IOSurface path above. Aligned-width and
2716        // host-only packed/planar images keep their flat layout with no explicit
2717        // stride; `effective_row_stride()` then falls back to the identical
2718        // computed pitch. When padding IS present, consumers must iterate rows by
2719        // `effective_row_stride()` to skip it.
2720        if semi || used_stride > natural_stride {
2721            t.set_row_stride_unchecked(used_stride);
2722        }
2723        debug_assert!(
2724            t.row_stride.is_some() || !semi,
2725            "image() must always set row_stride for semi-planar tensors"
2726        );
2727        t.cpu_access = access;
2728        #[cfg(target_os = "linux")]
2729        t.try_init_dma_cuda();
2730        Ok(t)
2731    }
2732
2733    /// Create a DMA-backed image tensor with an explicit row stride that
2734    /// may exceed the natural `width * channels * sizeof(T)` pitch.
2735    ///
2736    /// Used for image tensors that need GPU pitch alignment padding: the
2737    /// underlying DMA-BUF is sized to `row_stride * height` bytes, but
2738    /// the tensor's logical shape stays at `[height, width, channels]`.
2739    /// `width()` / `height()` / `shape()` continue to report the
2740    /// user-requested values; the padding is visible only via
2741    /// `row_stride()` / `effective_row_stride()` and is automatically
2742    /// propagated to the GL backend's EGLImage import so Mali Valhall
2743    /// accepts the buffer.
2744    ///
2745    /// # Supported formats
2746    ///
2747    /// Currently only **packed** pixel layouts (RGBA8, BGRA8, RGB888,
2748    /// Grey, etc.) are supported — the formats the GL backend uses as
2749    /// render destinations. Semi-planar formats (NV12, NV16) come from
2750    /// external allocators (camera capture, video decoders) and are
2751    /// imported via `TensorDyn::from_fd` + `set_row_stride`, which
2752    /// already supports padded strides.
2753    ///
2754    /// # Supported memory
2755    ///
2756    /// Currently only `TensorMemory::Dma` is supported. PBO and Mem
2757    /// storage don't go through EGLImage import so they don't need
2758    /// pitch alignment; if you pass any other memory type this returns
2759    /// `NotImplemented`. `None` (auto-select) is treated as `Dma`.
2760    ///
2761    /// # Errors
2762    ///
2763    /// - `InvalidArgument` if `row_stride_bytes < width * channels * sizeof(T)`
2764    ///   (the requested stride would not fit a single row)
2765    /// - `NotImplemented` for non-packed formats or non-DMA memory
2766    /// - `IoError` if the DMA-heap allocation fails (propagated from
2767    ///   `DmaTensor::new_with_byte_size`)
2768    pub fn image_with_stride(
2769        width: usize,
2770        height: usize,
2771        format: PixelFormat,
2772        row_stride_bytes: usize,
2773        memory: Option<TensorMemory>,
2774        access: CpuAccess,
2775    ) -> Result<Self> {
2776        #[cfg(not(target_os = "linux"))]
2777        let _ = access;
2778        // DMA backing (the only thing this constructor produces) is
2779        // Linux-only. On macOS/BSD/Windows the non-Linux block below is
2780        // the only compiled body and returns `NotImplemented` directly;
2781        // on Linux the non-Linux block is cfg-removed and the function
2782        // falls through to the real validation + allocation path. Each
2783        // target compiles exactly one of the two blocks, and the block
2784        // serves as the function's tail expression in both cases — so
2785        // neither needs an explicit `return` (avoids
2786        // `clippy::needless_return` on the macOS CI gate).
2787        #[cfg(not(target_os = "linux"))]
2788        {
2789            let _ = (width, height, format, row_stride_bytes, memory);
2790            Err(Error::NotImplemented(
2791                "image_with_stride requires DMA support (Linux only)".to_owned(),
2792            ))
2793        }
2794
2795        #[cfg(target_os = "linux")]
2796        {
2797            if format.layout() != PixelLayout::Packed {
2798                return Err(Error::NotImplemented(format!(
2799                    "Tensor::image_with_stride only supports packed pixel layouts, got {format:?}"
2800                )));
2801            }
2802            let elem = std::mem::size_of::<T>();
2803            let min_stride = width
2804                .checked_mul(format.channels())
2805                .and_then(|p| p.checked_mul(elem))
2806                .ok_or_else(|| {
2807                    Error::InvalidArgument(format!(
2808                        "image_with_stride: width {width} × channels {} × sizeof::<T>={elem} \
2809                         overflows usize",
2810                        format.channels()
2811                    ))
2812                })?;
2813            if row_stride_bytes < min_stride {
2814                return Err(Error::InvalidArgument(format!(
2815                    "image_with_stride: row_stride {row_stride_bytes} < minimum {min_stride} \
2816                     ({width} px × {} ch × {elem} B)",
2817                    format.channels()
2818                )));
2819            }
2820            let total_byte_size = row_stride_bytes.checked_mul(height).ok_or_else(|| {
2821                Error::InvalidArgument(format!(
2822                    "image_with_stride: row_stride {row_stride_bytes} × height {height} overflows usize"
2823                ))
2824            })?;
2825
2826            let shape = vec![height, width, format.channels()];
2827
2828            let storage = match memory {
2829                Some(TensorMemory::Dma) | None => {
2830                    TensorStorage::<T>::new_dma_with_byte_size(&shape, total_byte_size, None)?
2831                }
2832                Some(other) => {
2833                    return Err(Error::NotImplemented(format!(
2834                        "image_with_stride: only TensorMemory::Dma is supported, got {other:?}"
2835                    )));
2836                }
2837            };
2838
2839            let mut t = Self::wrap(storage);
2840            t.format = Some(format);
2841            t.row_stride = Some(row_stride_bytes);
2842            t.cpu_access = access;
2843            // Match new()/from_fd(): a DMA tensor must attempt CUDA external-
2844            // memory import so a strided DMA buffer is also zero-copy
2845            // CUDA-mappable (no-op when libcudart is absent).
2846            t.try_init_dma_cuda();
2847            Ok(t)
2848        }
2849    }
2850
2851    /// Attach format metadata to an existing tensor.
2852    ///
2853    /// # Arguments
2854    ///
2855    /// * `format` - The pixel format to attach
2856    ///
2857    /// # Returns
2858    ///
2859    /// `Ok(())` on success, with the format stored as metadata on the tensor.
2860    ///
2861    /// # Errors
2862    ///
2863    /// Returns `Error::InvalidShape` if the tensor shape is incompatible with
2864    /// the format's layout (packed expects `[H, W, C]`, planar expects
2865    /// `[C, H, W]`, semi-planar expects `[H*k, W]` with format-specific
2866    /// height constraints).
2867    pub fn set_format(&mut self, format: PixelFormat) -> Result<()> {
2868        let shape = self.shape();
2869        match format.layout() {
2870            PixelLayout::Packed => {
2871                if shape.len() != 3 || shape[2] != format.channels() {
2872                    return Err(Error::InvalidShape(format!(
2873                        "packed format {format:?} expects [H, W, {}], got {shape:?}",
2874                        format.channels()
2875                    )));
2876                }
2877            }
2878            PixelLayout::Planar => {
2879                if shape.len() != 3 || shape[0] != format.channels() {
2880                    return Err(Error::InvalidShape(format!(
2881                        "planar format {format:?} expects [{}, H, W], got {shape:?}",
2882                        format.channels()
2883                    )));
2884                }
2885            }
2886            PixelLayout::SemiPlanar => {
2887                if shape.len() != 2 {
2888                    return Err(Error::InvalidShape(format!(
2889                        "semi-planar format {format:?} expects [H*k, W], got {shape:?}"
2890                    )));
2891                }
2892                match format {
2893                    // Combined-plane height is `H + ceil(H/2)` (luma + chroma
2894                    // rows). For even H that is `3H/2` (≡ 0 mod 3); for odd H it
2895                    // is `(3H+1)/2` (≡ 2 mod 3). Only totals ≡ 1 mod 3 are
2896                    // unreachable, so reject just those — odd-height NV12 is
2897                    // valid (e.g. 725 rows for a 483-tall image).
2898                    PixelFormat::Nv12 if shape[0] % 3 == 1 => {
2899                        return Err(Error::InvalidShape(format!(
2900                            "NV12 contiguous shape[0] must be H + ceil(H/2) for some height; \
2901                             {} is unreachable (≡ 1 mod 3)",
2902                            shape[0]
2903                        )));
2904                    }
2905                    PixelFormat::Nv16 if !shape[0].is_multiple_of(2) => {
2906                        return Err(Error::InvalidShape(format!(
2907                            "NV16 contiguous shape[0] must be even, got {}",
2908                            shape[0]
2909                        )));
2910                    }
2911                    // NV24 (4:4:4): combined-plane height is 3H (Y + 2H chroma).
2912                    PixelFormat::Nv24 if !shape[0].is_multiple_of(3) => {
2913                        return Err(Error::InvalidShape(format!(
2914                            "NV24 contiguous shape[0] must be a multiple of 3 (= 3H), got {}",
2915                            shape[0]
2916                        )));
2917                    }
2918                    _ => {}
2919                }
2920            }
2921        }
2922        // Clear stored stride/offset when format changes — they may be invalid
2923        // for the new format. Caller must re-set after changing format.
2924        if self.format != Some(format) {
2925            self.row_stride = None;
2926            self.plane_offset = None;
2927            match self.storage {
2928                TensorStorage::Mem(ref mut m) => m.set_offset(0),
2929                #[cfg(target_os = "linux")]
2930                TensorStorage::Dma(ref mut dma) => dma.mmap_offset = 0,
2931                _ => {}
2932            }
2933        }
2934        self.format = Some(format);
2935        Ok(())
2936    }
2937
2938    /// Set this tensor's logical dimensions and pixel format to a decoded
2939    /// image, reusing the existing allocation. The shape is derived from the
2940    /// format layout; fails with `Error::InsufficientCapacity` if the
2941    /// allocation cannot hold `width`×`height` in `format`, or
2942    /// `Error::InvalidArgument` if the dimensions are invalid for the format.
2943    ///
2944    /// For NV12/NV16/NV24 the buffer width is rounded up to even (a chroma-plane
2945    /// interleaving requirement); the true odd width is reported by the decoder
2946    /// in `ImageInfo` and trimmed by a `convert()` crop. See
2947    /// [`PixelFormat::image_shape`].
2948    ///
2949    /// When the backing has a fixed physical row pitch (an IOSurface's
2950    /// 64-aligned `bytesPerRow`) that exceeds the new format's natural row
2951    /// stride — i.e. a reused max-sized pool tensor reconfigured to a smaller
2952    /// image — the physical pitch is preserved as the tensor's `row_stride`.
2953    /// This keeps the **physical grid** (allocation stride/surface) fixed while
2954    /// the **logical ROI** (this image's W×H) changes, so the decode writes rows
2955    /// at the surface's real stride and the GPU samples them at the same stride.
2956    /// Exact-sized buffers (pitch == natural) stay tightly packed unchanged.
2957    pub fn configure_image(
2958        &mut self,
2959        width: usize,
2960        height: usize,
2961        format: PixelFormat,
2962    ) -> Result<()> {
2963        let shape = format.image_shape(width, height).ok_or_else(|| {
2964            Error::InvalidArgument(format!(
2965                "invalid dimensions {width}x{height} for format {format:?}"
2966            ))
2967        })?;
2968        // Capture the pre-existing row stride before `set_format` clears it.
2969        // For pool tensors that were allocated at a larger width (e.g. 1920-wide
2970        // pool decoding a 789-wide image), this preserves the backing pitch so
2971        // rows are still written at the correct physical stride.
2972        let prior_stride = self.row_stride;
2973
2974        self.storage.set_logical_shape(&shape)?;
2975        self.set_format(format)?; // clears any stale row_stride
2976
2977        // Restore the correct row pitch for the new geometry. A DMA buffer of
2978        // ANY layout, and a semi-planar buffer on any backing, MUST carry a
2979        // 64-byte-aligned pitch: Mali/Vivante reject a DMA-BUF EGLImage whose
2980        // row pitch is not 64-aligned (`EGL_BAD_ALLOC` / `EGL_BAD_ACCESS`), and
2981        // semi-planar chroma-offset math assumes it. This mirrors the pitch
2982        // `Tensor::image()` allocates, so a recycled pool buffer imports
2983        // identically to a fresh one — without it a `configure_image()`'d packed
2984        // buffer (e.g. Y800 96-wide → tight pitch 96) failed convert() on
2985        // imx95/imx8mp via the texture-upload fallback while the fresh oracle
2986        // (aligned 128) succeeded. Packed/planar on host-only memory (Mem/Shm)
2987        // keep the tight pitch so flat CPU consumers stay unaffected — matching
2988        // `image()`'s `host_stride` rule.
2989        let elem = std::mem::size_of::<T>();
2990        let channels = format.channels();
2991        let (min_stride, total_rows) = match format.layout() {
2992            PixelLayout::SemiPlanar => (width.next_multiple_of(2) * elem, shape[0]),
2993            PixelLayout::Packed => (width * channels * elem, height),
2994            PixelLayout::Planar => (width * elem, channels * height),
2995        };
2996        let needs_align = self.storage.memory() == TensorMemory::Dma
2997            || format.layout() == PixelLayout::SemiPlanar;
2998
2999        let active_stride = if let Some(pitch) = self.storage.backing_row_stride() {
3000            // macOS IOSurface: use the surface's native pitch.
3001            let natural = self.effective_row_stride().unwrap_or(0);
3002            if pitch > natural {
3003                self.set_row_stride_unchecked(pitch);
3004                pitch
3005            } else {
3006                natural
3007            }
3008        } else if needs_align {
3009            // Priority:
3010            //   1. Prior stride (pool reuse): if the pre-existing stride is
3011            //      64-aligned, >= this layout's minimum row, and fits the
3012            //      allocation, keep it. This is the hot-loop reuse case (large
3013            //      pool, small image).
3014            //   2. Compute a fresh 64-aligned stride for the current width.
3015            let aligned = min_stride.next_multiple_of(64);
3016            let capacity = self.storage.capacity_bytes();
3017
3018            let candidate = if let Some(ps) = prior_stride {
3019                if ps >= min_stride && ps % 64 == 0 && ps * total_rows <= capacity {
3020                    ps
3021                } else {
3022                    aligned
3023                }
3024            } else {
3025                aligned
3026            };
3027
3028            if candidate * total_rows <= capacity {
3029                self.set_row_stride_unchecked(candidate);
3030                candidate
3031            } else {
3032                // Shouldn't happen for legitimate pools, but don't crash.
3033                self.effective_row_stride().unwrap_or(0)
3034            }
3035        } else {
3036            self.effective_row_stride().unwrap_or(0)
3037        };
3038
3039        // Ensure the active stride fits the allocation. A pool reconfigured to a
3040        // wider image than its backing would silently SIGBUS on any subsequent
3041        // map/write — catch it here instead.
3042        if needs_align && active_stride > 0 {
3043            let needed = active_stride * total_rows;
3044            let capacity = self.storage.capacity_bytes();
3045            if needed > capacity {
3046                return Err(Error::InsufficientCapacity { needed, capacity });
3047            }
3048        }
3049        Ok(())
3050    }
3051
3052    /// Allocate an image tensor sized to hold up to `width`×`height` in
3053    /// `format`, reusable for any smaller image via `configure_image`.
3054    pub fn image_with_capacity(
3055        width: usize,
3056        height: usize,
3057        format: PixelFormat,
3058        memory: Option<TensorMemory>,
3059        access: CpuAccess,
3060    ) -> Result<Self>
3061    where
3062        T: 'static,
3063    {
3064        Self::image(width, height, format, memory, access)
3065    }
3066
3067    /// Pixel format (None if not an image).
3068    pub fn format(&self) -> Option<PixelFormat> {
3069        self.format
3070    }
3071
3072    /// Image width (None if not an image).
3073    pub fn width(&self) -> Option<usize> {
3074        let fmt = self.format?;
3075        let shape = self.shape();
3076        match fmt.layout() {
3077            PixelLayout::Packed => Some(shape[1]),
3078            PixelLayout::Planar => Some(shape[2]),
3079            PixelLayout::SemiPlanar => Some(shape[1]),
3080        }
3081    }
3082
3083    /// Image height (None if not an image).
3084    ///
3085    /// For semi-planar formats the combined-plane shape row count is divided
3086    /// by the format's luma-to-total ratio to recover logical height. This
3087    /// returns the exact logical height (including odd heights) only because
3088    /// the logical dimensions are tracked separately from the physical shape —
3089    /// `configure_image` stores the actual `(width, height)` in the format's
3090    /// `image_shape`, which round-trips losslessly via these accessors.
3091    pub fn height(&self) -> Option<usize> {
3092        let fmt = self.format?;
3093        let shape = self.shape();
3094        match fmt.layout() {
3095            PixelLayout::Packed => Some(shape[0]),
3096            PixelLayout::Planar => Some(shape[1]),
3097            PixelLayout::SemiPlanar => {
3098                if self.is_multiplane() {
3099                    Some(shape[0])
3100                } else {
3101                    match fmt {
3102                        PixelFormat::Nv12 => Some(shape[0] * 2 / 3),
3103                        PixelFormat::Nv16 => Some(shape[0] / 2),
3104                        PixelFormat::Nv24 => Some(shape[0] / 3),
3105                        _ => None,
3106                    }
3107                }
3108            }
3109        }
3110    }
3111
3112    /// Create from separate Y and UV planes (multiplane NV12/NV16).
3113    pub fn from_planes(luma: Tensor<T>, chroma: Tensor<T>, format: PixelFormat) -> Result<Self> {
3114        if format.layout() != PixelLayout::SemiPlanar {
3115            return Err(Error::InvalidArgument(format!(
3116                "from_planes requires a semi-planar format, got {format:?}"
3117            )));
3118        }
3119        if chroma.format.is_some() || chroma.chroma.is_some() {
3120            return Err(Error::InvalidArgument(
3121                "chroma tensor must be a raw tensor (no format or chroma metadata)".into(),
3122            ));
3123        }
3124        let luma_shape = luma.shape();
3125        let chroma_shape = chroma.shape();
3126        if luma_shape.len() != 2 || chroma_shape.len() != 2 {
3127            return Err(Error::InvalidArgument(format!(
3128                "from_planes expects 2D shapes, got luma={luma_shape:?} chroma={chroma_shape:?}"
3129            )));
3130        }
3131        if luma_shape[1] != chroma_shape[1] {
3132            return Err(Error::InvalidArgument(format!(
3133                "luma width {} != chroma width {}",
3134                luma_shape[1], chroma_shape[1]
3135            )));
3136        }
3137        match format {
3138            PixelFormat::Nv12 => {
3139                if luma_shape[0] % 2 != 0 {
3140                    return Err(Error::InvalidArgument(format!(
3141                        "NV12 requires even luma height, got {}",
3142                        luma_shape[0]
3143                    )));
3144                }
3145                if chroma_shape[0] != luma_shape[0] / 2 {
3146                    return Err(Error::InvalidArgument(format!(
3147                        "NV12 chroma height {} != luma height / 2 ({})",
3148                        chroma_shape[0],
3149                        luma_shape[0] / 2
3150                    )));
3151                }
3152            }
3153            PixelFormat::Nv16 => {
3154                if chroma_shape[0] != luma_shape[0] {
3155                    return Err(Error::InvalidArgument(format!(
3156                        "NV16 chroma height {} != luma height {}",
3157                        chroma_shape[0], luma_shape[0]
3158                    )));
3159                }
3160            }
3161            // NV24's chroma plane is full-resolution (2×-wide interleaved UV),
3162            // which the equal-width plane check above doesn't model. Multiplane
3163            // NV24 is unused (the JPEG decoder emits a contiguous NV24 buffer),
3164            // so it's not supported here yet.
3165            _ => {
3166                return Err(Error::InvalidArgument(format!(
3167                    "from_planes only supports NV12 and NV16 (NV24 multiplane not yet \
3168                     supported — use a contiguous NV24 tensor), got {format:?}"
3169                )));
3170            }
3171        }
3172
3173        Ok(Tensor {
3174            storage: luma.storage,
3175            format: Some(format),
3176            chroma: Some(Box::new(chroma)),
3177            row_stride: luma.row_stride,
3178            plane_offset: luma.plane_offset,
3179            quantization: luma.quantization,
3180            // A multiplane tensor spans two DMA-BUFs (luma + chroma); CUDA
3181            // external-memory import is per-fd, so there is no single device
3182            // pointer for the composite. Any CUDA handle the luma plane carried
3183            // is intentionally dropped — consumers needing CUDA access to
3184            // multiplane data must import each plane independently.
3185            cuda: None,
3186            colorimetry: luma.colorimetry,
3187            cpu_access: luma.cpu_access,
3188            compression: luma.compression,
3189            // A composed multiplane tensor is a whole image, not a sub-view.
3190            view_origin: None,
3191        })
3192    }
3193
3194    /// Whether this tensor uses separate plane allocations.
3195    pub fn is_multiplane(&self) -> bool {
3196        self.chroma.is_some()
3197    }
3198
3199    /// Access the chroma plane for multiplane semi-planar images.
3200    pub fn chroma(&self) -> Option<&Tensor<T>> {
3201        self.chroma.as_deref()
3202    }
3203
3204    /// Mutable access to the chroma plane for multiplane semi-planar images.
3205    pub fn chroma_mut(&mut self) -> Option<&mut Tensor<T>> {
3206        self.chroma.as_deref_mut()
3207    }
3208
3209    /// Row stride in bytes (`None` = tightly packed).
3210    pub fn row_stride(&self) -> Option<usize> {
3211        self.row_stride
3212    }
3213
3214    /// Effective row stride in bytes: the stored stride if set, otherwise the
3215    /// minimum stride computed from the format, width, and element size.
3216    /// Returns `None` only when no format is set and no explicit stride was
3217    /// stored via [`set_row_stride`](Self::set_row_stride).
3218    ///
3219    /// **GREY note:** `effective_row_stride()` for a GREY tensor returns the
3220    /// tight `width` bytes (no padding), which is what `normalize_to_numpy` and
3221    /// the CPU convert path expect. The codec's internal `native_row_stride`
3222    /// (64-byte-aligned) is used only during decoding and is not propagated to
3223    /// the tensor's stored stride, so callers reading via
3224    /// `effective_row_stride()` always see the tight value for GREY.
3225    pub fn effective_row_stride(&self) -> Option<usize> {
3226        if let Some(s) = self.row_stride {
3227            return Some(s);
3228        }
3229        let fmt = self.format?;
3230        let w = self.width()?;
3231        let elem = std::mem::size_of::<T>();
3232        Some(match fmt.layout() {
3233            PixelLayout::Packed => w * fmt.channels() * elem,
3234            PixelLayout::Planar => w * elem,
3235            // Semi-planar: minimum stride must cover the even width so the
3236            // interleaved chroma columns are byte-aligned on odd-width images.
3237            PixelLayout::SemiPlanar => w.next_multiple_of(2) * elem,
3238        })
3239    }
3240
3241    /// Copy the tensor's logical bytes into `dst`, compacting away any
3242    /// recorded row-stride padding.
3243    ///
3244    /// The flatness helper for NPU handoff: consumers that need a FLAT
3245    /// layout (e.g. `[1, C, H, W]` for NNAPI/LiteRT when the runtime cannot
3246    /// take a padded pitch) call this when [`row_stride`](Self::row_stride)
3247    /// is `Some` — on a tight tensor it degenerates to one memcpy, so it is
3248    /// safe to call unconditionally. `dst.len()` must equal the tight byte
3249    /// footprint (`shape` product × element size). Zero-copy consumers
3250    /// should prefer the buffer handle + [`effective_row_stride`]
3251    /// (Self::effective_row_stride) and skip this copy entirely.
3252    pub fn copy_to_flat(&self, dst: &mut [u8]) -> Result<()> {
3253        let tight_bytes = crate::ahardwarebuffer_layout::checked_shape_bytes::<T>(self.shape())?;
3254        if dst.len() != tight_bytes {
3255            return Err(Error::InvalidArgument(format!(
3256                "copy_to_flat: dst is {} bytes but the tensor's tight \
3257                 footprint is {tight_bytes} bytes (shape {:?})",
3258                dst.len(),
3259                self.shape()
3260            )));
3261        }
3262        let map = self.map()?;
3263        // SAFETY: T is a plain numeric type (crate-wide bound); viewing the
3264        // mapped elements as bytes is sound.
3265        let src: &[u8] = unsafe {
3266            std::slice::from_raw_parts(
3267                map.as_slice().as_ptr() as *const u8,
3268                std::mem::size_of_val(map.as_slice()),
3269            )
3270        };
3271        let Some(stride) = self.row_stride else {
3272            // Tight layout: the mapped window is exactly the logical bytes.
3273            let got = src.len().min(tight_bytes);
3274            if got < tight_bytes {
3275                return Err(Error::InvalidOperation(format!(
3276                    "copy_to_flat: mapped {got} bytes < tight footprint {tight_bytes}"
3277                )));
3278            }
3279            dst.copy_from_slice(&src[..tight_bytes]);
3280            return Ok(());
3281        };
3282        // Strided: row count follows the strided-map convention (planar
3283        // stacks C planes of H rows; packed/semi-planar use shape[0]), and
3284        // the logical row is tight_bytes / rows for every layout.
3285        let rows = match self.format.map(|f| f.layout()) {
3286            Some(PixelLayout::Planar) => {
3287                let s = self.shape();
3288                if s.len() < 2 {
3289                    return Err(Error::InvalidOperation(
3290                        "copy_to_flat: strided planar tensor requires [C, H, W] shape".into(),
3291                    ));
3292                }
3293                s[0].checked_mul(s[1]).ok_or_else(|| {
3294                    Error::InvalidOperation(format!(
3295                        "copy_to_flat: planar rows {} × {} overflows usize",
3296                        s[0], s[1]
3297                    ))
3298                })?
3299            }
3300            _ => *self.shape().first().ok_or_else(|| {
3301                Error::InvalidOperation("copy_to_flat: tensor has an empty shape".into())
3302            })?,
3303        };
3304        if rows == 0 || !tight_bytes.is_multiple_of(rows) {
3305            return Err(Error::InvalidOperation(format!(
3306                "copy_to_flat: tight footprint {tight_bytes} does not divide \
3307                 into {rows} rows"
3308            )));
3309        }
3310        let row_bytes = tight_bytes / rows;
3311        let need = (rows - 1)
3312            .checked_mul(stride)
3313            .and_then(|b| b.checked_add(row_bytes))
3314            .ok_or_else(|| {
3315                Error::InvalidOperation(format!(
3316                    "copy_to_flat: stride {stride} × rows {rows} overflows usize"
3317                ))
3318            })?;
3319        if src.len() < need {
3320            return Err(Error::InvalidOperation(format!(
3321                "copy_to_flat: mapped {} bytes but strided rows need {need}",
3322                src.len()
3323            )));
3324        }
3325        for r in 0..rows {
3326            dst[r * row_bytes..(r + 1) * row_bytes]
3327                .copy_from_slice(&src[r * stride..r * stride + row_bytes]);
3328        }
3329        Ok(())
3330    }
3331
3332    /// Set the row stride in bytes for externally allocated buffers with
3333    /// row padding (e.g. V4L2 or GStreamer allocators).
3334    ///
3335    /// The stride is propagated to the EGL DMA-BUF import attributes so
3336    /// the GPU interprets the padded buffer layout correctly. Must be
3337    /// called after [`set_format`](Self::set_format) and before the tensor
3338    /// is first passed to [`ImageProcessor::convert`]. The stored stride
3339    /// is cleared automatically if the pixel format is later changed.
3340    ///
3341    /// No stride-vs-buffer-size validation is performed because the
3342    /// backing allocation size is not reliably known: external DMA-BUFs
3343    /// may be over-allocated by the allocator, and internal tensors store
3344    /// a logical (unpadded) shape. An incorrect stride will be caught by
3345    /// the EGL driver at import time.
3346    ///
3347    /// # Arguments
3348    ///
3349    /// * `stride` - Row stride in bytes. Must be >= the minimum stride for
3350    ///   the format (width * channels * sizeof(T) for packed,
3351    ///   width * sizeof(T) for planar/semi-planar).
3352    ///
3353    /// # Errors
3354    ///
3355    /// * `InvalidArgument` if no pixel format is set on this tensor
3356    /// * `InvalidArgument` if `stride` is less than the minimum for the
3357    ///   format and width
3358    pub fn set_row_stride(&mut self, stride: usize) -> Result<()> {
3359        let fmt = self.format.ok_or_else(|| {
3360            Error::InvalidArgument("cannot set row_stride without a pixel format".into())
3361        })?;
3362        let w = self.width().ok_or_else(|| {
3363            Error::InvalidArgument("cannot determine width for row_stride validation".into())
3364        })?;
3365        let elem = std::mem::size_of::<T>();
3366        let min_stride = match fmt.layout() {
3367            PixelLayout::Packed => w * fmt.channels() * elem,
3368            PixelLayout::Planar => w * elem,
3369            // Semi-planar: minimum must cover even width for chroma alignment.
3370            PixelLayout::SemiPlanar => w.next_multiple_of(2) * elem,
3371        };
3372        if stride < min_stride {
3373            return Err(Error::InvalidArgument(format!(
3374                "row_stride {stride} < minimum {min_stride} for {fmt:?} at width {w}"
3375            )));
3376        }
3377        self.row_stride = Some(stride);
3378        Ok(())
3379    }
3380
3381    /// Set the row stride without format validation.
3382    ///
3383    /// Use this for raw sub-tensors (e.g. chroma planes) that don't carry
3384    /// format metadata. The caller is responsible for ensuring the stride
3385    /// is valid.
3386    pub fn set_row_stride_unchecked(&mut self, stride: usize) {
3387        self.row_stride = Some(stride);
3388    }
3389
3390    /// Builder-style variant of [`set_row_stride`](Self::set_row_stride),
3391    /// consuming and returning `self`.
3392    ///
3393    /// # Errors
3394    ///
3395    /// Same conditions as [`set_row_stride`](Self::set_row_stride).
3396    pub fn with_row_stride(mut self, stride: usize) -> Result<Self> {
3397        self.set_row_stride(stride)?;
3398        Ok(self)
3399    }
3400
3401    /// Byte offset within the DMA-BUF where image data starts (`None` = 0).
3402    pub fn plane_offset(&self) -> Option<usize> {
3403        self.plane_offset
3404    }
3405
3406    /// The parent-image snapshot if this tensor is a [`view`](Self::view)/
3407    /// [`batch`](Self::batch) sub-region; `None` for a whole tensor. The GL
3408    /// backend keys its import on the parent geometry and renders this view as a
3409    /// `glViewport`/`glScissor` ROI at `(x, y, width, height)`. See [`ViewOrigin`].
3410    pub fn view_origin(&self) -> Option<ViewOrigin> {
3411        self.view_origin
3412    }
3413
3414    /// Set the byte offset within the DMA-BUF where image data starts.
3415    ///
3416    /// Propagated to `EGL_DMA_BUF_PLANE0_OFFSET_EXT` on GPU import.
3417    /// Unlike [`set_row_stride`](Self::set_row_stride), no format is required
3418    /// since the offset is format-independent.
3419    pub fn set_plane_offset(&mut self, offset: usize) {
3420        self.plane_offset = Some(offset);
3421        // The offset consulted by `map()` lives inside the storage variant.
3422        // Keep it in sync with the wrapper field for every backing that
3423        // honors it (DMA and Mem); see also the clear sites in `set_format`
3424        // and `reshape`.
3425        match self.storage {
3426            TensorStorage::Mem(ref mut m) => m.set_offset(offset),
3427            #[cfg(target_os = "linux")]
3428            TensorStorage::Dma(ref mut dma) => dma.mmap_offset = offset,
3429            _ => {}
3430        }
3431    }
3432
3433    /// Colorimetry metadata (`None` = undefined; never auto-filled).
3434    /// The CPU access declared for this tensor at allocation (see
3435    /// [`CpuAccess`]). Views share their parent's declaration.
3436    pub fn cpu_access(&self) -> CpuAccess {
3437        self.cpu_access
3438    }
3439
3440    /// Set the declared CPU access without re-allocating — crate-private:
3441    /// the declaration must reflect the underlying allocation's real
3442    /// capabilities (constructors and importers set it; arbitrary widening
3443    /// would defeat the contract).
3444    // Only the Android AHardwareBuffer importer derives a declaration from
3445    // an existing allocation's usage bits today.
3446    #[cfg_attr(not(target_os = "android"), allow(dead_code))]
3447    pub(crate) fn set_cpu_access_unchecked(&mut self, access: CpuAccess) {
3448        self.cpu_access = access;
3449    }
3450
3451    /// The vendor tile-compression scheme recorded at allocation, or
3452    /// `None` for a linear layout. `Some` means the pixels live in a
3453    /// proprietary tile order: the row-stride accessors describe no
3454    /// meaningful linear layout and CPU maps are best-effort (see
3455    /// [`Compression`]). Only Android hardware-only allocations that
3456    /// requested compression record a scheme.
3457    pub fn compression(&self) -> Option<CompressionScheme> {
3458        self.compression
3459    }
3460
3461    /// Record the compression scheme — crate-private: recording is an
3462    /// allocation-time fact ([`Tensor::image_desc`] sets it; arbitrary
3463    /// mutation would misdescribe the physical layout).
3464    // Only the Android allocation path records a scheme today.
3465    #[cfg_attr(not(target_os = "android"), allow(dead_code))]
3466    pub(crate) fn set_compression_unchecked(&mut self, scheme: Option<CompressionScheme>) {
3467        self.compression = scheme;
3468    }
3469
3470    pub fn colorimetry(&self) -> Option<crate::Colorimetry> {
3471        self.colorimetry
3472    }
3473
3474    /// Attach/clear colorimetry metadata.
3475    pub fn set_colorimetry(&mut self, c: Option<crate::Colorimetry>) {
3476        self.colorimetry = c;
3477    }
3478
3479    /// Builder-style colorimetry attach.
3480    pub fn with_colorimetry(mut self, c: crate::Colorimetry) -> Self {
3481        self.colorimetry = Some(c);
3482        self
3483    }
3484
3485    /// Create a zero-copy sub-region view of this tensor's backing buffer.
3486    ///
3487    /// The returned tensor shares this tensor's allocation (no copy) and maps
3488    /// the window `[offset_bytes, offset_bytes + shape.product()*size_of::<T>())`
3489    /// measured from this tensor's own logical start. N sub-views into one
3490    /// parent can be written independently, enabling batched assembly into a
3491    /// single buffer. Identical semantics across `Mem` (shared `Arc`) and
3492    /// `Dma` (shared fd) backings.
3493    ///
3494    /// # Disjointness
3495    ///
3496    /// Independent writes are sound *only* when the windows do not overlap. The
3497    /// shared backing uses interior mutability (`UnsafeCell` cells), so two
3498    /// sub-views whose byte ranges intersect alias the same cells: writing one
3499    /// while reading or writing the other is a data race and therefore
3500    /// **undefined behaviour**. The caller is responsible for keeping the
3501    /// windows disjoint; this method does not check for overlap.
3502    ///
3503    /// # Errors
3504    ///
3505    /// - [`Error::InvalidOperation`] if the backing is not `Mem` or `Dma`, or
3506    ///   if `offset_bytes` is not a multiple of `align_of::<T>()`.
3507    /// - [`Error::InsufficientCapacity`] / [`Error::InvalidSize`] if the window
3508    ///   exceeds the parent allocation.
3509    pub(crate) fn subview(&self, offset_bytes: usize, shape: &[usize]) -> Result<Tensor<T>> {
3510        // Offset is absolute into the backing allocation: a sub-view of a
3511        // sub-view composes by adding this tensor's own offset.
3512        let abs_offset = self
3513            .plane_offset
3514            .unwrap_or(0)
3515            .checked_add(offset_bytes)
3516            .ok_or(Error::InvalidSize(offset_bytes))?;
3517        // Every backend exposes `view(offset, shape)` via `TensorTrait`, sharing
3518        // the resource AND `BufferIdentity` (unlike `from_fd`/`from_surface`,
3519        // which mint a fresh identity). The GL backend keys the import on the
3520        // shared identity so offset-distinct sub-views of one buffer reuse a
3521        // single import and address their window via `glViewport`. `Mem`/`Shm`
3522        // share via the allocation `Arc` / a cloned fd; `Pbo` via the GL-buffer
3523        // `Arc`; Linux DMA-BUF / macOS IOSurface via the shared fd / CFRetain.
3524        // `TensorStorage::view` performs the one remaining per-variant dispatch.
3525        let mut t = Tensor::wrap(self.storage.view(offset_bytes, shape)?);
3526        // Inherit the parent's image metadata so the view is a ready-to-use
3527        // sub-image (e.g. a `convert()` destination). The offset is applied
3528        // LAST because `set_format` deliberately clears it — the offset is a
3529        // structural property of the sub-region, not format-dependent metadata.
3530        if let Some(fmt) = self.format {
3531            t.set_format(fmt)?;
3532        }
3533        if let Some(rs) = self.row_stride {
3534            t.set_row_stride_unchecked(rs);
3535        }
3536        t.quantization = self.quantization.clone();
3537        // A sub-region of an image carries the parent's colorimetry — it is the
3538        // same pixels, same color encoding. Inherit it like the other image
3539        // metadata above so a sub-view is a faithful convert() source/target.
3540        t.set_colorimetry(self.colorimetry);
3541        // The declared CPU access is a property of the underlying
3542        // allocation, so every view shares the parent's declaration.
3543        t.cpu_access = self.cpu_access;
3544        // Likewise the recorded compression scheme: the view shares the
3545        // parent's physical layout.
3546        t.compression = self.compression;
3547        if abs_offset > 0 {
3548            t.set_plane_offset(abs_offset);
3549        }
3550        Ok(t)
3551    }
3552
3553    /// Borrow batch element `n` of a batched tensor as a zero-copy view.
3554    ///
3555    /// A batched tensor prepends `N` as the leading dimension over the
3556    /// per-element image layout (`[N, H, W, C]` packed or `[N, C, H, W]`
3557    /// planar) — `N` is over the whole per-element block regardless of
3558    /// `HWC`/`CHW`. `batch(n)` returns element `n`: the contiguous per-element
3559    /// region at byte offset `n * element_size`, sharing the parent's
3560    /// `BufferIdentity` and inheriting its format / row stride / colorimetry.
3561    /// `batch(0)` on a tensor with `N == 1` is equivalent to the whole tensor.
3562    ///
3563    /// # Errors
3564    ///
3565    /// - [`Error::BatchIndexOutOfBounds`] if `n >= N`.
3566    /// - [`Error::InvalidShape`] if the tensor is not batched (a formatted
3567    ///   tensor whose rank lacks the leading `N`, or an empty shape).
3568    pub fn batch(&self, n: usize) -> Result<Tensor<T>> {
3569        let shape = self.shape();
3570        // With a format we know the exact per-element rank, so a missing leading
3571        // `N` is a misuse we reject rather than silently treating a spatial dim
3572        // as the batch. Raw tensors take shape[0] as `N` by contract.
3573        if let Some(fmt) = self.format {
3574            let elem_rank = match fmt.layout() {
3575                PixelLayout::SemiPlanar => 2,
3576                _ => 3,
3577            };
3578            if shape.len() != elem_rank + 1 {
3579                return Err(Error::InvalidShape(format!(
3580                    "batch(): tensor is not batched ({fmt:?} expects a leading N over a \
3581                     {elem_rank}-D element, got shape {shape:?})"
3582                )));
3583            }
3584        }
3585        let batch = *shape
3586            .first()
3587            .ok_or_else(|| Error::InvalidShape("batch(): empty shape".into()))?;
3588        if n >= batch {
3589            return Err(Error::BatchIndexOutOfBounds { index: n, batch });
3590        }
3591        let elem_shape: Vec<usize> = shape[1..].to_vec();
3592        let elem_count: usize = elem_shape.iter().product();
3593        let elem_bytes = elem_count
3594            .checked_mul(std::mem::size_of::<T>())
3595            .ok_or(Error::InvalidSize(elem_count))?;
3596        let offset = n.checked_mul(elem_bytes).ok_or(Error::InvalidSize(n))?;
3597        // For a packed `[N, H, W, C]` tensor the N tiles stack vertically in the
3598        // shared buffer, so the GL import sees one `(W, N*H)` parent and each
3599        // tile is the row-band at `y = n*H`. Snapshot that parent so the backend
3600        // imports once and renders the tile via `glViewport`. Non-packed
3601        // (planar/semi-planar) batching keeps the per-slot path for now (planar
3602        // NCHW tiling is a separate step); raw tensors have no pixel geometry.
3603        let view_origin = match self.format.map(|f| f.layout()) {
3604            Some(PixelLayout::Packed) => {
3605                let tile_h = elem_shape[0];
3606                let tile_w = elem_shape[1];
3607                // Per-row pitch of the tall `(W, N*H)` parent — padded stride if
3608                // set, else the tight row width. The GL import keys on this.
3609                let bpp = elem_shape[2] * std::mem::size_of::<T>();
3610                let parent_stride = self.effective_row_stride().unwrap_or(tile_w * bpp);
3611                Some(self.compose_view_origin(tile_w, batch * tile_h, parent_stride, 0, n * tile_h))
3612            }
3613            _ => None,
3614        };
3615        let mut t = self.subview(offset, &elem_shape)?;
3616        t.view_origin = view_origin;
3617        Ok(t)
3618    }
3619
3620    /// Borrow a rectangular spatial sub-region of an image tensor as a
3621    /// zero-copy view — the **destination/source crop** primitive.
3622    ///
3623    /// `region` is in pixels of the image's leading frame. The returned view
3624    /// shares the parent's `BufferIdentity` and addresses the sub-rectangle by
3625    /// offset + the **parent's** row pitch (so each row lands at the correct
3626    /// columns). `convert(src, &mut dst.view(rect), …)` renders into that
3627    /// sub-rectangle of `dst`; a letterbox fit then clears the view and renders
3628    /// the aspect-preserved content into its inner region. `view`/`batch`/the
3629    /// whole tensor are the one coherent destination model — there is no
3630    /// separate `dst_rect`.
3631    ///
3632    /// # Errors
3633    ///
3634    /// - [`Error::RegionOutOfBounds`] if `region` exceeds the image bounds.
3635    /// - [`Error::InvalidOperation`] if the tensor is not a packed-format image
3636    ///   (planar/semi-planar spatial sub-rects are not a single strided window;
3637    ///   use [`batch`](Self::batch) for batched planar tensors).
3638    pub fn view(&self, region: Region) -> Result<Tensor<T>> {
3639        let fmt = self.format.ok_or_else(|| {
3640            Error::InvalidOperation("view() requires a formatted image tensor".into())
3641        })?;
3642        if fmt.layout() != PixelLayout::Packed {
3643            return Err(Error::InvalidOperation(format!(
3644                "view() supports packed formats only (got {fmt:?}); use batch(n) for batched \
3645                 planar tensors"
3646            )));
3647        }
3648        let w = self
3649            .width()
3650            .ok_or_else(|| Error::InvalidOperation("view(): tensor has no image width".into()))?;
3651        let h = self
3652            .height()
3653            .ok_or_else(|| Error::InvalidOperation("view(): tensor has no image height".into()))?;
3654        if !region.fits_within(w, h) {
3655            return Err(Error::RegionOutOfBounds {
3656                region,
3657                bounds: (w, h),
3658            });
3659        }
3660        let elem = std::mem::size_of::<T>();
3661        let bpp = fmt.channels() * elem;
3662        let stride = self.effective_row_stride().unwrap_or(w * bpp);
3663        let offset = region
3664            .y
3665            .checked_mul(stride)
3666            .and_then(|yo| yo.checked_add(region.x.checked_mul(bpp)?))
3667            .ok_or(Error::InvalidSize(region.y))?;
3668        let sub_shape = fmt
3669            .image_shape(region.width, region.height)
3670            .ok_or_else(|| Error::InvalidShape(format!("view(): invalid shape for {fmt:?}")))?;
3671        let mut t = self.subview(offset, &sub_shape)?;
3672        // A multi-row sub-rect must advance rows by the PARENT pitch so each row
3673        // addresses the correct columns. A single-row view uses its own tight
3674        // stride — the parent pitch would make the strided `map()` (which exposes
3675        // `stride × rows`) expose a trailing row that runs past the buffer tail
3676        // for an offset (x>0 / bottom) view. The GL backend does NOT rely on this
3677        // (single-row-tight) `row_stride`: it reads the parent pitch from
3678        // `view_origin.parent_row_stride` so its import/cache pitch stays
3679        // parent-consistent for views of any height — see `ViewOrigin`.
3680        let view_stride = if region.height > 1 {
3681            stride
3682        } else {
3683            region.width * bpp
3684        };
3685        t.set_row_stride_unchecked(view_stride);
3686        // Snapshot the parent `(w, h, row_stride)` so the GL backend imports the
3687        // parent once (keyed on the parent pitch, not this view's possibly-tight
3688        // single-row stride) and renders this sub-rect as a `glViewport`/
3689        // `glScissor` ROI at `(region.x, region.y)`. Composes when viewing an
3690        // existing view.
3691        t.view_origin = Some(self.compose_view_origin(w, h, stride, region.x, region.y));
3692        Ok(t)
3693    }
3694
3695    /// Build the [`ViewOrigin`] for a new sub-region of `self`. When `self` is a
3696    /// whole tensor the snapshot names `self` as the parent; when `self` is
3697    /// already a view, the snapshot keeps the **root** parent and accumulates the
3698    /// local origin so nested views still resolve to one import.
3699    fn compose_view_origin(
3700        &self,
3701        parent_width: usize,
3702        parent_height: usize,
3703        parent_row_stride: usize,
3704        x: usize,
3705        y: usize,
3706    ) -> ViewOrigin {
3707        match self.view_origin {
3708            Some(root) => ViewOrigin {
3709                parent_width: root.parent_width,
3710                parent_height: root.parent_height,
3711                parent_row_stride: root.parent_row_stride,
3712                x: root.x.saturating_add(x),
3713                y: root.y.saturating_add(y),
3714            },
3715            None => ViewOrigin {
3716                parent_width,
3717                parent_height,
3718                parent_row_stride,
3719                x,
3720                y,
3721            },
3722        }
3723    }
3724
3725    /// Downcast to PBO tensor reference (for GL backends).
3726    pub fn as_pbo(&self) -> Option<&PboTensor<T>> {
3727        match &self.storage {
3728            TensorStorage::Pbo(p) => Some(p),
3729            _ => None,
3730        }
3731    }
3732
3733    /// Downcast to DMA tensor reference (for EGL import, G2D).
3734    #[cfg(target_os = "linux")]
3735    pub fn as_dma(&self) -> Option<&DmaTensor<T>> {
3736        match &self.storage {
3737            TensorStorage::Dma(d) => Some(d),
3738            _ => None,
3739        }
3740    }
3741
3742    /// Borrow the DMA-BUF file descriptor backing this tensor.
3743    ///
3744    /// # Returns
3745    ///
3746    /// A borrowed reference to the DMA-BUF file descriptor, tied to `self`'s
3747    /// lifetime.
3748    ///
3749    /// # Errors
3750    ///
3751    /// Returns `Error::NotImplemented` if the tensor is not DMA-backed.
3752    #[cfg(target_os = "linux")]
3753    pub fn dmabuf(&self) -> Result<std::os::fd::BorrowedFd<'_>> {
3754        use std::os::fd::AsFd;
3755        match &self.storage {
3756            TensorStorage::Dma(dma) => Ok(dma.fd.as_fd()),
3757            _ => Err(Error::NotImplemented(format!(
3758                "dmabuf requires DMA-backed tensor, got {:?}",
3759                self.storage.memory()
3760            ))),
3761        }
3762    }
3763
3764    /// Construct a Tensor from a PBO tensor (for GL backends that allocate PBOs).
3765    pub fn from_pbo(pbo: PboTensor<T>) -> Self {
3766        Self {
3767            storage: TensorStorage::Pbo(pbo),
3768            format: None,
3769            chroma: None,
3770            row_stride: None,
3771            plane_offset: None,
3772            quantization: None,
3773            cuda: None,
3774            colorimetry: None,
3775            cpu_access: CpuAccess::ReadWrite,
3776            compression: None,
3777            view_origin: None,
3778        }
3779    }
3780
3781    /// The CUDA registration for this tensor, if any (set at creation on CUDA devices).
3782    pub fn cuda(&self) -> Option<&crate::cuda::CudaHandle> {
3783        self.cuda.as_ref()
3784    }
3785
3786    /// Attach a CUDA handle (called by ImageProcessor::create_image after registering a PBO).
3787    pub fn set_cuda_handle(&mut self, h: crate::cuda::CudaHandle) {
3788        self.cuda = Some(h);
3789    }
3790
3791    /// Fast-fail CUDA map: None (no GL routing) when no handle; else map (PBO routes to the GL worker).
3792    ///
3793    /// Returns a scoped [`CudaMap`](crate::cuda::CudaMap) guard holding the raw CUDA device pointer
3794    /// for the duration of the mapping. For GL-buffer-backed tensors the unmap is deferred until the
3795    /// guard drops, freeing the PBO for the next `convert()` call. When no CUDA handle is attached
3796    /// (the common case for plain `Mem`/`DMA` tensors without CUDA registration), returns `None`
3797    /// immediately — no GL routing, no allocation.
3798    ///
3799    /// # Example — zero-copy CUDA input with host fallback
3800    ///
3801    /// ```no_run
3802    /// use edgefirst_tensor::{Tensor, TensorMemory, TensorTrait};
3803    /// # fn feed_tensorrt(_dptr: *mut std::ffi::c_void, _bytes: usize) {}
3804    /// # fn demo(t: &Tensor<f32>) {
3805    /// // Try the zero-copy CUDA device pointer first.
3806    /// if let Some(cuda) = t.cuda_map() {
3807    ///     feed_tensorrt(cuda.device_ptr(), cuda.len());
3808    ///     // `cuda` (a CudaMap guard) unmaps when it goes out of scope, freeing
3809    ///     // the GPU buffer for the next convert().
3810    /// } else {
3811    ///     // Fall back to the host mapping when no CUDA handle is attached.
3812    ///     let _host = t.map().expect("host map fallback must succeed");
3813    ///     // `_host` is a TensorMap<f32> that derefs to &[f32].
3814    /// }
3815    /// # }
3816    /// ```
3817    pub fn cuda_map(&self) -> Option<crate::cuda::CudaMap<'_>> {
3818        self.cuda.as_ref()?.map()
3819    }
3820
3821    /// Attempt to attach a CUDA `ExternalMemory` handle for DMA-backed tensors.
3822    ///
3823    /// On a CUDA-capable host, imports the DMA-BUF fd via
3824    /// `cudaImportExternalMemory(OpaqueFd)` and maps it to a device pointer.
3825    /// Sets `self.cuda` to a persistent `ExternalMem` handle on success. No-op
3826    /// if CUDA is unavailable, the tensor is not DMA-backed, or a handle is
3827    /// already set. Import failure is silently ignored — the tensor remains
3828    /// usable without a CUDA handle.
3829    ///
3830    /// # RUNTIME-UNVALIDATED
3831    ///
3832    /// No test platform has both `/dev/dma_heap` and a CUDA device. ABI is
3833    /// layout-asserted vs. CUDA 12.6 `driver_types.h`; the mechanism is proven
3834    /// by gpu-probe O5 on Orin. Best-effort: tensor creation never fails here.
3835    #[cfg(target_os = "linux")]
3836    pub fn try_init_dma_cuda(&mut self) {
3837        // Fast-path: already imported, CUDA not available, or not a DMA tensor.
3838        if self.cuda.is_some() || !crate::cuda::is_cuda_available() {
3839            return;
3840        }
3841        let (raw_fd, buf_size) = match &self.storage {
3842            TensorStorage::Dma(dma) => {
3843                use std::os::fd::AsRawFd;
3844                (dma.fd.as_raw_fd(), dma.buf_size)
3845            }
3846            _ => return,
3847        };
3848        if let Some((ext, dptr)) = crate::cuda::import_dma_fd(raw_fd, buf_size) {
3849            self.cuda = Some(crate::cuda::CudaHandle::new_external(ext, dptr, buf_size));
3850        }
3851    }
3852}
3853
3854// Quantization accessors — type-gated to integer element types via the
3855// sealed `IntegerType` trait. Calling `.quantization()` on a `Tensor<f32>`
3856// produces a compile error, not a runtime one.
3857impl<T> Tensor<T>
3858where
3859    T: IntegerType + Num + Clone + fmt::Debug + Send + Sync,
3860{
3861    /// Quantization metadata for this tensor, if set.
3862    pub fn quantization(&self) -> Option<&Quantization> {
3863        self.quantization.as_ref()
3864    }
3865
3866    /// Attach quantization metadata to this tensor. Validates against the
3867    /// tensor's shape — returns [`Error::QuantizationInvalid`] on any
3868    /// inconsistency (mismatched scale/zp lengths, out-of-range axis, etc.).
3869    pub fn set_quantization(&mut self, q: Quantization) -> Result<()> {
3870        q.validate(self.shape())?;
3871        self.quantization = Some(q);
3872        Ok(())
3873    }
3874
3875    /// Builder-style variant of [`Self::set_quantization`]. Consumes `self`
3876    /// and returns `Result<Self>` — on success yields the tensor with the
3877    /// attached quantization; on validation failure returns
3878    /// [`Error::QuantizationInvalid`] and drops `self` (the tensor is not
3879    /// returned in the error arm).
3880    pub fn with_quantization(mut self, q: Quantization) -> Result<Self> {
3881        self.set_quantization(q)?;
3882        Ok(self)
3883    }
3884
3885    /// Clear any quantization metadata on this tensor.
3886    pub fn clear_quantization(&mut self) {
3887        self.quantization = None;
3888    }
3889}
3890
3891impl<T> TensorTrait<T> for Tensor<T>
3892where
3893    T: Num + Clone + fmt::Debug + Send + Sync,
3894{
3895    fn new(shape: &[usize], name: Option<&str>) -> Result<Self>
3896    where
3897        Self: Sized,
3898    {
3899        Self::new(shape, None, name)
3900    }
3901
3902    #[cfg(unix)]
3903    fn from_fd(fd: std::os::fd::OwnedFd, shape: &[usize], name: Option<&str>) -> Result<Self>
3904    where
3905        Self: Sized,
3906    {
3907        #[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
3908        let mut t = Self::wrap(TensorStorage::from_fd(fd, shape, name)?);
3909        // Best-effort CUDA external memory import for DMA-backed tensors.
3910        // RUNTIME-UNVALIDATED: see try_init_dma_cuda().
3911        #[cfg(target_os = "linux")]
3912        t.try_init_dma_cuda();
3913        Ok(t)
3914    }
3915
3916    #[cfg(unix)]
3917    fn clone_fd(&self) -> Result<std::os::fd::OwnedFd> {
3918        self.storage.clone_fd()
3919    }
3920
3921    fn memory(&self) -> TensorMemory {
3922        self.storage.memory()
3923    }
3924
3925    fn name(&self) -> String {
3926        self.storage.name()
3927    }
3928
3929    fn shape(&self) -> &[usize] {
3930        self.storage.shape()
3931    }
3932
3933    fn reshape(&mut self, shape: &[usize]) -> Result<()> {
3934        if self.chroma.is_some() {
3935            return Err(Error::InvalidOperation(
3936                "cannot reshape a multiplane tensor — decompose planes first".into(),
3937            ));
3938        }
3939        self.storage.reshape(shape)?;
3940        self.format = None;
3941        self.row_stride = None;
3942        self.plane_offset = None;
3943        match self.storage {
3944            TensorStorage::Mem(ref mut m) => m.set_offset(0),
3945            #[cfg(target_os = "linux")]
3946            TensorStorage::Dma(ref mut dma) => dma.mmap_offset = 0,
3947            _ => {}
3948        }
3949        Ok(())
3950    }
3951
3952    fn map_with(&self, access: CpuAccess) -> Result<TensorMap<T>> {
3953        let _span = tracing::trace_span!(
3954            "tensor.map",
3955            memory = ?self.storage.memory(),
3956            ?access,
3957        )
3958        .entered();
3959        if access == CpuAccess::None {
3960            return Err(Error::InvalidArgument(
3961                "map_with(CpuAccess::None) is not a mappable direction — use \
3962                 map_read()/map_write()/map_mut()"
3963                    .into(),
3964            ));
3965        }
3966        // Declared-vs-requested telemetry (all platforms): mapping beyond
3967        // the allocation-time declaration is best-effort — tolerated where
3968        // the backing is CPU-mappable regardless (Mem/Shm/dma-buf/
3969        // IOSurface), refused by the Android backend for CpuAccess::None
3970        // buffers — but always loud and counted, never silent.
3971        if !self.cpu_access.covers(access) {
3972            note_unplanned_cpu_access(
3973                self.buffer_identity().id(),
3974                &format!("{:?}", self.storage.memory()),
3975                "map access exceeds the declared CpuAccess",
3976            );
3977        }
3978        // CPU mapping of a strided tensor exposes the full padded buffer
3979        // (`row_stride × rows`) so callers can iterate rows via
3980        // `effective_row_stride()` without running past the slice. This is sound
3981        // only when the HAL owns and can size-check the allocation:
3982        //
3983        //   * Self-allocated Mem / Shm tensors (any platform) — the backing
3984        //     `Vec` / shm segment is sized by `capacity_bytes()`, checked here.
3985        //   * Self-allocated DMA tensors (Linux) — pitch padding from
3986        //     `image_with_stride()`; checked against the DMA-BUF `buf_size`.
3987        //
3988        //   * Self-allocated PBO tensors (any platform with GL) — the GL buffer
3989        //     is sized by `capacity_bytes()` and may carry 64-byte row padding;
3990        //     the JPEG decoder mmaps it and convert() reads it, both iterating
3991        //     by `row_stride`. Checked against the PBO capacity below.
3992        //
3993        // Foreign DMA-BUFs (`from_fd()` + `set_row_stride()`, the V4L2 /
3994        // GStreamer case) and IOSurface are rejected: their layout comes from an
3995        // external allocator / GPU driver the HAL cannot validate for a strided
3996        // CPU view, and they are intended for the GPU path. (Earlier this
3997        // rejected *all* non-Linux strided maps with "DMA backing is Linux-only"
3998        // — that was an unimplemented path, not a platform limit; HAL-owned
3999        // Mem/Shm/PBO are trivially mappable and now are.)
4000        if let Some(stride) = self.row_stride {
4001            // Rows sit at `stride`-byte spacing. The row count is the first
4002            // shape dim for packed `[H, W, C]` and semi-planar `[H*k, W]`,
4003            // but planar `[C, H, W]` stacks C planes of H rows — its surface
4004            // row count is `C × H` (`shape[0]` alone would expose a 3-row
4005            // window and truncate the map; first hit by Android planar-F16
4006            // AHardwareBuffers, whose gralloc pads the pitch — macOS/Linux
4007            // planar pitches happen to be naturally aligned so no stride was
4008            // ever recorded there).
4009            let rows = match self.format.map(|f| f.layout()) {
4010                Some(PixelLayout::Planar) => {
4011                    let s = self.shape();
4012                    if s.len() < 2 {
4013                        return Err(Error::InvalidOperation(
4014                            "Tensor::map: strided planar mapping requires [C, H, W] shape".into(),
4015                        ));
4016                    }
4017                    s[0].checked_mul(s[1]).ok_or_else(|| {
4018                        Error::InvalidOperation(format!(
4019                            "Tensor::map: planar rows {} × {} overflows usize",
4020                            s[0], s[1]
4021                        ))
4022                    })?
4023                }
4024                _ => *self.shape().first().ok_or_else(|| {
4025                    Error::InvalidOperation(
4026                        "Tensor::map: strided mapping requires a non-empty shape".into(),
4027                    )
4028                })?,
4029            };
4030            let total_bytes = stride.checked_mul(rows).ok_or_else(|| {
4031                Error::InvalidOperation(format!(
4032                    "Tensor::map: row_stride {stride} × rows {rows} overflows usize"
4033                ))
4034            })?;
4035
4036            match &self.storage {
4037                #[cfg(target_os = "linux")]
4038                TensorStorage::Dma(dma) if !dma.is_imported => {
4039                    // `set_row_stride()` only validates `stride >= min_stride`,
4040                    // not that `stride × rows` fits the DMA-BUF, so re-check
4041                    // here — mapping past `buf_size` would SIGBUS on access.
4042                    let available_bytes = dma.buf_size.saturating_sub(dma.mmap_offset);
4043                    if total_bytes > available_bytes {
4044                        return Err(Error::InvalidOperation(format!(
4045                            "Tensor::map: strided mapping needs {total_bytes} bytes \
4046                             but DMA buffer only has {available_bytes} available \
4047                             (buf_size={}, mmap_offset={}, stride={stride}, rows={rows}); \
4048                             the row_stride was likely set larger than the original allocation",
4049                            dma.buf_size, dma.mmap_offset
4050                        )));
4051                    }
4052                    return dma
4053                        .map_with_byte_size(total_bytes, access)
4054                        .map(TensorMap::Dma);
4055                }
4056                TensorStorage::Mem(mem) => {
4057                    let capacity = self.storage.capacity_bytes();
4058                    if total_bytes > capacity {
4059                        return Err(Error::InsufficientCapacity {
4060                            needed: total_bytes,
4061                            capacity,
4062                        });
4063                    }
4064                    return mem.map_with_byte_size(total_bytes, access);
4065                }
4066                #[cfg(unix)]
4067                TensorStorage::Shm(shm) => {
4068                    let capacity = self.storage.capacity_bytes();
4069                    if total_bytes > capacity {
4070                        return Err(Error::InsufficientCapacity {
4071                            needed: total_bytes,
4072                            capacity,
4073                        });
4074                    }
4075                    return shm.map_with_byte_size(total_bytes, access);
4076                }
4077                // macOS/iOS: `TensorStorage::Dma` is the IOSurface. The lock yields
4078                // the full surface base address, and the row pitch
4079                // (`IOSurfaceGetBytesPerRow`) is known from the API for both
4080                // self-allocated and imported surfaces — unlike a foreign
4081                // DMA-BUF — so a strided CPU view is sound and zero-copy.
4082                #[cfg(any(target_os = "macos", target_os = "ios"))]
4083                TensorStorage::Dma(io) => {
4084                    // A sub-view's window is `buf_size − view_offset`; the strided
4085                    // span must fit the window, not the whole surface.
4086                    let available = io.buf_size.saturating_sub(io.view_offset);
4087                    if total_bytes > available {
4088                        return Err(Error::InsufficientCapacity {
4089                            needed: total_bytes,
4090                            capacity: available,
4091                        });
4092                    }
4093                    return io.map_with_byte_size(total_bytes, access);
4094                }
4095                // Android: `TensorStorage::Dma` is the AHardwareBuffer. The lock
4096                // yields the full buffer base address, and the row pitch is
4097                // known from the allocator-filled descriptor — so a strided CPU
4098                // view is sound and zero-copy, same as IOSurface.
4099                #[cfg(target_os = "android")]
4100                TensorStorage::Dma(ahb) => {
4101                    // A sub-view's window is `buf_size − view_offset`; the strided
4102                    // span must fit the window, not the whole buffer.
4103                    let available = ahb.buf_size.saturating_sub(ahb.view_offset);
4104                    if total_bytes > available {
4105                        return Err(Error::InsufficientCapacity {
4106                            needed: total_bytes,
4107                            capacity: available,
4108                        });
4109                    }
4110                    return ahb.map_with_byte_size(total_bytes, access);
4111                }
4112                TensorStorage::Pbo(pbo) => {
4113                    // PBO: the GPU-side allocation may have a padded row stride
4114                    // (e.g. 64-byte aligned). Expose the full padded buffer so a
4115                    // CPU producer (JPEG decoder) and a strided convert source
4116                    // can iterate rows via `effective_row_stride()` without
4117                    // running past the slice — the logical `pbo.map()` view would
4118                    // stop after `shape.product()` and lose bytes past row 0.
4119                    // A sub-view's window is `capacity − view_offset`.
4120                    let available = pbo.capacity_bytes().saturating_sub(pbo.view_offset);
4121                    if total_bytes > available {
4122                        return Err(Error::InsufficientCapacity {
4123                            needed: total_bytes,
4124                            capacity: available,
4125                        });
4126                    }
4127                    return pbo.map_with_byte_size(total_bytes, access);
4128                }
4129                // Reachable on Linux for an IMPORTED DMA-BUF (the `Dma` arm above
4130                // is guarded `if !dma.is_imported`). On macOS/Windows every
4131                // storage variant is matched explicitly, so this catch-all is
4132                // unreachable there — allow it rather than cfg-gating per platform.
4133                #[allow(unreachable_patterns)]
4134                _ => {
4135                    return Err(Error::InvalidOperation(
4136                        "CPU mapping of strided tensors is supported only for HAL-allocated \
4137                         Mem/Shm (any platform), self-allocated DMA (Linux), IOSurface \
4138                         (macOS), and PBO; imported DMA-BUF without self-allocation is \
4139                         GPU-path only"
4140                            .into(),
4141                    ));
4142                }
4143            }
4144        }
4145        // Offset tensors are supported for storages that apply the offset
4146        // inside their own `map()`: DMA (`DmaMap`/IOSurface adjust the mapped
4147        // base), Mem (`MemMap` adjusts the slice base), Shm (`ShmMap` adjusts
4148        // the slice base), and PBO (the staged copy starts at the offset). Every
4149        // self-allocated backing now carries a sub-region concept via `view`, so
4150        // a non-zero offset is honoured rather than rejected.
4151        if self.plane_offset.is_some_and(|o| o > 0) {
4152            let supported = matches!(self.storage, TensorStorage::Mem(_) | TensorStorage::Pbo(_));
4153            // macOS `Dma` is the IOSurface; Linux `Dma` is the DMA-BUF; Android
4154            // `Dma` is the AHardwareBuffer — all apply the offset in their map.
4155            // (`Dma` is the same variant name on each, hence one `cfg(any(...))`
4156            // arm rather than three.)
4157            #[cfg(any(
4158                target_os = "linux",
4159                target_os = "macos",
4160                target_os = "ios",
4161                target_os = "android"
4162            ))]
4163            let supported = supported || matches!(self.storage, TensorStorage::Dma(_));
4164            #[cfg(unix)]
4165            let supported = supported || matches!(self.storage, TensorStorage::Shm(_));
4166            if !supported {
4167                return Err(Error::InvalidOperation(
4168                    "plane offset only supported for DMA, Mem, Shm, and PBO tensors".into(),
4169                ));
4170            }
4171        }
4172        self.storage.map_with(access)
4173    }
4174
4175    fn buffer_identity(&self) -> &BufferIdentity {
4176        self.storage.buffer_identity()
4177    }
4178}
4179
4180pub enum TensorMap<T>
4181where
4182    T: Num + Clone + fmt::Debug,
4183{
4184    #[cfg(target_os = "linux")]
4185    Dma(DmaMap<T>),
4186    #[cfg(any(target_os = "macos", target_os = "ios"))]
4187    IoSurface(IoSurfaceMap<T>),
4188    #[cfg(target_os = "android")]
4189    HardwareBuffer(AHardwareBufferMap<T>),
4190    #[cfg(unix)]
4191    Shm(ShmMap<T>),
4192    Mem(MemMap<T>),
4193    Pbo(PboMap<T>),
4194}
4195
4196impl<T> TensorMapTrait<T> for TensorMap<T>
4197where
4198    T: Num + Clone + fmt::Debug,
4199{
4200    fn shape(&self) -> &[usize] {
4201        match self {
4202            #[cfg(target_os = "linux")]
4203            TensorMap::Dma(map) => map.shape(),
4204            #[cfg(any(target_os = "macos", target_os = "ios"))]
4205            TensorMap::IoSurface(map) => map.shape(),
4206            #[cfg(target_os = "android")]
4207            TensorMap::HardwareBuffer(map) => map.shape(),
4208            #[cfg(unix)]
4209            TensorMap::Shm(map) => map.shape(),
4210            TensorMap::Mem(map) => map.shape(),
4211            TensorMap::Pbo(map) => map.shape(),
4212        }
4213    }
4214
4215    fn unmap(&mut self) {
4216        match self {
4217            #[cfg(target_os = "linux")]
4218            TensorMap::Dma(map) => map.unmap(),
4219            #[cfg(any(target_os = "macos", target_os = "ios"))]
4220            TensorMap::IoSurface(map) => map.unmap(),
4221            #[cfg(target_os = "android")]
4222            TensorMap::HardwareBuffer(map) => map.unmap(),
4223            #[cfg(unix)]
4224            TensorMap::Shm(map) => map.unmap(),
4225            TensorMap::Mem(map) => map.unmap(),
4226            TensorMap::Pbo(map) => map.unmap(),
4227        }
4228    }
4229
4230    fn as_slice(&self) -> &[T] {
4231        match self {
4232            #[cfg(target_os = "linux")]
4233            TensorMap::Dma(map) => map.as_slice(),
4234            #[cfg(any(target_os = "macos", target_os = "ios"))]
4235            TensorMap::IoSurface(map) => map.deref(),
4236            #[cfg(target_os = "android")]
4237            TensorMap::HardwareBuffer(map) => map.deref(),
4238            #[cfg(unix)]
4239            TensorMap::Shm(map) => map.as_slice(),
4240            TensorMap::Mem(map) => map.as_slice(),
4241            TensorMap::Pbo(map) => map.as_slice(),
4242        }
4243    }
4244
4245    fn as_mut_slice(&mut self) -> &mut [T] {
4246        match self {
4247            #[cfg(target_os = "linux")]
4248            TensorMap::Dma(map) => map.as_mut_slice(),
4249            #[cfg(any(target_os = "macos", target_os = "ios"))]
4250            TensorMap::IoSurface(map) => map.deref_mut(),
4251            #[cfg(target_os = "android")]
4252            TensorMap::HardwareBuffer(map) => map.deref_mut(),
4253            #[cfg(unix)]
4254            TensorMap::Shm(map) => map.as_mut_slice(),
4255            TensorMap::Mem(map) => map.as_mut_slice(),
4256            TensorMap::Pbo(map) => map.as_mut_slice(),
4257        }
4258    }
4259}
4260
4261impl<T> Deref for TensorMap<T>
4262where
4263    T: Num + Clone + fmt::Debug,
4264{
4265    type Target = [T];
4266
4267    fn deref(&self) -> &[T] {
4268        match self {
4269            #[cfg(target_os = "linux")]
4270            TensorMap::Dma(map) => map.deref(),
4271            #[cfg(any(target_os = "macos", target_os = "ios"))]
4272            TensorMap::IoSurface(map) => map.deref(),
4273            #[cfg(target_os = "android")]
4274            TensorMap::HardwareBuffer(map) => map.deref(),
4275            #[cfg(unix)]
4276            TensorMap::Shm(map) => map.deref(),
4277            TensorMap::Mem(map) => map.deref(),
4278            TensorMap::Pbo(map) => map.deref(),
4279        }
4280    }
4281}
4282
4283impl<T> DerefMut for TensorMap<T>
4284where
4285    T: Num + Clone + fmt::Debug,
4286{
4287    fn deref_mut(&mut self) -> &mut [T] {
4288        match self {
4289            #[cfg(target_os = "linux")]
4290            TensorMap::Dma(map) => map.deref_mut(),
4291            #[cfg(any(target_os = "macos", target_os = "ios"))]
4292            TensorMap::IoSurface(map) => map.deref_mut(),
4293            #[cfg(target_os = "android")]
4294            TensorMap::HardwareBuffer(map) => map.deref_mut(),
4295            #[cfg(unix)]
4296            TensorMap::Shm(map) => map.deref_mut(),
4297            TensorMap::Mem(map) => map.deref_mut(),
4298            TensorMap::Pbo(map) => map.deref_mut(),
4299        }
4300    }
4301}
4302
4303// ============================================================================
4304// Platform availability helpers
4305// ============================================================================
4306
4307/// Cached result of the Linux DMA-BUF availability probe.
4308#[cfg(target_os = "linux")]
4309static DMA_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
4310/// Cached result of the macOS/iOS IOSurface availability probe.
4311#[cfg(any(target_os = "macos", target_os = "ios"))]
4312static IOSURFACE_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
4313
4314/// Check if Linux DMA-BUF allocation is available on this system.
4315///
4316/// Linux-specific availability check (typically requires `/dev/dma_heap`
4317/// access — running as root or membership in a video/render group). For
4318/// portable code that wants "any zero-copy GPU buffer", use
4319/// [`is_gpu_buffer_available`] which also covers IOSurface on macOS.
4320///
4321/// This function caches its result after the first call.
4322#[cfg(target_os = "linux")]
4323pub fn is_dma_available() -> bool {
4324    *DMA_AVAILABLE.get_or_init(|| Tensor::<u8>::new(&[64], Some(TensorMemory::Dma), None).is_ok())
4325}
4326
4327/// Always returns `false` on non-Linux platforms.
4328#[cfg(not(target_os = "linux"))]
4329pub fn is_dma_available() -> bool {
4330    false
4331}
4332
4333/// Check if macOS/iOS IOSurface allocation is available on this system.
4334///
4335/// IOSurface is part of the macOS/iOS OS and is essentially always present;
4336/// this probe catches degraded scenarios such as memory pressure or
4337/// sandboxed contexts where `IOSurfaceCreate` fails. The result is
4338/// cached after the first call.
4339///
4340/// Always returns `false` on non-Apple platforms.
4341#[cfg(any(target_os = "macos", target_os = "ios"))]
4342pub fn is_iosurface_available() -> bool {
4343    *IOSURFACE_AVAILABLE.get_or_init(|| {
4344        // Probe via the same Dma path — on macOS/iOS this routes through
4345        // IoSurfaceTensor::new.
4346        Tensor::<u8>::new(&[64], Some(TensorMemory::Dma), None).is_ok()
4347    })
4348}
4349
4350#[cfg(not(any(target_os = "macos", target_os = "ios")))]
4351pub fn is_iosurface_available() -> bool {
4352    false
4353}
4354
4355/// Cached result of the Android AHardwareBuffer availability probe.
4356#[cfg(target_os = "android")]
4357static AHARDWAREBUFFER_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
4358
4359/// Check if Android AHardwareBuffer allocation is available on this system.
4360///
4361/// AHardwareBuffer is part of the Android OS (public NDK ABI since API
4362/// 26) and is essentially always present; this probe catches degraded
4363/// scenarios such as memory pressure or gralloc failures. The result is
4364/// cached after the first call.
4365#[cfg(target_os = "android")]
4366pub fn is_ahardwarebuffer_available() -> bool {
4367    *AHARDWAREBUFFER_AVAILABLE.get_or_init(|| {
4368        // Probe via the same Dma path — on Android this routes through
4369        // AHardwareBufferTensor::new.
4370        Tensor::<u8>::new(&[64], Some(TensorMemory::Dma), None).is_ok()
4371    })
4372}
4373
4374/// Always returns `false` on non-Android platforms.
4375#[cfg(not(target_os = "android"))]
4376pub fn is_ahardwarebuffer_available() -> bool {
4377    false
4378}
4379
4380/// Portable probe for the platform's native zero-copy GPU buffer
4381/// allocator (DMA-BUF on Linux, IOSurface on macOS/iOS, AHardwareBuffer on
4382/// Android). Returns `false` on
4383/// Windows and other platforms with no equivalent. Use this when writing
4384/// cross-platform code that cares whether the `Dma` tensor variant will
4385/// work, not which underlying mechanism is used.
4386pub fn is_gpu_buffer_available() -> bool {
4387    #[cfg(target_os = "linux")]
4388    {
4389        is_dma_available()
4390    }
4391    #[cfg(any(target_os = "macos", target_os = "ios"))]
4392    {
4393        is_iosurface_available()
4394    }
4395    #[cfg(target_os = "android")]
4396    {
4397        is_ahardwarebuffer_available()
4398    }
4399    #[cfg(not(any(
4400        target_os = "linux",
4401        target_os = "macos",
4402        target_os = "ios",
4403        target_os = "android"
4404    )))]
4405    {
4406        false
4407    }
4408}
4409
4410/// Check if POSIX shared memory allocation is available on this system.
4411///
4412/// Returns `true` on Unix systems (Linux, macOS, BSD) where POSIX shared memory
4413/// is supported. Always returns `false` on non-Unix platforms (Windows).
4414///
4415/// This function caches its result after the first call for efficiency.
4416#[cfg(unix)]
4417static SHM_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
4418
4419/// Check if POSIX shared memory allocation is available on this system.
4420#[cfg(unix)]
4421pub fn is_shm_available() -> bool {
4422    *SHM_AVAILABLE.get_or_init(|| Tensor::<u8>::new(&[64], Some(TensorMemory::Shm), None).is_ok())
4423}
4424
4425/// Check if POSIX shared memory allocation is available on this system.
4426///
4427/// Always returns `false` on non-Unix platforms since POSIX SHM is Unix-specific.
4428#[cfg(not(unix))]
4429pub fn is_shm_available() -> bool {
4430    false
4431}
4432
4433#[cfg(test)]
4434mod dtype_tests {
4435    use super::*;
4436
4437    #[test]
4438    fn dtype_size() {
4439        assert_eq!(DType::U8.size(), 1);
4440        assert_eq!(DType::I8.size(), 1);
4441        assert_eq!(DType::U16.size(), 2);
4442        assert_eq!(DType::I16.size(), 2);
4443        assert_eq!(DType::U32.size(), 4);
4444        assert_eq!(DType::I32.size(), 4);
4445        assert_eq!(DType::U64.size(), 8);
4446        assert_eq!(DType::I64.size(), 8);
4447        assert_eq!(DType::F16.size(), 2);
4448        assert_eq!(DType::F32.size(), 4);
4449        assert_eq!(DType::F64.size(), 8);
4450    }
4451
4452    #[test]
4453    fn dtype_name() {
4454        assert_eq!(DType::U8.name(), "u8");
4455        assert_eq!(DType::F16.name(), "f16");
4456        assert_eq!(DType::F32.name(), "f32");
4457    }
4458
4459    #[test]
4460    fn dtype_serde_roundtrip() {
4461        use serde_json;
4462        let dt = DType::F16;
4463        let json = serde_json::to_string(&dt).unwrap();
4464        let back: DType = serde_json::from_str(&json).unwrap();
4465        assert_eq!(dt, back);
4466    }
4467}
4468
4469#[cfg(test)]
4470mod image_tests {
4471    use super::*;
4472
4473    #[test]
4474    fn image_shape_per_layout() {
4475        assert_eq!(
4476            PixelFormat::Rgb.image_shape(640, 480),
4477            Some(vec![480, 640, 3])
4478        );
4479        assert_eq!(
4480            PixelFormat::Grey.image_shape(640, 480),
4481            Some(vec![480, 640, 1])
4482        );
4483        assert_eq!(
4484            PixelFormat::Nv12.image_shape(640, 480),
4485            Some(vec![720, 640])
4486        );
4487        // Odd height: combined-plane height is `481 + ceil(481/2)` = 481 + 241
4488        // = 722 rows. Logical height is recovered as `722 * 2 / 3` = 481.
4489        assert_eq!(
4490            PixelFormat::Nv12.image_shape(640, 481),
4491            Some(vec![722, 640])
4492        );
4493        // Odd width: shape carries the LOGICAL width (641).
4494        // The 64-aligned stride (>= 642) is stored separately on the Tensor.
4495        assert_eq!(
4496            PixelFormat::Nv12.image_shape(641, 480),
4497            Some(vec![720, 641])
4498        );
4499        // NV16 odd width: same — logical width in shape, stride separate.
4500        assert_eq!(
4501            PixelFormat::Nv16.image_shape(641, 480),
4502            Some(vec![960, 641])
4503        );
4504        assert_eq!(
4505            PixelFormat::PlanarRgb.image_shape(640, 480),
4506            Some(vec![3, 480, 640])
4507        );
4508        assert_eq!(
4509            PixelFormat::Nv16.image_shape(640, 480),
4510            Some(vec![960, 640])
4511        );
4512    }
4513
4514    #[test]
4515    fn raw_tensor_has_no_format() {
4516        let t = Tensor::<u8>::new(&[480, 640, 3], None, None).unwrap();
4517        assert!(t.format().is_none());
4518        assert!(t.width().is_none());
4519        assert!(t.height().is_none());
4520        assert!(!t.is_multiplane());
4521        assert!(t.chroma().is_none());
4522    }
4523
4524    #[test]
4525    fn image_tensor_packed() {
4526        let t = Tensor::<u8>::image(
4527            640,
4528            480,
4529            PixelFormat::Rgba,
4530            None,
4531            crate::CpuAccess::ReadWrite,
4532        )
4533        .unwrap();
4534        assert_eq!(t.format(), Some(PixelFormat::Rgba));
4535        assert_eq!(t.width(), Some(640));
4536        assert_eq!(t.height(), Some(480));
4537        assert_eq!(t.shape(), &[480, 640, 4]);
4538        assert!(!t.is_multiplane());
4539    }
4540
4541    #[test]
4542    fn image_tensor_planar() {
4543        let t = Tensor::<u8>::image(
4544            640,
4545            480,
4546            PixelFormat::PlanarRgb,
4547            None,
4548            crate::CpuAccess::ReadWrite,
4549        )
4550        .unwrap();
4551        assert_eq!(t.format(), Some(PixelFormat::PlanarRgb));
4552        assert_eq!(t.width(), Some(640));
4553        assert_eq!(t.height(), Some(480));
4554        assert_eq!(t.shape(), &[3, 480, 640]);
4555    }
4556
4557    #[test]
4558    #[cfg(target_os = "macos")]
4559    fn image_tensor_dma_non_aligned_packed_width_pads_zero_copy() {
4560        // RGBA u8 at width=4 → 4*4 = 16 bytes/row, not 64-byte aligned. RGBA has
4561        // a real IOSurface FourCC, so an explicit `Some(TensorMemory::Dma)`
4562        // request now allocates a padded image IOSurface (64-aligned
4563        // `bytes_per_row`) and records the stride — a fully zero-copy buffer GL
4564        // can bind and the CPU can map via the strided path. (Previously this
4565        // failed loudly to avoid an 'L008' byte-bag downgrade; with a real
4566        // FourCC surface that concern no longer applies.)
4567        let t = Tensor::<u8>::image(
4568            4,
4569            4,
4570            PixelFormat::Rgba,
4571            Some(TensorMemory::Dma),
4572            crate::CpuAccess::ReadWrite,
4573        )
4574        .expect("padded RGBA IOSurface should allocate");
4575        assert_eq!(t.format(), Some(PixelFormat::Rgba));
4576        assert_eq!(t.width(), Some(4));
4577        assert_eq!(t.height(), Some(4));
4578        let stride = t.effective_row_stride().expect("stride");
4579        assert_eq!(stride % 64, 0, "padded to 64-byte row alignment");
4580        assert!(stride >= 16);
4581        // A CPU map exposes the full padded surface for strided iteration.
4582        let m = t.map().expect("strided IOSurface map");
4583        assert_eq!(m.as_slice().len(), stride * 4);
4584    }
4585
4586    /// `per_pixel_bytes` that doesn't divide 64 evenly (e.g. RGB u8 with
4587    /// 3 B/pixel) makes a "Pad width to N" suggestion structurally
4588    /// impossible — there is no integer width whose `width * 3` is a
4589    /// multiple of 64. The error must still fire (no silent SHM
4590    /// fallback for explicit-DMA requests) and must spell out the
4591    /// alignment requirement; it just omits the misleading "pad to N"
4592    /// hint instead of printing a number whose row pitch still won't
4593    /// align.
4594    #[test]
4595    #[cfg(target_os = "macos")]
4596    fn image_tensor_dma_rejects_indivisible_pixel_pitch_without_pad_hint() {
4597        // Width=10 RGB f32 → 120 B/row, not 64-byte aligned, and (Rgb,
4598        // F32) has no IOSurface mapping so the padded-stride tolerance
4599        // does not apply. The next 64-multiple (128 B) isn't an integer
4600        // multiple of 12 B/pixel, so the "pad width to N" hint can't
4601        // produce a valid number and must be omitted. (Rgb u8 used to be
4602        // this test's subject but now has a real RGBA8888 mapping with
4603        // padded-stride tolerance — see the test below.)
4604        let err = Tensor::<f32>::image(
4605            10,
4606            10,
4607            PixelFormat::Rgb,
4608            Some(TensorMemory::Dma),
4609            crate::CpuAccess::ReadWrite,
4610        )
4611        .expect_err("RGB f32 with 12 B/pixel and non-aligned width must be rejected");
4612        match err {
4613            Error::InvalidArgument(msg) => {
4614                assert!(
4615                    msg.contains("64-byte aligned"),
4616                    "error must still name the alignment requirement: {msg}"
4617                );
4618                assert!(
4619                    !msg.contains("Pad width"),
4620                    "indivisible per-pixel pitch makes a width suggestion impossible; \
4621                     hint must be omitted, got: {msg}"
4622                );
4623                assert!(
4624                    msg.contains("memory=None") && msg.contains("TensorMemory::Mem"),
4625                    "error must still list the always-applicable alternatives: {msg}"
4626                );
4627            }
4628            other => panic!("expected InvalidArgument, got {other:?}"),
4629        }
4630    }
4631
4632    #[test]
4633    #[cfg(target_os = "macos")]
4634    fn image_tensor_dma_packed_rgb_u8_contract() {
4635        // Packed RGB u8 @Dma is a designed RGBA8888 mapping at
4636        // (W*3/4, H) — the INT8 NPU input layout, shared with Android.
4637        // width%4 != 0 cannot form whole texels → loud InvalidArgument…
4638        let err = Tensor::<u8>::image(
4639            10,
4640            10,
4641            PixelFormat::Rgb,
4642            Some(TensorMemory::Dma),
4643            crate::CpuAccess::ReadWrite,
4644        )
4645        .expect_err("Rgb u8 width%4!=0 must be rejected");
4646        assert!(
4647            matches!(&err, Error::InvalidArgument(m) if m.contains("width%4==0")),
4648            "got {err:?}"
4649        );
4650        // …width%4 == 0 with a non-64-aligned pitch allocates PADDED
4651        // (36 B rows → 64 B surface pitch, recorded on the tensor)…
4652        let t = Tensor::<u8>::image(
4653            12,
4654            4,
4655            PixelFormat::Rgb,
4656            Some(TensorMemory::Dma),
4657            crate::CpuAccess::ReadWrite,
4658        )
4659        .expect("width 12 Rgb u8 must allocate padded");
4660        assert_eq!(t.memory(), TensorMemory::Dma);
4661        assert!(
4662            t.row_stride()
4663                .is_some_and(|s| s >= 64 && s.is_multiple_of(64)),
4664            "padded pitch must be recorded: {:?}",
4665            t.row_stride()
4666        );
4667        // …and the aligned model-input width stays flat (640*3 = 1920 is
4668        // 64-aligned → no recorded stride, the buffer IS [H, W, 3]).
4669        let t = Tensor::<u8>::image(
4670            640,
4671            8,
4672            PixelFormat::Rgb,
4673            Some(TensorMemory::Dma),
4674            crate::CpuAccess::ReadWrite,
4675        )
4676        .expect("width 640 Rgb u8 must allocate flat");
4677        assert_eq!(t.row_stride(), None);
4678        // I8 shares the layout (INT8 shader bias, not a format change).
4679        let t = Tensor::<i8>::image(
4680            640,
4681            8,
4682            PixelFormat::Rgb,
4683            Some(TensorMemory::Dma),
4684            crate::CpuAccess::ReadWrite,
4685        )
4686        .expect("Rgb i8 shares the RGBA8888 mapping");
4687        assert_eq!(t.memory(), TensorMemory::Dma);
4688    }
4689
4690    #[test]
4691    #[cfg(target_os = "macos")]
4692    fn image_tensor_dma_planar_f16_alignment() {
4693        // PlanarRgb F16 uses single-channel row pitch (width * 2 bytes).
4694        // Width=16 → 32 bytes/row (not aligned); width=32 → 64 bytes/row (aligned).
4695        let err = Tensor::<half::f16>::image(
4696            16,
4697            16,
4698            PixelFormat::PlanarRgb,
4699            Some(TensorMemory::Dma),
4700            crate::CpuAccess::ReadWrite,
4701        )
4702        .expect_err("width=16 PlanarRgb F16 is 32-byte row, must reject");
4703        assert!(matches!(err, Error::InvalidArgument(_)), "got {err:?}");
4704        // 32 wide should work.
4705        let t = Tensor::<half::f16>::image(
4706            32,
4707            8,
4708            PixelFormat::PlanarRgb,
4709            Some(TensorMemory::Dma),
4710            crate::CpuAccess::ReadWrite,
4711        )
4712        .expect("width=32 PlanarRgb F16 is 64-byte row, must succeed");
4713        assert_eq!(t.format(), Some(PixelFormat::PlanarRgb));
4714    }
4715
4716    #[test]
4717    fn image_tensor_semi_planar_contiguous() {
4718        let t = Tensor::<u8>::image(
4719            640,
4720            480,
4721            PixelFormat::Nv12,
4722            None,
4723            crate::CpuAccess::ReadWrite,
4724        )
4725        .unwrap();
4726        assert_eq!(t.format(), Some(PixelFormat::Nv12));
4727        assert_eq!(t.width(), Some(640));
4728        assert_eq!(t.height(), Some(480));
4729        // NV12: H*3/2 = 720
4730        assert_eq!(t.shape(), &[720, 640]);
4731        assert!(!t.is_multiplane());
4732    }
4733
4734    #[test]
4735    #[cfg(target_os = "linux")]
4736    fn image_tensor_with_stride_preserves_logical_width() {
4737        // Skip if DMA not available (e.g. sandboxed CI lacking dma_heap access).
4738        if !is_dma_available() {
4739            eprintln!("SKIPPED: DMA heap not available");
4740            return;
4741        }
4742        // 3004×1688 RGBA8: natural pitch 12016, padded to 12032 (64-aligned).
4743        let stride = 12032;
4744        let t = Tensor::<u8>::image_with_stride(
4745            3004,
4746            1688,
4747            PixelFormat::Rgba,
4748            stride,
4749            Some(TensorMemory::Dma),
4750            crate::CpuAccess::ReadWrite,
4751        )
4752        .unwrap();
4753        // Logical dimensions unchanged by padding — this is the contract.
4754        assert_eq!(t.width(), Some(3004));
4755        assert_eq!(t.height(), Some(1688));
4756        assert_eq!(t.shape(), &[1688, 3004, 4]);
4757        // Stride is carried separately and reports the padded pitch.
4758        assert_eq!(t.effective_row_stride(), Some(stride));
4759        // Buffer is sized to stride × height so the full padded layout fits,
4760        // and CPU map() works for self-allocated strided DMA tensors.
4761        use crate::TensorMapTrait;
4762        {
4763            let map = t.map().unwrap();
4764            assert!(
4765                map.as_slice().len() >= stride * 1688,
4766                "mapped buffer {} bytes < expected {}",
4767                map.as_slice().len(),
4768                stride * 1688
4769            );
4770        }
4771        // CPU write access works too — iterate rows using the padded stride,
4772        // touch only the active `width × bpp` region, verify it round-trips.
4773        {
4774            let mut map = t.map().unwrap();
4775            let slice = map.as_mut_slice();
4776            for y in 0..1688 {
4777                let row_start = y * stride;
4778                for x in 0..3004 {
4779                    let p = row_start + x * 4;
4780                    slice[p] = (y & 0xFF) as u8;
4781                    slice[p + 1] = (x & 0xFF) as u8;
4782                    slice[p + 2] = 0x42;
4783                    slice[p + 3] = 0xFF;
4784                }
4785            }
4786        }
4787        {
4788            let map = t.map().unwrap();
4789            let slice = map.as_slice();
4790            // Sample a few pixels to confirm the round-trip.
4791            assert_eq!(slice[0], 0x00);
4792            assert_eq!(slice[1], 0x00);
4793            assert_eq!(slice[2], 0x42);
4794            assert_eq!(slice[3], 0xFF);
4795            let mid = 100 * stride + 50 * 4;
4796            assert_eq!(slice[mid], 100);
4797            assert_eq!(slice[mid + 1], 50);
4798            assert_eq!(slice[mid + 2], 0x42);
4799        }
4800    }
4801
4802    #[test]
4803    #[cfg(target_os = "linux")]
4804    fn image_tensor_with_stride_rejects_foreign_strided_map() {
4805        // A FOREIGN (imported via from_fd) DMA tensor with row_stride set
4806        // should still refuse CPU mapping — external allocator owns the
4807        // layout. This protects the V4L2 / GStreamer use case.
4808        //
4809        // We simulate a foreign import by wrapping our own allocation's
4810        // fd via `from_fd` and calling set_row_stride manually. The
4811        // `is_imported` flag on from_fd is true by construction.
4812        if !is_dma_available() {
4813            eprintln!("SKIPPED: DMA heap not available");
4814            return;
4815        }
4816        // Allocate a backing buffer large enough for a 320×240 BGRA8 image.
4817        let backing = Tensor::<u8>::new(&[240 * 320 * 4], Some(TensorMemory::Dma), None).unwrap();
4818        let fd = backing.clone_fd().unwrap();
4819        // Import it via from_fd — this marks is_imported=true.
4820        let shape = [240usize, 320, 4];
4821        let storage = TensorStorage::<u8>::from_fd(fd, &shape, None).unwrap();
4822        let mut t = Tensor::<u8>::wrap(storage);
4823        t.set_format(PixelFormat::Bgra).unwrap();
4824        t.set_row_stride(320 * 4).unwrap(); // natural, but still marks it as strided
4825        let err = t.map();
4826        assert!(
4827            matches!(err, Err(Error::InvalidOperation(_))),
4828            "foreign strided map should error"
4829        );
4830    }
4831
4832    #[test]
4833    #[cfg(target_os = "linux")]
4834    fn image_tensor_with_stride_map_rejects_tampered_stride() {
4835        // Round-3 PR feedback (C1): `set_row_stride` is public and only
4836        // validates `stride >= min_stride`, not that the new stride × height
4837        // fits the underlying buffer. A caller that tampers with the stride
4838        // after allocation must not be able to coerce `Tensor::map()` into
4839        // returning a slice larger than the backing mmap (that would be UB
4840        // in `DmaMap::as_slice`).
4841        if !is_dma_available() {
4842            eprintln!("SKIPPED: DMA heap not available");
4843            return;
4844        }
4845        // Allocate a 640×480 RGBA8 padded canvas (stride = 3072 = 768 px).
4846        // Backing buffer is 3072 × 480 = 1,474,560 bytes.
4847        let mut t = Tensor::<u8>::image_with_stride(
4848            640,
4849            480,
4850            PixelFormat::Rgba,
4851            3072,
4852            Some(TensorMemory::Dma),
4853            crate::CpuAccess::ReadWrite,
4854        )
4855        .unwrap();
4856        // Tamper: push the stride up to 4 × the original. This is >=
4857        // min_stride (2560), so `set_row_stride` accepts it.
4858        t.set_row_stride(12288).unwrap();
4859        // Map must now refuse — 12288 × 480 = 5,898,240 > 1,474,560.
4860        let err = t.map();
4861        assert!(
4862            matches!(err, Err(Error::InvalidOperation(_))),
4863            "map() with oversized stride must return InvalidOperation"
4864        );
4865    }
4866
4867    #[test]
4868    fn dma_tensor_new_with_byte_size_rejects_shape_overflow() {
4869        // Round-3 PR feedback (C3): shape.product() * sizeof(T) must use
4870        // checked arithmetic so a pathological shape can't wrap usize and
4871        // make the byte_size-vs-logical-size comparison incorrect.
4872        //
4873        // This test only exercises the overflow rejection path, which is
4874        // pure-Rust and doesn't touch dma_heap — safe to run on any target.
4875        #[cfg(target_os = "linux")]
4876        {
4877            let err = crate::dma::DmaTensor::<u64>::new_with_byte_size(
4878                &[usize::MAX, 2, 2],
4879                usize::MAX,
4880                None,
4881            );
4882            assert!(
4883                matches!(err, Err(Error::InvalidArgument(_))),
4884                "new_with_byte_size must detect shape.product() overflow"
4885            );
4886        }
4887    }
4888
4889    #[test]
4890    #[cfg(target_os = "linux")]
4891    fn image_tensor_with_stride_rejects_too_small_stride() {
4892        // 640×480 RGBA8 natural pitch = 2560, request 2400 → should error.
4893        let err = Tensor::<u8>::image_with_stride(
4894            640,
4895            480,
4896            PixelFormat::Rgba,
4897            2400,
4898            Some(TensorMemory::Dma),
4899            crate::CpuAccess::ReadWrite,
4900        );
4901        assert!(matches!(err, Err(Error::InvalidArgument(_))));
4902    }
4903
4904    #[test]
4905    #[cfg(target_os = "linux")]
4906    fn image_tensor_with_stride_rejects_non_packed() {
4907        // NV12 is SemiPlanar → not supported. (Linux-only because
4908        // `TensorMemory::Dma` itself is a Linux-only enum variant.)
4909        let err = Tensor::<u8>::image_with_stride(
4910            640,
4911            480,
4912            PixelFormat::Nv12,
4913            640,
4914            Some(TensorMemory::Dma),
4915            crate::CpuAccess::ReadWrite,
4916        );
4917        assert!(matches!(err, Err(Error::NotImplemented(_))));
4918    }
4919
4920    #[test]
4921    fn set_format_valid() {
4922        let mut t = Tensor::<u8>::new(&[480, 640, 3], None, None).unwrap();
4923        assert!(t.format().is_none());
4924        t.set_format(PixelFormat::Rgb).unwrap();
4925        assert_eq!(t.format(), Some(PixelFormat::Rgb));
4926        assert_eq!(t.width(), Some(640));
4927        assert_eq!(t.height(), Some(480));
4928    }
4929
4930    #[test]
4931    fn set_format_invalid_shape() {
4932        let mut t = Tensor::<u8>::new(&[480, 640, 4], None, None).unwrap();
4933        // RGB expects 3 channels, not 4
4934        let err = t.set_format(PixelFormat::Rgb);
4935        assert!(err.is_err());
4936        // Original tensor is unmodified
4937        assert!(t.format().is_none());
4938    }
4939
4940    #[test]
4941    fn reshape_clears_format() {
4942        let mut t = Tensor::<u8>::image(
4943            640,
4944            480,
4945            PixelFormat::Rgba,
4946            None,
4947            crate::CpuAccess::ReadWrite,
4948        )
4949        .unwrap();
4950        assert_eq!(t.format(), Some(PixelFormat::Rgba));
4951        // Reshape to flat — format cleared
4952        t.reshape(&[480 * 640 * 4]).unwrap();
4953        assert!(t.format().is_none());
4954    }
4955
4956    #[test]
4957    fn from_planes_nv12() {
4958        let y = Tensor::<u8>::new(&[480, 640], None, None).unwrap();
4959        let uv = Tensor::<u8>::new(&[240, 640], None, None).unwrap();
4960        let img = Tensor::from_planes(y, uv, PixelFormat::Nv12).unwrap();
4961        assert_eq!(img.format(), Some(PixelFormat::Nv12));
4962        assert!(img.is_multiplane());
4963        assert!(img.chroma().is_some());
4964        assert_eq!(img.width(), Some(640));
4965        assert_eq!(img.height(), Some(480));
4966    }
4967
4968    #[test]
4969    fn from_planes_rejects_non_semiplanar() {
4970        let y = Tensor::<u8>::new(&[480, 640], None, None).unwrap();
4971        let uv = Tensor::<u8>::new(&[240, 640], None, None).unwrap();
4972        let err = Tensor::from_planes(y, uv, PixelFormat::Rgb);
4973        assert!(err.is_err());
4974    }
4975
4976    #[test]
4977    fn reshape_multiplane_errors() {
4978        let y = Tensor::<u8>::new(&[480, 640], None, None).unwrap();
4979        let uv = Tensor::<u8>::new(&[240, 640], None, None).unwrap();
4980        let mut img = Tensor::from_planes(y, uv, PixelFormat::Nv12).unwrap();
4981        let err = img.reshape(&[480 * 640 + 240 * 640]);
4982        assert!(err.is_err());
4983    }
4984}
4985
4986#[cfg(test)]
4987mod compression_tests {
4988    use super::*;
4989
4990    #[test]
4991    fn desc_builder_roundtrips() {
4992        let desc = ImageDesc::new(640, 480, PixelFormat::Rgba, DType::U8)
4993            .with_memory(Some(TensorMemory::Mem))
4994            .with_access(CpuAccess::Read)
4995            .with_compression(Compression::Any);
4996        assert_eq!(desc.width(), 640);
4997        assert_eq!(desc.height(), 480);
4998        assert_eq!(desc.format(), PixelFormat::Rgba);
4999        assert_eq!(desc.dtype(), DType::U8);
5000        assert_eq!(desc.memory(), Some(TensorMemory::Mem));
5001        assert_eq!(desc.access(), CpuAccess::Read);
5002        assert_eq!(desc.compression(), Some(Compression::Any));
5003
5004        // Defaults: auto memory, hardware-only, no request.
5005        let plain = ImageDesc::new(2, 2, PixelFormat::Grey, DType::U8);
5006        assert_eq!(plain.memory(), None);
5007        assert_eq!(plain.access(), CpuAccess::None);
5008        assert_eq!(plain.compression(), None);
5009    }
5010
5011    #[test]
5012    fn desc_dtype_must_match_element_type() {
5013        let desc = ImageDesc::new(4, 4, PixelFormat::Rgba, DType::F32);
5014        match Tensor::<u8>::image_desc(&desc) {
5015            Err(Error::InvalidArgument(msg)) => assert!(msg.contains("dtype")),
5016            other => panic!("expected InvalidArgument, got {other:?}"),
5017        }
5018    }
5019
5020    #[test]
5021    fn compression_with_cpu_access_is_invalid() {
5022        let desc = ImageDesc::new(4, 4, PixelFormat::Rgba, DType::U8)
5023            .with_access(CpuAccess::ReadWrite)
5024            .with_compression(Compression::Any);
5025        match Tensor::<u8>::image_desc(&desc) {
5026            Err(Error::InvalidArgument(msg)) => assert!(msg.contains("CpuAccess::None")),
5027            other => panic!("expected InvalidArgument, got {other:?}"),
5028        }
5029    }
5030
5031    #[cfg(not(target_os = "android"))]
5032    #[test]
5033    fn scheme_request_off_android_is_not_implemented() {
5034        let desc = ImageDesc::new(4, 4, PixelFormat::Rgba, DType::U8)
5035            .with_compression(Compression::Scheme(CompressionScheme::Ubwc));
5036        match Tensor::<u8>::image_desc(&desc) {
5037            Err(Error::NotImplemented(msg)) => assert!(msg.contains("Ubwc")),
5038            other => panic!("expected NotImplemented, got {other:?}"),
5039        }
5040    }
5041
5042    #[cfg(not(target_os = "android"))]
5043    #[test]
5044    fn any_request_off_android_resolves_linear_and_counts() {
5045        let before = compression_fallback_count();
5046        let desc = ImageDesc::new(64, 64, PixelFormat::Rgba, DType::U8)
5047            .with_memory(Some(TensorMemory::Mem))
5048            .with_compression(Compression::Any);
5049        let t = Tensor::<u8>::image_desc(&desc).unwrap();
5050        assert_eq!(t.compression(), None);
5051        assert!(compression_fallback_count() > before);
5052    }
5053
5054    #[test]
5055    fn desc_without_request_matches_classic_constructor() {
5056        let desc = ImageDesc::new(32, 32, PixelFormat::Rgba, DType::U8)
5057            .with_memory(Some(TensorMemory::Mem))
5058            .with_access(CpuAccess::ReadWrite);
5059        let t = Tensor::<u8>::image_desc(&desc).unwrap();
5060        assert_eq!(t.compression(), None);
5061        assert_eq!(t.cpu_access(), CpuAccess::ReadWrite);
5062        assert_eq!(t.width(), Some(32));
5063        // Mappable exactly like the classic constructor's result.
5064        let m = t.map_read().unwrap();
5065        assert_eq!(m.as_slice().len(), 32 * 32 * 4);
5066    }
5067
5068    #[test]
5069    fn configure_image_preserves_compression_and_views_inherit() {
5070        // No host platform records a scheme, so emulate the recording to
5071        // pin the preserve/inherit semantics (the physical layout does
5072        // not change when the logical image is reconfigured).
5073        let mut t = Tensor::<u8>::image(
5074            64,
5075            64,
5076            PixelFormat::Rgba,
5077            Some(TensorMemory::Mem),
5078            CpuAccess::None,
5079        )
5080        .unwrap();
5081        t.compression = Some(CompressionScheme::Ubwc);
5082        t.configure_image(32, 32, PixelFormat::Rgba).unwrap();
5083        assert_eq!(t.compression(), Some(CompressionScheme::Ubwc));
5084        let view = t.subview(0, &[16, 32, 4]).unwrap();
5085        assert_eq!(view.compression(), Some(CompressionScheme::Ubwc));
5086    }
5087
5088    #[test]
5089    fn tensor_dyn_dispatches_desc_and_compression() {
5090        let desc = ImageDesc::new(16, 16, PixelFormat::Rgba, DType::U8)
5091            .with_memory(Some(TensorMemory::Mem))
5092            .with_access(CpuAccess::ReadWrite);
5093        let t = TensorDyn::image_desc(&desc).unwrap();
5094        assert_eq!(t.compression(), None);
5095        assert!(matches!(t, TensorDyn::U8(_)));
5096    }
5097}
5098
5099#[cfg(test)]
5100mod cpu_access_tests {
5101    use super::*;
5102
5103    #[test]
5104    fn covers_matrix() {
5105        use CpuAccess::*;
5106        // Every declaration covers a narrower or equal request…
5107        for a in [None, Read, Write, ReadWrite] {
5108            assert!(a.covers(None), "{a:?} must cover None");
5109            assert!(ReadWrite.covers(a), "ReadWrite must cover {a:?}");
5110        }
5111        assert!(Read.covers(Read));
5112        assert!(Write.covers(Write));
5113        // …and never a wider one.
5114        assert!(!None.covers(Read));
5115        assert!(!None.covers(Write));
5116        assert!(!Read.covers(Write));
5117        assert!(!Read.covers(ReadWrite));
5118        assert!(!Write.covers(Read));
5119        assert!(!Write.covers(ReadWrite));
5120    }
5121
5122    #[test]
5123    fn map_with_none_is_invalid() {
5124        let t = Tensor::<u8>::new(&[16], Some(TensorMemory::Mem), None).unwrap();
5125        match t.map_with(CpuAccess::None) {
5126            Err(Error::InvalidArgument(_)) => {}
5127            Err(other) => panic!("expected InvalidArgument, got {other:?}"),
5128            Ok(_) => panic!("map_with(CpuAccess::None) must not succeed"),
5129        }
5130    }
5131
5132    #[test]
5133    fn read_map_rejects_mutation_uniformly() {
5134        // Mem backend: map_read yields a working read view whose mutable
5135        // accessor panics (the uniform cross-backend contract).
5136        let t = Tensor::<u8>::new(&[8], Some(TensorMemory::Mem), None).unwrap();
5137        t.map_mut().unwrap().as_mut_slice().copy_from_slice(&[7; 8]);
5138        let ro = t.map_read().unwrap();
5139        assert_eq!(ro.as_slice(), &[7; 8]);
5140        drop(ro);
5141        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
5142            let mut ro = t.map_read().unwrap();
5143            let _ = ro.as_mut_slice();
5144        }));
5145        assert!(result.is_err(), "as_mut_slice through map_read must panic");
5146    }
5147
5148    #[test]
5149    fn write_and_rw_maps_stay_mutable() {
5150        let t = Tensor::<u8>::new(&[4], Some(TensorMemory::Mem), None).unwrap();
5151        t.map_write()
5152            .unwrap()
5153            .as_mut_slice()
5154            .copy_from_slice(&[1; 4]);
5155        t.map().unwrap().as_mut_slice().copy_from_slice(&[2; 4]);
5156        assert_eq!(t.map_read().unwrap().as_slice(), &[2; 4]);
5157    }
5158
5159    /// The read-only IOSurface lock path: a `map_read` must observe data
5160    /// written through a prior read-write lock, and its unlock (which
5161    /// skips the cache flush) must not disturb subsequent reads.
5162    #[test]
5163    #[cfg(target_os = "macos")]
5164    fn iosurface_read_only_lock_roundtrip() {
5165        let Ok(t) = Tensor::<u8>::new(&[64], Some(TensorMemory::Dma), None) else {
5166            eprintln!("SKIPPED: IOSurface unavailable");
5167            return;
5168        };
5169        {
5170            let mut m = t.map_mut().unwrap();
5171            for (i, b) in m.as_mut_slice().iter_mut().enumerate() {
5172                *b = (i * 3) as u8;
5173            }
5174        }
5175        for _ in 0..2 {
5176            let ro = t.map_read().unwrap();
5177            for (i, b) in ro.as_slice().iter().enumerate() {
5178                assert_eq!(*b, (i * 3) as u8, "byte {i} through read-only lock");
5179            }
5180        }
5181        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
5182            let mut ro = t.map_read().unwrap();
5183            let _ = ro.as_mut_slice();
5184        }));
5185        assert!(result.is_err(), "IOSurface read map must reject mutation");
5186    }
5187}
5188
5189#[cfg(test)]
5190mod tests {
5191    #[cfg(target_os = "linux")]
5192    use nix::unistd::{access, AccessFlags};
5193    #[cfg(target_os = "linux")]
5194    use std::io::Write as _;
5195    use std::sync::RwLock;
5196
5197    use super::*;
5198
5199    #[ctor::ctor(unsafe)]
5200    fn init() {
5201        env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
5202    }
5203
5204    /// Macro to get the current function name for logging in tests.
5205    #[cfg(target_os = "linux")]
5206    macro_rules! function {
5207        () => {{
5208            fn f() {}
5209            fn type_name_of<T>(_: T) -> &'static str {
5210                std::any::type_name::<T>()
5211            }
5212            let name = type_name_of(f);
5213
5214            // Find and cut the rest of the path
5215            match &name[..name.len() - 3].rfind(':') {
5216                Some(pos) => &name[pos + 1..name.len() - 3],
5217                None => &name[..name.len() - 3],
5218            }
5219        }};
5220    }
5221
5222    #[test]
5223    #[cfg(target_os = "linux")]
5224    fn test_tensor() {
5225        let _lock = FD_LOCK.read().unwrap();
5226        let shape = vec![1];
5227        let tensor = DmaTensor::<f32>::new(&shape, Some("dma_tensor"));
5228        let dma_enabled = tensor.is_ok();
5229
5230        let tensor = Tensor::<f32>::new(&shape, None, None).expect("Failed to create tensor");
5231        // Auto-select priority is Dma > Mem; Shm is never auto-selected.
5232        match dma_enabled {
5233            true => assert_eq!(tensor.memory(), TensorMemory::Dma),
5234            false => assert_eq!(tensor.memory(), TensorMemory::Mem),
5235        }
5236    }
5237
5238    #[test]
5239    #[cfg(any(target_os = "macos", target_os = "ios"))]
5240    fn test_tensor() {
5241        let shape = vec![1];
5242        let tensor = Tensor::<f32>::new(&shape, None, None).expect("Failed to create tensor");
5243        // macOS/iOS auto-fallback chain: IOSurface (Dma) → Mem. Healthy systems
5244        // return Dma; Mem only appears under memory pressure or sandboxed
5245        // contexts where IOSurfaceCreate fails. Shm is never auto-selected.
5246        let m = tensor.memory();
5247        assert!(
5248            matches!(m, TensorMemory::Dma | TensorMemory::Mem),
5249            "Unexpected auto-fallback result on macOS/iOS: {m:?}"
5250        );
5251    }
5252
5253    #[test]
5254    #[cfg(all(
5255        unix,
5256        not(any(target_os = "linux", target_os = "macos", target_os = "ios"))
5257    ))]
5258    fn test_tensor() {
5259        let shape = vec![1];
5260        let tensor = Tensor::<f32>::new(&shape, None, None).expect("Failed to create tensor");
5261        // Other Unix (BSD): no DMA, so auto-selection is Mem (Shm is
5262        // explicit-only, never auto-selected).
5263        assert_eq!(tensor.memory(), TensorMemory::Mem);
5264    }
5265
5266    #[test]
5267    #[cfg(not(unix))]
5268    fn test_tensor() {
5269        let shape = vec![1];
5270        let tensor = Tensor::<f32>::new(&shape, None, None).expect("Failed to create tensor");
5271        assert_eq!(tensor.memory(), TensorMemory::Mem);
5272    }
5273
5274    #[test]
5275    #[cfg(target_os = "linux")]
5276    fn test_dma_tensor() {
5277        let _lock = FD_LOCK.read().unwrap();
5278        match access(
5279            "/dev/dma_heap/linux,cma",
5280            AccessFlags::R_OK | AccessFlags::W_OK,
5281        ) {
5282            Ok(_) => println!("/dev/dma_heap/linux,cma is available"),
5283            Err(_) => match access(
5284                "/dev/dma_heap/system",
5285                AccessFlags::R_OK | AccessFlags::W_OK,
5286            ) {
5287                Ok(_) => println!("/dev/dma_heap/system is available"),
5288                Err(e) => {
5289                    writeln!(
5290                        &mut std::io::stdout(),
5291                        "[WARNING] DMA Heap is unavailable: {e}"
5292                    )
5293                    .unwrap();
5294                    return;
5295                }
5296            },
5297        }
5298
5299        let shape = vec![2, 3, 4];
5300        let tensor =
5301            DmaTensor::<f32>::new(&shape, Some("test_tensor")).expect("Failed to create tensor");
5302
5303        const DUMMY_VALUE: f32 = 12.34;
5304
5305        assert_eq!(tensor.memory(), TensorMemory::Dma);
5306        assert_eq!(tensor.name(), "test_tensor");
5307        assert_eq!(tensor.shape(), &shape);
5308        assert_eq!(tensor.size(), 2 * 3 * 4 * std::mem::size_of::<f32>());
5309        assert_eq!(tensor.len(), 2 * 3 * 4);
5310
5311        {
5312            let mut tensor_map = tensor.map().expect("Failed to map DMA memory");
5313            tensor_map.fill(42.0);
5314            assert!(tensor_map.iter().all(|&x| x == 42.0));
5315        }
5316
5317        {
5318            let shared = Tensor::<f32>::from_fd(
5319                tensor
5320                    .clone_fd()
5321                    .expect("Failed to duplicate tensor file descriptor"),
5322                &shape,
5323                Some("test_tensor_shared"),
5324            )
5325            .expect("Failed to create tensor from fd");
5326
5327            assert_eq!(shared.memory(), TensorMemory::Dma);
5328            assert_eq!(shared.name(), "test_tensor_shared");
5329            assert_eq!(shared.shape(), &shape);
5330
5331            let mut tensor_map = shared.map().expect("Failed to map DMA memory from fd");
5332            tensor_map.fill(DUMMY_VALUE);
5333            assert!(tensor_map.iter().all(|&x| x == DUMMY_VALUE));
5334        }
5335
5336        {
5337            let tensor_map = tensor.map().expect("Failed to map DMA memory");
5338            assert!(tensor_map.iter().all(|&x| x == DUMMY_VALUE));
5339        }
5340
5341        let mut tensor = DmaTensor::<u8>::new(&shape, None).expect("Failed to create tensor");
5342        assert_eq!(tensor.shape(), &shape);
5343        let new_shape = vec![3, 4, 4];
5344        assert!(
5345            tensor.reshape(&new_shape).is_err(),
5346            "Reshape should fail due to size mismatch"
5347        );
5348        assert_eq!(tensor.shape(), &shape, "Shape should remain unchanged");
5349
5350        let new_shape = vec![2, 3, 4];
5351        tensor.reshape(&new_shape).expect("Reshape should succeed");
5352        assert_eq!(
5353            tensor.shape(),
5354            &new_shape,
5355            "Shape should be updated after successful reshape"
5356        );
5357
5358        {
5359            let mut tensor_map = tensor.map().expect("Failed to map DMA memory");
5360            tensor_map.fill(1);
5361            assert!(tensor_map.iter().all(|&x| x == 1));
5362        }
5363
5364        {
5365            let mut tensor_map = tensor.map().expect("Failed to map DMA memory");
5366            tensor_map[2] = 42;
5367            assert_eq!(tensor_map[1], 1, "Value at index 1 should be 1");
5368            assert_eq!(tensor_map[2], 42, "Value at index 2 should be 42");
5369        }
5370    }
5371
5372    #[test]
5373    #[cfg(unix)]
5374    fn test_shm_tensor() {
5375        let _lock = FD_LOCK.read().unwrap();
5376        let shape = vec![2, 3, 4];
5377        let tensor =
5378            ShmTensor::<f32>::new(&shape, Some("test_tensor")).expect("Failed to create tensor");
5379        assert_eq!(tensor.shape(), &shape);
5380        assert_eq!(tensor.size(), 2 * 3 * 4 * std::mem::size_of::<f32>());
5381        assert_eq!(tensor.name(), "test_tensor");
5382
5383        const DUMMY_VALUE: f32 = 12.34;
5384        {
5385            let mut tensor_map = tensor.map().expect("Failed to map shared memory");
5386            tensor_map.fill(42.0);
5387            assert!(tensor_map.iter().all(|&x| x == 42.0));
5388        }
5389
5390        {
5391            let shared = Tensor::<f32>::from_fd(
5392                tensor
5393                    .clone_fd()
5394                    .expect("Failed to duplicate tensor file descriptor"),
5395                &shape,
5396                Some("test_tensor_shared"),
5397            )
5398            .expect("Failed to create tensor from fd");
5399
5400            assert_eq!(shared.memory(), TensorMemory::Shm);
5401            assert_eq!(shared.name(), "test_tensor_shared");
5402            assert_eq!(shared.shape(), &shape);
5403
5404            let mut tensor_map = shared.map().expect("Failed to map shared memory from fd");
5405            tensor_map.fill(DUMMY_VALUE);
5406            assert!(tensor_map.iter().all(|&x| x == DUMMY_VALUE));
5407        }
5408
5409        {
5410            let tensor_map = tensor.map().expect("Failed to map shared memory");
5411            assert!(tensor_map.iter().all(|&x| x == DUMMY_VALUE));
5412        }
5413
5414        let mut tensor = ShmTensor::<u8>::new(&shape, None).expect("Failed to create tensor");
5415        assert_eq!(tensor.shape(), &shape);
5416        let new_shape = vec![3, 4, 4];
5417        assert!(
5418            tensor.reshape(&new_shape).is_err(),
5419            "Reshape should fail due to size mismatch"
5420        );
5421        assert_eq!(tensor.shape(), &shape, "Shape should remain unchanged");
5422
5423        let new_shape = vec![2, 3, 4];
5424        tensor.reshape(&new_shape).expect("Reshape should succeed");
5425        assert_eq!(
5426            tensor.shape(),
5427            &new_shape,
5428            "Shape should be updated after successful reshape"
5429        );
5430
5431        {
5432            let mut tensor_map = tensor.map().expect("Failed to map shared memory");
5433            tensor_map.fill(1);
5434            assert!(tensor_map.iter().all(|&x| x == 1));
5435        }
5436
5437        {
5438            let mut tensor_map = tensor.map().expect("Failed to map shared memory");
5439            tensor_map[2] = 42;
5440            assert_eq!(tensor_map[1], 1, "Value at index 1 should be 1");
5441            assert_eq!(tensor_map[2], 42, "Value at index 2 should be 42");
5442        }
5443    }
5444
5445    #[test]
5446    fn mem_subview_partitions_parent_buffer() {
5447        // One heap [2,4] u8 parent (8 bytes). Two [1,4] sub-views at byte
5448        // offsets 0 and 4 must share the parent allocation (zero-copy) and be
5449        // independently writable: view 0 owns bytes [0,4), view 1 owns [4,8).
5450        // Today this is impossible — heap offset is rejected and there is no
5451        // shared sub-view constructor.
5452        let parent = Tensor::<u8>::new(&[2, 4], Some(TensorMemory::Mem), None).unwrap();
5453        let view0 = parent.subview(0, &[1, 4]).expect("subview at offset 0");
5454        let view1 = parent.subview(4, &[1, 4]).expect("subview at offset 4");
5455
5456        view1
5457            .map()
5458            .unwrap()
5459            .as_mut_slice()
5460            .copy_from_slice(&[10, 20, 30, 40]);
5461        view0
5462            .map()
5463            .unwrap()
5464            .as_mut_slice()
5465            .copy_from_slice(&[1, 2, 3, 4]);
5466
5467        // Each view sees only its own window.
5468        assert_eq!(view0.map().unwrap().as_slice(), &[1, 2, 3, 4]);
5469        assert_eq!(view1.map().unwrap().as_slice(), &[10, 20, 30, 40]);
5470        // The parent buffer is correctly partitioned (shared, zero-copy).
5471        assert_eq!(
5472            parent.map().unwrap().as_slice(),
5473            &[1, 2, 3, 4, 10, 20, 30, 40]
5474        );
5475    }
5476
5477    #[test]
5478    fn batch_partitions_leading_dim() {
5479        // Raw [4,2,2,3] u8 batched tensor: 4 elements of 12 bytes each. batch(n)
5480        // yields element n at offset n*12, sharing the parent buffer (zero-copy).
5481        let parent = Tensor::<u8>::new(&[4, 2, 2, 3], Some(TensorMemory::Mem), None).unwrap();
5482        for i in 0..4u8 {
5483            let e = parent.batch(i as usize).expect("batch element");
5484            assert_eq!(e.shape(), &[2, 2, 3]);
5485            // A batch element shares the parent's BufferIdentity.
5486            assert_eq!(e.buffer_identity().id(), parent.buffer_identity().id());
5487            for b in e.map().unwrap().as_mut_slice() {
5488                *b = i + 1;
5489            }
5490        }
5491        // Each element occupies its own 12-byte band of the parent.
5492        let whole = parent.map().unwrap();
5493        let s = whole.as_slice();
5494        for i in 0..4usize {
5495            assert!(
5496                s[i * 12..(i + 1) * 12].iter().all(|&b| b == (i as u8 + 1)),
5497                "band {i} not partitioned: {:?}",
5498                &s[i * 12..(i + 1) * 12]
5499            );
5500        }
5501    }
5502
5503    #[test]
5504    fn view_origin_snapshots_parent_and_composes() {
5505        // view() on a whole image snapshots the parent dims + the view's origin.
5506        let parent = Tensor::<u8>::image(
5507            100,
5508            80,
5509            PixelFormat::Rgba,
5510            Some(TensorMemory::Mem),
5511            crate::CpuAccess::ReadWrite,
5512        )
5513        .unwrap();
5514        assert_eq!(
5515            parent.view_origin(),
5516            None,
5517            "whole tensor has no view_origin"
5518        );
5519        let v = parent.view(Region::new(10, 20, 30, 40)).unwrap();
5520        assert_eq!(
5521            v.view_origin(),
5522            Some(ViewOrigin {
5523                parent_width: 100,
5524                parent_height: 80,
5525                parent_row_stride: 100 * 4, // tight RGBA pitch
5526                x: 10,
5527                y: 20
5528            })
5529        );
5530        // A view of a view keeps the ROOT parent and accumulates the origin.
5531        let v2 = v.view(Region::new(5, 5, 10, 10)).unwrap();
5532        assert_eq!(
5533            v2.view_origin(),
5534            Some(ViewOrigin {
5535                parent_width: 100,
5536                parent_height: 80,
5537                parent_row_stride: 100 * 4,
5538                x: 15,
5539                y: 25
5540            }),
5541            "nested view composes onto the root parent"
5542        );
5543    }
5544
5545    #[test]
5546    fn view_origin_none_for_raw_batch() {
5547        // A raw (unformatted) batched tensor has no pixel geometry, so batch()
5548        // leaves view_origin None (the per-slot path, not the one-import pivot).
5549        let parent = Tensor::<u8>::new(&[4, 2, 2, 3], Some(TensorMemory::Mem), None).unwrap();
5550        assert_eq!(parent.batch(2).unwrap().view_origin(), None);
5551    }
5552
5553    #[test]
5554    fn batch_rejects_out_of_bounds_index() {
5555        let parent = Tensor::<u8>::new(&[4, 2, 2, 3], Some(TensorMemory::Mem), None).unwrap();
5556        match parent.batch(4) {
5557            Err(Error::BatchIndexOutOfBounds { index, batch }) => {
5558                assert_eq!((index, batch), (4, 4));
5559            }
5560            other => panic!("expected BatchIndexOutOfBounds, got {other:?}"),
5561        }
5562    }
5563
5564    #[test]
5565    fn batch_zero_on_unit_n_is_whole() {
5566        // N == 1: batch(0) is the whole per-element block at offset 0 (no plane_offset).
5567        let parent = Tensor::<u8>::new(&[1, 2, 2, 3], Some(TensorMemory::Mem), None).unwrap();
5568        let e = parent.batch(0).unwrap();
5569        assert_eq!(e.shape(), &[2, 2, 3]);
5570        assert_eq!(e.plane_offset(), None);
5571        assert_eq!(e.buffer_identity().id(), parent.buffer_identity().id());
5572    }
5573
5574    #[test]
5575    fn mem_subview_rejects_unaligned_offset() {
5576        // f32 has align 4; a byte offset of 2 cannot back a valid `*const f32`.
5577        let parent = Tensor::<f32>::new(&[8], Some(TensorMemory::Mem), None).unwrap();
5578        assert!(parent.subview(2, &[1]).is_err());
5579        // A correctly aligned offset is accepted.
5580        assert!(parent.subview(4, &[1]).is_ok());
5581    }
5582
5583    #[test]
5584    fn mem_subview_rejects_out_of_bounds() {
5585        let parent = Tensor::<u8>::new(&[8], Some(TensorMemory::Mem), None).unwrap();
5586        // offset 6 + 4 bytes = 10 exceeds the 8-byte allocation.
5587        assert!(parent.subview(6, &[4]).is_err());
5588    }
5589
5590    /// Regression guard for the `TensorTrait::view` promotion (R2): a `subview`
5591    /// must share the parent's `BufferIdentity` on **every** backend, not mint a
5592    /// fresh one. Identity-keyed caches (the GL EGLImage import) rely on this to
5593    /// treat offset-distinct windows of one buffer as a single import; a fresh
5594    /// identity would silently break that and regress zero-copy import reuse.
5595    ///
5596    /// Runs each backend that can be allocated without a GPU/GL context on the
5597    /// test host: `Mem` (always); `Shm` (when POSIX shm is available); the
5598    /// platform-native zero-copy buffer `Dma` (DMA-BUF on Linux / IOSurface on
5599    /// macOS, when available). `Pbo` shares its identity the same way (see
5600    /// `pbo.rs` `view`) but needs a live GL context, so it is exercised by the
5601    /// image-crate GL tests rather than here.
5602    #[test]
5603    fn subview_shares_buffer_identity_all_backends() {
5604        // u8 has align 1, so every byte offset is valid for the alignment check;
5605        // this isolates the identity-sharing contract from alignment concerns.
5606        let assert_shares = |memory: TensorMemory, label: &str| {
5607            let parent = Tensor::<u8>::new(&[64], Some(memory), None)
5608                .unwrap_or_else(|e| panic!("{label}: parent alloc failed: {e:?}"));
5609            let parent_id = parent.buffer_identity().id();
5610            // Two offset-distinct windows must both carry the parent's identity.
5611            let v0 = parent
5612                .subview(0, &[16])
5613                .unwrap_or_else(|e| panic!("{label}: subview(0) failed: {e:?}"));
5614            let v1 = parent
5615                .subview(16, &[16])
5616                .unwrap_or_else(|e| panic!("{label}: subview(16) failed: {e:?}"));
5617            assert_eq!(
5618                v0.buffer_identity().id(),
5619                parent_id,
5620                "{label}: subview(0) minted a fresh BufferIdentity"
5621            );
5622            assert_eq!(
5623                v1.buffer_identity().id(),
5624                parent_id,
5625                "{label}: subview(16) minted a fresh BufferIdentity"
5626            );
5627        };
5628
5629        assert_shares(TensorMemory::Mem, "Mem");
5630
5631        #[cfg(unix)]
5632        if crate::is_shm_available() {
5633            assert_shares(TensorMemory::Shm, "Shm");
5634        }
5635
5636        // Dma == DMA-BUF on Linux, IOSurface on macOS; same public variant.
5637        if crate::is_gpu_buffer_available() {
5638            assert_shares(TensorMemory::Dma, "Dma");
5639        }
5640    }
5641
5642    #[test]
5643    fn mem_subview_four_views_no_aliasing() {
5644        // One [4,3] f32 parent; four [1,3] views at 12-byte strides, each
5645        // written independently. Exercises a multi-byte element type (offsets
5646        // must stay element-aligned) and N-way zero-copy sharing.
5647        let parent = Tensor::<f32>::new(&[4, 3], Some(TensorMemory::Mem), None).unwrap();
5648        let frame = 3 * std::mem::size_of::<f32>();
5649        for i in 0..4 {
5650            let v = parent.subview(i * frame, &[1, 3]).unwrap();
5651            let val = i as f32 + 1.0;
5652            v.map()
5653                .unwrap()
5654                .as_mut_slice()
5655                .copy_from_slice(&[val, val, val]);
5656        }
5657        assert_eq!(
5658            parent.map().unwrap().as_slice(),
5659            &[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0]
5660        );
5661    }
5662
5663    #[test]
5664    fn mem_subview_inherits_format_and_row_stride() {
5665        // A sub-view is a ready-to-use sub-image: it inherits the parent's
5666        // pixel format and (crucially) its padded row stride, so a strided
5667        // parent yields strided windows. Set a stride wider than the tight row
5668        // to exercise the row_stride inheritance path specifically.
5669        let mut parent = Tensor::<u8>::image(
5670            100,
5671            100,
5672            PixelFormat::Rgba,
5673            Some(TensorMemory::Mem),
5674            crate::CpuAccess::ReadWrite,
5675        )
5676        .unwrap();
5677        parent.set_row_stride_unchecked(512); // padded stride (> 100*4)
5678        let view = parent.subview(4096, &[10, 10, 4]).unwrap();
5679        assert_eq!(view.format(), Some(PixelFormat::Rgba), "format inherited");
5680        assert_eq!(view.row_stride(), Some(512), "row_stride inherited");
5681    }
5682
5683    #[test]
5684    fn mem_strided_subview_maps_offset_and_byte_size() {
5685        // Integration of the sub-region offset (PR #89) and the strided-map
5686        // `byte_size_override` (PR #90): a strided sub-view exposes its full
5687        // padded window (`row_stride × rows`) starting at the view's byte
5688        // offset, mapped zero-copy into the parent.
5689        let parent = Tensor::<u8>::new(&[2048], Some(TensorMemory::Mem), None).unwrap();
5690        let mut view = parent.subview(128, &[8, 16]).unwrap(); // 8 rows × 16 @ off 128
5691        assert_eq!(view.plane_offset(), Some(128));
5692        view.set_row_stride_unchecked(32); // padded stride (> 16)
5693
5694        {
5695            let mut m = view.map().unwrap();
5696            let s = m.as_mut_slice();
5697            // Strided map exposes the padded window: stride(32) × rows(8) = 256.
5698            assert_eq!(
5699                s.len(),
5700                256,
5701                "strided map exposes the full padded byte window"
5702            );
5703            s[0] = 0xAA; // row 0, col 0
5704            s[32] = 0xBB; // row 1, col 0 (one stride in)
5705        }
5706
5707        // Zero-copy: the writes land in the parent at the view's offset.
5708        let p = parent.map().unwrap();
5709        let pb = p.as_slice();
5710        assert_eq!(pb[128], 0xAA, "row 0 writes at parent offset 128");
5711        assert_eq!(
5712            pb[128 + 32],
5713            0xBB,
5714            "row 1 writes at parent offset 128 + stride"
5715        );
5716    }
5717
5718    #[test]
5719    #[cfg(unix)]
5720    fn shm_subview_partitions_parent_buffer() {
5721        // Mirrors `mem_subview_partitions_parent_buffer` for Shm: one [2,4] u8
5722        // parent shared segment (8 bytes); two [1,4] sub-views at byte offsets 0
5723        // and 4 must share the segment (zero-copy, via cloned fd) and be
5724        // independently writable — view 0 owns [0,4), view 1 owns [4,8).
5725        if !crate::is_shm_available() {
5726            eprintln!("SKIPPED: shm not available");
5727            return;
5728        }
5729        let parent = Tensor::<u8>::new(&[2, 4], Some(TensorMemory::Shm), None).unwrap();
5730        let view0 = parent.subview(0, &[1, 4]).expect("shm subview at offset 0");
5731        let view1 = parent.subview(4, &[1, 4]).expect("shm subview at offset 4");
5732
5733        view1
5734            .map()
5735            .unwrap()
5736            .as_mut_slice()
5737            .copy_from_slice(&[10, 20, 30, 40]);
5738        view0
5739            .map()
5740            .unwrap()
5741            .as_mut_slice()
5742            .copy_from_slice(&[1, 2, 3, 4]);
5743
5744        assert_eq!(view0.map().unwrap().as_slice(), &[1, 2, 3, 4]);
5745        assert_eq!(view1.map().unwrap().as_slice(), &[10, 20, 30, 40]);
5746        // The parent sees the full partitioned segment (shared, zero-copy).
5747        assert_eq!(
5748            parent.map().unwrap().as_slice(),
5749            &[1, 2, 3, 4, 10, 20, 30, 40]
5750        );
5751        // A sub-view of a sub-view composes the offset.
5752        let nested = view1.subview(2, &[1, 2]).expect("nested shm subview");
5753        assert_eq!(nested.map().unwrap().as_slice(), &[30, 40]);
5754    }
5755
5756    #[test]
5757    #[cfg(unix)]
5758    fn shm_subview_rejects_unaligned_and_oob() {
5759        if !crate::is_shm_available() {
5760            eprintln!("SKIPPED: shm not available");
5761            return;
5762        }
5763        // f32 align 4: a 2-byte offset cannot back a valid `*const f32`.
5764        let parent = Tensor::<f32>::new(&[8], Some(TensorMemory::Shm), None).unwrap();
5765        assert!(parent.subview(2, &[1]).is_err());
5766        assert!(parent.subview(4, &[1]).is_ok());
5767        // Out of bounds: offset 6 + 4 bytes = 10 > 8-byte (u8) segment.
5768        let p2 = Tensor::<u8>::new(&[8], Some(TensorMemory::Shm), None).unwrap();
5769        assert!(p2.subview(6, &[4]).is_err());
5770    }
5771
5772    #[test]
5773    #[cfg(target_os = "linux")]
5774    fn dma_subview_matches_mem_subview() {
5775        // Serialize against the fd-leak tests: this test opens DMA fds (alloc +
5776        // clone_fd), which would otherwise perturb their fd counts.
5777        let _lock = FD_LOCK.read().unwrap();
5778        // Identical sub-view semantics across Dma (shared fd) and Mem (shared
5779        // Arc): same offsets → same logical windows → same partition.
5780        let dma = match Tensor::<u8>::new(&[8], Some(TensorMemory::Dma), None) {
5781            Ok(t) => t,
5782            Err(_) => {
5783                eprintln!("SKIPPED: DMA not available");
5784                return;
5785            }
5786        };
5787        let mem = Tensor::<u8>::new(&[8], Some(TensorMemory::Mem), None).unwrap();
5788        for parent in [&dma, &mem] {
5789            let v0 = parent.subview(0, &[4]).unwrap();
5790            let v1 = parent.subview(4, &[4]).unwrap();
5791            v0.map()
5792                .unwrap()
5793                .as_mut_slice()
5794                .copy_from_slice(&[1, 2, 3, 4]);
5795            v1.map()
5796                .unwrap()
5797                .as_mut_slice()
5798                .copy_from_slice(&[5, 6, 7, 8]);
5799            assert_eq!(parent.map().unwrap().as_slice(), &[1, 2, 3, 4, 5, 6, 7, 8]);
5800        }
5801    }
5802
5803    #[test]
5804    #[cfg(target_os = "linux")]
5805    fn dma_strided_subview_maps_padded_window() {
5806        // The strided-map path differs by backing: DMA maps through
5807        // `mmap_offset` + the `byte_size_override`, not the Mem `Arc` slice. A
5808        // padded sub-view of a DMA buffer must still expose its full
5809        // `row_stride × rows` window zero-copy at the view's offset (the GPU
5810        // batched-render-to-DMA case). Mirrors
5811        // `mem_strided_subview_maps_offset_and_byte_size` on a Dma parent.
5812        let _lock = FD_LOCK.read().unwrap();
5813        let parent = match Tensor::<u8>::new(&[2048], Some(TensorMemory::Dma), None) {
5814            Ok(t) => t,
5815            Err(_) => {
5816                eprintln!("SKIPPED: DMA not available");
5817                return;
5818            }
5819        };
5820        let mut view = parent.subview(128, &[8, 16]).unwrap();
5821        assert_eq!(view.plane_offset(), Some(128));
5822        view.set_row_stride_unchecked(32); // padded stride (> 16)
5823
5824        {
5825            let mut m = view.map().unwrap();
5826            let s = m.as_mut_slice();
5827            assert_eq!(s.len(), 256, "strided DMA map exposes stride(32) × rows(8)");
5828            s[0] = 0xAA; // row 0, col 0
5829            s[32] = 0xBB; // row 1, col 0 (one stride in)
5830        }
5831
5832        let p = parent.map().unwrap();
5833        let pb = p.as_slice();
5834        assert_eq!(pb[128], 0xAA, "row 0 writes at parent offset 128");
5835        assert_eq!(
5836            pb[128 + 32],
5837            0xBB,
5838            "row 1 writes at parent offset 128 + stride"
5839        );
5840    }
5841
5842    #[test]
5843    #[cfg(target_os = "linux")]
5844    fn view_single_row_snapshots_parent_stride() {
5845        // A single-row `view()` keeps a TIGHT `row_stride` for map-span safety,
5846        // but its `view_origin` snapshots the PARENT row stride — the GL backend
5847        // keys its EGLImage import/pitch on that snapshot (not the view's tight
5848        // stride), so single-row and multi-row sibling views collapse onto the
5849        // same parent import.
5850        let _lock = FD_LOCK.read().unwrap();
5851        // 8x4 RGBA with a padded 64-byte row stride (tight row = 8*4 = 32).
5852        let parent = match Tensor::<u8>::image_with_stride(
5853            8,
5854            4,
5855            PixelFormat::Rgba,
5856            64,
5857            Some(TensorMemory::Dma),
5858            crate::CpuAccess::ReadWrite,
5859        ) {
5860            Ok(t) => t,
5861            Err(_) => {
5862                eprintln!("SKIPPED: DMA not available");
5863                return;
5864            }
5865        };
5866        assert_eq!(parent.effective_row_stride(), Some(64));
5867        // Bottom row (y=3) at x>0 — the case the tight single-row stride guards.
5868        let row = parent.view(Region::new(2, 3, 4, 1)).unwrap();
5869        // The view's own stride is tight (4*4 = 16) so its strided map stays in
5870        // bounds; the GL-facing parent pitch (64) lives in `view_origin`.
5871        assert_eq!(row.effective_row_stride(), Some(16));
5872        let vo = row.view_origin().expect("a view carries a view_origin");
5873        assert_eq!(
5874            vo.parent_row_stride, 64,
5875            "GL keys/pitches a view on the parent stride, not its tight one"
5876        );
5877        // The tight stride keeps map() in-bounds for the bottom / x>0 single row.
5878        assert_eq!(row.map().unwrap().as_slice().len(), 16);
5879    }
5880
5881    #[test]
5882    fn test_mem_tensor() {
5883        let shape = vec![2, 3, 4];
5884        let tensor =
5885            MemTensor::<f32>::new(&shape, Some("test_tensor")).expect("Failed to create tensor");
5886        assert_eq!(tensor.shape(), &shape);
5887        assert_eq!(tensor.size(), 2 * 3 * 4 * std::mem::size_of::<f32>());
5888        assert_eq!(tensor.name(), "test_tensor");
5889
5890        {
5891            let mut tensor_map = tensor.map().expect("Failed to map memory");
5892            tensor_map.fill(42.0);
5893            assert!(tensor_map.iter().all(|&x| x == 42.0));
5894        }
5895
5896        let mut tensor = MemTensor::<u8>::new(&shape, None).expect("Failed to create tensor");
5897        assert_eq!(tensor.shape(), &shape);
5898        let new_shape = vec![3, 4, 4];
5899        assert!(
5900            tensor.reshape(&new_shape).is_err(),
5901            "Reshape should fail due to size mismatch"
5902        );
5903        assert_eq!(tensor.shape(), &shape, "Shape should remain unchanged");
5904
5905        let new_shape = vec![2, 3, 4];
5906        tensor.reshape(&new_shape).expect("Reshape should succeed");
5907        assert_eq!(
5908            tensor.shape(),
5909            &new_shape,
5910            "Shape should be updated after successful reshape"
5911        );
5912
5913        {
5914            let mut tensor_map = tensor.map().expect("Failed to map memory");
5915            tensor_map.fill(1);
5916            assert!(tensor_map.iter().all(|&x| x == 1));
5917        }
5918
5919        {
5920            let mut tensor_map = tensor.map().expect("Failed to map memory");
5921            tensor_map[2] = 42;
5922            assert_eq!(tensor_map[1], 1, "Value at index 1 should be 1");
5923            assert_eq!(tensor_map[2], 42, "Value at index 2 should be 42");
5924        }
5925    }
5926
5927    #[test]
5928    #[cfg(target_os = "linux")]
5929    fn test_dma_no_fd_leaks() {
5930        let _lock = FD_LOCK.write().unwrap();
5931        if !is_dma_available() {
5932            log::warn!(
5933                "SKIPPED: {} - DMA memory allocation not available (permission denied or no DMA-BUF support)",
5934                function!()
5935            );
5936            return;
5937        }
5938
5939        let proc = procfs::process::Process::myself()
5940            .expect("Failed to get current process using /proc/self");
5941
5942        let start_open_fds = proc
5943            .fd_count()
5944            .expect("Failed to get open file descriptor count");
5945
5946        for _ in 0..100 {
5947            let tensor = Tensor::<u8>::new(&[100, 100], Some(TensorMemory::Dma), None)
5948                .expect("Failed to create tensor");
5949            let mut map = tensor.map().unwrap();
5950            map.as_mut_slice().fill(233);
5951        }
5952
5953        let end_open_fds = proc
5954            .fd_count()
5955            .expect("Failed to get open file descriptor count");
5956
5957        assert_eq!(
5958            start_open_fds, end_open_fds,
5959            "File descriptor leak detected: {} -> {}",
5960            start_open_fds, end_open_fds
5961        );
5962    }
5963
5964    #[test]
5965    #[cfg(target_os = "linux")]
5966    fn test_dma_from_fd_no_fd_leaks() {
5967        let _lock = FD_LOCK.write().unwrap();
5968        if !is_dma_available() {
5969            log::warn!(
5970                "SKIPPED: {} - DMA memory allocation not available (permission denied or no DMA-BUF support)",
5971                function!()
5972            );
5973            return;
5974        }
5975
5976        let proc = procfs::process::Process::myself()
5977            .expect("Failed to get current process using /proc/self");
5978
5979        let start_open_fds = proc
5980            .fd_count()
5981            .expect("Failed to get open file descriptor count");
5982
5983        let orig = Tensor::<u8>::new(&[100, 100], Some(TensorMemory::Dma), None).unwrap();
5984
5985        for _ in 0..100 {
5986            let tensor =
5987                Tensor::<u8>::from_fd(orig.clone_fd().unwrap(), orig.shape(), None).unwrap();
5988            let mut map = tensor.map().unwrap();
5989            map.as_mut_slice().fill(233);
5990        }
5991        drop(orig);
5992
5993        let end_open_fds = proc.fd_count().unwrap();
5994
5995        assert_eq!(
5996            start_open_fds, end_open_fds,
5997            "File descriptor leak detected: {} -> {}",
5998            start_open_fds, end_open_fds
5999        );
6000    }
6001
6002    #[test]
6003    #[cfg(target_os = "linux")]
6004    fn test_shm_no_fd_leaks() {
6005        let _lock = FD_LOCK.write().unwrap();
6006        if !is_shm_available() {
6007            log::warn!(
6008                "SKIPPED: {} - SHM memory allocation not available (permission denied or no SHM support)",
6009                function!()
6010            );
6011            return;
6012        }
6013
6014        let proc = procfs::process::Process::myself()
6015            .expect("Failed to get current process using /proc/self");
6016
6017        let start_open_fds = proc
6018            .fd_count()
6019            .expect("Failed to get open file descriptor count");
6020
6021        for _ in 0..100 {
6022            let tensor = Tensor::<u8>::new(&[100, 100], Some(TensorMemory::Shm), None)
6023                .expect("Failed to create tensor");
6024            let mut map = tensor.map().unwrap();
6025            map.as_mut_slice().fill(233);
6026        }
6027
6028        let end_open_fds = proc
6029            .fd_count()
6030            .expect("Failed to get open file descriptor count");
6031
6032        assert_eq!(
6033            start_open_fds, end_open_fds,
6034            "File descriptor leak detected: {} -> {}",
6035            start_open_fds, end_open_fds
6036        );
6037    }
6038
6039    #[test]
6040    #[cfg(target_os = "linux")]
6041    fn test_shm_from_fd_no_fd_leaks() {
6042        let _lock = FD_LOCK.write().unwrap();
6043        if !is_shm_available() {
6044            log::warn!(
6045                "SKIPPED: {} - SHM memory allocation not available (permission denied or no SHM support)",
6046                function!()
6047            );
6048            return;
6049        }
6050
6051        let proc = procfs::process::Process::myself()
6052            .expect("Failed to get current process using /proc/self");
6053
6054        let start_open_fds = proc
6055            .fd_count()
6056            .expect("Failed to get open file descriptor count");
6057
6058        let orig = Tensor::<u8>::new(&[100, 100], Some(TensorMemory::Shm), None).unwrap();
6059
6060        for _ in 0..100 {
6061            let tensor =
6062                Tensor::<u8>::from_fd(orig.clone_fd().unwrap(), orig.shape(), None).unwrap();
6063            let mut map = tensor.map().unwrap();
6064            map.as_mut_slice().fill(233);
6065        }
6066        drop(orig);
6067
6068        let end_open_fds = proc.fd_count().unwrap();
6069
6070        assert_eq!(
6071            start_open_fds, end_open_fds,
6072            "File descriptor leak detected: {} -> {}",
6073            start_open_fds, end_open_fds
6074        );
6075    }
6076
6077    #[cfg(feature = "ndarray")]
6078    #[test]
6079    fn test_ndarray() {
6080        let _lock = FD_LOCK.read().unwrap();
6081        let shape = vec![2, 3, 4];
6082        let tensor = Tensor::<f32>::new(&shape, None, None).expect("Failed to create tensor");
6083
6084        let mut tensor_map = tensor.map().expect("Failed to map tensor memory");
6085        tensor_map.fill(1.0);
6086
6087        let view = tensor_map.view().expect("Failed to get ndarray view");
6088        assert_eq!(view.shape(), &[2, 3, 4]);
6089        assert!(view.iter().all(|&x| x == 1.0));
6090
6091        let mut view_mut = tensor_map
6092            .view_mut()
6093            .expect("Failed to get mutable ndarray view");
6094        view_mut[[0, 0, 0]] = 42.0;
6095        assert_eq!(view_mut[[0, 0, 0]], 42.0);
6096        assert_eq!(tensor_map[0], 42.0, "Value at index 0 should be 42");
6097    }
6098
6099    #[test]
6100    fn test_buffer_identity_unique() {
6101        let id1 = BufferIdentity::new();
6102        let id2 = BufferIdentity::new();
6103        assert_ne!(
6104            id1.id(),
6105            id2.id(),
6106            "Two identities should have different ids"
6107        );
6108    }
6109
6110    #[test]
6111    fn test_buffer_identity_clone_shares_guard() {
6112        let id1 = BufferIdentity::new();
6113        let weak = id1.weak();
6114        assert!(
6115            weak.upgrade().is_some(),
6116            "Weak should be alive while original exists"
6117        );
6118
6119        let id2 = id1.clone();
6120        assert_eq!(id1.id(), id2.id(), "Cloned identity should have same id");
6121
6122        drop(id1);
6123        assert!(
6124            weak.upgrade().is_some(),
6125            "Weak should still be alive (clone holds Arc)"
6126        );
6127
6128        drop(id2);
6129        assert!(
6130            weak.upgrade().is_none(),
6131            "Weak should be dead after all clones dropped"
6132        );
6133    }
6134
6135    #[test]
6136    fn test_tensor_buffer_identity() {
6137        let t1 = Tensor::<u8>::new(&[100], Some(TensorMemory::Mem), Some("t1")).unwrap();
6138        let t2 = Tensor::<u8>::new(&[100], Some(TensorMemory::Mem), Some("t2")).unwrap();
6139        assert_ne!(
6140            t1.buffer_identity().id(),
6141            t2.buffer_identity().id(),
6142            "Different tensors should have different buffer ids"
6143        );
6144    }
6145
6146    // ------------------------------------------------------------------------
6147    // Quantization — constructor validation + accessor correctness.
6148    // ------------------------------------------------------------------------
6149
6150    #[test]
6151    fn test_quantization_per_tensor_constructors() {
6152        let q = Quantization::per_tensor(0.1, -5);
6153        assert!(q.is_per_tensor());
6154        assert!(!q.is_per_channel());
6155        assert!(!q.is_symmetric());
6156        assert_eq!(q.scale(), &[0.1]);
6157        assert_eq!(q.zero_point(), Some(&[-5][..]));
6158
6159        let qs = Quantization::per_tensor_symmetric(0.05);
6160        assert!(qs.is_per_tensor());
6161        assert!(qs.is_symmetric());
6162        assert_eq!(qs.zero_point(), None);
6163    }
6164
6165    #[test]
6166    fn test_quantization_per_channel_constructors() {
6167        let q = Quantization::per_channel(vec![0.1, 0.2, 0.3], vec![0, -1, 1], 2).unwrap();
6168        assert!(q.is_per_channel());
6169        assert!(!q.is_symmetric());
6170        assert_eq!(q.axis(), Some(2));
6171        assert_eq!(q.scale().len(), 3);
6172
6173        let qs = Quantization::per_channel_symmetric(vec![0.054, 0.089, 0.195], 0).unwrap();
6174        assert!(qs.is_per_channel());
6175        assert!(qs.is_symmetric());
6176        assert_eq!(qs.axis(), Some(0));
6177    }
6178
6179    #[test]
6180    fn test_quantization_per_channel_length_mismatch_rejected() {
6181        // len(scales) != len(zero_points) → rejected at construction.
6182        let err = Quantization::per_channel(vec![0.1, 0.2], vec![0, 0, 0], 0).unwrap_err();
6183        assert!(matches!(err, Error::QuantizationInvalid { .. }));
6184    }
6185
6186    #[test]
6187    fn test_quantization_per_channel_empty_rejected() {
6188        let err = Quantization::per_channel_symmetric(vec![], 0).unwrap_err();
6189        assert!(matches!(err, Error::QuantizationInvalid { .. }));
6190    }
6191
6192    /// Constructors guard scale/zero_point length invariants, but
6193    /// `Quantization` is `Deserialize`, so malformed JSON (e.g. an
6194    /// empty `scale` array, or `zero_point` length that disagrees with
6195    /// `scale`) bypasses the constructor checks. `set_quantization`
6196    /// must reject these via `validate()` so they don't poison
6197    /// downstream `mode()` selection or per-channel kernel indexing.
6198    #[test]
6199    fn test_quantization_validate_rejects_malformed_deserialize() {
6200        let mut t = Tensor::<i8>::new(&[1, 1, 4], Some(TensorMemory::Mem), None).unwrap();
6201
6202        // Empty scale array: must be rejected.
6203        let q: Quantization = serde_json::from_str(r#"{"scale": []}"#).unwrap();
6204        assert!(matches!(
6205            t.set_quantization(q).unwrap_err(),
6206            Error::QuantizationInvalid { .. }
6207        ));
6208
6209        // Per-tensor with multi-element zero_point: must be rejected.
6210        let q: Quantization =
6211            serde_json::from_str(r#"{"scale": 0.1, "zero_point": [0, 0, 0]}"#).unwrap();
6212        assert!(matches!(
6213            t.set_quantization(q).unwrap_err(),
6214            Error::QuantizationInvalid { .. }
6215        ));
6216
6217        // Per-channel zero_point length != scale length: must be rejected.
6218        let q: Quantization = serde_json::from_str(
6219            r#"{"scale": [0.1, 0.2, 0.3, 0.4], "zero_point": [0, 0], "axis": 2}"#,
6220        )
6221        .unwrap();
6222        assert!(matches!(
6223            t.set_quantization(q).unwrap_err(),
6224            Error::QuantizationInvalid { .. }
6225        ));
6226    }
6227
6228    #[test]
6229    fn test_quantization_mode_dispatch() {
6230        let pt = Quantization::per_tensor(0.1, -5);
6231        assert!(matches!(
6232            pt.mode(),
6233            QuantMode::PerTensor { scale, zero_point } if scale == 0.1 && zero_point == -5
6234        ));
6235
6236        let pts = Quantization::per_tensor_symmetric(0.05);
6237        assert!(matches!(
6238            pts.mode(),
6239            QuantMode::PerTensorSymmetric { scale } if scale == 0.05
6240        ));
6241
6242        let pc = Quantization::per_channel(vec![0.1, 0.2], vec![0, -1], 2).unwrap();
6243        assert!(matches!(pc.mode(), QuantMode::PerChannel { axis: 2, .. }));
6244
6245        let pcs = Quantization::per_channel_symmetric(vec![0.1, 0.2], 0).unwrap();
6246        assert!(matches!(
6247            pcs.mode(),
6248            QuantMode::PerChannelSymmetric { axis: 0, .. }
6249        ));
6250    }
6251
6252    #[test]
6253    fn test_tensor_quantization_roundtrip_integer() {
6254        let mut t = Tensor::<i8>::new(&[2, 3, 4], Some(TensorMemory::Mem), None).unwrap();
6255        assert!(t.quantization().is_none());
6256        t.set_quantization(Quantization::per_tensor(0.1, -5))
6257            .unwrap();
6258        let q = t.quantization().unwrap();
6259        assert_eq!(q.scale(), &[0.1]);
6260        t.clear_quantization();
6261        assert!(t.quantization().is_none());
6262    }
6263
6264    #[test]
6265    fn test_tensor_with_quantization_builder() {
6266        let t = Tensor::<i8>::new(&[4, 4], Some(TensorMemory::Mem), None)
6267            .unwrap()
6268            .with_quantization(Quantization::per_tensor_symmetric(0.05))
6269            .unwrap();
6270        assert!(t.quantization().is_some());
6271    }
6272
6273    #[test]
6274    fn test_tensor_dyn_quantization_float_arm_returns_none() {
6275        let t = Tensor::<f32>::new(&[2, 2], Some(TensorMemory::Mem), None).unwrap();
6276        let td = TensorDyn::F32(t);
6277        assert!(td.quantization().is_none());
6278    }
6279
6280    #[test]
6281    fn test_tensor_dyn_set_quantization_float_arm_errors() {
6282        let t = Tensor::<f32>::new(&[2, 2], Some(TensorMemory::Mem), None).unwrap();
6283        let mut td = TensorDyn::F32(t);
6284        let err = td
6285            .set_quantization(Quantization::per_tensor(0.1, 0))
6286            .unwrap_err();
6287        // float path returns a QuantizationInvalid error.
6288        assert!(matches!(err, Error::QuantizationInvalid { .. }));
6289    }
6290
6291    /// Compile-time type gate — calling `Tensor::<f32>::quantization()` must
6292    /// fail to compile (the `IntegerType` trait bound is not satisfied by
6293    /// `f32`). This doctest anchors the invariant.
6294    ///
6295    /// ```compile_fail
6296    /// use edgefirst_tensor::{Tensor, TensorMemory};
6297    /// let t = Tensor::<f32>::new(&[2, 2], Some(TensorMemory::Mem), None).unwrap();
6298    /// let _ = t.quantization(); // compile error: f32 not IntegerType
6299    /// ```
6300    fn _compile_fail_doctest_anchor() {}
6301
6302    // Any test that cares about the fd count must grab it exclusively.
6303    // Any tests which modifies the fd count by opening or closing fds must grab it
6304    // shared.
6305    pub static FD_LOCK: RwLock<()> = RwLock::new(());
6306
6307    /// Test that DMA is NOT available on non-Linux platforms.
6308    /// This verifies the cross-platform behavior of is_dma_available().
6309    #[test]
6310    #[cfg(not(target_os = "linux"))]
6311    fn test_dma_not_available_on_non_linux() {
6312        assert!(
6313            !is_dma_available(),
6314            "DMA memory allocation should NOT be available on non-Linux platforms"
6315        );
6316    }
6317
6318    #[test]
6319    fn colorimetry_defaults_none_and_roundtrips_without_auto_fill() {
6320        use crate::{ColorEncoding, ColorRange, Colorimetry, PixelFormat, TensorMemory};
6321        let mut t = Tensor::<u8>::image(
6322            1280,
6323            720,
6324            PixelFormat::Nv12,
6325            Some(TensorMemory::Mem),
6326            crate::CpuAccess::ReadWrite,
6327        )
6328        .unwrap();
6329        assert_eq!(t.colorimetry(), None); // default undefined
6330        let c = Colorimetry::default()
6331            .with_encoding(ColorEncoding::Bt709)
6332            .with_range(ColorRange::Limited);
6333        t.set_colorimetry(Some(c));
6334        assert_eq!(t.colorimetry(), Some(c));
6335        // configure_image must NOT touch colorimetry
6336        t.configure_image(640, 480, PixelFormat::Grey).unwrap();
6337        assert_eq!(t.colorimetry(), Some(c));
6338    }
6339
6340    #[test]
6341    fn configure_image_within_capacity() {
6342        let mut t = Tensor::<u8>::image_with_capacity(
6343            640,
6344            480,
6345            PixelFormat::Rgb,
6346            None,
6347            crate::CpuAccess::ReadWrite,
6348        )
6349        .unwrap();
6350        t.configure_image(320, 240, PixelFormat::Nv12).unwrap();
6351        assert_eq!(t.format(), Some(PixelFormat::Nv12));
6352        assert_eq!(t.width(), Some(320));
6353        assert_eq!(t.height(), Some(240));
6354        assert_eq!(t.shape(), &[360, 320]); // 240*3/2
6355    }
6356
6357    #[test]
6358    fn configure_image_too_large_errors() {
6359        let mut t = Tensor::<u8>::image_with_capacity(
6360            64,
6361            64,
6362            PixelFormat::Grey,
6363            None,
6364            crate::CpuAccess::ReadWrite,
6365        )
6366        .unwrap();
6367        let err = t
6368            .configure_image(1920, 1080, PixelFormat::Nv12)
6369            .unwrap_err();
6370        assert!(matches!(err, Error::InsufficientCapacity { .. }));
6371    }
6372
6373    /// A reused max-sized IOSurface pool keeps its physical `bytesPerRow` when
6374    /// reconfigured to a smaller logical image (physical-grid / logical-ROI
6375    /// decoupling), instead of collapsing to the frame's natural row stride.
6376    #[test]
6377    #[cfg(target_os = "macos")]
6378    fn configure_image_preserves_iosurface_physical_stride() {
6379        // Pool: GREY/R8 IOSurface 100 wide → bytesPerRow padded to 128.
6380        let mut pool = Tensor::<u8>::image(
6381            100,
6382            64,
6383            PixelFormat::Grey,
6384            Some(TensorMemory::Dma),
6385            crate::CpuAccess::ReadWrite,
6386        )
6387        .unwrap();
6388        let pitch = pool.effective_row_stride().unwrap();
6389        assert!(
6390            pitch >= 128 && pitch.is_multiple_of(64),
6391            "padded bytesPerRow, got {pitch}"
6392        );
6393
6394        // Reconfigure to a smaller NV12 frame; the physical pitch must survive
6395        // (natural would be 32, but the surface stride is the 128-padded pitch).
6396        pool.configure_image(32, 16, PixelFormat::Nv12).unwrap();
6397        assert_eq!(pool.format(), Some(PixelFormat::Nv12));
6398        assert_eq!(pool.width(), Some(32));
6399        assert_eq!(pool.height(), Some(16));
6400        assert_eq!(
6401            pool.effective_row_stride(),
6402            Some(pitch),
6403            "configure_image must preserve the IOSurface physical bytesPerRow"
6404        );
6405
6406        // Reconfigure again to NV24 — pitch still preserved.
6407        pool.configure_image(32, 16, PixelFormat::Nv24).unwrap();
6408        assert_eq!(pool.effective_row_stride(), Some(pitch));
6409    }
6410
6411    /// `configure_image` on a Mem backing reconfigures to the format's
6412    /// **64-byte-aligned** row stride (the odd-dim contract: every image tensor
6413    /// carries a 64-aligned `row_stride`). For NV12 32×16 the minimum is
6414    /// `even(32)=32`, rounded up to the 64-byte alignment → 64. The capacity
6415    /// (64×64×4 RGBA = 16 KiB) easily holds the 24×64 = 1.5 KiB NV12 layout.
6416    #[test]
6417    fn configure_image_mem_aligns_stride() {
6418        let mut t = Tensor::<u8>::image_with_capacity(
6419            64,
6420            64,
6421            PixelFormat::Rgba,
6422            Some(TensorMemory::Mem),
6423            crate::CpuAccess::ReadWrite,
6424        )
6425        .unwrap();
6426        t.configure_image(32, 16, PixelFormat::Nv12).unwrap();
6427        let s = t.effective_row_stride().unwrap();
6428        assert_eq!(s % 64, 0, "stride must be 64-aligned");
6429        assert!(s >= 32, "stride must cover the even-width minimum");
6430        assert_eq!(s, 64);
6431    }
6432
6433    #[test]
6434    fn strided_mem_tensor_cpu_maps_full_padded_buffer() {
6435        // A packed RGBA image with row padding (GPU-pitch style): logical width
6436        // 8 px (32 B/row) but a 48-byte row stride. Over-allocate capacity (for
6437        // 16 px), narrow the logical width, then record the padded stride.
6438        // Previously `map()` rejected this on non-Linux with
6439        // "DMA backing is Linux-only"; HAL-owned Mem is now mappable.
6440        let mut t = Tensor::<u8>::image_with_capacity(
6441            16,
6442            3,
6443            PixelFormat::Rgba,
6444            Some(TensorMemory::Mem),
6445            crate::CpuAccess::ReadWrite,
6446        )
6447        .unwrap(); // capacity 3 × 16 × 4 = 192 B
6448        t.configure_image(8, 3, PixelFormat::Rgba).unwrap(); // logical [3, 8, 4] = 96 B
6449        t.set_row_stride(48).unwrap(); // padded stride (>= 32 B min)
6450
6451        let map = t.map().expect("strided Mem tensor should CPU-map");
6452        // Full padded buffer (stride 48 × 3 rows = 144 B), not the 96 B logical
6453        // view — callers iterate rows via `effective_row_stride()`.
6454        assert_eq!(map.as_slice().len(), 144);
6455        // Logical shape is still reported for shape-aware consumers.
6456        assert_eq!(map.shape(), &[3, 8, 4]);
6457    }
6458
6459    #[test]
6460    fn strided_mem_tensor_over_capacity_errors() {
6461        // Stride larger than the allocation: 64 B × 3 rows = 192 B > 96 B cap.
6462        let mut t = Tensor::<u8>::new(&[3, 8, 4], Some(TensorMemory::Mem), None).unwrap();
6463        t.set_format(PixelFormat::Rgba).unwrap();
6464        t.set_row_stride(64).unwrap();
6465        assert!(matches!(t.map(), Err(Error::InsufficientCapacity { .. })));
6466    }
6467
6468    /// Test that SHM memory allocation is available and usable on Unix systems.
6469    /// This is a basic functional test; Linux has additional FD leak tests using procfs.
6470    #[test]
6471    #[cfg(unix)]
6472    fn test_shm_available_and_usable() {
6473        assert!(
6474            is_shm_available(),
6475            "SHM memory allocation should be available on Unix systems"
6476        );
6477
6478        // Create a tensor with SHM backing
6479        let tensor = Tensor::<u8>::new(&[100, 100], Some(TensorMemory::Shm), None)
6480            .expect("Failed to create SHM tensor");
6481
6482        // Verify we can map and write to it
6483        let mut map = tensor.map().expect("Failed to map SHM tensor");
6484        map.as_mut_slice().fill(0xAB);
6485
6486        // Verify the data was written correctly
6487        assert!(
6488            map.as_slice().iter().all(|&b| b == 0xAB),
6489            "SHM tensor data should be writable and readable"
6490        );
6491    }
6492
6493    // =========================================================================
6494    // packed_rgba16f_layout — host-runnable geometry unit tests (TDD)
6495    // =========================================================================
6496
6497    #[test]
6498    fn packed_rgba16f_layout_planar_rgb_f16() {
6499        let layout =
6500            packed_rgba16f_layout(PixelFormat::PlanarRgb, DType::F16, 640, 640).expect("Some");
6501        assert_eq!(layout.surface_w, 160);
6502        assert_eq!(layout.surface_h, 1920);
6503        assert_eq!(layout.bytes_per_texel, 8);
6504        assert_eq!(layout.pitch, 1280);
6505    }
6506
6507    #[test]
6508    fn packed_rgba16f_layout_planar_rgba_f16() {
6509        let layout =
6510            packed_rgba16f_layout(PixelFormat::PlanarRgba, DType::F16, 640, 640).expect("Some");
6511        assert_eq!(layout.surface_w, 160);
6512        assert_eq!(layout.surface_h, 2560); // 4 planes
6513        assert_eq!(layout.bytes_per_texel, 8);
6514        assert_eq!(layout.pitch, 1280);
6515    }
6516
6517    #[test]
6518    fn packed_rgba16f_layout_rejects_misaligned() {
6519        assert!(packed_rgba16f_layout(PixelFormat::PlanarRgb, DType::F16, 642, 640).is_none());
6520    }
6521
6522    #[test]
6523    fn packed_rgba16f_layout_rejects_non_f16() {
6524        // Non-F16 dtype with planar RGB
6525        assert!(packed_rgba16f_layout(PixelFormat::PlanarRgb, DType::U8, 640, 640).is_none());
6526        // Non-planar format with F32
6527        assert!(packed_rgba16f_layout(PixelFormat::Rgb, DType::F32, 640, 640).is_none());
6528        // Packed Rgba with F16 is not a planar format → None
6529        assert!(packed_rgba16f_layout(PixelFormat::Rgba, DType::F16, 640, 640).is_none());
6530    }
6531
6532    #[test]
6533    fn cuda_map_fast_fails_to_none_without_handle() {
6534        let t = Tensor::<f32>::new(&[4], Some(TensorMemory::Mem), None).unwrap();
6535        assert!(t.cuda().is_none());
6536        assert!(t.cuda_map().is_none()); // pure local check, no GL routing
6537    }
6538
6539    #[test]
6540    fn cuda_returns_none_without_handle() {
6541        // A plain Mem-backed tensor has no CUDA handle attached.
6542        let t = Tensor::<f32>::new(&[2, 2], Some(TensorMemory::Mem), None).unwrap();
6543        assert!(t.cuda().is_none(), "no CUDA handle on a Mem tensor");
6544        assert!(t.cuda_map().is_none(), "fast-fail map → None");
6545    }
6546
6547    #[test]
6548    fn cuda_map_then_host_map_fallback() {
6549        // The documented client pattern: try cuda_map() first; when it is None
6550        // (no CUDA handle — the case for a plain Mem tensor), fall back to map().
6551        let t = Tensor::<f32>::new(&[2, 2], Some(TensorMemory::Mem), None).unwrap();
6552        // Bind to a named variable so the CudaMap guard (and its borrow of `t`)
6553        // is dropped at the end of this statement, before the else branch borrows `t` again.
6554        let cuda = t.cuda_map();
6555        if let Some(_c) = cuda {
6556            // On a CUDA-registered tensor we'd use the device ptr here.
6557            unreachable!("a Mem tensor has no CUDA handle");
6558        } else {
6559            let host = t.map().expect("host map fallback must succeed");
6560            // TensorMapTrait::len() returns the element count (not bytes).
6561            assert_eq!(host.len(), 4); // 2*2 f32 elements
6562        }
6563    }
6564
6565    // -------------------------------------------------------------------------
6566    // Tensor::from_foreign — public API tests at the Tensor<T> layer.
6567    //
6568    // The low-level MemTensor::from_foreign mechanics (owner-drop, view sharing)
6569    // are covered in mem.rs.  These tests exercise the Tensor<T> guard paths
6570    // (null ptr, empty shape, size overflow) and the basic wrap+readback
6571    // contract, confirming the public unsafe API wires through correctly.
6572    // -------------------------------------------------------------------------
6573
6574    #[test]
6575    fn from_foreign_valid_wrap_and_readback() {
6576        // The canonical CUDA zero-copy shape: wrap a caller allocation as a
6577        // Mem tensor and verify the tensor reads the exact same bytes.
6578        let mut buf: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
6579        let ptr = buf.as_mut_ptr();
6580        let t = unsafe { Tensor::<f32>::from_foreign(ptr, &[2, 3], None, Some("test_foreign")) }
6581            .expect("valid from_foreign must succeed");
6582        assert_eq!(t.shape(), &[2, 3]);
6583        assert_eq!(t.memory(), TensorMemory::Mem);
6584        assert_eq!(t.name(), "test_foreign");
6585        let m = t.map().unwrap();
6586        // The tensor is a zero-copy borrow — it sees the caller's data.
6587        assert_eq!(m.as_slice(), &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0]);
6588    }
6589
6590    #[test]
6591    fn from_foreign_write_visible_in_caller_allocation() {
6592        // Writes through the tensor's map land in the caller's buffer (zero-copy).
6593        let mut buf: Vec<u8> = vec![0u8; 6];
6594        let ptr = buf.as_mut_ptr();
6595        let t = unsafe { Tensor::<u8>::from_foreign(ptr, &[2, 3], None, None) }.unwrap();
6596        {
6597            let mut m = t.map().unwrap();
6598            m.as_mut_slice().copy_from_slice(&[10, 20, 30, 40, 50, 60]);
6599        }
6600        drop(t);
6601        // Mutations are visible in the original Vec — same physical buffer.
6602        assert_eq!(buf, vec![10, 20, 30, 40, 50, 60]);
6603    }
6604
6605    #[test]
6606    fn from_foreign_rejects_null_ptr() {
6607        let err = unsafe { Tensor::<u8>::from_foreign(std::ptr::null_mut(), &[4], None, None) }
6608            .unwrap_err();
6609        assert!(
6610            matches!(err, Error::InvalidArgument(ref m) if m.contains("non-null")),
6611            "expected InvalidArgument(non-null), got {err:?}"
6612        );
6613    }
6614
6615    #[test]
6616    fn from_foreign_rejects_empty_shape() {
6617        let mut dummy: u8 = 0;
6618        let err = unsafe { Tensor::<u8>::from_foreign(&mut dummy, &[], None, None) }.unwrap_err();
6619        assert!(
6620            matches!(err, Error::InvalidSize(0)),
6621            "expected InvalidSize(0) for empty shape, got {err:?}"
6622        );
6623    }
6624
6625    #[test]
6626    fn from_foreign_rejects_overflow_shape() {
6627        // Two dimensions whose product overflows usize — the overflow guard must
6628        // fire before any pointer arithmetic is attempted.
6629        let mut dummy: u8 = 0;
6630        let huge = [usize::MAX / 2 + 1, 2];
6631        let err = unsafe { Tensor::<u8>::from_foreign(&mut dummy, &huge, None, None) }.unwrap_err();
6632        assert!(
6633            matches!(err, Error::InvalidArgument(ref m) if m.contains("overflow")),
6634            "expected InvalidArgument(overflow), got {err:?}"
6635        );
6636    }
6637
6638    #[test]
6639    fn from_foreign_owner_keeps_allocation_alive() {
6640        // When `owner` is `Some`, dropping the Tensor must not free the backing
6641        // before the owner is also gone — the owner's Drop fires on last ref.
6642        use std::sync::atomic::{AtomicBool, Ordering};
6643        let flag = std::sync::Arc::new(AtomicBool::new(false));
6644        let flag2 = flag.clone();
6645        struct Guard(std::sync::Arc<AtomicBool>);
6646        impl Drop for Guard {
6647            fn drop(&mut self) {
6648                self.0.store(true, Ordering::SeqCst);
6649            }
6650        }
6651        let mut buf: Vec<u32> = vec![42u32; 4];
6652        let ptr = buf.as_mut_ptr();
6653        let owner: ForeignOwner = Box::new(Guard(flag2));
6654        let t = unsafe { Tensor::<u32>::from_foreign(ptr, &[4], Some(owner), None) }.unwrap();
6655        // Map co-owns the backing Arc; the owner must stay alive while the map lives.
6656        let m = t.map().unwrap();
6657        assert_eq!(m.as_slice()[0], 42);
6658        drop(t); // tensor dropped while map is still live
6659        assert!(
6660            !flag.load(Ordering::SeqCst),
6661            "owner must not drop while a map shares the backing"
6662        );
6663        drop(m);
6664        assert!(
6665            flag.load(Ordering::SeqCst),
6666            "owner Drop must fire when the last Arc reference is released"
6667        );
6668    }
6669}
edgefirst_tensor/lib.rs

edgefirst_tensor/
lib.rs