Skip to main content

j2k_core/
backend.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use core::sync::atomic::{AtomicU8, Ordering};
4
5/// Runtime backend that executes codec work.
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
7pub enum BackendKind {
8    /// Portable CPU implementation.
9    Cpu,
10    /// Apple Metal implementation.
11    Metal,
12    /// NVIDIA CUDA implementation.
13    Cuda,
14}
15
16/// Caller preference for backend selection.
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
18pub enum BackendRequest {
19    /// Let the codec choose the best available backend.
20    #[default]
21    Auto,
22    /// Force the portable CPU backend.
23    Cpu,
24    /// Force Metal and fail if unavailable.
25    Metal,
26    /// Force CUDA and fail if unavailable.
27    Cuda,
28}
29
30impl BackendRequest {
31    /// Adaptive accelerated route: let the codec choose CPU and device stages
32    /// for benchmark-approved workload shapes.
33    pub const ACCELERATED: Self = Self::Auto;
34    /// Explicit portable CPU route.
35    pub const CPU_ONLY: Self = Self::Cpu;
36    /// Strict Metal route; fail when Metal is unavailable or unsupported.
37    pub const STRICT_METAL: Self = Self::Metal;
38    /// Strict CUDA route; fail when CUDA is unavailable or unsupported.
39    pub const STRICT_CUDA: Self = Self::Cuda;
40}
41
42/// CPU SIMD feature flags detected for the current host.
43#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
44pub struct CpuFeatures {
45    /// True when AVX2 is available and enabled by the OS.
46    pub avx2: bool,
47    /// True when SSE4.1 is available.
48    pub sse41: bool,
49    /// True when NEON is available.
50    pub neon: bool,
51}
52
53impl CpuFeatures {
54    /// Detect CPU SIMD features once and reuse the cached result.
55    pub fn detect() -> Self {
56        static DETECTED: AtomicU8 = AtomicU8::new(0);
57
58        let cached = DETECTED.load(Ordering::Acquire);
59        if cached != 0 {
60            return Self::from_cache_byte(cached);
61        }
62
63        let detected = Self::detect_uncached();
64        let encoded = detected.to_cache_byte();
65        let _ = DETECTED.compare_exchange(0, encoded, Ordering::AcqRel, Ordering::Acquire);
66        Self::from_cache_byte(DETECTED.load(Ordering::Acquire))
67    }
68
69    fn detect_uncached() -> Self {
70        #[cfg(target_arch = "x86_64")]
71        {
72            Self {
73                avx2: detect_x86_avx2(),
74                sse41: detect_x86_sse41(),
75                neon: false,
76            }
77        }
78
79        #[cfg(target_arch = "aarch64")]
80        {
81            Self {
82                avx2: false,
83                sse41: false,
84                neon: true,
85            }
86        }
87
88        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
89        {
90            Self::default()
91        }
92    }
93
94    const fn to_cache_byte(self) -> u8 {
95        let mut encoded = 1_u8;
96        if self.avx2 {
97            encoded |= 1 << 1;
98        }
99        if self.sse41 {
100            encoded |= 1 << 2;
101        }
102        if self.neon {
103            encoded |= 1 << 3;
104        }
105        encoded
106    }
107
108    const fn from_cache_byte(encoded: u8) -> Self {
109        let bits = encoded.saturating_sub(1);
110        Self {
111            avx2: (bits & (1 << 1)) != 0,
112            sse41: (bits & (1 << 2)) != 0,
113            neon: (bits & (1 << 3)) != 0,
114        }
115    }
116}
117
118#[cfg(target_arch = "x86_64")]
119fn detect_x86_sse41() -> bool {
120    let features = core::arch::x86_64::__cpuid(1);
121    (features.ecx & (1 << 19)) != 0
122}
123
124#[cfg(target_arch = "x86_64")]
125fn detect_x86_avx2() -> bool {
126    let leaf1 = core::arch::x86_64::__cpuid(1);
127    let osxsave = (leaf1.ecx & (1 << 27)) != 0;
128    let avx = (leaf1.ecx & (1 << 28)) != 0;
129    if !(osxsave && avx) {
130        return false;
131    }
132
133    // SAFETY: XGETBV is only executed after CPUID reports OSXSAVE support.
134    let xcr0 = unsafe { core::arch::x86_64::_xgetbv(0) };
135    let xmm_enabled = (xcr0 & 0b10) != 0;
136    let ymm_enabled = (xcr0 & 0b100) != 0;
137    if !(xmm_enabled && ymm_enabled) {
138        return false;
139    }
140
141    let max_leaf = core::arch::x86_64::__cpuid(0).eax;
142    if max_leaf < 7 {
143        return false;
144    }
145
146    let leaf7 = core::arch::x86_64::__cpuid_count(7, 0);
147    (leaf7.ebx & (1 << 5)) != 0
148}
149
150/// Backend availability for a codec/runtime combination.
151#[derive(Debug, Clone, Copy, PartialEq, Eq)]
152pub struct BackendCapabilities {
153    /// Host CPU feature set.
154    pub cpu: CpuFeatures,
155    /// True when Metal is available to this crate.
156    pub metal: bool,
157    /// True when CUDA is available to this crate.
158    pub cuda: bool,
159}
160
161impl BackendCapabilities {
162    /// Return default capabilities implied by the current build target.
163    ///
164    /// This does not probe GPU devices or runtime libraries. Codec facades and
165    /// adapters must further gate the returned device flags by their compiled
166    /// features and runtime availability.
167    #[must_use]
168    pub fn compile_time_defaults() -> Self {
169        Self {
170            cpu: CpuFeatures::detect(),
171            metal: cfg!(target_os = "macos"),
172            cuda: false,
173        }
174    }
175
176    /// Return whether a backend request can be satisfied.
177    #[must_use]
178    pub const fn supports(self, request: BackendRequest) -> bool {
179        match request {
180            BackendRequest::Auto | BackendRequest::Cpu => true,
181            BackendRequest::Metal => self.metal,
182            BackendRequest::Cuda => self.cuda,
183        }
184    }
185
186    /// Resolve a backend request to the concrete backend that should run.
187    ///
188    /// `Auto` resolves to CPU here. Workload-aware device promotion belongs in
189    /// codec-specific route planners that have benchmark evidence for the
190    /// requested operation.
191    #[must_use]
192    pub fn resolve(self, request: BackendRequest) -> Option<BackendKind> {
193        match request {
194            BackendRequest::Auto | BackendRequest::Cpu => Some(BackendKind::Cpu),
195            BackendRequest::Metal if self.metal => Some(BackendKind::Metal),
196            BackendRequest::Cuda if self.cuda => Some(BackendKind::Cuda),
197            BackendRequest::Metal | BackendRequest::Cuda => None,
198        }
199    }
200
201    /// Return an available accelerator backend without implying it should be
202    /// selected for a workload.
203    #[must_use]
204    pub const fn first_available_accelerator(self) -> Option<BackendKind> {
205        if self.metal {
206            Some(BackendKind::Metal)
207        } else if self.cuda {
208            Some(BackendKind::Cuda)
209        } else {
210            None
211        }
212    }
213}