1use core::sync::atomic::{AtomicU8, Ordering};
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
7pub enum BackendKind {
8 Cpu,
10 Metal,
12 Cuda,
14}
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
18pub enum BackendRequest {
19 #[default]
21 Auto,
22 Cpu,
24 Metal,
26 Cuda,
28}
29
30impl BackendRequest {
31 pub const ACCELERATED: Self = Self::Auto;
34 pub const CPU_ONLY: Self = Self::Cpu;
36 pub const STRICT_METAL: Self = Self::Metal;
38 pub const STRICT_CUDA: Self = Self::Cuda;
40}
41
42#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
44pub struct CpuFeatures {
45 pub avx2: bool,
47 pub sse41: bool,
49 pub neon: bool,
51}
52
53impl CpuFeatures {
54 pub fn detect() -> Self {
56 static DETECTED: AtomicU8 = AtomicU8::new(0);
57
58 let cached = DETECTED.load(Ordering::Acquire);
59 if cached != 0 {
60 return Self::from_cache_byte(cached);
61 }
62
63 let detected = Self::detect_uncached();
64 let encoded = detected.to_cache_byte();
65 let _ = DETECTED.compare_exchange(0, encoded, Ordering::AcqRel, Ordering::Acquire);
66 Self::from_cache_byte(DETECTED.load(Ordering::Acquire))
67 }
68
69 fn detect_uncached() -> Self {
70 #[cfg(target_arch = "x86_64")]
71 {
72 Self {
73 avx2: detect_x86_avx2(),
74 sse41: detect_x86_sse41(),
75 neon: false,
76 }
77 }
78
79 #[cfg(target_arch = "aarch64")]
80 {
81 Self {
82 avx2: false,
83 sse41: false,
84 neon: true,
85 }
86 }
87
88 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
89 {
90 Self::default()
91 }
92 }
93
94 const fn to_cache_byte(self) -> u8 {
95 let mut encoded = 1_u8;
96 if self.avx2 {
97 encoded |= 1 << 1;
98 }
99 if self.sse41 {
100 encoded |= 1 << 2;
101 }
102 if self.neon {
103 encoded |= 1 << 3;
104 }
105 encoded
106 }
107
108 const fn from_cache_byte(encoded: u8) -> Self {
109 let bits = encoded.saturating_sub(1);
110 Self {
111 avx2: (bits & (1 << 1)) != 0,
112 sse41: (bits & (1 << 2)) != 0,
113 neon: (bits & (1 << 3)) != 0,
114 }
115 }
116}
117
118#[cfg(target_arch = "x86_64")]
119fn detect_x86_sse41() -> bool {
120 let features = core::arch::x86_64::__cpuid(1);
121 (features.ecx & (1 << 19)) != 0
122}
123
124#[cfg(target_arch = "x86_64")]
125fn detect_x86_avx2() -> bool {
126 let leaf1 = core::arch::x86_64::__cpuid(1);
127 let osxsave = (leaf1.ecx & (1 << 27)) != 0;
128 let avx = (leaf1.ecx & (1 << 28)) != 0;
129 if !(osxsave && avx) {
130 return false;
131 }
132
133 let xcr0 = unsafe { core::arch::x86_64::_xgetbv(0) };
135 let xmm_enabled = (xcr0 & 0b10) != 0;
136 let ymm_enabled = (xcr0 & 0b100) != 0;
137 if !(xmm_enabled && ymm_enabled) {
138 return false;
139 }
140
141 let max_leaf = core::arch::x86_64::__cpuid(0).eax;
142 if max_leaf < 7 {
143 return false;
144 }
145
146 let leaf7 = core::arch::x86_64::__cpuid_count(7, 0);
147 (leaf7.ebx & (1 << 5)) != 0
148}
149
150#[derive(Debug, Clone, Copy, PartialEq, Eq)]
152pub struct BackendCapabilities {
153 pub cpu: CpuFeatures,
155 pub metal: bool,
157 pub cuda: bool,
159}
160
161impl BackendCapabilities {
162 #[must_use]
168 pub fn compile_time_defaults() -> Self {
169 Self {
170 cpu: CpuFeatures::detect(),
171 metal: cfg!(target_os = "macos"),
172 cuda: false,
173 }
174 }
175
176 #[must_use]
178 pub const fn supports(self, request: BackendRequest) -> bool {
179 match request {
180 BackendRequest::Auto | BackendRequest::Cpu => true,
181 BackendRequest::Metal => self.metal,
182 BackendRequest::Cuda => self.cuda,
183 }
184 }
185
186 #[must_use]
192 pub fn resolve(self, request: BackendRequest) -> Option<BackendKind> {
193 match request {
194 BackendRequest::Auto | BackendRequest::Cpu => Some(BackendKind::Cpu),
195 BackendRequest::Metal if self.metal => Some(BackendKind::Metal),
196 BackendRequest::Cuda if self.cuda => Some(BackendKind::Cuda),
197 BackendRequest::Metal | BackendRequest::Cuda => None,
198 }
199 }
200
201 #[must_use]
204 pub const fn first_available_accelerator(self) -> Option<BackendKind> {
205 if self.metal {
206 Some(BackendKind::Metal)
207 } else if self.cuda {
208 Some(BackendKind::Cuda)
209 } else {
210 None
211 }
212 }
213}