ai_hwaccel/
lib.rs

1//! Universal AI hardware accelerator detection and workload planning.
2//!
3//! `ai-hwaccel` provides a unified view of every AI-capable accelerator on the
4//! system — GPUs, TPUs, NPUs, and cloud inference chips — through a single
5//! detection call. It answers three questions:
6//!
7//! 1. **What hardware is available?** ([`AcceleratorRegistry::detect`])
8//! 2. **What quantisation fits?** ([`AcceleratorRegistry::suggest_quantization`])
9//! 3. **How should the model be distributed?** ([`AcceleratorRegistry::plan_sharding`])
10//!
11//! Zero runtime dependencies beyond `serde` (for serialisation) and `tracing`
12//! (for debug diagnostics). Detection probes sysfs, `/dev`, and `$PATH` tools —
13//! no vendor SDKs required at compile time.
14//!
15//! # Supported Hardware
16//!
17//! | Family | Variant | Detection method |
18//! |--------|---------|------------------|
19//! | NVIDIA CUDA | GeForce / Tesla / A100 / H100 | `nvidia-smi` on PATH |
20//! | AMD ROCm | MI250 / MI300 / RX 7900 | sysfs `/sys/class/drm` |
21//! | Apple Metal | M1–M4 GPU | `system_profiler` or device-tree |
22//! | Apple ANE | Neural Engine | `system_profiler` or device-tree |
23//! | Intel NPU | Meteor Lake+ NPU | `/sys/class/misc/intel_npu` |
24//! | AMD XDNA | Ryzen AI NPU | `/sys/class/accel/accel*/device/driver` → `amdxdna` |
25//! | Google TPU | v4 / v5e / v5p | `/dev/accel*` + sysfs version |
26//! | Intel Gaudi | Gaudi 2 / 3 (Habana HPU) | `hl-smi` on PATH |
27//! | AWS Inferentia | inf1 / inf2 | `neuron-ls` or `/dev/neuron*` |
28//! | AWS Trainium | trn1 | `neuron-ls` or `/dev/neuron*` + sysfs |
29//! | Qualcomm Cloud AI | AI 100 | `/dev/qaic_*` or `/sys/class/qaic` |
30//! | Vulkan Compute | Any Vulkan 1.1+ device | `vulkaninfo` on PATH |
31//! | CPU | Always present | `/proc/meminfo` or 16 GiB fallback |
32//!
33//! # Quick start
34//!
35//! ```rust,no_run
36//! use ai_hwaccel::{AcceleratorRegistry, QuantizationLevel};
37//!
38//! let registry = AcceleratorRegistry::detect();
39//! println!("Best device: {}", registry.best_available().unwrap());
40//!
41//! // What quantisation for a 7B-parameter model?
42//! let quant = registry.suggest_quantization(7_000_000_000);
43//! println!("Recommended: {quant}");
44//!
45//! // How to shard a 70B model at BF16?
46//! let plan = registry.plan_sharding(70_000_000_000, &QuantizationLevel::BFloat16);
47//! println!("Strategy: {}, est. {:.0} tok/s",
48//!     plan.strategy,
49//!     plan.estimated_tokens_per_sec.unwrap_or(0.0));
50//! ```
51//!
52//! # Guide
53//!
54//! ## Step 1: Detect hardware
55//!
56//! Call [`AcceleratorRegistry::detect`] to probe the system. All backends
57//! run in parallel and detection is best-effort — missing tools or drivers
58//! are skipped, not fatal.
59//!
60//! ```rust
61//! use ai_hwaccel::AcceleratorRegistry;
62//!
63//! let registry = AcceleratorRegistry::detect();
64//! for profile in registry.all_profiles() {
65//!     println!("{}", profile);
66//! }
67//! // Check for warnings (tool failures, parse errors, etc.)
68//! for w in registry.warnings() {
69//!     eprintln!("warning: {}", w);
70//! }
71//! ```
72//!
73//! Use [`DetectBuilder`] to control which backends run:
74//!
75//! ```rust,no_run
76//! use ai_hwaccel::AcceleratorRegistry;
77//!
78//! let registry = AcceleratorRegistry::builder()
79//!     .with_cuda()
80//!     .with_tpu()
81//!     .detect();
82//! ```
83//!
84//! Or disable backends at compile time with cargo features:
85//!
86//! ```toml
87//! [dependencies]
88//! ai-hwaccel = { version = "0.19", default-features = false, features = ["cuda", "tpu"] }
89//! ```
90//!
91//! ## Step 2: Query capabilities
92//!
93//! The registry provides several query methods:
94//!
95//! ```rust
96//! use ai_hwaccel::{AcceleratorRegistry, AcceleratorProfile, AcceleratorFamily,
97//!                  AcceleratorRequirement};
98//!
99//! let registry = AcceleratorRegistry::from_profiles(vec![
100//!     AcceleratorProfile::cpu(64 * 1024 * 1024 * 1024),
101//!     AcceleratorProfile::cuda(0, 24 * 1024 * 1024 * 1024),
102//! ]);
103//!
104//! // Best single device
105//! let best = registry.best_available().unwrap();
106//!
107//! // Filter by family
108//! let gpus = registry.by_family(AcceleratorFamily::Gpu);
109//!
110//! // Filter by workload requirement
111//! let matches = registry.satisfying(&AcceleratorRequirement::Gpu);
112//!
113//! // Memory totals
114//! let total = registry.total_memory();
115//! let accel = registry.total_accelerator_memory();
116//! ```
117//!
118//! ## Step 3: Plan model deployment
119//!
120//! Given a model's parameter count, the registry can suggest a quantisation
121//! level and generate a sharding plan:
122//!
123//! ```rust
124//! use ai_hwaccel::{AcceleratorRegistry, AcceleratorProfile, QuantizationLevel};
125//!
126//! let registry = AcceleratorRegistry::from_profiles(vec![
127//!     AcceleratorProfile::cpu(64 * 1024 * 1024 * 1024),
128//!     AcceleratorProfile::cuda(0, 80 * 1024 * 1024 * 1024),
129//!     AcceleratorProfile::cuda(1, 80 * 1024 * 1024 * 1024),
130//! ]);
131//!
132//! // Suggest quantisation for available hardware
133//! let quant = registry.suggest_quantization(70_000_000_000);
134//!
135//! // Generate a sharding plan
136//! let plan = registry.plan_sharding(70_000_000_000, &quant);
137//! print!("{}", plan); // human-readable summary
138//! ```
139//!
140//! ## Step 4: Estimate training memory
141//!
142//! For fine-tuning workloads, estimate per-component memory usage:
143//!
144//! ```rust
145//! use ai_hwaccel::{estimate_training_memory, TrainingMethod, TrainingTarget};
146//!
147//! let est = estimate_training_memory(7000, TrainingMethod::LoRA, TrainingTarget::Gpu);
148//! println!("Model: {:.1} GB, Optimizer: {:.1} GB, Activations: {:.1} GB",
149//!     est.model_gb, est.optimizer_gb, est.activation_gb);
150//! println!("Total: {:.1} GB", est.total_gb);
151//! ```
152//!
153//! ## Step 5: Inspect system I/O
154//!
155//! After detection, the registry includes system-level I/O information:
156//!
157//! ```rust,no_run
158//! use ai_hwaccel::AcceleratorRegistry;
159//!
160//! let registry = AcceleratorRegistry::detect();
161//! let sio = registry.system_io();
162//!
163//! for ic in &sio.interconnects {
164//!     println!("{} ({}) — {:.1} GB/s", ic.name, ic.kind, ic.bandwidth_gbps);
165//! }
166//! for dev in &sio.storage {
167//!     println!("{} ({}) — {:.1} GB/s", dev.name, dev.kind, dev.bandwidth_gbps);
168//! }
169//!
170//! // Estimate how long to load a 100 GB dataset from local storage
171//! if let Some(secs) = sio.estimate_ingestion_secs(100 * 1024 * 1024 * 1024) {
172//!     println!("Estimated ingestion time: {:.0}s", secs);
173//! }
174//! ```
175//!
176//! # Error handling
177//!
178//! Detection is best-effort. Errors are collected as warnings, not panics:
179//!
180//! ```rust,no_run
181//! use ai_hwaccel::{AcceleratorRegistry, DetectionError};
182//!
183//! let registry = AcceleratorRegistry::detect();
184//! for w in registry.warnings() {
185//!     match w {
186//!         DetectionError::ToolNotFound { tool } => {
187//!             // Tool not installed — expected on systems without that hardware.
188//!             eprintln!("skipped: {} not found", tool);
189//!         }
190//!         DetectionError::Timeout { tool, timeout_secs } => {
191//!             // Tool hung — may want to retry with a longer timeout.
192//!             eprintln!("{} timed out after {:.0}s", tool, timeout_secs);
193//!         }
194//!         DetectionError::ToolFailed { tool, exit_code, stderr } => {
195//!             eprintln!("{} failed (exit {}): {}", tool,
196//!                 exit_code.unwrap_or(-1), stderr);
197//!         }
198//!         _ => eprintln!("warning: {}", w),
199//!     }
200//! }
201//! ```
202//!
203//! # Custom backends
204//!
205//! Build profiles manually and add them to a registry for hardware that
206//! isn't auto-detected:
207//!
208//! ```rust
209//! use ai_hwaccel::{AcceleratorProfile, AcceleratorRegistry, AcceleratorType};
210//!
211//! let mut registry = AcceleratorRegistry::detect();
212//!
213//! // Add a device from an external detection system
214//! let mut custom = AcceleratorProfile::cuda(4, 80 * 1024 * 1024 * 1024);
215//! custom.compute_capability = Some("9.0".into());
216//! custom.memory_bandwidth_gbps = Some(3350.0);
217//! registry.add_profile(custom);
218//! ```
219//!
220//! # Serde integration
221//!
222//! The registry and all sub-types implement `Serialize`/`Deserialize`.
223//! Use [`CachedRegistry`] for disk persistence with TTL-based invalidation:
224//!
225//! ```rust,no_run
226//! use ai_hwaccel::CachedRegistry;
227//! use std::time::Duration;
228//!
229//! let cache = CachedRegistry::new(Duration::from_secs(300));
230//! let registry = cache.get(); // detects on first call, caches for 5 min
231//! let registry2 = cache.get(); // returns cached result
232//! ```
233//!
234//! The [`SCHEMA_VERSION`] constant tracks the JSON schema. Bumps indicate
235//! new fields or structural changes. Old JSON (lower version) can still be
236//! deserialized — new fields use `#[serde(default)]`.
237//!
238//! # Cargo features
239//!
240//! Each hardware backend can be individually enabled or disabled:
241//!
242//! | Feature | Backend | Default |
243//! |---------|---------|---------|
244//! | `cuda` | NVIDIA CUDA | yes |
245//! | `rocm` | AMD ROCm | yes |
246//! | `apple` | Apple Metal + ANE | yes |
247//! | `vulkan` | Vulkan Compute | yes |
248//! | `intel-npu` | Intel NPU | yes |
249//! | `amd-xdna` | AMD XDNA NPU | yes |
250//! | `tpu` | Google TPU | yes |
251//! | `gaudi` | Intel Gaudi | yes |
252//! | `aws-neuron` | AWS Inferentia/Trainium | yes |
253//! | `intel-oneapi` | Intel oneAPI | yes |
254//! | `qualcomm` | Qualcomm Cloud AI | yes |
255//! | `all-backends` | All of the above | yes |
256//!
257//! To include only specific backends:
258//!
259//! ```toml
260//! [dependencies]
261//! ai-hwaccel = { version = "0.19", default-features = false, features = ["cuda"] }
262//! ```
263
264mod async_detect;
265pub mod cache;
266pub mod cost;
267pub mod detect;
268pub mod error;
269pub mod ffi;
270#[cfg(feature = "fuzz")]
271#[doc(hidden)]
272pub mod fuzz_helpers;
273pub mod hardware;
274pub mod lazy;
275pub mod plan;
276pub mod profile;
277pub mod quantization;
278pub mod registry;
279pub mod requirement;
280pub mod sharding;
281pub mod system_io;
282pub mod training;
283pub mod units;
284
285pub use cache::{CachedRegistry, DiskCachedRegistry};
286pub use cost::{CloudGpuInstance, CloudProvider, InstanceRecommendation};
287pub use detect::TimedDetection;
288pub use error::DetectionError;
289pub use hardware::{
290    AcceleratorFamily, AcceleratorType, GaudiGeneration, NeuronChipType, TpuVersion,
291};
292pub use lazy::LazyRegistry;
293pub use profile::AcceleratorProfile;
294pub use quantization::QuantizationLevel;
295pub use registry::{AcceleratorRegistry, Backend, DetectBuilder, SCHEMA_VERSION};
296pub use requirement::AcceleratorRequirement;
297pub use sharding::{ModelShard, ShardingPlan, ShardingStrategy};
298pub use system_io::{
299    CloudInstanceMeta, Interconnect, InterconnectKind, RuntimeEnvironment, StorageDevice,
300    StorageKind, SystemIo,
301};
302pub use training::{MemoryEstimate, TrainingMethod, TrainingTarget, estimate_training_memory};
303
304#[cfg(test)]
305mod tests;
ai_hwaccel/lib.rs

ai_hwaccel/
lib.rs