all_smi/client.rs
1// Copyright 2025 Lablup Inc. and Jeongkyu Shin
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! High-level client API for all-smi library.
16//!
17//! This module provides the main [`AllSmi`] struct, which offers a simple,
18//! ergonomic interface for querying GPU, CPU, memory, and process information
19//! across all supported platforms.
20//!
21//! # Example
22//!
23//! ```rust,no_run
24//! use all_smi::{AllSmi, Result};
25//!
26//! fn main() -> Result<()> {
27//! // Initialize with auto-detection
28//! let smi = AllSmi::new()?;
29//!
30//! // Get all GPU/NPU information
31//! let gpus = smi.get_gpu_info();
32//! for gpu in &gpus {
33//! println!("{}: {}% utilization, {:.1}W",
34//! gpu.name, gpu.utilization, gpu.power_consumption);
35//! }
36//!
37//! // Get CPU information
38//! let cpus = smi.get_cpu_info();
39//! for cpu in &cpus {
40//! println!("{}: {:.1}% utilization", cpu.cpu_model, cpu.utilization);
41//! }
42//!
43//! // Get memory information
44//! let memory = smi.get_memory_info();
45//! for mem in &memory {
46//! println!("Memory: {:.1}% used", mem.utilization);
47//! }
48//!
49//! Ok(())
50//! }
51//! ```
52
53use crate::device::{
54 create_chassis_reader, get_cpu_readers, get_gpu_readers, get_memory_readers, ChassisInfo,
55 ChassisReader, CpuInfo, CpuReader, GpuInfo, GpuReader, MemoryInfo, MemoryReader, ProcessInfo,
56};
57use crate::error::Result;
58use crate::storage::{create_storage_reader, StorageInfo, StorageReader};
59
60#[cfg(target_os = "macos")]
61use crate::device::macos_native::{
62 initialize_native_metrics_manager, shutdown_native_metrics_manager,
63};
64
65#[cfg(target_os = "linux")]
66use crate::device::hlsmi::{initialize_hlsmi_manager, shutdown_hlsmi_manager};
67
68#[cfg(target_os = "linux")]
69use crate::device::platform_detection::has_gaudi;
70
71/// The type of device that can be monitored.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
73pub enum DeviceType {
74 /// NVIDIA GPU
75 NvidiaGpu,
76 /// AMD GPU
77 AmdGpu,
78 /// Apple Silicon GPU
79 AppleSiliconGpu,
80 /// NVIDIA Jetson
81 NvidiaJetson,
82 /// Intel Gaudi NPU
83 IntelGaudi,
84 /// Furiosa NPU
85 FuriosaNpu,
86 /// Rebellions NPU
87 RebellionsNpu,
88 /// Tenstorrent NPU
89 TenstorrentNpu,
90 /// Google TPU
91 GoogleTpu,
92}
93
94impl std::fmt::Display for DeviceType {
95 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96 match self {
97 DeviceType::NvidiaGpu => write!(f, "NVIDIA GPU"),
98 DeviceType::AmdGpu => write!(f, "AMD GPU"),
99 DeviceType::AppleSiliconGpu => write!(f, "Apple Silicon GPU"),
100 DeviceType::NvidiaJetson => write!(f, "NVIDIA Jetson"),
101 DeviceType::IntelGaudi => write!(f, "Intel Gaudi"),
102 DeviceType::FuriosaNpu => write!(f, "Furiosa NPU"),
103 DeviceType::RebellionsNpu => write!(f, "Rebellions NPU"),
104 DeviceType::TenstorrentNpu => write!(f, "Tenstorrent NPU"),
105 DeviceType::GoogleTpu => write!(f, "Google TPU"),
106 }
107 }
108}
109
110/// Main client for accessing hardware monitoring information.
111///
112/// `AllSmi` provides a high-level API for querying GPU, NPU, CPU, and memory
113/// information across all supported platforms. It handles platform-specific
114/// initialization and cleanup automatically.
115///
116/// # Thread Safety
117///
118/// `AllSmi` is `Send + Sync` and can be safely shared across threads.
119///
120/// # Example
121///
122/// ```rust,no_run
123/// use all_smi::AllSmi;
124///
125/// let smi = AllSmi::new().expect("Failed to initialize");
126///
127/// // Query GPU information
128/// for gpu in smi.get_gpu_info() {
129/// println!("{}: {}% utilization", gpu.name, gpu.utilization);
130/// }
131/// ```
132pub struct AllSmi {
133 gpu_readers: Vec<Box<dyn GpuReader>>,
134 cpu_readers: Vec<Box<dyn CpuReader>>,
135 memory_readers: Vec<Box<dyn MemoryReader>>,
136 chassis_reader: Box<dyn ChassisReader>,
137 storage_reader: Box<dyn StorageReader>,
138 #[cfg(target_os = "macos")]
139 _macos_initialized: bool,
140 #[cfg(target_os = "linux")]
141 _gaudi_initialized: bool,
142}
143
144impl AllSmi {
145 /// Create a new `AllSmi` instance with auto-detected hardware.
146 ///
147 /// This constructor initializes all platform-specific managers and
148 /// creates readers for available hardware. It does not fail if no
149 /// hardware is detected; instead, the corresponding `get_*_info()`
150 /// methods will return empty collections.
151 ///
152 /// # Errors
153 ///
154 /// Returns an error if platform initialization fails critically
155 /// (e.g., macOS IOReport API unavailable, or system-level errors).
156 ///
157 /// # Example
158 ///
159 /// ```rust,no_run
160 /// use all_smi::AllSmi;
161 ///
162 /// let smi = AllSmi::new()?;
163 /// println!("Found {} GPU(s)", smi.get_gpu_info().len());
164 /// # Ok::<(), all_smi::Error>(())
165 /// ```
166 #[must_use = "AllSmi instance must be stored to access hardware information"]
167 pub fn new() -> Result<Self> {
168 Self::with_config(AllSmiConfig::default())
169 }
170
171 /// Create a new `AllSmi` instance with custom configuration.
172 ///
173 /// # Arguments
174 ///
175 /// * `config` - Configuration options for the client
176 ///
177 /// # Errors
178 ///
179 /// Returns an error if platform initialization fails.
180 #[must_use = "AllSmi instance must be stored to access hardware information"]
181 pub fn with_config(config: AllSmiConfig) -> Result<Self> {
182 // Initialize platform-specific managers
183 #[cfg(target_os = "macos")]
184 let macos_initialized = {
185 match initialize_native_metrics_manager(config.sample_interval_ms) {
186 Ok(()) => true,
187 Err(e) => {
188 // Log but don't fail - some metrics may still work
189 if config.verbose {
190 eprintln!("Warning: macOS native metrics init failed: {e}");
191 }
192 false
193 }
194 }
195 };
196
197 #[cfg(target_os = "linux")]
198 let gaudi_initialized = {
199 if has_gaudi() {
200 match initialize_hlsmi_manager(config.sample_interval_ms / 1000) {
201 Ok(()) => true,
202 Err(e) => {
203 if config.verbose {
204 eprintln!("Warning: Intel Gaudi hl-smi init failed: {e}");
205 }
206 false
207 }
208 }
209 } else {
210 false
211 }
212 };
213
214 // Get readers
215 let gpu_readers = get_gpu_readers();
216 let cpu_readers = get_cpu_readers();
217 let memory_readers = get_memory_readers();
218 let chassis_reader = create_chassis_reader();
219 let storage_reader = create_storage_reader();
220
221 Ok(AllSmi {
222 gpu_readers,
223 cpu_readers,
224 memory_readers,
225 chassis_reader,
226 storage_reader,
227 #[cfg(target_os = "macos")]
228 _macos_initialized: macos_initialized,
229 #[cfg(target_os = "linux")]
230 _gaudi_initialized: gaudi_initialized,
231 })
232 }
233
234 /// Get information about all detected GPUs and NPUs.
235 ///
236 /// Returns a vector of [`GpuInfo`] structs containing metrics for each
237 /// detected accelerator. The list includes NVIDIA GPUs, AMD GPUs,
238 /// Apple Silicon GPUs, Intel Gaudi NPUs, and other supported devices.
239 ///
240 /// Returns an empty vector if no devices are detected.
241 ///
242 /// # Example
243 ///
244 /// ```rust,no_run
245 /// use all_smi::AllSmi;
246 ///
247 /// let smi = AllSmi::new()?;
248 /// for gpu in smi.get_gpu_info() {
249 /// println!("{}: {}% util, {:.1}W power, {}MB/{} MB memory",
250 /// gpu.name,
251 /// gpu.utilization,
252 /// gpu.power_consumption,
253 /// gpu.used_memory / 1024 / 1024,
254 /// gpu.total_memory / 1024 / 1024);
255 /// }
256 /// # Ok::<(), all_smi::Error>(())
257 /// ```
258 pub fn get_gpu_info(&self) -> Vec<GpuInfo> {
259 let mut all_gpus = Vec::new();
260 for reader in &self.gpu_readers {
261 all_gpus.extend(reader.get_gpu_info());
262 }
263 all_gpus
264 }
265
266 /// Get information about GPU/NPU processes.
267 ///
268 /// Returns a vector of [`ProcessInfo`] structs containing information
269 /// about processes using GPU resources. This includes process ID, name,
270 /// GPU memory usage, and other metrics.
271 ///
272 /// Returns an empty vector if no GPU processes are found.
273 ///
274 /// # Example
275 ///
276 /// ```rust,no_run
277 /// use all_smi::AllSmi;
278 ///
279 /// let smi = AllSmi::new()?;
280 /// for proc in smi.get_process_info() {
281 /// println!("PID {}: {} using {} MB GPU memory",
282 /// proc.pid,
283 /// proc.process_name,
284 /// proc.used_memory / 1024 / 1024);
285 /// }
286 /// # Ok::<(), all_smi::Error>(())
287 /// ```
288 pub fn get_process_info(&self) -> Vec<ProcessInfo> {
289 let mut all_processes = Vec::new();
290 for reader in &self.gpu_readers {
291 all_processes.extend(reader.get_process_info());
292 }
293 all_processes
294 }
295
296 /// Get information about system CPUs.
297 ///
298 /// Returns a vector of [`CpuInfo`] structs containing metrics for each
299 /// CPU socket or processor. This includes model name, utilization,
300 /// frequency, temperature, and platform-specific details.
301 ///
302 /// Returns an empty vector if CPU information is not available.
303 ///
304 /// # Example
305 ///
306 /// ```rust,no_run
307 /// use all_smi::AllSmi;
308 ///
309 /// let smi = AllSmi::new()?;
310 /// for cpu in smi.get_cpu_info() {
311 /// println!("{}: {:.1}% utilization, {} MHz",
312 /// cpu.cpu_model,
313 /// cpu.utilization,
314 /// cpu.base_frequency_mhz);
315 /// if let Some(temp) = cpu.temperature {
316 /// println!(" Temperature: {}C", temp);
317 /// }
318 /// }
319 /// # Ok::<(), all_smi::Error>(())
320 /// ```
321 pub fn get_cpu_info(&self) -> Vec<CpuInfo> {
322 let mut all_cpus = Vec::new();
323 for reader in &self.cpu_readers {
324 all_cpus.extend(reader.get_cpu_info());
325 }
326 all_cpus
327 }
328
329 /// Get information about system memory.
330 ///
331 /// Returns a vector of [`MemoryInfo`] structs containing memory
332 /// utilization metrics including total, used, available, and swap memory.
333 ///
334 /// Returns an empty vector if memory information is not available.
335 ///
336 /// # Example
337 ///
338 /// ```rust,no_run
339 /// use all_smi::AllSmi;
340 ///
341 /// let smi = AllSmi::new()?;
342 /// for mem in smi.get_memory_info() {
343 /// let total_gb = mem.total_bytes as f64 / 1024.0 / 1024.0 / 1024.0;
344 /// let used_gb = mem.used_bytes as f64 / 1024.0 / 1024.0 / 1024.0;
345 /// println!("Memory: {:.1} GB / {:.1} GB ({:.1}% used)",
346 /// used_gb, total_gb, mem.utilization);
347 /// }
348 /// # Ok::<(), all_smi::Error>(())
349 /// ```
350 pub fn get_memory_info(&self) -> Vec<MemoryInfo> {
351 let mut all_memory = Vec::new();
352 for reader in &self.memory_readers {
353 all_memory.extend(reader.get_memory_info());
354 }
355 all_memory
356 }
357
358 /// Get chassis/node-level information.
359 ///
360 /// Returns [`ChassisInfo`] if available, containing system-wide metrics
361 /// such as total power consumption (CPU + GPU + ANE), thermal pressure,
362 /// fan speeds, and PSU status.
363 ///
364 /// Returns `None` if chassis information is not available on this platform.
365 ///
366 /// # Example
367 ///
368 /// ```rust,no_run
369 /// use all_smi::AllSmi;
370 ///
371 /// let smi = AllSmi::new()?;
372 /// if let Some(chassis) = smi.get_chassis_info() {
373 /// if let Some(power) = chassis.total_power_watts {
374 /// println!("Total system power: {:.1}W", power);
375 /// }
376 /// if let Some(ref pressure) = chassis.thermal_pressure {
377 /// println!("Thermal pressure: {}", pressure);
378 /// }
379 /// }
380 /// # Ok::<(), all_smi::Error>(())
381 /// ```
382 pub fn get_chassis_info(&self) -> Option<ChassisInfo> {
383 self.chassis_reader.get_chassis_info()
384 }
385
386 /// Get information about storage devices.
387 ///
388 /// Returns a vector of [`StorageInfo`] structs containing metrics for each
389 /// detected storage device. The information includes mount point, total space,
390 /// available space, and host identification.
391 ///
392 /// Returns an empty vector if storage information is not available.
393 ///
394 /// # Example
395 ///
396 /// ```rust,no_run
397 /// use all_smi::AllSmi;
398 ///
399 /// let smi = AllSmi::new()?;
400 /// for storage in smi.get_storage_info() {
401 /// let used_bytes = storage.total_bytes - storage.available_bytes;
402 /// let usage_percent = if storage.total_bytes > 0 {
403 /// (used_bytes as f64 / storage.total_bytes as f64) * 100.0
404 /// } else {
405 /// 0.0
406 /// };
407 /// let total_gb = storage.total_bytes as f64 / 1024.0 / 1024.0 / 1024.0;
408 /// let available_gb = storage.available_bytes as f64 / 1024.0 / 1024.0 / 1024.0;
409 /// println!("{}: {:.1} GB / {:.1} GB ({:.1}% used)",
410 /// storage.mount_point, available_gb, total_gb, usage_percent);
411 /// }
412 /// # Ok::<(), all_smi::Error>(())
413 /// ```
414 pub fn get_storage_info(&self) -> Vec<StorageInfo> {
415 self.storage_reader.get_storage_info()
416 }
417
418 /// Get the number of detected GPU readers.
419 ///
420 /// This returns the number of reader types, not the number of GPUs.
421 /// Use `get_gpu_info().len()` to get the actual GPU count.
422 pub fn gpu_reader_count(&self) -> usize {
423 self.gpu_readers.len()
424 }
425
426 /// Check if any GPUs/NPUs are available.
427 ///
428 /// # Example
429 ///
430 /// ```rust,no_run
431 /// use all_smi::AllSmi;
432 ///
433 /// let smi = AllSmi::new()?;
434 /// if smi.has_gpus() {
435 /// println!("Found {} GPU(s)", smi.get_gpu_info().len());
436 /// } else {
437 /// println!("No GPUs detected");
438 /// }
439 /// # Ok::<(), all_smi::Error>(())
440 /// ```
441 pub fn has_gpus(&self) -> bool {
442 !self.gpu_readers.is_empty()
443 }
444
445 /// Check if CPU monitoring is available.
446 pub fn has_cpu_monitoring(&self) -> bool {
447 !self.cpu_readers.is_empty()
448 }
449
450 /// Check if memory monitoring is available.
451 pub fn has_memory_monitoring(&self) -> bool {
452 !self.memory_readers.is_empty()
453 }
454
455 /// Check if storage monitoring is available.
456 ///
457 /// This always returns `true` as storage monitoring is available on all
458 /// supported platforms through the `sysinfo` crate.
459 pub fn has_storage_monitoring(&self) -> bool {
460 // Storage monitoring is always available via sysinfo
461 true
462 }
463}
464
465impl Drop for AllSmi {
466 fn drop(&mut self) {
467 // Cleanup platform-specific managers
468 #[cfg(target_os = "macos")]
469 if self._macos_initialized {
470 shutdown_native_metrics_manager();
471 }
472
473 #[cfg(target_os = "linux")]
474 if self._gaudi_initialized {
475 shutdown_hlsmi_manager();
476 }
477 }
478}
479
480// SAFETY: AllSmi is safe to send and share across threads because:
481// 1. All reader traits (GpuReader, CpuReader, MemoryReader, ChassisReader) require
482// Send + Sync bounds, ensuring all stored readers are thread-safe
483// 2. The platform-specific managers (NativeMetricsManager on macOS, HlsmiManager on Linux)
484// are designed to be accessed from any thread
485// 3. The initialization flags are only written during construction and only read during drop,
486// with no concurrent access possible due to ownership semantics
487unsafe impl Send for AllSmi {}
488unsafe impl Sync for AllSmi {}
489
490/// Configuration options for [`AllSmi`].
491#[derive(Debug, Clone)]
492pub struct AllSmiConfig {
493 /// Sample interval in milliseconds for platform managers.
494 /// Default: 1000ms (1 second)
495 pub sample_interval_ms: u64,
496 /// Whether to print verbose warnings during initialization.
497 /// Default: false
498 pub verbose: bool,
499}
500
501impl Default for AllSmiConfig {
502 fn default() -> Self {
503 Self {
504 sample_interval_ms: 1000,
505 verbose: false,
506 }
507 }
508}
509
510impl AllSmiConfig {
511 /// Create a new configuration with default values.
512 pub fn new() -> Self {
513 Self::default()
514 }
515
516 /// Set the sample interval in milliseconds.
517 ///
518 /// # Arguments
519 ///
520 /// * `interval_ms` - Sample interval (minimum 100ms recommended)
521 pub fn sample_interval(mut self, interval_ms: u64) -> Self {
522 self.sample_interval_ms = interval_ms;
523 self
524 }
525
526 /// Enable verbose output during initialization.
527 pub fn verbose(mut self, verbose: bool) -> Self {
528 self.verbose = verbose;
529 self
530 }
531}
532
533#[cfg(test)]
534mod tests {
535 use super::*;
536
537 #[test]
538 fn test_allsmi_is_send_sync() {
539 fn assert_send_sync<T: Send + Sync>() {}
540 assert_send_sync::<AllSmi>();
541 }
542
543 #[test]
544 fn test_device_type_display() {
545 assert_eq!(DeviceType::NvidiaGpu.to_string(), "NVIDIA GPU");
546 assert_eq!(DeviceType::AppleSiliconGpu.to_string(), "Apple Silicon GPU");
547 assert_eq!(DeviceType::IntelGaudi.to_string(), "Intel Gaudi");
548 }
549
550 #[test]
551 fn test_config_default() {
552 let config = AllSmiConfig::default();
553 assert_eq!(config.sample_interval_ms, 1000);
554 assert!(!config.verbose);
555 }
556
557 #[test]
558 fn test_config_builder() {
559 let config = AllSmiConfig::new().sample_interval(500).verbose(true);
560 assert_eq!(config.sample_interval_ms, 500);
561 assert!(config.verbose);
562 }
563
564 #[test]
565 fn test_allsmi_new() {
566 // This test verifies that AllSmi can be created without panicking
567 // It may not find any hardware in CI environments
568 let result = AllSmi::new();
569 assert!(result.is_ok());
570
571 let smi = result.unwrap();
572 // These should not panic even without hardware
573 let _ = smi.get_gpu_info();
574 let _ = smi.get_cpu_info();
575 let _ = smi.get_memory_info();
576 let _ = smi.get_process_info();
577 let _ = smi.get_chassis_info();
578 let _ = smi.get_storage_info();
579 }
580
581 #[test]
582 fn test_storage_info() {
583 let smi = AllSmi::new().unwrap();
584
585 // Storage monitoring should always be available
586 assert!(smi.has_storage_monitoring());
587
588 // Get storage info and verify basic properties
589 let storage_info = smi.get_storage_info();
590
591 // Storage info should be returned (may be empty in some CI environments)
592 for storage in &storage_info {
593 // Mount point should not be empty
594 assert!(!storage.mount_point.is_empty());
595
596 // Available bytes should not exceed total bytes
597 assert!(storage.available_bytes <= storage.total_bytes);
598
599 // Hostname should not be empty
600 assert!(!storage.hostname.is_empty());
601 }
602 }
603
604 #[test]
605 fn test_allsmi_with_config() {
606 let config = AllSmiConfig::new().sample_interval(500);
607 let result = AllSmi::with_config(config);
608 assert!(result.is_ok());
609 }
610}