Skip to main content

all_smi/
client.rs

1// Copyright 2025 Lablup Inc. and Jeongkyu Shin
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! High-level client API for all-smi library.
16//!
17//! This module provides the main [`AllSmi`] struct, which offers a simple,
18//! ergonomic interface for querying GPU, CPU, memory, and process information
19//! across all supported platforms.
20//!
21//! # Example
22//!
23//! ```rust,no_run
24//! use all_smi::{AllSmi, Result};
25//!
26//! fn main() -> Result<()> {
27//!     // Initialize with auto-detection
28//!     let smi = AllSmi::new()?;
29//!
30//!     // Get all GPU/NPU information
31//!     let gpus = smi.get_gpu_info();
32//!     for gpu in &gpus {
33//!         println!("{}: {}% utilization, {:.1}W",
34//!             gpu.name, gpu.utilization, gpu.power_consumption);
35//!     }
36//!
37//!     // Get CPU information
38//!     let cpus = smi.get_cpu_info();
39//!     for cpu in &cpus {
40//!         println!("{}: {:.1}% utilization", cpu.cpu_model, cpu.utilization);
41//!     }
42//!
43//!     // Get memory information
44//!     let memory = smi.get_memory_info();
45//!     for mem in &memory {
46//!         println!("Memory: {:.1}% used", mem.utilization);
47//!     }
48//!
49//!     Ok(())
50//! }
51//! ```
52
53use crate::device::{
54    create_chassis_reader, get_cpu_readers, get_gpu_readers, get_memory_readers, ChassisInfo,
55    ChassisReader, CpuInfo, CpuReader, GpuInfo, GpuReader, MemoryInfo, MemoryReader, ProcessInfo,
56};
57use crate::error::Result;
58use crate::storage::{create_storage_reader, StorageInfo, StorageReader};
59
60#[cfg(target_os = "macos")]
61use crate::device::macos_native::{
62    initialize_native_metrics_manager, shutdown_native_metrics_manager,
63};
64
65#[cfg(target_os = "linux")]
66use crate::device::hlsmi::{initialize_hlsmi_manager, shutdown_hlsmi_manager};
67
68#[cfg(target_os = "linux")]
69use crate::device::platform_detection::has_gaudi;
70
71/// The type of device that can be monitored.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
73pub enum DeviceType {
74    /// NVIDIA GPU
75    NvidiaGpu,
76    /// AMD GPU
77    AmdGpu,
78    /// Apple Silicon GPU
79    AppleSiliconGpu,
80    /// NVIDIA Jetson
81    NvidiaJetson,
82    /// Intel Gaudi NPU
83    IntelGaudi,
84    /// Furiosa NPU
85    FuriosaNpu,
86    /// Rebellions NPU
87    RebellionsNpu,
88    /// Tenstorrent NPU
89    TenstorrentNpu,
90    /// Google TPU
91    GoogleTpu,
92}
93
94impl std::fmt::Display for DeviceType {
95    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96        match self {
97            DeviceType::NvidiaGpu => write!(f, "NVIDIA GPU"),
98            DeviceType::AmdGpu => write!(f, "AMD GPU"),
99            DeviceType::AppleSiliconGpu => write!(f, "Apple Silicon GPU"),
100            DeviceType::NvidiaJetson => write!(f, "NVIDIA Jetson"),
101            DeviceType::IntelGaudi => write!(f, "Intel Gaudi"),
102            DeviceType::FuriosaNpu => write!(f, "Furiosa NPU"),
103            DeviceType::RebellionsNpu => write!(f, "Rebellions NPU"),
104            DeviceType::TenstorrentNpu => write!(f, "Tenstorrent NPU"),
105            DeviceType::GoogleTpu => write!(f, "Google TPU"),
106        }
107    }
108}
109
110/// Main client for accessing hardware monitoring information.
111///
112/// `AllSmi` provides a high-level API for querying GPU, NPU, CPU, and memory
113/// information across all supported platforms. It handles platform-specific
114/// initialization and cleanup automatically.
115///
116/// # Thread Safety
117///
118/// `AllSmi` is `Send + Sync` and can be safely shared across threads.
119///
120/// # Example
121///
122/// ```rust,no_run
123/// use all_smi::AllSmi;
124///
125/// let smi = AllSmi::new().expect("Failed to initialize");
126///
127/// // Query GPU information
128/// for gpu in smi.get_gpu_info() {
129///     println!("{}: {}% utilization", gpu.name, gpu.utilization);
130/// }
131/// ```
132pub struct AllSmi {
133    gpu_readers: Vec<Box<dyn GpuReader>>,
134    cpu_readers: Vec<Box<dyn CpuReader>>,
135    memory_readers: Vec<Box<dyn MemoryReader>>,
136    chassis_reader: Box<dyn ChassisReader>,
137    storage_reader: Box<dyn StorageReader>,
138    #[cfg(target_os = "macos")]
139    _macos_initialized: bool,
140    #[cfg(target_os = "linux")]
141    _gaudi_initialized: bool,
142}
143
144impl AllSmi {
145    /// Create a new `AllSmi` instance with auto-detected hardware.
146    ///
147    /// This constructor initializes all platform-specific managers and
148    /// creates readers for available hardware. It does not fail if no
149    /// hardware is detected; instead, the corresponding `get_*_info()`
150    /// methods will return empty collections.
151    ///
152    /// # Errors
153    ///
154    /// Returns an error if platform initialization fails critically
155    /// (e.g., macOS IOReport API unavailable, or system-level errors).
156    ///
157    /// # Example
158    ///
159    /// ```rust,no_run
160    /// use all_smi::AllSmi;
161    ///
162    /// let smi = AllSmi::new()?;
163    /// println!("Found {} GPU(s)", smi.get_gpu_info().len());
164    /// # Ok::<(), all_smi::Error>(())
165    /// ```
166    #[must_use = "AllSmi instance must be stored to access hardware information"]
167    pub fn new() -> Result<Self> {
168        Self::with_config(AllSmiConfig::default())
169    }
170
171    /// Create a new `AllSmi` instance with custom configuration.
172    ///
173    /// # Arguments
174    ///
175    /// * `config` - Configuration options for the client
176    ///
177    /// # Errors
178    ///
179    /// Returns an error if platform initialization fails.
180    #[must_use = "AllSmi instance must be stored to access hardware information"]
181    pub fn with_config(config: AllSmiConfig) -> Result<Self> {
182        // Initialize platform-specific managers
183        #[cfg(target_os = "macos")]
184        let macos_initialized = {
185            match initialize_native_metrics_manager(config.sample_interval_ms) {
186                Ok(()) => true,
187                Err(e) => {
188                    // Log but don't fail - some metrics may still work
189                    if config.verbose {
190                        eprintln!("Warning: macOS native metrics init failed: {e}");
191                    }
192                    false
193                }
194            }
195        };
196
197        #[cfg(target_os = "linux")]
198        let gaudi_initialized = {
199            if has_gaudi() {
200                match initialize_hlsmi_manager(config.sample_interval_ms / 1000) {
201                    Ok(()) => true,
202                    Err(e) => {
203                        if config.verbose {
204                            eprintln!("Warning: Intel Gaudi hl-smi init failed: {e}");
205                        }
206                        false
207                    }
208                }
209            } else {
210                false
211            }
212        };
213
214        // Get readers
215        let gpu_readers = get_gpu_readers();
216        let cpu_readers = get_cpu_readers();
217        let memory_readers = get_memory_readers();
218        let chassis_reader = create_chassis_reader();
219        let storage_reader = create_storage_reader();
220
221        Ok(AllSmi {
222            gpu_readers,
223            cpu_readers,
224            memory_readers,
225            chassis_reader,
226            storage_reader,
227            #[cfg(target_os = "macos")]
228            _macos_initialized: macos_initialized,
229            #[cfg(target_os = "linux")]
230            _gaudi_initialized: gaudi_initialized,
231        })
232    }
233
234    /// Get information about all detected GPUs and NPUs.
235    ///
236    /// Returns a vector of [`GpuInfo`] structs containing metrics for each
237    /// detected accelerator. The list includes NVIDIA GPUs, AMD GPUs,
238    /// Apple Silicon GPUs, Intel Gaudi NPUs, and other supported devices.
239    ///
240    /// Returns an empty vector if no devices are detected.
241    ///
242    /// # Example
243    ///
244    /// ```rust,no_run
245    /// use all_smi::AllSmi;
246    ///
247    /// let smi = AllSmi::new()?;
248    /// for gpu in smi.get_gpu_info() {
249    ///     println!("{}: {}% util, {:.1}W power, {}MB/{} MB memory",
250    ///         gpu.name,
251    ///         gpu.utilization,
252    ///         gpu.power_consumption,
253    ///         gpu.used_memory / 1024 / 1024,
254    ///         gpu.total_memory / 1024 / 1024);
255    /// }
256    /// # Ok::<(), all_smi::Error>(())
257    /// ```
258    pub fn get_gpu_info(&self) -> Vec<GpuInfo> {
259        let mut all_gpus = Vec::new();
260        for reader in &self.gpu_readers {
261            all_gpus.extend(reader.get_gpu_info());
262        }
263        all_gpus
264    }
265
266    /// Get information about GPU/NPU processes.
267    ///
268    /// Returns a vector of [`ProcessInfo`] structs containing information
269    /// about processes using GPU resources. This includes process ID, name,
270    /// GPU memory usage, and other metrics.
271    ///
272    /// Returns an empty vector if no GPU processes are found.
273    ///
274    /// # Example
275    ///
276    /// ```rust,no_run
277    /// use all_smi::AllSmi;
278    ///
279    /// let smi = AllSmi::new()?;
280    /// for proc in smi.get_process_info() {
281    ///     println!("PID {}: {} using {} MB GPU memory",
282    ///         proc.pid,
283    ///         proc.process_name,
284    ///         proc.used_memory / 1024 / 1024);
285    /// }
286    /// # Ok::<(), all_smi::Error>(())
287    /// ```
288    pub fn get_process_info(&self) -> Vec<ProcessInfo> {
289        let mut all_processes = Vec::new();
290        for reader in &self.gpu_readers {
291            all_processes.extend(reader.get_process_info());
292        }
293        all_processes
294    }
295
296    /// Get information about system CPUs.
297    ///
298    /// Returns a vector of [`CpuInfo`] structs containing metrics for each
299    /// CPU socket or processor. This includes model name, utilization,
300    /// frequency, temperature, and platform-specific details.
301    ///
302    /// Returns an empty vector if CPU information is not available.
303    ///
304    /// # Example
305    ///
306    /// ```rust,no_run
307    /// use all_smi::AllSmi;
308    ///
309    /// let smi = AllSmi::new()?;
310    /// for cpu in smi.get_cpu_info() {
311    ///     println!("{}: {:.1}% utilization, {} MHz",
312    ///         cpu.cpu_model,
313    ///         cpu.utilization,
314    ///         cpu.base_frequency_mhz);
315    ///     if let Some(temp) = cpu.temperature {
316    ///         println!("  Temperature: {}C", temp);
317    ///     }
318    /// }
319    /// # Ok::<(), all_smi::Error>(())
320    /// ```
321    pub fn get_cpu_info(&self) -> Vec<CpuInfo> {
322        let mut all_cpus = Vec::new();
323        for reader in &self.cpu_readers {
324            all_cpus.extend(reader.get_cpu_info());
325        }
326        all_cpus
327    }
328
329    /// Get information about system memory.
330    ///
331    /// Returns a vector of [`MemoryInfo`] structs containing memory
332    /// utilization metrics including total, used, available, and swap memory.
333    ///
334    /// Returns an empty vector if memory information is not available.
335    ///
336    /// # Example
337    ///
338    /// ```rust,no_run
339    /// use all_smi::AllSmi;
340    ///
341    /// let smi = AllSmi::new()?;
342    /// for mem in smi.get_memory_info() {
343    ///     let total_gb = mem.total_bytes as f64 / 1024.0 / 1024.0 / 1024.0;
344    ///     let used_gb = mem.used_bytes as f64 / 1024.0 / 1024.0 / 1024.0;
345    ///     println!("Memory: {:.1} GB / {:.1} GB ({:.1}% used)",
346    ///         used_gb, total_gb, mem.utilization);
347    /// }
348    /// # Ok::<(), all_smi::Error>(())
349    /// ```
350    pub fn get_memory_info(&self) -> Vec<MemoryInfo> {
351        let mut all_memory = Vec::new();
352        for reader in &self.memory_readers {
353            all_memory.extend(reader.get_memory_info());
354        }
355        all_memory
356    }
357
358    /// Get chassis/node-level information.
359    ///
360    /// Returns [`ChassisInfo`] if available, containing system-wide metrics
361    /// such as total power consumption (CPU + GPU + ANE), thermal pressure,
362    /// fan speeds, and PSU status.
363    ///
364    /// Returns `None` if chassis information is not available on this platform.
365    ///
366    /// # Example
367    ///
368    /// ```rust,no_run
369    /// use all_smi::AllSmi;
370    ///
371    /// let smi = AllSmi::new()?;
372    /// if let Some(chassis) = smi.get_chassis_info() {
373    ///     if let Some(power) = chassis.total_power_watts {
374    ///         println!("Total system power: {:.1}W", power);
375    ///     }
376    ///     if let Some(ref pressure) = chassis.thermal_pressure {
377    ///         println!("Thermal pressure: {}", pressure);
378    ///     }
379    /// }
380    /// # Ok::<(), all_smi::Error>(())
381    /// ```
382    pub fn get_chassis_info(&self) -> Option<ChassisInfo> {
383        self.chassis_reader.get_chassis_info()
384    }
385
386    /// Get information about storage devices.
387    ///
388    /// Returns a vector of [`StorageInfo`] structs containing metrics for each
389    /// detected storage device. The information includes mount point, total space,
390    /// available space, and host identification.
391    ///
392    /// Returns an empty vector if storage information is not available.
393    ///
394    /// # Example
395    ///
396    /// ```rust,no_run
397    /// use all_smi::AllSmi;
398    ///
399    /// let smi = AllSmi::new()?;
400    /// for storage in smi.get_storage_info() {
401    ///     let used_bytes = storage.total_bytes - storage.available_bytes;
402    ///     let usage_percent = if storage.total_bytes > 0 {
403    ///         (used_bytes as f64 / storage.total_bytes as f64) * 100.0
404    ///     } else {
405    ///         0.0
406    ///     };
407    ///     let total_gb = storage.total_bytes as f64 / 1024.0 / 1024.0 / 1024.0;
408    ///     let available_gb = storage.available_bytes as f64 / 1024.0 / 1024.0 / 1024.0;
409    ///     println!("{}: {:.1} GB / {:.1} GB ({:.1}% used)",
410    ///         storage.mount_point, available_gb, total_gb, usage_percent);
411    /// }
412    /// # Ok::<(), all_smi::Error>(())
413    /// ```
414    pub fn get_storage_info(&self) -> Vec<StorageInfo> {
415        self.storage_reader.get_storage_info()
416    }
417
418    /// Get the number of detected GPU readers.
419    ///
420    /// This returns the number of reader types, not the number of GPUs.
421    /// Use `get_gpu_info().len()` to get the actual GPU count.
422    pub fn gpu_reader_count(&self) -> usize {
423        self.gpu_readers.len()
424    }
425
426    /// Check if any GPUs/NPUs are available.
427    ///
428    /// # Example
429    ///
430    /// ```rust,no_run
431    /// use all_smi::AllSmi;
432    ///
433    /// let smi = AllSmi::new()?;
434    /// if smi.has_gpus() {
435    ///     println!("Found {} GPU(s)", smi.get_gpu_info().len());
436    /// } else {
437    ///     println!("No GPUs detected");
438    /// }
439    /// # Ok::<(), all_smi::Error>(())
440    /// ```
441    pub fn has_gpus(&self) -> bool {
442        !self.gpu_readers.is_empty()
443    }
444
445    /// Check if CPU monitoring is available.
446    pub fn has_cpu_monitoring(&self) -> bool {
447        !self.cpu_readers.is_empty()
448    }
449
450    /// Check if memory monitoring is available.
451    pub fn has_memory_monitoring(&self) -> bool {
452        !self.memory_readers.is_empty()
453    }
454
455    /// Check if storage monitoring is available.
456    ///
457    /// This always returns `true` as storage monitoring is available on all
458    /// supported platforms through the `sysinfo` crate.
459    pub fn has_storage_monitoring(&self) -> bool {
460        // Storage monitoring is always available via sysinfo
461        true
462    }
463}
464
465impl Drop for AllSmi {
466    fn drop(&mut self) {
467        // Cleanup platform-specific managers
468        #[cfg(target_os = "macos")]
469        if self._macos_initialized {
470            shutdown_native_metrics_manager();
471        }
472
473        #[cfg(target_os = "linux")]
474        if self._gaudi_initialized {
475            shutdown_hlsmi_manager();
476        }
477    }
478}
479
480// SAFETY: AllSmi is safe to send and share across threads because:
481// 1. All reader traits (GpuReader, CpuReader, MemoryReader, ChassisReader) require
482//    Send + Sync bounds, ensuring all stored readers are thread-safe
483// 2. The platform-specific managers (NativeMetricsManager on macOS, HlsmiManager on Linux)
484//    are designed to be accessed from any thread
485// 3. The initialization flags are only written during construction and only read during drop,
486//    with no concurrent access possible due to ownership semantics
487unsafe impl Send for AllSmi {}
488unsafe impl Sync for AllSmi {}
489
490/// Configuration options for [`AllSmi`].
491#[derive(Debug, Clone)]
492pub struct AllSmiConfig {
493    /// Sample interval in milliseconds for platform managers.
494    /// Default: 1000ms (1 second)
495    pub sample_interval_ms: u64,
496    /// Whether to print verbose warnings during initialization.
497    /// Default: false
498    pub verbose: bool,
499}
500
501impl Default for AllSmiConfig {
502    fn default() -> Self {
503        Self {
504            sample_interval_ms: 1000,
505            verbose: false,
506        }
507    }
508}
509
510impl AllSmiConfig {
511    /// Create a new configuration with default values.
512    pub fn new() -> Self {
513        Self::default()
514    }
515
516    /// Set the sample interval in milliseconds.
517    ///
518    /// # Arguments
519    ///
520    /// * `interval_ms` - Sample interval (minimum 100ms recommended)
521    pub fn sample_interval(mut self, interval_ms: u64) -> Self {
522        self.sample_interval_ms = interval_ms;
523        self
524    }
525
526    /// Enable verbose output during initialization.
527    pub fn verbose(mut self, verbose: bool) -> Self {
528        self.verbose = verbose;
529        self
530    }
531}
532
533#[cfg(test)]
534mod tests {
535    use super::*;
536
537    #[test]
538    fn test_allsmi_is_send_sync() {
539        fn assert_send_sync<T: Send + Sync>() {}
540        assert_send_sync::<AllSmi>();
541    }
542
543    #[test]
544    fn test_device_type_display() {
545        assert_eq!(DeviceType::NvidiaGpu.to_string(), "NVIDIA GPU");
546        assert_eq!(DeviceType::AppleSiliconGpu.to_string(), "Apple Silicon GPU");
547        assert_eq!(DeviceType::IntelGaudi.to_string(), "Intel Gaudi");
548    }
549
550    #[test]
551    fn test_config_default() {
552        let config = AllSmiConfig::default();
553        assert_eq!(config.sample_interval_ms, 1000);
554        assert!(!config.verbose);
555    }
556
557    #[test]
558    fn test_config_builder() {
559        let config = AllSmiConfig::new().sample_interval(500).verbose(true);
560        assert_eq!(config.sample_interval_ms, 500);
561        assert!(config.verbose);
562    }
563
564    #[test]
565    fn test_allsmi_new() {
566        // This test verifies that AllSmi can be created without panicking
567        // It may not find any hardware in CI environments
568        let result = AllSmi::new();
569        assert!(result.is_ok());
570
571        let smi = result.unwrap();
572        // These should not panic even without hardware
573        let _ = smi.get_gpu_info();
574        let _ = smi.get_cpu_info();
575        let _ = smi.get_memory_info();
576        let _ = smi.get_process_info();
577        let _ = smi.get_chassis_info();
578        let _ = smi.get_storage_info();
579    }
580
581    #[test]
582    fn test_storage_info() {
583        let smi = AllSmi::new().unwrap();
584
585        // Storage monitoring should always be available
586        assert!(smi.has_storage_monitoring());
587
588        // Get storage info and verify basic properties
589        let storage_info = smi.get_storage_info();
590
591        // Storage info should be returned (may be empty in some CI environments)
592        for storage in &storage_info {
593            // Mount point should not be empty
594            assert!(!storage.mount_point.is_empty());
595
596            // Available bytes should not exceed total bytes
597            assert!(storage.available_bytes <= storage.total_bytes);
598
599            // Hostname should not be empty
600            assert!(!storage.hostname.is_empty());
601        }
602    }
603
604    #[test]
605    fn test_allsmi_with_config() {
606        let config = AllSmiConfig::new().sample_interval(500);
607        let result = AllSmi::with_config(config);
608        assert!(result.is_ok());
609    }
610}