ipfrs_storage/
arm_profiler.rs

1//! ARM Performance Profiler
2//!
3//! Provides profiling utilities for ARM devices (Raspberry Pi, Jetson, etc.)
4//! with NEON SIMD detection and performance monitoring.
5
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::sync::Arc;
8use std::time::{Duration, Instant};
9
10/// ARM architecture feature detection
11#[derive(Debug, Clone)]
12pub struct ArmFeatures {
13    /// NEON SIMD support detected
14    pub has_neon: bool,
15    /// AArch64 architecture
16    pub is_aarch64: bool,
17    /// ARMv7 architecture
18    pub is_armv7: bool,
19}
20
21impl ArmFeatures {
22    /// Detect ARM features at runtime
23    pub fn detect() -> Self {
24        let is_aarch64 = cfg!(target_arch = "aarch64");
25        let is_armv7 = cfg!(target_arch = "arm");
26
27        // NEON is standard on AArch64, optional on ARMv7
28        let has_neon = if is_aarch64 {
29            true
30        } else if is_armv7 {
31            // On ARMv7, NEON is optional - check CPU features
32            #[cfg(target_arch = "arm")]
33            {
34                // Try to detect NEON through various methods
35                // Note: This is a simplified check
36                std::arch::is_arm_feature_detected!("neon")
37            }
38            #[cfg(not(target_arch = "arm"))]
39            {
40                false
41            }
42        } else {
43            false
44        };
45
46        Self {
47            has_neon,
48            is_aarch64,
49            is_armv7,
50        }
51    }
52
53    /// Check if running on any ARM architecture
54    pub fn is_arm(&self) -> bool {
55        self.is_aarch64 || self.is_armv7
56    }
57}
58
59/// Performance counter for ARM profiling
60#[derive(Debug, Clone)]
61pub struct ArmPerfCounter {
62    name: String,
63    count: Arc<AtomicU64>,
64    total_time_ns: Arc<AtomicU64>,
65}
66
67impl ArmPerfCounter {
68    /// Create a new performance counter
69    pub fn new(name: impl Into<String>) -> Self {
70        Self {
71            name: name.into(),
72            count: Arc::new(AtomicU64::new(0)),
73            total_time_ns: Arc::new(AtomicU64::new(0)),
74        }
75    }
76
77    /// Start timing an operation
78    pub fn start(&self) -> ArmPerfTimer {
79        ArmPerfTimer {
80            counter: self.clone(),
81            start: Instant::now(),
82        }
83    }
84
85    /// Get total operation count
86    pub fn count(&self) -> u64 {
87        self.count.load(Ordering::Relaxed)
88    }
89
90    /// Get total time spent
91    pub fn total_time(&self) -> Duration {
92        Duration::from_nanos(self.total_time_ns.load(Ordering::Relaxed))
93    }
94
95    /// Get average time per operation
96    pub fn avg_time(&self) -> Duration {
97        let count = self.count();
98        if count == 0 {
99            Duration::from_nanos(0)
100        } else {
101            Duration::from_nanos(self.total_time_ns.load(Ordering::Relaxed) / count)
102        }
103    }
104
105    /// Reset counter
106    pub fn reset(&self) {
107        self.count.store(0, Ordering::Relaxed);
108        self.total_time_ns.store(0, Ordering::Relaxed);
109    }
110
111    /// Get counter name
112    pub fn name(&self) -> &str {
113        &self.name
114    }
115}
116
117/// RAII timer for performance measurement
118pub struct ArmPerfTimer {
119    counter: ArmPerfCounter,
120    start: Instant,
121}
122
123impl Drop for ArmPerfTimer {
124    fn drop(&mut self) {
125        let elapsed = self.start.elapsed().as_nanos() as u64;
126        self.counter.count.fetch_add(1, Ordering::Relaxed);
127        self.counter
128            .total_time_ns
129            .fetch_add(elapsed, Ordering::Relaxed);
130    }
131}
132
133/// ARM profiling report
134#[derive(Debug, Clone)]
135pub struct ArmPerfReport {
136    /// ARM features detected
137    pub features: ArmFeatures,
138    /// Performance counters
139    pub counters: Vec<(String, u64, Duration, Duration)>, // (name, count, total, avg)
140}
141
142impl ArmPerfReport {
143    /// Create a profiling report from counters
144    pub fn from_counters(counters: &[ArmPerfCounter]) -> Self {
145        let features = ArmFeatures::detect();
146        let counters = counters
147            .iter()
148            .map(|c| {
149                (
150                    c.name().to_string(),
151                    c.count(),
152                    c.total_time(),
153                    c.avg_time(),
154                )
155            })
156            .collect();
157
158        Self { features, counters }
159    }
160
161    /// Print report to stdout
162    pub fn print(&self) {
163        println!("=== ARM Performance Report ===");
164        println!(
165            "Architecture: {}",
166            if self.features.is_aarch64 {
167                "AArch64"
168            } else if self.features.is_armv7 {
169                "ARMv7"
170            } else {
171                "x86_64 (not ARM)"
172            }
173        );
174        println!("NEON support: {}", self.features.has_neon);
175        println!("\nPerformance Counters:");
176
177        for (name, count, total, avg) in &self.counters {
178            println!("  {name}: {count} ops, total: {total:?}, avg: {avg:?}");
179        }
180    }
181}
182
183/// ARM-optimized hash computation using NEON when available
184#[cfg(target_arch = "aarch64")]
185pub mod neon_hash {
186    use std::arch::aarch64::*;
187
188    /// Compute hash using NEON SIMD instructions (AArch64)
189    ///
190    /// This is a simplified example - real implementations would use
191    /// more sophisticated hash algorithms optimized for NEON.
192    #[target_feature(enable = "neon")]
193    pub unsafe fn hash_block_neon(data: &[u8]) -> u64 {
194        let mut hash = 0xcbf29ce484222325u64; // FNV offset basis
195        const FNV_PRIME: u64 = 0x100000001b3;
196
197        // Process 16 bytes at a time with NEON
198        let chunks = data.chunks_exact(16);
199        let remainder = chunks.remainder();
200
201        for chunk in chunks {
202            // Load 16 bytes into NEON register
203            let v = vld1q_u8(chunk.as_ptr());
204
205            // Extract bytes and update hash
206            // Note: This is a simple implementation - production code
207            // would use more efficient NEON operations
208            let bytes: [u8; 16] = std::mem::transmute(v);
209            for &byte in &bytes {
210                hash ^= byte as u64;
211                hash = hash.wrapping_mul(FNV_PRIME);
212            }
213        }
214
215        // Process remaining bytes
216        for &byte in remainder {
217            hash ^= byte as u64;
218            hash = hash.wrapping_mul(FNV_PRIME);
219        }
220
221        hash
222    }
223}
224
225/// Fallback hash computation for non-ARM or when NEON is not available
226pub fn hash_block_fallback(data: &[u8]) -> u64 {
227    let mut hash = 0xcbf29ce484222325u64; // FNV offset basis
228    const FNV_PRIME: u64 = 0x100000001b3;
229
230    for &byte in data {
231        hash ^= byte as u64;
232        hash = hash.wrapping_mul(FNV_PRIME);
233    }
234
235    hash
236}
237
238/// Hash a block using the best available method (NEON or fallback)
239pub fn hash_block(data: &[u8]) -> u64 {
240    #[cfg(target_arch = "aarch64")]
241    {
242        // Use NEON on AArch64
243        unsafe { neon_hash::hash_block_neon(data) }
244    }
245
246    #[cfg(not(target_arch = "aarch64"))]
247    {
248        // Fallback for non-ARM
249        hash_block_fallback(data)
250    }
251}
252
253/// Power profile for low-power operation tuning
254#[derive(Debug, Clone, Copy, PartialEq, Eq)]
255pub enum PowerProfile {
256    /// Maximum performance, no power saving
257    Performance,
258    /// Balanced mode with moderate batching
259    Balanced,
260    /// Low power mode with aggressive batching and delays
261    LowPower,
262    /// Custom profile with specific parameters
263    Custom {
264        batch_size: usize,
265        batch_delay_ms: u64,
266    },
267}
268
269impl PowerProfile {
270    /// Get batch size for this profile
271    pub fn batch_size(&self) -> usize {
272        match self {
273            PowerProfile::Performance => 1,
274            PowerProfile::Balanced => 10,
275            PowerProfile::LowPower => 50,
276            PowerProfile::Custom { batch_size, .. } => *batch_size,
277        }
278    }
279
280    /// Get batch delay in milliseconds
281    pub fn batch_delay_ms(&self) -> u64 {
282        match self {
283            PowerProfile::Performance => 0,
284            PowerProfile::Balanced => 10,
285            PowerProfile::LowPower => 100,
286            PowerProfile::Custom { batch_delay_ms, .. } => *batch_delay_ms,
287        }
288    }
289
290    /// Get batch delay as Duration
291    pub fn batch_delay(&self) -> Duration {
292        Duration::from_millis(self.batch_delay_ms())
293    }
294}
295
296/// Low-power operation batcher
297///
298/// Batches operations to reduce CPU wake-ups and save power.
299/// Particularly useful on battery-powered ARM devices.
300pub struct LowPowerBatcher<T> {
301    profile: PowerProfile,
302    buffer: Arc<std::sync::Mutex<Vec<T>>>,
303}
304
305impl<T> LowPowerBatcher<T> {
306    /// Create a new batcher with the given power profile
307    pub fn new(profile: PowerProfile) -> Self {
308        Self {
309            profile,
310            buffer: Arc::new(std::sync::Mutex::new(Vec::new())),
311        }
312    }
313
314    /// Add an item to the batch
315    ///
316    /// Returns the current batch if it's ready to be processed
317    pub fn push(&self, item: T) -> Option<Vec<T>> {
318        let mut buffer = self.buffer.lock().unwrap();
319        buffer.push(item);
320
321        if buffer.len() >= self.profile.batch_size() {
322            Some(std::mem::take(&mut *buffer))
323        } else {
324            None
325        }
326    }
327
328    /// Flush the current batch (returns all pending items)
329    pub fn flush(&self) -> Vec<T> {
330        let mut buffer = self.buffer.lock().unwrap();
331        std::mem::take(&mut *buffer)
332    }
333
334    /// Get the current power profile
335    pub fn profile(&self) -> PowerProfile {
336        self.profile
337    }
338
339    /// Get the number of pending items
340    pub fn pending(&self) -> usize {
341        self.buffer.lock().unwrap().len()
342    }
343}
344
345/// Power statistics tracker
346#[derive(Debug, Clone, Default)]
347pub struct PowerStats {
348    /// Number of CPU wake-ups (batch flushes)
349    pub wakeups: u64,
350    /// Number of operations batched
351    pub operations: u64,
352    /// Total time spent in batched delays
353    pub delay_time: Duration,
354}
355
356impl PowerStats {
357    /// Create a new power stats tracker
358    pub fn new() -> Self {
359        Self::default()
360    }
361
362    /// Record a batch operation
363    pub fn record_batch(&mut self, ops: usize, delay: Duration) {
364        self.wakeups += 1;
365        self.operations += ops as u64;
366        self.delay_time += delay;
367    }
368
369    /// Get average operations per wake-up
370    pub fn avg_ops_per_wakeup(&self) -> f64 {
371        if self.wakeups == 0 {
372            0.0
373        } else {
374            self.operations as f64 / self.wakeups as f64
375        }
376    }
377
378    /// Get power saving ratio (higher is better)
379    ///
380    /// This estimates how much we've reduced wake-ups compared to
381    /// processing each operation individually.
382    pub fn power_saving_ratio(&self) -> f64 {
383        if self.operations == 0 {
384            1.0
385        } else {
386            self.wakeups as f64 / self.operations as f64
387        }
388    }
389}
390
391#[cfg(test)]
392mod tests {
393    use super::*;
394
395    #[test]
396    fn test_arm_features() {
397        let _features = ArmFeatures::detect();
398
399        // Should detect correctly based on compile target
400        #[cfg(target_arch = "aarch64")]
401        {
402            assert!(_features.is_aarch64);
403            assert!(_features.has_neon);
404        }
405
406        #[cfg(target_arch = "arm")]
407        {
408            assert!(_features.is_armv7);
409        }
410
411        // On non-ARM, just verify we can detect features
412        #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
413        {
414            assert!(!_features.is_arm());
415        }
416    }
417
418    #[test]
419    fn test_perf_counter() {
420        let counter = ArmPerfCounter::new("test_op");
421
422        {
423            let _timer = counter.start();
424            std::thread::sleep(Duration::from_millis(10));
425        }
426
427        assert_eq!(counter.count(), 1);
428        assert!(counter.total_time() >= Duration::from_millis(10));
429        assert!(counter.avg_time() >= Duration::from_millis(10));
430    }
431
432    #[test]
433    fn test_hash_block() {
434        let data = b"hello world";
435        let hash1 = hash_block(data);
436
437        // Both implementations should produce consistent results
438        #[cfg(not(target_arch = "aarch64"))]
439        {
440            let hash2 = hash_block_fallback(data);
441            assert_eq!(hash1, hash2);
442        }
443
444        // Hash should be deterministic
445        assert_eq!(hash1, hash_block(data));
446    }
447
448    #[test]
449    fn test_perf_report() {
450        let counter1 = ArmPerfCounter::new("op1");
451        let counter2 = ArmPerfCounter::new("op2");
452
453        {
454            let _t = counter1.start();
455            std::thread::sleep(Duration::from_millis(1));
456        }
457
458        {
459            let _t = counter2.start();
460            std::thread::sleep(Duration::from_millis(1));
461        }
462
463        let report = ArmPerfReport::from_counters(&[counter1, counter2]);
464        assert_eq!(report.counters.len(), 2);
465    }
466
467    #[test]
468    fn test_power_profile() {
469        let perf = PowerProfile::Performance;
470        assert_eq!(perf.batch_size(), 1);
471        assert_eq!(perf.batch_delay_ms(), 0);
472
473        let balanced = PowerProfile::Balanced;
474        assert_eq!(balanced.batch_size(), 10);
475        assert_eq!(balanced.batch_delay_ms(), 10);
476
477        let low = PowerProfile::LowPower;
478        assert_eq!(low.batch_size(), 50);
479        assert_eq!(low.batch_delay_ms(), 100);
480
481        let custom = PowerProfile::Custom {
482            batch_size: 20,
483            batch_delay_ms: 30,
484        };
485        assert_eq!(custom.batch_size(), 20);
486        assert_eq!(custom.batch_delay_ms(), 30);
487    }
488
489    #[test]
490    fn test_low_power_batcher() {
491        let batcher: LowPowerBatcher<i32> = LowPowerBatcher::new(PowerProfile::Custom {
492            batch_size: 3,
493            batch_delay_ms: 0,
494        });
495
496        assert_eq!(batcher.pending(), 0);
497
498        // First two pushes shouldn't trigger batch
499        assert!(batcher.push(1).is_none());
500        assert_eq!(batcher.pending(), 1);
501
502        assert!(batcher.push(2).is_none());
503        assert_eq!(batcher.pending(), 2);
504
505        // Third push should trigger batch
506        let batch = batcher.push(3);
507        assert!(batch.is_some());
508        let batch = batch.unwrap();
509        assert_eq!(batch, vec![1, 2, 3]);
510        assert_eq!(batcher.pending(), 0);
511
512        // Test flush
513        batcher.push(4);
514        batcher.push(5);
515        let flushed = batcher.flush();
516        assert_eq!(flushed, vec![4, 5]);
517        assert_eq!(batcher.pending(), 0);
518    }
519
520    #[test]
521    fn test_power_stats() {
522        let mut stats = PowerStats::new();
523        assert_eq!(stats.wakeups, 0);
524        assert_eq!(stats.operations, 0);
525
526        stats.record_batch(10, Duration::from_millis(5));
527        assert_eq!(stats.wakeups, 1);
528        assert_eq!(stats.operations, 10);
529        assert_eq!(stats.avg_ops_per_wakeup(), 10.0);
530
531        stats.record_batch(5, Duration::from_millis(5));
532        assert_eq!(stats.wakeups, 2);
533        assert_eq!(stats.operations, 15);
534        assert_eq!(stats.avg_ops_per_wakeup(), 7.5);
535
536        // Power saving ratio: 2 wakeups / 15 operations ≈ 0.133
537        let ratio = stats.power_saving_ratio();
538        assert!(ratio > 0.0 && ratio < 1.0);
539    }
540}