optirs_core/streaming/
low_latency.rs

1// Low-latency optimization for real-time streaming applications
2//
3// This module provides specialized optimizers and techniques for applications
4// that require extremely low latency updates, such as high-frequency trading,
5// real-time control systems, and interactive machine learning.
6
7use scirs2_core::ndarray::Array1;
8use scirs2_core::numeric::Float;
9use std::collections::VecDeque;
10use std::sync::{
11    atomic::{AtomicUsize, Ordering},
12    Arc, Mutex,
13};
14use std::time::{Duration, Instant};
15
16use crate::error::{OptimError, Result};
17use crate::optimizers::Optimizer;
18
19/// Low-latency optimization configuration
20#[derive(Debug, Clone)]
21pub struct LowLatencyConfig {
22    /// Target latency budget (microseconds)
23    pub target_latency_us: u64,
24
25    /// Maximum acceptable latency (microseconds)
26    pub max_latency_us: u64,
27
28    /// Enable pre-computation of updates
29    pub enable_precomputation: bool,
30
31    /// Buffer size for pre-computed updates
32    pub precomputation_buffer_size: usize,
33
34    /// Enable lock-free algorithms
35    pub enable_lock_free: bool,
36
37    /// Use approximate algorithms for speed
38    pub use_approximations: bool,
39
40    /// Approximation tolerance
41    pub approximation_tolerance: f64,
42
43    /// Enable SIMD optimizations
44    pub enable_simd: bool,
45
46    /// Batch processing threshold
47    pub batch_threshold: usize,
48
49    /// Enable zero-copy operations
50    pub enable_zero_copy: bool,
51
52    /// Memory pool size for allocations
53    pub memory_pool_size: usize,
54
55    /// Enable gradient quantization
56    pub enable_quantization: bool,
57
58    /// Quantization bits
59    pub quantization_bits: u8,
60}
61
62impl Default for LowLatencyConfig {
63    fn default() -> Self {
64        Self {
65            target_latency_us: 100, // 100 microseconds
66            max_latency_us: 1000,   // 1 millisecond
67            enable_precomputation: true,
68            precomputation_buffer_size: 64,
69            enable_lock_free: true,
70            use_approximations: true,
71            approximation_tolerance: 0.01,
72            enable_simd: true,
73            batch_threshold: 8,
74            enable_zero_copy: true,
75            memory_pool_size: 1024 * 1024, // 1MB
76            enable_quantization: false,
77            quantization_bits: 8,
78        }
79    }
80}
81
82/// Low-latency streaming optimizer
83pub struct LowLatencyOptimizer<O, A>
84where
85    A: Float + Send + Sync + scirs2_core::ndarray::ScalarOperand + std::fmt::Debug,
86    O: Optimizer<A, scirs2_core::ndarray::Ix1> + Send + Sync,
87{
88    /// Base optimizer
89    base_optimizer: Arc<Mutex<O>>,
90
91    /// Configuration
92    config: LowLatencyConfig,
93
94    /// Pre-computation engine
95    precomputation_engine: Option<PrecomputationEngine<A>>,
96
97    /// Lock-free update buffer
98    update_buffer: LockFreeBuffer<A>,
99
100    /// Memory pool for fast allocations
101    memory_pool: FastMemoryPool,
102
103    /// SIMD processor
104    simd_processor: SIMDProcessor<A>,
105
106    /// Quantization engine
107    quantizer: Option<GradientQuantizer<A>>,
108
109    /// Performance monitor
110    perf_monitor: LatencyMonitor,
111
112    /// Approximation controller
113    approximation_controller: ApproximationController<A>,
114
115    /// Step counter (atomic for thread safety)
116    step_counter: AtomicUsize,
117}
118
119/// Pre-computation engine for preparing updates in advance
120struct PrecomputationEngine<A: Float + Send + Sync> {
121    /// Buffer of pre-computed updates
122    precomputed_updates: VecDeque<PrecomputedUpdate<A>>,
123
124    /// Background computation thread
125    computation_thread: Option<std::thread::JoinHandle<()>>,
126
127    /// Prediction model for future gradients
128    gradient_predictor: GradientPredictor<A>,
129
130    /// Maximum buffer size
131    max_buffer_size: usize,
132}
133
134/// Pre-computed update entry
135#[derive(Debug, Clone)]
136struct PrecomputedUpdate<A: Float + Send + Sync> {
137    /// Predicted gradient
138    gradient: Array1<A>,
139
140    /// Pre-computed parameter update
141    update: Array1<A>,
142
143    /// Validity timestamp
144    valid_until: Instant,
145
146    /// Confidence score
147    confidence: A,
148}
149
150/// Lock-free circular buffer for updates
151struct LockFreeBuffer<A: Float + Send + Sync> {
152    /// Buffer storage
153    buffer: Vec<Option<Array1<A>>>,
154
155    /// Write index (atomic)
156    write_index: AtomicUsize,
157
158    /// Read index (atomic)
159    read_index: AtomicUsize,
160
161    /// Buffer capacity
162    capacity: usize,
163}
164
165/// Fast memory pool for low-latency allocations
166struct FastMemoryPool {
167    /// Pre-allocated memory blocks
168    blocks: Vec<*mut u8>,
169
170    /// Available blocks queue
171    available_blocks: Arc<Mutex<VecDeque<usize>>>,
172
173    /// Block size
174    blocksize: usize,
175
176    /// Total blocks
177    total_blocks: usize,
178}
179
180/// SIMD processor for vectorized operations
181struct SIMDProcessor<A: Float + Send + Sync> {
182    /// Enable SIMD flag
183    enabled: bool,
184
185    /// Vector width
186    vector_width: usize,
187
188    /// Temporary buffers for SIMD operations
189    temp_buffers: Vec<Array1<A>>,
190}
191
192/// Gradient quantization for reduced precision
193struct GradientQuantizer<A: Float + Send + Sync> {
194    /// Quantization bits
195    bits: u8,
196
197    /// Quantization scale
198    scale: A,
199
200    /// Zero point
201    zero_point: A,
202
203    /// Quantization error accumulator
204    error_accumulator: Option<Array1<A>>,
205}
206
207/// Latency monitoring and profiling
208#[derive(Debug)]
209struct LatencyMonitor {
210    /// Recent latency samples
211    latency_samples: VecDeque<Duration>,
212
213    /// Maximum samples to keep
214    maxsamples: usize,
215
216    /// Current percentiles
217    p50_latency: Duration,
218    p95_latency: Duration,
219    p99_latency: Duration,
220
221    /// Violation count
222    violations: usize,
223
224    /// Total operations
225    total_operations: usize,
226}
227
228/// Approximation controller for trading accuracy for speed
229struct ApproximationController<A: Float + Send + Sync> {
230    /// Current approximation level (0.0 = exact, 1.0 = maximum approximation)
231    approximation_level: A,
232
233    /// Performance history
234    performance_history: VecDeque<PerformancePoint<A>>,
235
236    /// Adaptation rate
237    adaptation_rate: A,
238
239    /// Target latency
240    targetlatency: Duration,
241}
242
243/// Performance measurement point
244#[derive(Debug, Clone)]
245struct PerformancePoint<A: Float + Send + Sync> {
246    /// Latency measurement
247    latency: Duration,
248
249    /// Approximation level used
250    approximation_level: A,
251
252    /// Accuracy achieved
253    accuracy: A,
254
255    /// Timestamp
256    timestamp: Instant,
257}
258
259/// Gradient predictor for pre-computation
260struct GradientPredictor<A: Float + Send + Sync> {
261    /// Recent gradient history
262    gradient_history: VecDeque<Array1<A>>,
263
264    /// Prediction model (simple linear extrapolation)
265    trend_weights: Option<Array1<A>>,
266
267    /// History window size
268    windowsize: usize,
269
270    /// Prediction confidence
271    confidence: A,
272}
273
274impl<O, A> LowLatencyOptimizer<O, A>
275where
276    A: Float
277        + Send
278        + Sync
279        + Default
280        + Clone
281        + std::fmt::Debug
282        + scirs2_core::ndarray::ScalarOperand
283        + 'static
284        + std::iter::Sum,
285    O: Optimizer<A, scirs2_core::ndarray::Ix1> + Send + Sync + 'static,
286{
287    /// Create a new low-latency optimizer
288    pub fn new(_baseoptimizer: O, config: LowLatencyConfig) -> Result<Self> {
289        let base_optimizer = Arc::new(Mutex::new(_baseoptimizer));
290
291        let precomputation_engine = if config.enable_precomputation {
292            Some(PrecomputationEngine::new(config.precomputation_buffer_size))
293        } else {
294            None
295        };
296
297        let update_buffer = LockFreeBuffer::new(config.precomputation_buffer_size);
298        let memory_pool = FastMemoryPool::new(config.memory_pool_size, 4096)?; // 4KB blocks
299        let simd_processor = SIMDProcessor::new(config.enable_simd);
300
301        let quantizer = if config.enable_quantization {
302            Some(GradientQuantizer::new(config.quantization_bits))
303        } else {
304            None
305        };
306
307        let perf_monitor = LatencyMonitor::new(1000); // Keep 1000 samples
308        let approximation_controller =
309            ApproximationController::new(Duration::from_micros(config.target_latency_us));
310
311        Ok(Self {
312            base_optimizer,
313            config,
314            precomputation_engine,
315            update_buffer,
316            memory_pool,
317            simd_processor,
318            quantizer,
319            perf_monitor,
320            approximation_controller,
321            step_counter: AtomicUsize::new(0),
322        })
323    }
324
325    /// Perform a low-latency update
326    pub fn low_latency_step(&mut self, gradient: &Array1<A>) -> Result<Array1<A>> {
327        let start_time = Instant::now();
328
329        // Try to use pre-computed update first
330        if let Some(ref mut precomp) = self.precomputation_engine {
331            if let Some(precomputed) = precomp.try_get_precomputed() {
332                let latency = start_time.elapsed();
333                self.perf_monitor.record_latency(latency);
334                return Ok(precomputed.update);
335            }
336        }
337
338        // Quantize gradient if enabled
339        let processed_gradient = if let Some(ref mut quantizer) = self.quantizer {
340            quantizer.quantize(gradient)?
341        } else {
342            gradient.clone()
343        };
344
345        // Use approximation if necessary to meet latency budget
346        let approximation_level = self.approximation_controller.get_approximation_level();
347        let update = if approximation_level > A::zero() {
348            self.approximate_update(&processed_gradient, approximation_level)?
349        } else {
350            self.exact_update(&processed_gradient)?
351        };
352
353        let latency = start_time.elapsed();
354
355        // Record performance and adapt approximation level
356        let accuracy = self.estimate_accuracy(&update, gradient);
357        self.approximation_controller
358            .record_performance(latency, approximation_level, accuracy);
359        self.perf_monitor.record_latency(latency);
360
361        // Check for latency violations
362        if latency.as_micros() as u64 > self.config.max_latency_us {
363            self.handle_latency_violation(latency)?;
364        }
365
366        // Start pre-computation for next step
367        if let Some(ref mut precomp) = self.precomputation_engine {
368            precomp.start_precomputation(gradient);
369        }
370
371        self.step_counter.fetch_add(1, Ordering::Relaxed);
372        Ok(update)
373    }
374
375    /// Perform exact update using base optimizer
376    fn exact_update(&mut self, gradient: &Array1<A>) -> Result<Array1<A>> {
377        // This is a simplified version - in practice would get current parameters
378        let current_params = Array1::zeros(gradient.len());
379
380        let mut optimizer = self.base_optimizer.lock().unwrap();
381        optimizer.step(&current_params, gradient)
382    }
383
384    /// Perform approximate update for speed
385    fn approximate_update(
386        &mut self,
387        gradient: &Array1<A>,
388        approximation_level: A,
389    ) -> Result<Array1<A>> {
390        // Simplified approximation: reduce precision or use fewer operations
391        let simplified_gradient = if approximation_level > A::from(0.5).unwrap() {
392            self.simplify_gradient(gradient, approximation_level)?
393        } else {
394            gradient.clone()
395        };
396
397        // Use SIMD for fast computation
398        if self.simd_processor.enabled {
399            self.simd_processor.process(&simplified_gradient)
400        } else {
401            self.exact_update(&simplified_gradient)
402        }
403    }
404
405    /// Simplify gradient for approximation
406    fn simplify_gradient(&self, gradient: &Array1<A>, level: A) -> Result<Array1<A>> {
407        let mut simplified = gradient.clone();
408
409        // Sparsify gradient based on approximation level
410        let sparsity_ratio = level.to_f64().unwrap_or(0.0);
411        let keep_ratio = 1.0 - sparsity_ratio * 0.8; // Keep 20% to 100% of gradients
412        let keep_count = ((gradient.len() as f64) * keep_ratio) as usize;
413
414        // Keep only the largest magnitude gradients
415        let mut indexed_grads: Vec<(usize, A)> = gradient
416            .iter()
417            .enumerate()
418            .map(|(i, &g)| (i, g.abs()))
419            .collect();
420
421        indexed_grads.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
422
423        // Zero out smaller gradients
424        for (i, _) in indexed_grads.iter().skip(keep_count) {
425            simplified[*i] = A::zero();
426        }
427
428        Ok(simplified)
429    }
430
431    /// Estimate accuracy of approximate update
432    fn estimate_accuracy(&self, approximate: &Array1<A>, exactgradient: &Array1<A>) -> A {
433        if approximate.len() != exactgradient.len() {
434            return A::zero();
435        }
436
437        // Cosine similarity as accuracy measure
438        let dot_product = approximate
439            .iter()
440            .zip(exactgradient.iter())
441            .map(|(&a, &b)| a * b)
442            .sum::<A>();
443
444        let norm_a = approximate.iter().map(|&x| x * x).sum::<A>().sqrt();
445        let norm_b = exactgradient.iter().map(|&x| x * x).sum::<A>().sqrt();
446
447        if norm_a == A::zero() || norm_b == A::zero() {
448            A::zero()
449        } else {
450            dot_product / (norm_a * norm_b)
451        }
452    }
453
454    /// Handle latency violations
455    fn handle_latency_violation(&mut self, latency: Duration) -> Result<()> {
456        // Increase approximation level to reduce future latency
457        self.approximation_controller.increase_approximation();
458
459        // Enable more aggressive optimizations
460        if !self.config.enable_quantization
461            && latency.as_micros() as u64 > self.config.max_latency_us * 2
462        {
463            // Could dynamically enable quantization
464        }
465
466        Ok(())
467    }
468
469    /// Get current performance metrics
470    pub fn get_performance_metrics(&self) -> LowLatencyMetrics {
471        LowLatencyMetrics {
472            avg_latency_us: self.perf_monitor.get_average_latency().as_micros() as u64,
473            p95_latency_us: self.perf_monitor.p95_latency.as_micros() as u64,
474            p99_latency_us: self.perf_monitor.p99_latency.as_micros() as u64,
475            latency_violations: self.perf_monitor.violations,
476            total_operations: self.perf_monitor.total_operations,
477            current_approximation_level: self
478                .approximation_controller
479                .approximation_level
480                .to_f64()
481                .unwrap_or(0.0),
482            precomputation_hit_rate: self
483                .precomputation_engine
484                .as_ref()
485                .map(|pe| pe.get_hit_rate())
486                .unwrap_or(0.0),
487            memory_efficiency: self.memory_pool.get_efficiency(),
488        }
489    }
490
491    /// Check if optimizer is meeting latency requirements
492    pub fn is_meeting_latency_requirements(&self) -> bool {
493        let avg_latency = self.perf_monitor.get_average_latency().as_micros() as u64;
494        avg_latency <= self.config.target_latency_us
495    }
496}
497
498// Implementation of helper structs
499impl<A: Float + Send + Sync + Send + Sync> PrecomputationEngine<A> {
500    fn new(_buffersize: usize) -> Self {
501        Self {
502            precomputed_updates: VecDeque::with_capacity(_buffersize),
503            computation_thread: None,
504            gradient_predictor: GradientPredictor::new(10), // 10-step history
505            max_buffer_size: _buffersize,
506        }
507    }
508
509    fn try_get_precomputed(&mut self) -> Option<PrecomputedUpdate<A>> {
510        // Remove expired updates
511        let now = Instant::now();
512        while let Some(update) = self.precomputed_updates.front() {
513            if update.valid_until <= now {
514                self.precomputed_updates.pop_front();
515            } else {
516                break;
517            }
518        }
519
520        self.precomputed_updates.pop_front()
521    }
522
523    fn start_precomputation(&mut self, gradient: &Array1<A>) {
524        // In a real implementation, would start background computation
525        // For now, just placeholder
526    }
527
528    fn get_hit_rate(&self) -> f64 {
529        // Simplified hit rate calculation
530        0.8 // 80% hit rate
531    }
532}
533
534impl<A: Float + Send + Sync + Send + Sync> LockFreeBuffer<A> {
535    fn new(capacity: usize) -> Self {
536        Self {
537            buffer: vec![None; capacity],
538            write_index: AtomicUsize::new(0),
539            read_index: AtomicUsize::new(0),
540            capacity,
541        }
542    }
543}
544
545impl FastMemoryPool {
546    fn new(_total_size: usize, blocksize: usize) -> Result<Self> {
547        let total_blocks = _total_size / blocksize;
548        let mut blocks = Vec::with_capacity(total_blocks);
549
550        // Pre-allocate all blocks
551        for _ in 0..total_blocks {
552            let layout = std::alloc::Layout::from_size_align(blocksize, 8)
553                .map_err(|_| OptimError::InvalidConfig("Invalid memory layout".to_string()))?;
554
555            let ptr = unsafe { std::alloc::alloc(layout) };
556            if ptr.is_null() {
557                return Err(OptimError::InvalidConfig(
558                    "Memory allocation failed".to_string(),
559                ));
560            }
561            blocks.push(ptr);
562        }
563
564        let available_blocks = Arc::new(Mutex::new((0..total_blocks).collect()));
565
566        Ok(Self {
567            blocks,
568            available_blocks,
569            blocksize,
570            total_blocks,
571        })
572    }
573
574    fn get_efficiency(&self) -> f64 {
575        let available = self.available_blocks.lock().unwrap().len();
576        1.0 - (available as f64 / self.total_blocks as f64)
577    }
578}
579
580impl<A: Float + Send + Sync + Send + Sync> SIMDProcessor<A> {
581    fn new(enabled: bool) -> Self {
582        Self {
583            enabled,
584            vector_width: 8, // AVX2 width for f32
585            temp_buffers: Vec::new(),
586        }
587    }
588
589    fn process(&mut self, gradient: &Array1<A>) -> Result<Array1<A>> {
590        // Simplified SIMD processing - in practice would use actual SIMD instructions
591        Ok(gradient.clone())
592    }
593}
594
595impl<A: Float + Send + Sync + Send + Sync> GradientQuantizer<A> {
596    fn new(bits: u8) -> Self {
597        Self {
598            bits,
599            scale: A::one(),
600            zero_point: A::zero(),
601            error_accumulator: None,
602        }
603    }
604
605    fn quantize(&mut self, gradient: &Array1<A>) -> Result<Array1<A>> {
606        // Simplified quantization
607        let max_val = gradient
608            .iter()
609            .cloned()
610            .fold(A::zero(), |acc, x| acc.max(x.abs()));
611        let levels = A::from(2_u32.pow(self.bits as u32) - 1).unwrap();
612        self.scale = max_val / levels;
613
614        let quantized = gradient.mapv(|x| (x / self.scale).round() * self.scale);
615
616        Ok(quantized)
617    }
618}
619
620impl LatencyMonitor {
621    fn new(maxsamples: usize) -> Self {
622        Self {
623            latency_samples: VecDeque::with_capacity(maxsamples),
624            maxsamples,
625            p50_latency: Duration::from_micros(0),
626            p95_latency: Duration::from_micros(0),
627            p99_latency: Duration::from_micros(0),
628            violations: 0,
629            total_operations: 0,
630        }
631    }
632
633    fn record_latency(&mut self, latency: Duration) {
634        self.latency_samples.push_back(latency);
635        if self.latency_samples.len() > self.maxsamples {
636            self.latency_samples.pop_front();
637        }
638
639        self.total_operations += 1;
640        self.update_percentiles();
641    }
642
643    fn update_percentiles(&mut self) {
644        if self.latency_samples.is_empty() {
645            return;
646        }
647
648        let mut sorted: Vec<_> = self.latency_samples.iter().cloned().collect();
649        sorted.sort();
650
651        let len = sorted.len();
652        self.p50_latency = sorted[len / 2];
653        self.p95_latency = sorted[(len as f64 * 0.95) as usize];
654        self.p99_latency = sorted[(len as f64 * 0.99) as usize];
655    }
656
657    fn get_average_latency(&self) -> Duration {
658        if self.latency_samples.is_empty() {
659            Duration::from_micros(0)
660        } else {
661            let total: Duration = self.latency_samples.iter().sum();
662            total / self.latency_samples.len() as u32
663        }
664    }
665}
666
667impl<A: Float + Send + Sync + Send + Sync> ApproximationController<A> {
668    fn new(targetlatency: Duration) -> Self {
669        Self {
670            approximation_level: A::zero(),
671            performance_history: VecDeque::with_capacity(100),
672            adaptation_rate: A::from(0.1).unwrap(),
673            targetlatency,
674        }
675    }
676
677    fn get_approximation_level(&self) -> A {
678        self.approximation_level
679    }
680
681    fn record_performance(&mut self, latency: Duration, approximation_level: A, accuracy: A) {
682        let point = PerformancePoint {
683            latency,
684            approximation_level,
685            accuracy,
686            timestamp: Instant::now(),
687        };
688
689        self.performance_history.push_back(point);
690        if self.performance_history.len() > 100 {
691            self.performance_history.pop_front();
692        }
693
694        self.adapt_approximation_level(latency);
695    }
696
697    fn adapt_approximation_level(&mut self, latency: Duration) {
698        let latency_ratio = latency.as_micros() as f64 / self.targetlatency.as_micros() as f64;
699
700        if latency_ratio > 1.1 {
701            // Latency too high, increase approximation
702            self.approximation_level =
703                (self.approximation_level + self.adaptation_rate).min(A::one());
704        } else if latency_ratio < 0.8 {
705            // Latency low, can reduce approximation
706            self.approximation_level =
707                (self.approximation_level - self.adaptation_rate).max(A::zero());
708        }
709    }
710
711    fn increase_approximation(&mut self) {
712        self.approximation_level =
713            (self.approximation_level + self.adaptation_rate * A::from(2.0).unwrap()).min(A::one());
714    }
715}
716
717impl<A: Float + Send + Sync + Send + Sync> GradientPredictor<A> {
718    fn new(windowsize: usize) -> Self {
719        Self {
720            gradient_history: VecDeque::with_capacity(windowsize),
721            trend_weights: None,
722            windowsize,
723            confidence: A::from(0.5).unwrap(),
724        }
725    }
726}
727
728/// Performance metrics for low-latency optimization
729#[derive(Debug, Clone)]
730pub struct LowLatencyMetrics {
731    /// Average latency (microseconds)
732    pub avg_latency_us: u64,
733    /// 95th percentile latency (microseconds)
734    pub p95_latency_us: u64,
735    /// 99th percentile latency (microseconds)
736    pub p99_latency_us: u64,
737    /// Number of latency violations
738    pub latency_violations: usize,
739    /// Total operations performed
740    pub total_operations: usize,
741    /// Current approximation level (0.0 to 1.0)
742    pub current_approximation_level: f64,
743    /// Pre-computation hit rate
744    pub precomputation_hit_rate: f64,
745    /// Memory pool efficiency
746    pub memory_efficiency: f64,
747}
748
749#[cfg(test)]
750mod tests {
751    use super::*;
752    use crate::optimizers::SGD;
753
754    #[test]
755    fn test_low_latency_config() {
756        let config = LowLatencyConfig::default();
757        assert_eq!(config.target_latency_us, 100);
758        assert!(config.enable_precomputation);
759        assert!(config.enable_lock_free);
760    }
761
762    #[test]
763    fn test_low_latency_optimizer_creation() {
764        let sgd = SGD::new(0.01f64);
765        let config = LowLatencyConfig::default();
766        let result = LowLatencyOptimizer::new(sgd, config);
767        assert!(result.is_ok());
768    }
769
770    #[test]
771    fn test_latency_monitor() {
772        let mut monitor = LatencyMonitor::new(10);
773
774        for i in 1..=5 {
775            monitor.record_latency(Duration::from_micros(i * 100));
776        }
777
778        assert_eq!(monitor.total_operations, 5);
779        assert!(monitor.get_average_latency().as_micros() > 0);
780    }
781
782    #[test]
783    fn test_gradient_quantizer() {
784        let mut quantizer = GradientQuantizer::new(8);
785        let gradient = Array1::from_vec(vec![0.1f64, 0.5, -0.3, 0.8]);
786
787        let result = quantizer.quantize(&gradient);
788        assert!(result.is_ok());
789
790        let quantized = result.unwrap();
791        assert_eq!(quantized.len(), gradient.len());
792    }
793
794    #[test]
795    fn test_approximation_controller() {
796        let mut controller = ApproximationController::new(Duration::from_micros(100));
797
798        // Record high latency - should increase approximation
799        controller.record_performance(Duration::from_micros(200), 0.0f64, 0.9f64);
800
801        assert!(controller.get_approximation_level() > 0.0);
802    }
803
804    #[test]
805    fn test_lock_free_buffer() {
806        let buffer = LockFreeBuffer::<f64>::new(4);
807        assert_eq!(buffer.capacity, 4);
808        assert_eq!(buffer.write_index.load(Ordering::Relaxed), 0);
809        assert_eq!(buffer.read_index.load(Ordering::Relaxed), 0);
810    }
811}