optirs-core 0.3.1

// Low-latency optimization for real-time streaming applications
//
// This module provides specialized optimizers and techniques for applications
// that require extremely low latency updates, such as high-frequency trading,
// real-time control systems, and interactive machine learning.

use scirs2_core::ndarray::Array1;
use scirs2_core::numeric::Float;
use std::collections::VecDeque;
use std::sync::{
    atomic::{AtomicUsize, Ordering},
    Arc, Mutex,
};
use std::time::{Duration, Instant};

use crate::error::{OptimError, Result};
use crate::optimizers::Optimizer;

/// Low-latency optimization configuration
#[derive(Debug, Clone)]
pub struct LowLatencyConfig {
    /// Target latency budget (microseconds)
    pub target_latency_us: u64,

    /// Maximum acceptable latency (microseconds)
    pub max_latency_us: u64,

    /// Enable pre-computation of updates
    pub enable_precomputation: bool,

    /// Buffer size for pre-computed updates
    pub precomputation_buffer_size: usize,

    /// Enable lock-free algorithms
    pub enable_lock_free: bool,

    /// Use approximate algorithms for speed
    pub use_approximations: bool,

    /// Approximation tolerance
    pub approximation_tolerance: f64,

    /// Enable SIMD optimizations
    pub enable_simd: bool,

    /// Batch processing threshold
    pub batch_threshold: usize,

    /// Enable zero-copy operations
    pub enable_zero_copy: bool,

    /// Memory pool size for allocations
    pub memory_pool_size: usize,

    /// Enable gradient quantization
    pub enable_quantization: bool,

    /// Quantization bits
    pub quantization_bits: u8,
}

impl Default for LowLatencyConfig {
    fn default() -> Self {
        Self {
            target_latency_us: 100, // 100 microseconds
            max_latency_us: 1000,   // 1 millisecond
            enable_precomputation: true,
            precomputation_buffer_size: 64,
            enable_lock_free: true,
            use_approximations: true,
            approximation_tolerance: 0.01,
            enable_simd: true,
            batch_threshold: 8,
            enable_zero_copy: true,
            memory_pool_size: 1024 * 1024, // 1MB
            enable_quantization: false,
            quantization_bits: 8,
        }
    }
}

/// Low-latency streaming optimizer
pub struct LowLatencyOptimizer<O, A>
where
    A: Float + Send + Sync + scirs2_core::ndarray::ScalarOperand + std::fmt::Debug,
    O: Optimizer<A, scirs2_core::ndarray::Ix1> + Send + Sync,
{
    /// Base optimizer
    base_optimizer: Arc<Mutex<O>>,

    /// Configuration
    config: LowLatencyConfig,

    /// Pre-computation engine
    precomputation_engine: Option<PrecomputationEngine<A>>,

    /// Lock-free update buffer
    update_buffer: LockFreeBuffer<A>,

    /// Memory pool for fast allocations
    memory_pool: FastMemoryPool,

    /// SIMD processor
    simd_processor: SIMDProcessor<A>,

    /// Quantization engine
    quantizer: Option<GradientQuantizer<A>>,

    /// Performance monitor
    perf_monitor: LatencyMonitor,

    /// Approximation controller
    approximation_controller: ApproximationController<A>,

    /// Step counter (atomic for thread safety)
    step_counter: AtomicUsize,
}

/// Pre-computation engine for preparing updates in advance
struct PrecomputationEngine<A: Float + Send + Sync> {
    /// Buffer of pre-computed updates
    precomputed_updates: VecDeque<PrecomputedUpdate<A>>,

    /// Background computation thread
    computation_thread: Option<std::thread::JoinHandle<()>>,

    /// Prediction model for future gradients
    gradient_predictor: GradientPredictor<A>,

    /// Maximum buffer size
    max_buffer_size: usize,
}

/// Pre-computed update entry
#[derive(Debug, Clone)]
struct PrecomputedUpdate<A: Float + Send + Sync> {
    /// Predicted gradient
    gradient: Array1<A>,

    /// Pre-computed parameter update
    update: Array1<A>,

    /// Validity timestamp
    valid_until: Instant,

    /// Confidence score
    confidence: A,
}

/// Lock-free circular buffer for updates
struct LockFreeBuffer<A: Float + Send + Sync> {
    /// Buffer storage
    buffer: Vec<Option<Array1<A>>>,

    /// Write index (atomic)
    write_index: AtomicUsize,

    /// Read index (atomic)
    read_index: AtomicUsize,

    /// Buffer capacity
    capacity: usize,
}

/// Fast memory pool for low-latency allocations
struct FastMemoryPool {
    /// Pre-allocated memory blocks
    blocks: Vec<*mut u8>,

    /// Available blocks queue
    available_blocks: Arc<Mutex<VecDeque<usize>>>,

    /// Block size
    blocksize: usize,

    /// Total blocks
    total_blocks: usize,
}

/// SIMD processor for vectorized operations
struct SIMDProcessor<A: Float + Send + Sync> {
    /// Enable SIMD flag
    enabled: bool,

    /// Vector width
    vector_width: usize,

    /// Temporary buffers for SIMD operations
    temp_buffers: Vec<Array1<A>>,
}

/// Gradient quantization for reduced precision
struct GradientQuantizer<A: Float + Send + Sync> {
    /// Quantization bits
    bits: u8,

    /// Quantization scale
    scale: A,

    /// Zero point
    zero_point: A,

    /// Quantization error accumulator
    error_accumulator: Option<Array1<A>>,
}

/// Latency monitoring and profiling
#[derive(Debug)]
struct LatencyMonitor {
    /// Recent latency samples
    latency_samples: VecDeque<Duration>,

    /// Maximum samples to keep
    maxsamples: usize,

    /// Current percentiles
    p50_latency: Duration,
    p95_latency: Duration,
    p99_latency: Duration,

    /// Violation count
    violations: usize,

    /// Total operations
    total_operations: usize,
}

/// Approximation controller for trading accuracy for speed
struct ApproximationController<A: Float + Send + Sync> {
    /// Current approximation level (0.0 = exact, 1.0 = maximum approximation)
    approximation_level: A,

    /// Performance history
    performance_history: VecDeque<PerformancePoint<A>>,

    /// Adaptation rate
    adaptation_rate: A,

    /// Target latency
    targetlatency: Duration,
}

/// Performance measurement point
#[derive(Debug, Clone)]
struct PerformancePoint<A: Float + Send + Sync> {
    /// Latency measurement
    latency: Duration,

    /// Approximation level used
    approximation_level: A,

    /// Accuracy achieved
    accuracy: A,

    /// Timestamp
    timestamp: Instant,
}

/// Gradient predictor for pre-computation
struct GradientPredictor<A: Float + Send + Sync> {
    /// Recent gradient history
    gradient_history: VecDeque<Array1<A>>,

    /// Prediction model (simple linear extrapolation)
    trend_weights: Option<Array1<A>>,

    /// History window size
    windowsize: usize,

    /// Prediction confidence
    confidence: A,
}

impl<O, A> LowLatencyOptimizer<O, A>
where
    A: Float
        + Send
        + Sync
        + Default
        + Clone
        + std::fmt::Debug
        + scirs2_core::ndarray::ScalarOperand
        + 'static
        + std::iter::Sum,
    O: Optimizer<A, scirs2_core::ndarray::Ix1> + Send + Sync + 'static,
{
    /// Create a new low-latency optimizer
    pub fn new(_baseoptimizer: O, config: LowLatencyConfig) -> Result<Self> {
        let base_optimizer = Arc::new(Mutex::new(_baseoptimizer));

        let precomputation_engine = if config.enable_precomputation {
            Some(PrecomputationEngine::new(config.precomputation_buffer_size))
        } else {
            None
        };

        let update_buffer = LockFreeBuffer::new(config.precomputation_buffer_size);
        let memory_pool = FastMemoryPool::new(config.memory_pool_size, 4096)?; // 4KB blocks
        let simd_processor = SIMDProcessor::new(config.enable_simd);

        let quantizer = if config.enable_quantization {
            Some(GradientQuantizer::new(config.quantization_bits))
        } else {
            None
        };

        let perf_monitor = LatencyMonitor::new(1000); // Keep 1000 samples
        let approximation_controller =
            ApproximationController::new(Duration::from_micros(config.target_latency_us));

        Ok(Self {
            base_optimizer,
            config,
            precomputation_engine,
            update_buffer,
            memory_pool,
            simd_processor,
            quantizer,
            perf_monitor,
            approximation_controller,
            step_counter: AtomicUsize::new(0),
        })
    }

    /// Perform a low-latency update
    pub fn low_latency_step(&mut self, gradient: &Array1<A>) -> Result<Array1<A>> {
        let start_time = Instant::now();

        // Try to use pre-computed update first
        if let Some(ref mut precomp) = self.precomputation_engine {
            if let Some(precomputed) = precomp.try_get_precomputed() {
                let latency = start_time.elapsed();
                self.perf_monitor.record_latency(latency);
                return Ok(precomputed.update);
            }
        }

        // Quantize gradient if enabled
        let processed_gradient = if let Some(ref mut quantizer) = self.quantizer {
            quantizer.quantize(gradient)?
        } else {
            gradient.clone()
        };

        // Use approximation if necessary to meet latency budget
        let approximation_level = self.approximation_controller.get_approximation_level();
        let update = if approximation_level > A::zero() {
            self.approximate_update(&processed_gradient, approximation_level)?
        } else {
            self.exact_update(&processed_gradient)?
        };

        let latency = start_time.elapsed();

        // Record performance and adapt approximation level
        let accuracy = self.estimate_accuracy(&update, gradient);
        self.approximation_controller
            .record_performance(latency, approximation_level, accuracy);
        self.perf_monitor.record_latency(latency);

        // Check for latency violations
        if latency.as_micros() as u64 > self.config.max_latency_us {
            self.handle_latency_violation(latency)?;
        }

        // Start pre-computation for next step
        if let Some(ref mut precomp) = self.precomputation_engine {
            precomp.start_precomputation(gradient);
        }

        self.step_counter.fetch_add(1, Ordering::Relaxed);
        Ok(update)
    }

    /// Perform exact update using base optimizer
    fn exact_update(&mut self, gradient: &Array1<A>) -> Result<Array1<A>> {
        // This is a simplified version - in practice would get current parameters
        let current_params = Array1::zeros(gradient.len());

        let mut optimizer = self.base_optimizer.lock().expect("lock poisoned");
        optimizer.step(&current_params, gradient)
    }

    /// Perform approximate update for speed
    fn approximate_update(
        &mut self,
        gradient: &Array1<A>,
        approximation_level: A,
    ) -> Result<Array1<A>> {
        // Simplified approximation: reduce precision or use fewer operations
        let simplified_gradient = if approximation_level > A::from(0.5).expect("unwrap failed") {
            self.simplify_gradient(gradient, approximation_level)?
        } else {
            gradient.clone()
        };

        // Use SIMD for fast computation
        if self.simd_processor.enabled {
            self.simd_processor.process(&simplified_gradient)
        } else {
            self.exact_update(&simplified_gradient)
        }
    }

    /// Simplify gradient for approximation
    fn simplify_gradient(&self, gradient: &Array1<A>, level: A) -> Result<Array1<A>> {
        let mut simplified = gradient.clone();

        // Sparsify gradient based on approximation level
        let sparsity_ratio = level.to_f64().unwrap_or(0.0);
        let keep_ratio = 1.0 - sparsity_ratio * 0.8; // Keep 20% to 100% of gradients
        let keep_count = ((gradient.len() as f64) * keep_ratio) as usize;

        // Keep only the largest magnitude gradients
        let mut indexed_grads: Vec<(usize, A)> = gradient
            .iter()
            .enumerate()
            .map(|(i, &g)| (i, g.abs()))
            .collect();

        indexed_grads.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));

        // Zero out smaller gradients
        for (i, _) in indexed_grads.iter().skip(keep_count) {
            simplified[*i] = A::zero();
        }

        Ok(simplified)
    }

    /// Estimate accuracy of approximate update
    fn estimate_accuracy(&self, approximate: &Array1<A>, exactgradient: &Array1<A>) -> A {
        if approximate.len() != exactgradient.len() {
            return A::zero();
        }

        // Cosine similarity as accuracy measure
        let dot_product = approximate
            .iter()
            .zip(exactgradient.iter())
            .map(|(&a, &b)| a * b)
            .sum::<A>();

        let norm_a = approximate.iter().map(|&x| x * x).sum::<A>().sqrt();
        let norm_b = exactgradient.iter().map(|&x| x * x).sum::<A>().sqrt();

        if norm_a == A::zero() || norm_b == A::zero() {
            A::zero()
        } else {
            dot_product / (norm_a * norm_b)
        }
    }

    /// Handle latency violations
    fn handle_latency_violation(&mut self, latency: Duration) -> Result<()> {
        // Increase approximation level to reduce future latency
        self.approximation_controller.increase_approximation();

        // Enable more aggressive optimizations
        if !self.config.enable_quantization
            && latency.as_micros() as u64 > self.config.max_latency_us * 2
        {
            // Could dynamically enable quantization
        }

        Ok(())
    }

    /// Get current performance metrics
    pub fn get_performance_metrics(&self) -> LowLatencyMetrics {
        LowLatencyMetrics {
            avg_latency_us: self.perf_monitor.get_average_latency().as_micros() as u64,
            p95_latency_us: self.perf_monitor.p95_latency.as_micros() as u64,
            p99_latency_us: self.perf_monitor.p99_latency.as_micros() as u64,
            latency_violations: self.perf_monitor.violations,
            total_operations: self.perf_monitor.total_operations,
            current_approximation_level: self
                .approximation_controller
                .approximation_level
                .to_f64()
                .unwrap_or(0.0),
            precomputation_hit_rate: self
                .precomputation_engine
                .as_ref()
                .map(|pe| pe.get_hit_rate())
                .unwrap_or(0.0),
            memory_efficiency: self.memory_pool.get_efficiency(),
        }
    }

    /// Check if optimizer is meeting latency requirements
    pub fn is_meeting_latency_requirements(&self) -> bool {
        let avg_latency = self.perf_monitor.get_average_latency().as_micros() as u64;
        avg_latency <= self.config.target_latency_us
    }
}

// Implementation of helper structs
impl<A: Float + Send + Sync + Send + Sync> PrecomputationEngine<A> {
    fn new(_buffersize: usize) -> Self {
        Self {
            precomputed_updates: VecDeque::with_capacity(_buffersize),
            computation_thread: None,
            gradient_predictor: GradientPredictor::new(10), // 10-step history
            max_buffer_size: _buffersize,
        }
    }

    fn try_get_precomputed(&mut self) -> Option<PrecomputedUpdate<A>> {
        // Remove expired updates
        let now = Instant::now();
        while let Some(update) = self.precomputed_updates.front() {
            if update.valid_until <= now {
                self.precomputed_updates.pop_front();
            } else {
                break;
            }
        }

        self.precomputed_updates.pop_front()
    }

    fn start_precomputation(&mut self, gradient: &Array1<A>) {
        // In a real implementation, would start background computation
        // For now, just placeholder
    }

    fn get_hit_rate(&self) -> f64 {
        // Simplified hit rate calculation
        0.8 // 80% hit rate
    }
}

impl<A: Float + Send + Sync + Send + Sync> LockFreeBuffer<A> {
    fn new(capacity: usize) -> Self {
        Self {
            buffer: vec![None; capacity],
            write_index: AtomicUsize::new(0),
            read_index: AtomicUsize::new(0),
            capacity,
        }
    }
}

impl FastMemoryPool {
    fn new(_total_size: usize, blocksize: usize) -> Result<Self> {
        let total_blocks = _total_size / blocksize;
        let mut blocks = Vec::with_capacity(total_blocks);

        // Pre-allocate all blocks
        for _ in 0..total_blocks {
            let layout = std::alloc::Layout::from_size_align(blocksize, 8)
                .map_err(|_| OptimError::InvalidConfig("Invalid memory layout".to_string()))?;

            let ptr = unsafe { std::alloc::alloc(layout) };
            if ptr.is_null() {
                return Err(OptimError::InvalidConfig(
                    "Memory allocation failed".to_string(),
                ));
            }
            blocks.push(ptr);
        }

        let available_blocks = Arc::new(Mutex::new((0..total_blocks).collect()));

        Ok(Self {
            blocks,
            available_blocks,
            blocksize,
            total_blocks,
        })
    }

    fn get_efficiency(&self) -> f64 {
        let available = self.available_blocks.lock().expect("lock poisoned").len();
        1.0 - (available as f64 / self.total_blocks as f64)
    }
}

impl<A: Float + Send + Sync + Send + Sync> SIMDProcessor<A> {
    fn new(enabled: bool) -> Self {
        Self {
            enabled,
            vector_width: 8, // AVX2 width for f32
            temp_buffers: Vec::new(),
        }
    }

    fn process(&mut self, gradient: &Array1<A>) -> Result<Array1<A>> {
        // Simplified SIMD processing - in practice would use actual SIMD instructions
        Ok(gradient.clone())
    }
}

impl<A: Float + Send + Sync + Send + Sync> GradientQuantizer<A> {
    fn new(bits: u8) -> Self {
        Self {
            bits,
            scale: A::one(),
            zero_point: A::zero(),
            error_accumulator: None,
        }
    }

    fn quantize(&mut self, gradient: &Array1<A>) -> Result<Array1<A>> {
        // Simplified quantization
        let max_val = gradient
            .iter()
            .cloned()
            .fold(A::zero(), |acc, x| acc.max(x.abs()));
        let levels = A::from(2_u32.pow(self.bits as u32) - 1).expect("unwrap failed");
        self.scale = max_val / levels;

        let quantized = gradient.mapv(|x| (x / self.scale).round() * self.scale);

        Ok(quantized)
    }
}

impl LatencyMonitor {
    fn new(maxsamples: usize) -> Self {
        Self {
            latency_samples: VecDeque::with_capacity(maxsamples),
            maxsamples,
            p50_latency: Duration::from_micros(0),
            p95_latency: Duration::from_micros(0),
            p99_latency: Duration::from_micros(0),
            violations: 0,
            total_operations: 0,
        }
    }

    fn record_latency(&mut self, latency: Duration) {
        self.latency_samples.push_back(latency);
        if self.latency_samples.len() > self.maxsamples {
            self.latency_samples.pop_front();
        }

        self.total_operations += 1;
        self.update_percentiles();
    }

    fn update_percentiles(&mut self) {
        if self.latency_samples.is_empty() {
            return;
        }

        let mut sorted: Vec<_> = self.latency_samples.iter().cloned().collect();
        sorted.sort();

        let len = sorted.len();
        self.p50_latency = sorted[len / 2];
        self.p95_latency = sorted[(len as f64 * 0.95) as usize];
        self.p99_latency = sorted[(len as f64 * 0.99) as usize];
    }

    fn get_average_latency(&self) -> Duration {
        if self.latency_samples.is_empty() {
            Duration::from_micros(0)
        } else {
            let total: Duration = self.latency_samples.iter().sum();
            total / self.latency_samples.len() as u32
        }
    }
}

impl<A: Float + Send + Sync + Send + Sync> ApproximationController<A> {
    fn new(targetlatency: Duration) -> Self {
        Self {
            approximation_level: A::zero(),
            performance_history: VecDeque::with_capacity(100),
            adaptation_rate: A::from(0.1).expect("unwrap failed"),
            targetlatency,
        }
    }

    fn get_approximation_level(&self) -> A {
        self.approximation_level
    }

    fn record_performance(&mut self, latency: Duration, approximation_level: A, accuracy: A) {
        let point = PerformancePoint {
            latency,
            approximation_level,
            accuracy,
            timestamp: Instant::now(),
        };

        self.performance_history.push_back(point);
        if self.performance_history.len() > 100 {
            self.performance_history.pop_front();
        }

        self.adapt_approximation_level(latency);
    }

    fn adapt_approximation_level(&mut self, latency: Duration) {
        let latency_ratio = latency.as_micros() as f64 / self.targetlatency.as_micros() as f64;

        if latency_ratio > 1.1 {
            // Latency too high, increase approximation
            self.approximation_level =
                (self.approximation_level + self.adaptation_rate).min(A::one());
        } else if latency_ratio < 0.8 {
            // Latency low, can reduce approximation
            self.approximation_level =
                (self.approximation_level - self.adaptation_rate).max(A::zero());
        }
    }

    fn increase_approximation(&mut self) {
        self.approximation_level = (self.approximation_level
            + self.adaptation_rate * A::from(2.0).expect("unwrap failed"))
        .min(A::one());
    }
}

impl<A: Float + Send + Sync + Send + Sync> GradientPredictor<A> {
    fn new(windowsize: usize) -> Self {
        Self {
            gradient_history: VecDeque::with_capacity(windowsize),
            trend_weights: None,
            windowsize,
            confidence: A::from(0.5).expect("unwrap failed"),
        }
    }
}

/// Performance metrics for low-latency optimization
#[derive(Debug, Clone)]
pub struct LowLatencyMetrics {
    /// Average latency (microseconds)
    pub avg_latency_us: u64,
    /// 95th percentile latency (microseconds)
    pub p95_latency_us: u64,
    /// 99th percentile latency (microseconds)
    pub p99_latency_us: u64,
    /// Number of latency violations
    pub latency_violations: usize,
    /// Total operations performed
    pub total_operations: usize,
    /// Current approximation level (0.0 to 1.0)
    pub current_approximation_level: f64,
    /// Pre-computation hit rate
    pub precomputation_hit_rate: f64,
    /// Memory pool efficiency
    pub memory_efficiency: f64,
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::optimizers::SGD;

    #[test]
    fn test_low_latency_config() {
        let config = LowLatencyConfig::default();
        assert_eq!(config.target_latency_us, 100);
        assert!(config.enable_precomputation);
        assert!(config.enable_lock_free);
    }

    #[test]
    fn test_low_latency_optimizer_creation() {
        let sgd = SGD::new(0.01f64);
        let config = LowLatencyConfig::default();
        let result = LowLatencyOptimizer::new(sgd, config);
        assert!(result.is_ok());
    }

    #[test]
    fn test_latency_monitor() {
        let mut monitor = LatencyMonitor::new(10);

        for i in 1..=5 {
            monitor.record_latency(Duration::from_micros(i * 100));
        }

        assert_eq!(monitor.total_operations, 5);
        assert!(monitor.get_average_latency().as_micros() > 0);
    }

    #[test]
    fn test_gradient_quantizer() {
        let mut quantizer = GradientQuantizer::new(8);
        let gradient = Array1::from_vec(vec![0.1f64, 0.5, -0.3, 0.8]);

        let result = quantizer.quantize(&gradient);
        assert!(result.is_ok());

        let quantized = result.expect("unwrap failed");
        assert_eq!(quantized.len(), gradient.len());
    }

    #[test]
    fn test_approximation_controller() {
        let mut controller = ApproximationController::new(Duration::from_micros(100));

        // Record high latency - should increase approximation
        controller.record_performance(Duration::from_micros(200), 0.0f64, 0.9f64);

        assert!(controller.get_approximation_level() > 0.0);
    }

    #[test]
    fn test_lock_free_buffer() {
        let buffer = LockFreeBuffer::<f64>::new(4);
        assert_eq!(buffer.capacity, 4);
        assert_eq!(buffer.write_index.load(Ordering::Relaxed), 0);
        assert_eq!(buffer.read_index.load(Ordering::Relaxed), 0);
    }
}