tenflowers-core 0.1.1

use crate::{Result, Tensor};
use scirs2_core::metrics::{Histogram, Timer};
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex, OnceLock};

/// Ultra-performance activation operations registry and analytics
pub struct ActivationRegistry {
    /// Function execution counters
    function_counters: Arc<Mutex<std::collections::HashMap<String, AtomicU64>>>,
    /// SIMD acceleration usage
    simd_usage: AtomicU64,
    /// Parallel processing usage
    parallel_usage: AtomicU64,
    /// GPU acceleration usage
    gpu_usage: AtomicU64,
    /// Approximation algorithm usage
    approximation_usage: AtomicU64,
    /// Execution timer
    #[allow(dead_code)]
    execution_timer: Timer,
    /// Throughput metrics
    throughput_histogram: Histogram,
}

impl ActivationRegistry {
    pub fn new() -> Self {
        Self {
            function_counters: Arc::new(Mutex::new(std::collections::HashMap::new())),
            simd_usage: AtomicU64::new(0),
            parallel_usage: AtomicU64::new(0),
            gpu_usage: AtomicU64::new(0),
            approximation_usage: AtomicU64::new(0),
            execution_timer: Timer::new("activation.execution_time".to_string()),
            throughput_histogram: Histogram::new("activation.throughput".to_string()),
        }
    }

    pub fn record_function(&self, name: &str, elements: usize, duration_ns: u64) {
        // Update function counter
        {
            let mut counters = self
                .function_counters
                .lock()
                .expect("lock should not be poisoned");
            counters
                .entry(name.to_string())
                .or_insert_with(|| AtomicU64::new(0))
                .fetch_add(1, Ordering::Relaxed);
        }

        // Calculate throughput (millions of elements per second)
        let throughput_meps = (elements as f64 * 1e9) / (duration_ns as f64 * 1e6);
        self.throughput_histogram.observe(throughput_meps);
    }

    pub fn record_simd(&self) {
        self.simd_usage.fetch_add(1, Ordering::Relaxed);
    }
    pub fn record_parallel(&self) {
        self.parallel_usage.fetch_add(1, Ordering::Relaxed);
    }
    pub fn record_gpu(&self) {
        self.gpu_usage.fetch_add(1, Ordering::Relaxed);
    }
    pub fn record_approximation(&self) {
        self.approximation_usage.fetch_add(1, Ordering::Relaxed);
    }

    pub fn get_analytics(&self) -> ActivationAnalytics {
        let counters = self
            .function_counters
            .lock()
            .expect("lock should not be poisoned");
        let function_counts: std::collections::HashMap<String, u64> = counters
            .iter()
            .map(|(k, v)| (k.clone(), v.load(Ordering::Relaxed)))
            .collect();

        ActivationAnalytics {
            function_counts,
            simd_accelerations: self.simd_usage.load(Ordering::Relaxed),
            parallel_executions: self.parallel_usage.load(Ordering::Relaxed),
            gpu_executions: self.gpu_usage.load(Ordering::Relaxed),
            approximation_usages: self.approximation_usage.load(Ordering::Relaxed),
            avg_throughput_meps: 150.0, // Placeholder - real implementation would calculate from histogram
        }
    }
}

impl Default for ActivationRegistry {
    fn default() -> Self {
        Self::new()
    }
}

#[derive(Debug, Clone)]
pub struct ActivationAnalytics {
    pub function_counts: std::collections::HashMap<String, u64>,
    pub simd_accelerations: u64,
    pub parallel_executions: u64,
    pub gpu_executions: u64,
    pub approximation_usages: u64,
    pub avg_throughput_meps: f64, // Millions of elements per second
}

/// Global activation registry
static ACTIVATION_REGISTRY: OnceLock<ActivationRegistry> = OnceLock::new();

/// Get global activation registry
pub fn get_activation_registry() -> &'static ActivationRegistry {
    ACTIVATION_REGISTRY.get_or_init(ActivationRegistry::new)
}

/// Algorithm selection strategy for activation functions
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)]
pub enum ActivationStrategy {
    Sequential, // Basic sequential processing
    Simd,       // SIMD vectorization
    #[allow(dead_code)]
    Parallel, // Parallel processing
    SimdParallel, // SIMD + Parallel
    #[allow(dead_code)]
    Gpu, // GPU acceleration
    Approximation, // Fast approximation algorithms
    #[allow(dead_code)]
    LookupTable, // Pre-computed lookup tables
}

/// Core trait for activation functions
pub trait ActivationFunction<T> {
    fn apply(&self, x: &Tensor<T>) -> Result<Tensor<T>>;
}

/// Ultra-performance convenience functions with comprehensive analytics
pub fn get_activation_performance_report() -> ActivationAnalytics {
    get_activation_registry().get_analytics()
}

/// Reset activation performance counters
pub fn reset_activation_counters() {
    // Implementation would reset all counters - simplified for now
}