ronn-api 0.1.0 - Docs.rs

//! Batch processing for high-throughput inference
//!
//! Provides static and dynamic batching to maximize GPU/CPU utilization
//! and achieve 3-10x throughput improvements.

use crate::InferenceSession;
use crate::error::{Error, Result};
use ronn_core::tensor::Tensor;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::{RwLock, mpsc};
use tokio::time::timeout;

/// Batch processing strategy
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BatchStrategy {
    /// Fixed batch size - waits until batch is full
    Static {
        /// Target batch size
        batch_size: usize,
    },
    /// Dynamic batching - fills batch up to max size or timeout
    Dynamic {
        /// Maximum batch size
        max_batch_size: usize,
        /// Maximum wait time before processing partial batch
        timeout_ms: u64,
    },
}

impl Default for BatchStrategy {
    fn default() -> Self {
        Self::Dynamic {
            max_batch_size: 32,
            timeout_ms: 10,
        }
    }
}

/// Configuration for batch processor
#[derive(Debug, Clone)]
pub struct BatchConfig {
    /// Batching strategy
    pub strategy: BatchStrategy,
    /// Queue capacity for incoming requests
    pub queue_capacity: usize,
    /// Number of worker threads
    pub num_workers: usize,
}

impl Default for BatchConfig {
    fn default() -> Self {
        Self {
            strategy: BatchStrategy::default(),
            queue_capacity: 1024,
            num_workers: 1,
        }
    }
}

/// A single inference request
pub struct BatchRequest {
    /// Input tensors for this request
    pub inputs: HashMap<String, Tensor>,
    /// Channel to send result back
    response_tx: tokio::sync::oneshot::Sender<Result<HashMap<String, Tensor>>>,
}

impl BatchRequest {
    /// Create a new batch request
    pub fn new(
        inputs: HashMap<String, Tensor>,
        response_tx: tokio::sync::oneshot::Sender<Result<HashMap<String, Tensor>>>,
    ) -> Self {
        Self {
            inputs,
            response_tx,
        }
    }

    /// Send the response back to the caller
    fn send_response(self, result: Result<HashMap<String, Tensor>>) {
        let _ = self.response_tx.send(result);
    }
}

/// Batch processor for high-throughput inference
///
/// Automatically batches incoming requests according to the configured strategy,
/// executes them in a single forward pass, and returns individual results.
///
/// # Performance
///
/// - Static batching: 3-5x throughput for stable workloads
/// - Dynamic batching: 5-10x throughput with variable request rates
/// - Optimal for GPU inference where batch processing is highly efficient
///
/// # Example
///
/// ```ignore
/// use ronn_api::{Model, SessionOptions, BatchProcessor, BatchConfig, BatchStrategy};
/// use std::collections::HashMap;
///
/// async fn example() -> Result<(), Box<dyn std::error::Error>> {
///     let model = Model::load("model.onnx")?;
///     let session = model.create_session(SessionOptions::default())?;
///
///     let config = BatchConfig {
///         strategy: BatchStrategy::Dynamic {
///             max_batch_size: 32,
///             timeout_ms: 10,
///         },
///         ..Default::default()
///     };
///
///     let processor = BatchProcessor::new(session, config);
///
///     // Submit requests - they will be automatically batched
///     let inputs = HashMap::new(); // Add your inputs here
///     let output = processor.process(inputs).await?;
///     Ok(())
/// }
/// ```
pub struct BatchProcessor {
    /// Request queue
    request_tx: mpsc::Sender<BatchRequest>,
    /// Worker handle
    _worker_handle: tokio::task::JoinHandle<()>,
    /// Configuration
    config: BatchConfig,
}

impl BatchProcessor {
    /// Create a new batch processor
    pub fn new(session: InferenceSession, config: BatchConfig) -> Self {
        let (request_tx, request_rx) = mpsc::channel(config.queue_capacity);

        let worker_config = config.clone();
        let worker_handle = tokio::spawn(async move {
            Self::worker_loop(session, request_rx, worker_config).await;
        });

        Self {
            request_tx,
            _worker_handle: worker_handle,
            config,
        }
    }

    /// Submit a request for batch processing
    ///
    /// # Arguments
    ///
    /// * `inputs` - Input tensors for inference
    ///
    /// # Returns
    ///
    /// Future that resolves to the inference outputs
    pub async fn process(
        &self,
        inputs: HashMap<String, Tensor>,
    ) -> Result<HashMap<String, Tensor>> {
        let (response_tx, response_rx) = tokio::sync::oneshot::channel();

        let request = BatchRequest::new(inputs, response_tx);

        self.request_tx
            .send(request)
            .await
            .map_err(|_| Error::InferenceError("Batch processor channel closed".to_string()))?;

        response_rx
            .await
            .map_err(|_| Error::InferenceError("Response channel closed".to_string()))?
    }

    /// Main worker loop - collects requests and processes batches
    async fn worker_loop(
        session: InferenceSession,
        mut request_rx: mpsc::Receiver<BatchRequest>,
        config: BatchConfig,
    ) {
        let session = Arc::new(RwLock::new(session));

        loop {
            match config.strategy {
                BatchStrategy::Static { batch_size } => {
                    let batch = Self::collect_static_batch(&mut request_rx, batch_size).await;
                    if batch.is_empty() {
                        break; // Channel closed
                    }
                    Self::process_batch(session.clone(), batch).await;
                }
                BatchStrategy::Dynamic {
                    max_batch_size,
                    timeout_ms,
                } => {
                    let batch =
                        Self::collect_dynamic_batch(&mut request_rx, max_batch_size, timeout_ms)
                            .await;
                    if batch.is_empty() {
                        break; // Channel closed
                    }
                    Self::process_batch(session.clone(), batch).await;
                }
            }
        }
    }

    /// Collect a static batch - waits until batch_size requests are available
    async fn collect_static_batch(
        request_rx: &mut mpsc::Receiver<BatchRequest>,
        batch_size: usize,
    ) -> Vec<BatchRequest> {
        let mut batch = Vec::with_capacity(batch_size);

        for _ in 0..batch_size {
            match request_rx.recv().await {
                Some(request) => batch.push(request),
                None => break, // Channel closed
            }
        }

        batch
    }

    /// Collect a dynamic batch - fills up to max_batch_size or until timeout
    async fn collect_dynamic_batch(
        request_rx: &mut mpsc::Receiver<BatchRequest>,
        max_batch_size: usize,
        timeout_ms: u64,
    ) -> Vec<BatchRequest> {
        let mut batch = Vec::with_capacity(max_batch_size);
        let deadline = Duration::from_millis(timeout_ms);

        // Get first request (blocking)
        match request_rx.recv().await {
            Some(request) => batch.push(request),
            None => return batch, // Channel closed
        }

        // Collect additional requests until timeout or batch full
        let start = Instant::now();
        while batch.len() < max_batch_size {
            let remaining = deadline.saturating_sub(start.elapsed());
            if remaining.is_zero() {
                break;
            }

            match timeout(remaining, request_rx.recv()).await {
                Ok(Some(request)) => batch.push(request),
                Ok(None) => break, // Channel closed
                Err(_) => break,   // Timeout
            }
        }

        batch
    }

    /// Process a batch of requests
    async fn process_batch(session: Arc<RwLock<InferenceSession>>, batch: Vec<BatchRequest>) {
        if batch.is_empty() {
            return;
        }

        // Combine inputs into batched tensors
        let batch_size = batch.len();
        let combined_inputs = match Self::combine_inputs(&batch) {
            Ok(inputs) => inputs,
            Err(e) => {
                // Send error to all requests
                let err_msg = format!("{}", e);
                for request in batch {
                    request.send_response(Err(Error::InferenceError(err_msg.clone())));
                }
                return;
            }
        };

        // Convert HashMap<String, Tensor> to HashMap<&str, Tensor>
        let inputs_ref: HashMap<&str, Tensor> = combined_inputs
            .iter()
            .map(|(k, v)| (k.as_str(), v.clone()))
            .collect();

        // Run inference on batched inputs
        let session = session.read().await;
        let combined_outputs = match session.run(inputs_ref) {
            Ok(outputs) => outputs,
            Err(e) => {
                // Send error to all requests
                let err_msg = format!("{}", e);
                for request in batch {
                    request.send_response(Err(Error::InferenceError(err_msg.clone())));
                }
                return;
            }
        };

        // Split outputs and send back to individual requests
        match Self::split_outputs(combined_outputs, batch_size) {
            Ok(individual_outputs) => {
                for (request, outputs) in batch.into_iter().zip(individual_outputs) {
                    request.send_response(Ok(outputs));
                }
            }
            Err(e) => {
                // Send error to all requests
                let err_msg = format!("{}", e);
                for request in batch {
                    request.send_response(Err(Error::InferenceError(err_msg.clone())));
                }
            }
        }
    }

    /// Combine multiple requests' inputs into batched tensors
    fn combine_inputs(batch: &[BatchRequest]) -> Result<HashMap<String, Tensor>> {
        if batch.is_empty() {
            return Ok(HashMap::new());
        }

        // Get all input names from first request
        let input_names: Vec<_> = batch[0].inputs.keys().cloned().collect();

        let mut combined = HashMap::new();

        for name in input_names {
            // Collect all tensors for this input
            let tensors: std::result::Result<Vec<_>, Error> = batch
                .iter()
                .map(|req| {
                    req.inputs.get(&name).ok_or_else(|| {
                        Error::InvalidInput(format!("Missing input tensor: {}", name))
                    })
                })
                .collect();
            let tensors = tensors?;

            // Stack tensors along batch dimension (dim 0)
            let batched = Tensor::stack(&tensors, 0)
                .map_err(|e| Error::InferenceError(format!("Failed to stack tensors: {}", e)))?;
            combined.insert(name, batched);
        }

        Ok(combined)
    }

    /// Split batched outputs into individual results
    fn split_outputs(
        combined: HashMap<String, Tensor>,
        batch_size: usize,
    ) -> Result<Vec<HashMap<String, Tensor>>> {
        let mut results = vec![HashMap::new(); batch_size];

        for (name, batched_tensor) in combined {
            // Split along batch dimension (dim 0)
            let individual_tensors = batched_tensor
                .split(batch_size, 0)
                .map_err(|e| Error::InferenceError(format!("Failed to split tensors: {}", e)))?;

            for (i, tensor) in individual_tensors.into_iter().enumerate() {
                results[i].insert(name.clone(), tensor);
            }
        }

        Ok(results)
    }

    /// Get the current configuration
    pub fn config(&self) -> &BatchConfig {
        &self.config
    }
}

/// Statistics about batch processing
#[derive(Debug, Clone, Default)]
pub struct BatchStats {
    /// Total number of batches processed
    pub total_batches: u64,
    /// Total number of individual requests processed
    pub total_requests: u64,
    /// Average batch size
    pub avg_batch_size: f64,
    /// Maximum batch size seen
    pub max_batch_size: usize,
    /// Minimum batch size seen
    pub min_batch_size: usize,
    /// Total processing time
    pub total_processing_time_ms: f64,
    /// Average processing time per batch
    pub avg_batch_time_ms: f64,
}

impl BatchStats {
    /// Calculate throughput (requests per second)
    pub fn throughput(&self) -> f64 {
        if self.total_processing_time_ms == 0.0 {
            0.0
        } else {
            (self.total_requests as f64 * 1000.0) / self.total_processing_time_ms
        }
    }

    /// Calculate batch utilization (actual vs max batch size)
    pub fn utilization(&self, max_batch_size: usize) -> f64 {
        if max_batch_size == 0 {
            0.0
        } else {
            self.avg_batch_size / max_batch_size as f64
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_batch_config_default() {
        let config = BatchConfig::default();
        assert_eq!(config.queue_capacity, 1024);
        assert_eq!(config.num_workers, 1);
        match config.strategy {
            BatchStrategy::Dynamic {
                max_batch_size,
                timeout_ms,
            } => {
                assert_eq!(max_batch_size, 32);
                assert_eq!(timeout_ms, 10);
            }
            _ => panic!("Expected dynamic strategy"),
        }
    }

    #[test]
    fn test_batch_strategy_static() {
        let strategy = BatchStrategy::Static { batch_size: 16 };
        match strategy {
            BatchStrategy::Static { batch_size } => {
                assert_eq!(batch_size, 16);
            }
            _ => panic!("Expected static strategy"),
        }
    }

    #[test]
    fn test_batch_stats_throughput() {
        let stats = BatchStats {
            total_requests: 1000,
            total_processing_time_ms: 1000.0,
            ..Default::default()
        };
        assert_eq!(stats.throughput(), 1000.0); // 1000 req/s
    }

    #[test]
    fn test_batch_stats_utilization() {
        let stats = BatchStats {
            avg_batch_size: 16.0,
            ..Default::default()
        };
        assert_eq!(stats.utilization(32), 0.5); // 50% utilization
    }
}