pub mod batching;
pub mod hardware;
pub mod inference;
pub mod memory;
pub mod quantization;
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct OptimizationConfig {
pub quantization: quantization::QuantizationConfig,
pub batching: batching::BatchingConfig,
pub memory: memory::MemoryConfig,
pub hardware: hardware::HardwareConfig,
pub inference: inference::InferenceConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct OptimizationMetrics {
pub inference_speedup: f64,
pub memory_reduction: f64,
pub throughput_improvement: f64,
pub gpu_utilization: f64,
pub cache_hit_ratio: f64,
pub batch_efficiency: f64,
pub quantization_accuracy_loss: f64,
}
pub struct OptimizationManager {
config: OptimizationConfig,
metrics: Arc<RwLock<OptimizationMetrics>>,
quantizer: quantization::ModelQuantizer,
batcher: batching::DynamicBatcher,
memory_manager: memory::MemoryManager,
hardware_optimizer: hardware::HardwareOptimizer,
inference_optimizer: inference::InferenceOptimizer,
}
impl OptimizationManager {
pub async fn new(config: OptimizationConfig) -> Result<Self> {
let metrics = Arc::new(RwLock::new(OptimizationMetrics::default()));
let quantizer = quantization::ModelQuantizer::new(config.quantization.clone()).await?;
let batcher = batching::DynamicBatcher::new(config.batching.clone()).await?;
let memory_manager = memory::MemoryManager::new(config.memory.clone()).await?;
let hardware_optimizer = hardware::HardwareOptimizer::new(config.hardware.clone()).await?;
let inference_optimizer =
inference::InferenceOptimizer::new(config.inference.clone()).await?;
Ok(Self {
config,
metrics,
quantizer,
batcher,
memory_manager,
hardware_optimizer,
inference_optimizer,
})
}
pub async fn optimize_model(
&mut self,
model_path: &str,
target_format: &str,
) -> Result<String> {
tracing::info!(
"Starting comprehensive model optimization for {}",
model_path
);
let quantized_path = self
.quantizer
.quantize_model(model_path, target_format)
.await?;
let memory_optimized = self
.memory_manager
.optimize_model_loading(&quantized_path)
.await?;
let hardware_optimized = self
.hardware_optimizer
.optimize_for_hardware(&memory_optimized)
.await?;
let final_optimized = self
.inference_optimizer
.optimize_model(&hardware_optimized)
.await?;
self.update_optimization_metrics().await?;
tracing::info!("Model optimization completed: {}", final_optimized);
Ok(final_optimized)
}
pub async fn get_metrics(&self) -> OptimizationMetrics {
self.metrics.read().await.clone()
}
async fn update_optimization_metrics(&self) -> Result<()> {
let mut metrics = self.metrics.write().await;
let quant_metrics = self.quantizer.get_metrics().await;
let batch_metrics = self.batcher.get_metrics().await;
let memory_metrics = self.memory_manager.get_metrics().await;
let hardware_metrics = self.hardware_optimizer.get_metrics().await;
let inference_metrics = self.inference_optimizer.get_metrics().await;
metrics.inference_speedup = inference_metrics.speedup_ratio;
metrics.memory_reduction = memory_metrics.memory_saved_ratio;
metrics.throughput_improvement = batch_metrics.throughput_improvement;
metrics.gpu_utilization = hardware_metrics.gpu_utilization;
metrics.cache_hit_ratio = inference_metrics.cache_hit_ratio;
metrics.batch_efficiency = batch_metrics.efficiency_ratio;
metrics.quantization_accuracy_loss = quant_metrics.accuracy_loss;
Ok(())
}
pub async fn configure_optimization(
&mut self,
optimization_type: &str,
enabled: bool,
) -> Result<()> {
match optimization_type {
"quantization" => self.config.quantization.enabled = enabled,
"batching" => self.config.batching.enabled = enabled,
"memory" => self.config.memory.enabled = enabled,
"hardware" => self.config.hardware.enabled = enabled,
"inference" => self.config.inference.enabled = enabled,
_ => {
return Err(anyhow::anyhow!(
"Unknown optimization type: {}",
optimization_type
));
}
}
tracing::info!("Optimization '{}' set to: {}", optimization_type, enabled);
Ok(())
}
pub async fn benchmark_optimizations(
&self,
model_path: &str,
num_requests: usize,
) -> Result<HashMap<String, f64>> {
let mut results = HashMap::new();
tracing::info!(
"Running optimization benchmark with {} requests",
num_requests
);
let baseline = self.benchmark_baseline(model_path, num_requests).await?;
results.insert("baseline".to_string(), baseline);
let quantized = self.quantizer.benchmark(model_path, num_requests).await?;
results.insert("quantization".to_string(), quantized);
let batched = self.batcher.benchmark(model_path, num_requests).await?;
results.insert("batching".to_string(), batched);
let memory_opt = self
.memory_manager
.benchmark(model_path, num_requests)
.await?;
results.insert("memory".to_string(), memory_opt);
let hardware_opt = self
.hardware_optimizer
.benchmark(model_path, num_requests)
.await?;
results.insert("hardware".to_string(), hardware_opt);
let inference_opt = self
.inference_optimizer
.benchmark(model_path, num_requests)
.await?;
results.insert("inference".to_string(), inference_opt);
Ok(results)
}
async fn benchmark_baseline(&self, _model_path: &str, _num_requests: usize) -> Result<f64> {
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
Ok(1.0) }
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_optimization_manager_creation() {
let config = OptimizationConfig::default();
let manager = OptimizationManager::new(config).await;
assert!(manager.is_ok());
}
#[tokio::test]
async fn test_optimization_config_toggle() {
let config = OptimizationConfig::default();
let mut manager = OptimizationManager::new(config).await.unwrap();
manager
.configure_optimization("quantization", false)
.await
.unwrap();
assert!(!manager.config.quantization.enabled);
manager
.configure_optimization("quantization", true)
.await
.unwrap();
assert!(manager.config.quantization.enabled);
}
}