use crate::error::Result;
use crate::nas::{
architecture_encoding::ArchitectureEncoding,
search_space::{Architecture, LayerType},
};
use std::collections::HashMap;
use std::sync::Arc;
#[derive(Debug, Clone, PartialEq)]
pub enum HardwarePlatform {
CPU,
GPU,
MobileARM,
EdgeTPU,
FPGA,
Custom(String),
}
#[derive(Debug, Clone)]
pub struct HardwareConstraints {
pub platform: HardwarePlatform,
pub max_latency_ms: Option<f64>,
pub max_memory_mb: Option<f64>,
pub max_energy_mj: Option<f64>,
pub max_model_size_mb: Option<f64>,
pub min_throughput: Option<f64>,
pub compute_units: usize,
pub memory_bandwidth_gbps: Option<f64>,
pub quantization_support: QuantizationSupport,
impl Default for HardwareConstraints {
fn default() -> Self {
Self {
platform: HardwarePlatform::CPU,
max_latency_ms: Some(100.0),
max_memory_mb: Some(512.0),
max_energy_mj: Some(100.0),
max_model_size_mb: Some(50.0),
min_throughput: Some(10.0),
compute_units: 4,
memory_bandwidth_gbps: Some(25.6),
quantization_support: QuantizationSupport::Int8,
}
}
pub enum QuantizationSupport {
None,
Int8,
Float16,
Mixed,
Custom(Vec<String>),
pub struct LatencyPredictor {
operation_latencies: HashMap<String, f64>,
memory_costs: HashMap<String, f64>,
parallelization_factors: HashMap<String, f64>,
platform: HardwarePlatform,
impl LatencyPredictor {
pub fn new(platform: HardwarePlatform) -> Self {
let mut predictor = Self {
operation_latencies: HashMap::new(),
memory_costs: HashMap::new(),
parallelization_factors: HashMap::new(),
platform,
};
predictor.initialize_platform_characteristics();
predictor
fn initialize_platform_characteristics(&mut self) {
match self.platform {
HardwarePlatform::CPU => {
self.operation_latencies.insert("dense".to_string(), 0.1);
self.operation_latencies.insert("conv2d".to_string(), 0.5);
self.operation_latencies.insert("pooling".to_string(), 0.05);
self.operation_latencies
.insert("activation".to_string(), 0.01);
.insert("batchnorm".to_string(), 0.02);
self.memory_costs.insert("weight_load".to_string(), 0.001);
self.memory_costs
.insert("activation_store".to_string(), 0.0005);
self.parallelization_factors
.insert("dense".to_string(), 0.8);
.insert("conv2d".to_string(), 0.9);
}
HardwarePlatform::GPU => {
self.operation_latencies.insert("dense".to_string(), 0.02);
self.operation_latencies.insert("conv2d".to_string(), 0.1);
self.operation_latencies.insert("pooling".to_string(), 0.01);
.insert("activation".to_string(), 0.005);
.insert("batchnorm".to_string(), 0.01);
self.memory_costs.insert("weight_load".to_string(), 0.0001);
.insert("activation_store".to_string(), 0.00005);
.insert("dense".to_string(), 0.95);
.insert("conv2d".to_string(), 0.98);
HardwarePlatform::MobileARM => {
self.operation_latencies.insert("dense".to_string(), 0.3);
self.operation_latencies.insert("conv2d".to_string(), 1.0);
self.operation_latencies.insert("pooling".to_string(), 0.1);
.insert("activation".to_string(), 0.02);
.insert("batchnorm".to_string(), 0.05);
self.memory_costs.insert("weight_load".to_string(), 0.002);
.insert("activation_store".to_string(), 0.001);
.insert("dense".to_string(), 0.6);
.insert("conv2d".to_string(), 0.7);
HardwarePlatform::EdgeTPU => {
self.operation_latencies.insert("dense".to_string(), 0.05);
self.operation_latencies.insert("conv2d".to_string(), 0.2);
self.operation_latencies.insert("pooling".to_string(), 0.02);
.insert("batchnorm".to_string(), 0.015);
self.memory_costs.insert("weight_load".to_string(), 0.0002);
.insert("activation_store".to_string(), 0.0001);
.insert("dense".to_string(), 0.9);
.insert("conv2d".to_string(), 0.95);
_ => {
self.operation_latencies.insert("dense".to_string(), 0.2);
self.operation_latencies.insert("conv2d".to_string(), 0.8);
self.operation_latencies.insert("pooling".to_string(), 0.08);
.insert("batchnorm".to_string(), 0.04);
pub fn predict_latency(
&self,
architecture: &Architecture,
inputshape: &[usize],
) -> Result<f64> {
let mut total_latency = 0.0;
let mut currentshape = inputshape.to_vec();
for layer in &architecture.layers {
let layer_latency = self.predict_layer_latency(layer, ¤tshape)?;
total_latency += layer_latency;
currentshape = self.compute_outputshape(layer, ¤tshape)?;
total_latency += self.compute_initialization_overhead(architecture)?;
Ok(total_latency)
fn predict_layer_latency(&self, layer: &LayerType, inputshape: &[usize]) -> Result<f64> {
let base_latency = match layer {
LayerType::Dense(units) => {
let input_size: usize = inputshape.iter().product();
let ops = input_size * units;
let base = self
.operation_latencies
.get("dense")
.copied()
.unwrap_or(0.1);
base * (ops as f64 / 1e6) LayerType::Conv2D {
filters,
kernel_size,
stride,
} => {
if inputshape.len() < 3 {
return Err(crate::error::NeuralError::InvalidArgument(
"Conv2D requires 3D input".to_string(),
));
}
let h = inputshape[0];
let w = inputshape[1];
let c = inputshape[2];
let output_h = (h - kernel_size.0) / stride.0 + 1;
let output_w = (w - kernel_size.1) / stride.1 + 1;
let ops = output_h * output_w * filters * kernel_size.0 * kernel_size.1 * c;
.get("conv2d")
.unwrap_or(0.5);
base * (ops as f64 / 1e6)
LayerType::MaxPool2D { pool_size, stride }
| LayerType::AvgPool2D { pool_size, stride } => {
return Ok(0.0);
let output_h = (h - pool_size.0) / stride.0 + 1;
let output_w = (w - pool_size.1) / stride.1 + 1;
let ops = output_h * output_w * c * pool_size.0 * pool_size.1;
.get("pooling")
.unwrap_or(0.05);
LayerType::Activation(_) => {
let ops: usize = inputshape.iter().product();
.get("activation")
.unwrap_or(0.01);
LayerType::BatchNorm | LayerType::LayerNorm => {
.get("batchnorm")
.unwrap_or(0.02);
LayerType::Dropout(_) => 0.001, _ => 0.01, let parallelization = match layer {
LayerType::Dense(_) => self
.parallelization_factors
.get("dense")
.copied()
.unwrap_or(1.0),
LayerType::Conv2D { .. } => self
.get("conv2d")
_ => 1.0,
Ok(base_latency * parallelization)
fn compute_outputshape(&self, layer: &LayerType, inputshape: &[usize]) -> Result<Vec<usize>> {
match layer {
LayerType::Dense(units) => Ok(vec![*units]),
let h = (inputshape[0] - kernel_size.0) / stride.0 + 1;
let w = (inputshape[1] - kernel_size.1) / stride.1 + 1;
Ok(vec![h, w, *filters])
return Ok(inputshape.to_vec());
let h = (inputshape[0] - pool_size.0) / stride.0 + 1;
let w = (inputshape[1] - pool_size.1) / stride.1 + 1;
Ok(vec![h, w, inputshape[2]])
LayerType::Flatten => {
let total_size: usize = inputshape.iter().product();
Ok(vec![total_size])
_ => Ok(inputshape.to_vec()), fn compute_initialization_overhead(&self, architecture: &Architecture) -> Result<f64> {
let num_layers = architecture.layers.len() as f64;
let base_overhead = match self.platform {
HardwarePlatform::CPU => 5.0,
HardwarePlatform::GPU => 20.0,
HardwarePlatform::MobileARM => 10.0,
HardwarePlatform::EdgeTPU => 15.0_ => 8.0,
Ok(base_overhead + num_layers * 0.5)
pub struct MemoryPredictor {
impl MemoryPredictor {
Self { platform }
pub fn predict_memory_usage(
let mut total_memory = 0.0;
total_memory += self.compute_weights_memory(architecture)?;
let mut max_activation_memory = 0.0;
let activation_memory = self.compute_activation_memory(¤tshape)?;
max_activation_memory = max_activation_memory.max(activation_memory);
total_memory += max_activation_memory;
total_memory += self.compute_memory_overhead()?;
Ok(total_memory)
fn compute_weights_memory(&self, architecture: &Architecture) -> Result<f64> {
let mut weights_memory = 0.0;
let layer_params = match layer {
LayerType::Dense(units) => {
1024 * units + units LayerType::Conv2D {
filters,
kernel_size,
..
} => {
filters * kernel_size.0 * kernel_size.1 * 64 + filters
LayerType::BatchNorm => 128 * 4, LayerType::Embedding {
vocab_size,
embedding_dim,
} => vocab_size * embedding_dim_ => 0, };
weights_memory += layer_params as f64 * 4.0;
Ok(weights_memory / (1024.0 * 1024.0))
fn compute_activation_memory(&self, shape: &[usize]) -> Result<f64> {
let elements: usize = shape.iter().product();
let memory_bytes = elements as f64 * 4.0;
Ok(memory_bytes / (1024.0 * 1024.0))
fn compute_memory_overhead(&self) -> Result<f64> {
let overhead_mb = match self.platform {
HardwarePlatform::CPU => 50.0,
HardwarePlatform::GPU => 100.0,
HardwarePlatform::MobileARM => 25.0,
HardwarePlatform::EdgeTPU => 30.0_ => 40.0,
Ok(overhead_mb)
_ => Ok(inputshape.to_vec()),
pub struct EnergyPredictor {
power_characteristics: HashMap<String, f64>, impl EnergyPredictor {
power_characteristics: HashMap::new(),
predictor.initialize_power_characteristics();
fn initialize_power_characteristics(&mut self) {
self.power_characteristics
.insert("dense".to_string(), 100.0);
.insert("conv2d".to_string(), 150.0);
.insert("memory".to_string(), 50.0);
.insert("dense".to_string(), 200.0);
.insert("conv2d".to_string(), 300.0);
.insert("memory".to_string(), 80.0);
self.power_characteristics.insert("dense".to_string(), 30.0);
.insert("conv2d".to_string(), 50.0);
.insert("memory".to_string(), 15.0);
self.power_characteristics.insert("dense".to_string(), 75.0);
.insert("conv2d".to_string(), 100.0);
.insert("memory".to_string(), 30.0);
pub fn predict_energy(&self, architecture: &Architecture, latencyms: f64) -> Result<f64> {
let mut total_energy = 0.0;
let layer_energy = self.compute_layer_energy(layer)?;
total_energy += layer_energy;
let static_power = self.get_static_power();
total_energy += static_power * latency_ms;
Ok(total_energy)
fn compute_layer_energy(&self, layer: &LayerType) -> Result<f64> {
let power = match layer {
.power_characteristics
.unwrap_or(100.0),
.unwrap_or(150.0, _ => 10.0, Ok(power * 1.0)
fn get_static_power(&self) -> f64 {
HardwarePlatform::CPU => 1000.0, HardwarePlatform::GPU => 5000.0, HardwarePlatform::MobileARM => 500.0, HardwarePlatform::EdgeTPU => 2000.0, _ => 1500.0, pub struct HardwareAwareSearch {
constraints: HardwareConstraints,
latency_predictor: LatencyPredictor,
memory_predictor: MemoryPredictor,
energy_predictor: EnergyPredictor,
constraint_weights: HashMap<String, f64>,
violation_history: Vec<HashMap<String, f64>>,
impl HardwareAwareSearch {
pub fn new(constraints: HardwareConstraints) -> Self {
let latency_predictor = LatencyPredictor::new(_constraints.platform.clone());
let memory_predictor = MemoryPredictor::new(_constraints.platform.clone());
let energy_predictor = EnergyPredictor::new(_constraints.platform.clone());
let mut constraint_weights = HashMap::new();
constraintweights.insert("latency".to_string(), 0.3);
constraintweights.insert("memory".to_string(), 0.25);
constraintweights.insert("energy".to_string(), 0.2);
constraintweights.insert("model_size".to_string(), 0.15);
constraintweights.insert("throughput".to_string(), 0.1);
constraints,
latency_predictor,
memory_predictor,
energy_predictor,
constraint_weights,
violation_history: Vec::new(),
pub fn evaluate_constraints(
&mut self,
architecture: &Arc<dyn ArchitectureEncoding>,
) -> Result<HashMap<String, f64>> {
let arch = architecture.to_architecture()?;
let mut violations = HashMap::new();
let predicted_latency = self.latency_predictor.predict_latency(&arch, inputshape)?;
if let Some(max_latency) = self._constraints.max_latency_ms {
let violation = (predicted_latency - max_latency).max(0.0) / max_latency;
violations.insert("latency".to_string(), violation);
let predicted_memory = self
.memory_predictor
.predict_memory_usage(&arch, inputshape)?;
if let Some(max_memory) = self._constraints.max_memory_mb {
let violation = (predicted_memory - max_memory).max(0.0) / max_memory;
violations.insert("memory".to_string(), violation);
let predicted_energy = self
.energy_predictor
.predict_energy(&arch, predicted_latency)?;
if let Some(max_energy) = self._constraints.max_energy_mj {
let violation = (predicted_energy - max_energy).max(0.0) / max_energy;
violations.insert("energy".to_string(), violation);
let model_size = self.estimate_model_size(&arch)?;
if let Some(max_size) = self._constraints.max_model_size_mb {
let violation = (model_size - max_size).max(0.0) / max_size;
violations.insert("model_size".to_string(), violation);
let throughput = 1000.0 / predicted_latency; if let Some(min_throughput) = self._constraints.min_throughput {
let violation = (min_throughput - throughput).max(0.0) / min_throughput;
violations.insert("throughput".to_string(), violation);
self.violation_history.push(violations.clone());
Ok(violations)
pub fn compute_constraint_score(&self, violations: &HashMap<String, f64>) -> f64 {
let mut weighted_score = 0.0;
let mut total_weight = 0.0;
for (constraint, &violation) in violations {
if let Some(&weight) = self.constraintweights.get(constraint) {
weighted_score += violation * weight;
total_weight += weight;
if total_weight > 0.0 {
weighted_score / total_weight
} else {
0.0
pub fn satisfies_constraints(&self, violations: &HashMap<String, f64>) -> bool {
violations.values().all(|&violation| violation <= 0.0)
pub fn get_satisfaction_rate(&self) -> f64 {
if self.violation_history.is_empty() {
return 1.0;
let satisfied_count = self
.violation_history
.iter()
.filter(|violations| self.satisfies_constraints(violations))
.count();
satisfied_count as f64 / self.violation_history.len() as f64
pub fn generate_optimization_suggestions(
violations: &HashMap<String, f64>,
) -> Vec<String> {
let mut suggestions = Vec::new();
if violation > 0.1 {
match constraint.as_str() {
"latency" => {
suggestions.push("Consider reducing model depth or width".to_string());
suggestions.push("Use depthwise separable convolutions".to_string());
suggestions.push("Apply model pruning or quantization".to_string());
}
"memory" => {
suggestions.push("Reduce batch size or model parameters".to_string());
suggestions.push("Use gradient checkpointing".to_string());
suggestions.push("Apply weight sharing techniques".to_string());
"energy" => {
suggestions.push("Reduce computational complexity".to_string());
suggestions.push("Use low-power layer types".to_string());
suggestions.push("Optimize data movement patterns".to_string());
"model_size" => {
suggestions.push("Apply model compression techniques".to_string());
suggestions.push("Use knowledge distillation".to_string());
suggestions.push("Reduce parameter precision".to_string());
"throughput" => {
suggestions.push("Optimize for batch processing".to_string());
suggestions.push("Use pipeline parallelism".to_string());
suggestions.push("Consider model ensemble reduction".to_string());
_ => {}
suggestions
fn estimate_model_size(&self, architecture: &Architecture) -> Result<f64> {
let mut total_params = 0;
1024 * units + units
_ => 0,
total_params += layer_params;
Ok(total_params as f64 * 4.0 / (1024.0 * 1024.0))
pub fn adapt_constraint_weights(&mut self) {
if self.violation_history.len() < 10 {
return; let recent_violations = &self.violation_history[self.violation_history.len() - 10..];
let mut avg_violations = HashMap::new();
for violations in recent_violations {
for (constraint, &violation) in violations {
*avg_violations.entry(constraint.clone()).or_insert(0.0) += violation / 10.0;
for (constraint, avg_violation) in avg_violations {
if avg_violation > 0.1 {
if let Some(weight) = self.constraintweights.get_mut(&constraint) {
*weight = (*weight * 1.1).min(0.5); let total_weight: f64 = self.constraintweights.values().sum();
for weight in self.constraintweights.values_mut() {
*weight /= total_weight;
pub fn get_platform_summary(&self) -> String {
format!(
"Platform: {:?}\nMax Latency: {:?} ms\nMax Memory: {:?} MB\nMax Energy: {:?} mJ\nCompute Units: {}",
self.constraints.platform,
self.constraints.max_latency_ms,
self.constraints.max_memory_mb,
self.constraints.max_energy_mj,
self.constraints.compute_units
)
#[cfg(test)]
mod tests {
use super::*;
use crate::nas::search_space::Architecture;
#[test]
fn test_hardware_constraints_default() {
let constraints = HardwareConstraints::default();
assert_eq!(constraints.platform, HardwarePlatform::CPU);
assert!(constraints.max_latency_ms.is_some());
fn test_latency_predictor() {
let predictor = LatencyPredictor::new(HardwarePlatform::CPU);
let architecture = Architecture {
layers: vec![
LayerType::Dense(128),
LayerType::Activation("relu".to_string()),
LayerType::Dense(10),
],
connections: vec![],
width_multiplier: 1.0,
depth_multiplier: 1.0,
let latency = predictor
.predict_latency(&architecture, &[32, 32, 3])
.expect("Operation failed");
assert!(latency > 0.0);
fn test_memory_predictor() {
let predictor = MemoryPredictor::new(HardwarePlatform::GPU);
layers: vec![LayerType::Dense(256), LayerType::Dense(10)],
let memory = predictor
.predict_memory_usage(&architecture, &[784])
assert!(memory > 0.0);
fn test_hardware_aware_search() {
let mut search = HardwareAwareSearch::new(constraints);
let arch = Arc::new(crate::nas::architecture_encoding::SequentialEncoding::new(
vec![LayerType::Dense(128), LayerType::Dense(10)],
));
let violations = search.evaluate_constraints(&arch, &[784]).expect("Operation failed");
assert!(!violations.is_empty());
let score = search.compute_constraint_score(&violations);
assert!(score >= 0.0);