symbi-runtime 1.11.0

//! Agent Resource Manager
//!
//! Manages resource allocation, monitoring, and enforcement for agents

use async_trait::async_trait;
use parking_lot::RwLock;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use tokio::sync::{mpsc, Notify};
use tokio::time::interval;

use crate::integrations::policy_engine::{
    PolicyEnforcementFactory, PolicyEnforcementPoint, ResourceAccessConfig,
    ResourceAllocationRequest,
};
use crate::types::*;

/// Resource manager trait
#[async_trait]
pub trait ResourceManager {
    /// Allocate resources for an agent
    async fn allocate_resources(
        &self,
        agent_id: AgentId,
        requirements: ResourceRequirements,
    ) -> Result<ResourceAllocation, ResourceError>;

    /// Deallocate resources for an agent
    async fn deallocate_resources(&self, agent_id: AgentId) -> Result<(), ResourceError>;

    /// Update resource usage for an agent
    async fn update_usage(
        &self,
        agent_id: AgentId,
        usage: ResourceUsage,
    ) -> Result<(), ResourceError>;

    /// Get current resource usage for an agent
    async fn get_usage(&self, agent_id: AgentId) -> Result<ResourceUsage, ResourceError>;

    /// Get system resource status
    async fn get_system_status(&self) -> ResourceSystemStatus;

    /// Set resource limits for an agent
    async fn set_limits(
        &self,
        agent_id: AgentId,
        limits: ResourceLimits,
    ) -> Result<(), ResourceError>;

    /// Check if agent is within resource limits
    async fn check_limits(&self, agent_id: AgentId) -> Result<bool, ResourceError>;

    /// Check resource access violations for an agent
    async fn check_resource_violations(
        &self,
        agent_id: AgentId,
    ) -> Result<Vec<ResourceViolation>, ResourceError>;

    /// Shutdown the resource manager
    async fn shutdown(&self) -> Result<(), ResourceError>;

    /// Check the health of the resource manager
    async fn check_health(&self) -> Result<ComponentHealth, ResourceError>;
}

/// Resource manager configuration
#[derive(Debug, Clone)]
pub struct ResourceManagerConfig {
    pub total_memory: usize,
    pub total_cpu_cores: u32,
    pub total_disk_space: usize,
    pub total_network_bandwidth: usize,
    pub monitoring_interval: Duration,
    pub enforcement_enabled: bool,
    pub auto_scaling_enabled: bool,
    pub resource_reservation_percentage: f32,
    pub policy_enforcement_config: ResourceAccessConfig,
    /// Action taken when an agent breaches its allocated resource limits.
    ///
    /// Defaults to [`ViolationAction::Throttle`] so a single noisy agent is
    /// backed off rather than killed outright; operators who prefer fail-fast
    /// semantics can swap in [`ViolationAction::Kill`]. `LogOnly` preserves
    /// the pre-enforcement behaviour (emit a monitoring event, do nothing)
    /// for debugging.
    pub violation_action: ViolationAction,
    /// Number of consecutive sampling intervals an agent must breach limits
    /// before the manager escalates to [`ViolationAction::Kill`]. Only
    /// consulted when `violation_action == Throttle`.
    pub kill_after_sustained_violations: u32,
}

/// Action the resource manager takes when an agent exceeds its allocation.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ViolationAction {
    /// Emit a `LimitViolation` monitoring event and do nothing else. Useful
    /// for early rollouts where the operator wants visibility without the
    /// risk of killing misbehaving-but-important agents.
    LogOnly,
    /// Throttle the agent (emit `ThrottleRequested`) and escalate to `Kill`
    /// after `kill_after_sustained_violations` consecutive breaches.
    Throttle,
    /// Immediately emit `KillRequested` on any breach.
    Kill,
}

impl Default for ResourceManagerConfig {
    fn default() -> Self {
        Self {
            total_memory: 16 * 1024 * 1024 * 1024, // 16GB
            total_cpu_cores: 8,
            total_disk_space: 1024 * 1024 * 1024 * 1024, // 1TB
            total_network_bandwidth: 1000 * 1024 * 1024, // 1Gbps
            monitoring_interval: Duration::from_secs(5),
            enforcement_enabled: true,
            auto_scaling_enabled: false,
            resource_reservation_percentage: 0.1, // 10% reserved
            policy_enforcement_config: ResourceAccessConfig::default(),
            violation_action: ViolationAction::Throttle,
            kill_after_sustained_violations: 5,
        }
    }
}

/// Default implementation of the resource manager
pub struct DefaultResourceManager {
    config: ResourceManagerConfig,
    allocations: Arc<RwLock<HashMap<AgentId, ResourceAllocation>>>,
    usage_tracker: Arc<RwLock<HashMap<AgentId, ResourceUsage>>>,
    system_resources: Arc<RwLock<SystemResources>>,
    monitoring_sender: mpsc::UnboundedSender<MonitoringEvent>,
    shutdown_notify: Arc<Notify>,
    is_running: Arc<RwLock<bool>>,
    policy_enforcement: Arc<dyn PolicyEnforcementPoint>,
    /// Per-agent counter of consecutive sampling intervals during which
    /// resource usage exceeded the agent's allocation. Cleared on the first
    /// sample that comes back within limits.
    consecutive_violations: Arc<RwLock<HashMap<AgentId, u32>>>,
}

impl DefaultResourceManager {
    /// Create a new resource manager
    pub async fn new(config: ResourceManagerConfig) -> Result<Self, ResourceError> {
        let allocations = Arc::new(RwLock::new(HashMap::new()));
        let usage_tracker = Arc::new(RwLock::new(HashMap::new()));
        let system_resources = Arc::new(RwLock::new(SystemResources::new(&config)));
        let (monitoring_sender, monitoring_receiver) = mpsc::unbounded_channel();
        let shutdown_notify = Arc::new(Notify::new());
        let is_running = Arc::new(RwLock::new(true));

        // Create policy enforcement point
        let policy_enforcement = PolicyEnforcementFactory::create_enforcement_point(
            config.policy_enforcement_config.clone(),
        )
        .await
        .map_err(|e| {
            ResourceError::PolicyError(format!("Failed to create policy enforcement: {}", e))
        })?;

        let manager = Self {
            config,
            allocations,
            usage_tracker,
            system_resources,
            monitoring_sender,
            shutdown_notify,
            is_running,
            policy_enforcement,
            consecutive_violations: Arc::new(RwLock::new(HashMap::new())),
        };

        // Start background tasks
        manager.start_monitoring_loop(monitoring_receiver).await;
        manager.start_enforcement_loop().await;

        Ok(manager)
    }

    /// Start the resource monitoring loop
    async fn start_monitoring_loop(
        &self,
        mut monitoring_receiver: mpsc::UnboundedReceiver<MonitoringEvent>,
    ) {
        let usage_tracker = self.usage_tracker.clone();
        let allocations = self.allocations.clone();
        let system_resources = self.system_resources.clone();
        let shutdown_notify = self.shutdown_notify.clone();

        tokio::spawn(async move {
            loop {
                tokio::select! {
                    event = monitoring_receiver.recv() => {
                        if let Some(event) = event {
                            Self::process_monitoring_event(event, &usage_tracker, &allocations, &system_resources).await;
                        } else {
                            break;
                        }
                    }
                    _ = shutdown_notify.notified() => {
                        break;
                    }
                }
            }
        });
    }

    /// Start the resource enforcement loop
    async fn start_enforcement_loop(&self) {
        let usage_tracker = self.usage_tracker.clone();
        let allocations = self.allocations.clone();
        let monitoring_sender = self.monitoring_sender.clone();
        let shutdown_notify = self.shutdown_notify.clone();
        let is_running = self.is_running.clone();
        let monitoring_interval = self.config.monitoring_interval;
        let enforcement_enabled = self.config.enforcement_enabled;
        let violation_action = self.config.violation_action;
        let kill_after_sustained = self.config.kill_after_sustained_violations;
        let consecutive_violations = self.consecutive_violations.clone();

        tokio::spawn(async move {
            let mut interval = interval(monitoring_interval);

            loop {
                tokio::select! {
                    _ = interval.tick() => {
                        if !*is_running.read() {
                            break;
                        }

                        if enforcement_enabled {
                            Self::enforce_resource_limits(
                                &usage_tracker,
                                &allocations,
                                &monitoring_sender,
                                violation_action,
                                kill_after_sustained,
                                &consecutive_violations,
                            )
                            .await;
                        }
                    }
                    _ = shutdown_notify.notified() => {
                        break;
                    }
                }
            }
        });
    }

    /// Process a monitoring event
    async fn process_monitoring_event(
        event: MonitoringEvent,
        usage_tracker: &Arc<RwLock<HashMap<AgentId, ResourceUsage>>>,
        allocations: &Arc<RwLock<HashMap<AgentId, ResourceAllocation>>>,
        system_resources: &Arc<RwLock<SystemResources>>,
    ) {
        match event {
            MonitoringEvent::UsageUpdate { agent_id, usage } => {
                usage_tracker.write().insert(agent_id, usage.clone());

                // Update system resource usage
                system_resources.write().update_usage(&usage);

                tracing::debug!("Updated resource usage for agent {}: {:?}", agent_id, usage);
            }
            MonitoringEvent::AllocationRequest {
                agent_id,
                requirements,
            } => {
                let mut system = system_resources.write();
                if system.can_allocate(&requirements) {
                    let allocation = system.allocate(&requirements);
                    allocations.write().insert(agent_id, allocation.clone());

                    tracing::info!(
                        "Allocated resources for agent {}: {:?}",
                        agent_id,
                        allocation
                    );
                } else {
                    tracing::warn!(
                        "Cannot allocate resources for agent {}: insufficient resources",
                        agent_id
                    );
                }
            }
            MonitoringEvent::DeallocationRequest { agent_id } => {
                if let Some(allocation) = allocations.write().remove(&agent_id) {
                    system_resources.write().deallocate(&allocation);
                    usage_tracker.write().remove(&agent_id);

                    tracing::info!("Deallocated resources for agent {}", agent_id);
                }
            }
            MonitoringEvent::LimitViolation {
                agent_id,
                violations,
            } => {
                // Handle limit violation event - this is typically sent by the enforcement loop
                // and processed by external systems, so we just log it here
                tracing::warn!(
                    "Resource limit violation detected for agent {}: {:?}",
                    agent_id,
                    violations
                );
            }
            MonitoringEvent::ThrottleRequested {
                agent_id,
                consecutive_violations,
                violations,
            } => {
                // Throttle enforcement belongs in the sandbox/orchestrator
                // layer (e.g. tightening the Docker CPU quota). At the
                // resource-manager layer we surface the signal for observers.
                tracing::warn!(
                    %agent_id,
                    consecutive_violations,
                    ?violations,
                    "ThrottleRequested event emitted — orchestrator should slow this agent"
                );
            }
            MonitoringEvent::KillRequested {
                agent_id,
                violations,
                reason,
            } => {
                // Same story — actual termination is the orchestrator's
                // responsibility. Log loudly so operators see the escalation
                // regardless of whether a consumer is listening.
                tracing::error!(
                    %agent_id,
                    ?violations,
                    reason,
                    "KillRequested event emitted — orchestrator should terminate this agent"
                );
            }
        }
    }

    /// Enforce resource limits.
    ///
    /// For each agent with a known allocation, compares the most recent usage
    /// sample against the allocation's ceiling and dispatches the configured
    /// [`ViolationAction`]:
    ///
    /// - `LogOnly`: always emits a `LimitViolation` event for visibility.
    /// - `Throttle`: emits `ThrottleRequested` on breach and escalates to
    ///   `KillRequested` after `kill_after_sustained_violations` consecutive
    ///   breaches. Sampling intervals with no breach reset the counter.
    /// - `Kill`: emits `KillRequested` on any breach.
    ///
    /// The resource manager does not own the agent process handle, so actual
    /// enforcement (sending SIGTERM, updating cgroup quotas, etc.) is the
    /// responsibility of the orchestrator that consumes these monitoring
    /// events. That separation keeps the manager pluggable across Docker,
    /// E2B, and native-host backends.
    async fn enforce_resource_limits(
        usage_tracker: &Arc<RwLock<HashMap<AgentId, ResourceUsage>>>,
        allocations: &Arc<RwLock<HashMap<AgentId, ResourceAllocation>>>,
        monitoring_sender: &mpsc::UnboundedSender<MonitoringEvent>,
        violation_action: ViolationAction,
        kill_after_sustained: u32,
        consecutive_violations: &Arc<RwLock<HashMap<AgentId, u32>>>,
    ) {
        // Snapshot the read-locked maps; we release the locks before sending
        // events so a slow receiver can't stall the enforcement loop.
        let samples: Vec<(AgentId, ResourceUsage, ResourceAllocation)> = {
            let usage_map = usage_tracker.read();
            let allocations_map = allocations.read();
            usage_map
                .iter()
                .filter_map(|(agent_id, usage)| {
                    allocations_map
                        .get(agent_id)
                        .map(|alloc| (*agent_id, usage.clone(), alloc.clone()))
                })
                .collect()
        };

        for (agent_id, usage, allocation) in samples {
            let limits = ResourceLimits {
                memory_mb: allocation.allocated_memory / (1024 * 1024),
                cpu_cores: allocation.allocated_cpu_cores,
                disk_io_mbps: allocation.allocated_disk_io / (1024 * 1024),
                network_io_mbps: allocation.allocated_network_io / (1024 * 1024),
                execution_timeout: Duration::from_secs(3600),
                idle_timeout: Duration::from_secs(300),
            };
            let violations = Self::check_resource_violations(&usage, &limits);

            if violations.is_empty() {
                // Agent came back within limits; reset its consecutive-violation
                // counter so a long-quiet period doesn't linger as "almost
                // killed".
                consecutive_violations.write().remove(&agent_id);
                continue;
            }

            // Bump the consecutive-violation counter under a write lock, then
            // emit the appropriate monitoring event after releasing the lock.
            let new_count = {
                let mut counters = consecutive_violations.write();
                let entry = counters.entry(agent_id).or_insert(0);
                *entry += 1;
                *entry
            };

            tracing::warn!(
                %agent_id,
                violations = ?violations,
                consecutive = new_count,
                action = ?violation_action,
                "Agent violated resource limits"
            );

            // Always emit the raw violation event so dashboards keep working.
            let _ = monitoring_sender.send(MonitoringEvent::LimitViolation {
                agent_id,
                violations: violations.clone(),
            });

            match violation_action {
                ViolationAction::LogOnly => { /* nothing further */ }
                ViolationAction::Kill => {
                    let _ = monitoring_sender.send(MonitoringEvent::KillRequested {
                        agent_id,
                        violations: violations.clone(),
                        reason: "ViolationAction::Kill configured",
                    });
                }
                ViolationAction::Throttle => {
                    if new_count >= kill_after_sustained {
                        let _ = monitoring_sender.send(MonitoringEvent::KillRequested {
                            agent_id,
                            violations: violations.clone(),
                            reason: "sustained limit violations exceeded threshold",
                        });
                        // Reset the counter so we don't emit repeated kills for
                        // the same escalation; the orchestrator will either
                        // act on this event or the usage will recover.
                        consecutive_violations.write().remove(&agent_id);
                    } else {
                        let _ = monitoring_sender.send(MonitoringEvent::ThrottleRequested {
                            agent_id,
                            consecutive_violations: new_count,
                            violations,
                        });
                    }
                }
            }
        }
    }

    /// Check for resource limit violations
    fn check_resource_violations(
        usage: &ResourceUsage,
        limits: &ResourceLimits,
    ) -> Vec<ResourceViolation> {
        let mut violations = Vec::new();

        if usage.memory_used > limits.memory_mb * 1024 * 1024 {
            violations.push(ResourceViolation::Memory {
                used: usage.memory_used,
                limit: limits.memory_mb * 1024 * 1024,
            });
        }

        if usage.cpu_utilization > limits.cpu_cores {
            violations.push(ResourceViolation::Cpu {
                used: usage.cpu_utilization,
                limit: limits.cpu_cores,
            });
        }

        if usage.disk_io_rate > limits.disk_io_mbps * 1024 * 1024 {
            violations.push(ResourceViolation::DiskIo {
                used: usage.disk_io_rate,
                limit: limits.disk_io_mbps * 1024 * 1024,
            });
        }

        if usage.network_io_rate > limits.network_io_mbps * 1024 * 1024 {
            violations.push(ResourceViolation::NetworkIo {
                used: usage.network_io_rate,
                limit: limits.network_io_mbps * 1024 * 1024,
            });
        }

        violations
    }

    /// Send a monitoring event
    fn send_monitoring_event(&self, event: MonitoringEvent) -> Result<(), ResourceError> {
        self.monitoring_sender.send(event).map_err(|_| {
            ResourceError::MonitoringFailed("Failed to send monitoring event".to_string())
        })
    }
}

#[async_trait]
impl ResourceManager for DefaultResourceManager {
    async fn allocate_resources(
        &self,
        agent_id: AgentId,
        requirements: ResourceRequirements,
    ) -> Result<ResourceAllocation, ResourceError> {
        if !*self.is_running.read() {
            return Err(ResourceError::ShuttingDown);
        }

        // Check if agent already has allocation
        if self.allocations.read().contains_key(&agent_id) {
            return Err(ResourceError::AllocationExists { agent_id });
        }

        // Check policy for resource allocation
        let allocation_request = ResourceAllocationRequest {
            agent_id,
            requirements: requirements.clone(),
            priority: Priority::Normal,
            justification: None,
            max_duration: None,
            timestamp: SystemTime::now(),
        };

        let policy_decision = self
            .policy_enforcement
            .validate_resource_allocation(agent_id, &allocation_request)
            .await
            .map_err(|e| ResourceError::PolicyError(format!("Policy validation failed: {}", e)))?;

        let final_requirements = match policy_decision.decision {
            crate::integrations::policy_engine::AllocationResult::Approve => requirements,
            crate::integrations::policy_engine::AllocationResult::Deny => {
                return Err(ResourceError::PolicyViolation {
                    reason: policy_decision.reason.into(),
                });
            }
            crate::integrations::policy_engine::AllocationResult::Modified => {
                // Use modified requirements if provided
                policy_decision
                    .modified_requirements
                    .unwrap_or(requirements)
            }
            crate::integrations::policy_engine::AllocationResult::Queued => {
                return Err(ResourceError::AllocationQueued {
                    reason: policy_decision.reason.into(),
                });
            }
            crate::integrations::policy_engine::AllocationResult::Escalate => {
                return Err(ResourceError::EscalationRequired {
                    reason: policy_decision.reason.into(),
                });
            }
        };

        // Send allocation request
        self.send_monitoring_event(MonitoringEvent::AllocationRequest {
            agent_id,
            requirements: final_requirements.clone(),
        })?;

        // Give the monitoring loop time to process
        tokio::time::sleep(Duration::from_millis(10)).await;

        // Check if allocation was successful
        self.allocations.read().get(&agent_id).cloned().ok_or(
            ResourceError::InsufficientResources {
                requirements: "Insufficient system resources".into(),
            },
        )
    }

    async fn deallocate_resources(&self, agent_id: AgentId) -> Result<(), ResourceError> {
        self.send_monitoring_event(MonitoringEvent::DeallocationRequest { agent_id })?;

        // Give the monitoring loop time to process
        tokio::time::sleep(Duration::from_millis(10)).await;

        Ok(())
    }

    async fn update_usage(
        &self,
        agent_id: AgentId,
        usage: ResourceUsage,
    ) -> Result<(), ResourceError> {
        self.send_monitoring_event(MonitoringEvent::UsageUpdate { agent_id, usage })?;
        Ok(())
    }

    async fn get_usage(&self, agent_id: AgentId) -> Result<ResourceUsage, ResourceError> {
        self.usage_tracker
            .read()
            .get(&agent_id)
            .cloned()
            .ok_or(ResourceError::AgentNotFound { agent_id })
    }

    async fn get_system_status(&self) -> ResourceSystemStatus {
        let system = self.system_resources.read();
        let allocations_count = self.allocations.read().len();

        // Use the resource info to access reserved fields
        let resource_info = system.get_resource_info();

        ResourceSystemStatus {
            total_memory: self.config.total_memory,
            available_memory: resource_info.available_memory,
            total_cpu_cores: self.config.total_cpu_cores,
            available_cpu_cores: resource_info.available_cpu_cores,
            total_disk_space: self.config.total_disk_space,
            available_disk_space: resource_info.available_disk_space,
            total_network_bandwidth: self.config.total_network_bandwidth,
            available_network_bandwidth: resource_info.available_network_bandwidth,
            active_allocations: allocations_count,
            last_updated: SystemTime::now(),
        }
    }

    async fn set_limits(
        &self,
        agent_id: AgentId,
        limits: ResourceLimits,
    ) -> Result<(), ResourceError> {
        let mut allocations = self.allocations.write();
        if let Some(allocation) = allocations.get_mut(&agent_id) {
            // Update allocation fields based on limits
            allocation.allocated_memory = limits.memory_mb * 1024 * 1024;
            allocation.allocated_cpu_cores = limits.cpu_cores;
            allocation.allocated_disk_io = limits.disk_io_mbps * 1024 * 1024;
            allocation.allocated_network_io = limits.network_io_mbps * 1024 * 1024;
            Ok(())
        } else {
            Err(ResourceError::AgentNotFound { agent_id })
        }
    }

    async fn check_limits(&self, agent_id: AgentId) -> Result<bool, ResourceError> {
        let usage_map = self.usage_tracker.read();
        let allocations_map = self.allocations.read();

        // Agent must have allocation to check limits
        if let Some(allocation) = allocations_map.get(&agent_id) {
            // If there's usage data, check for violations
            if let Some(usage) = usage_map.get(&agent_id) {
                let limits = ResourceLimits {
                    memory_mb: allocation.allocated_memory / (1024 * 1024),
                    cpu_cores: allocation.allocated_cpu_cores,
                    disk_io_mbps: allocation.allocated_disk_io / (1024 * 1024),
                    network_io_mbps: allocation.allocated_network_io / (1024 * 1024),
                    execution_timeout: Duration::from_secs(3600),
                    idle_timeout: Duration::from_secs(300),
                };
                let violations = Self::check_resource_violations(usage, &limits);
                Ok(violations.is_empty())
            } else {
                // No usage data yet - assume within limits since no actual usage recorded
                Ok(true)
            }
        } else {
            Err(ResourceError::AgentNotFound { agent_id })
        }
    }

    async fn check_resource_violations(
        &self,
        agent_id: AgentId,
    ) -> Result<Vec<ResourceViolation>, ResourceError> {
        let usage_map = self.usage_tracker.read();
        let allocations_map = self.allocations.read();

        if let (Some(usage), Some(allocation)) =
            (usage_map.get(&agent_id), allocations_map.get(&agent_id))
        {
            // Create limits from allocation for violation checking
            let limits = ResourceLimits {
                memory_mb: allocation.allocated_memory / (1024 * 1024),
                cpu_cores: allocation.allocated_cpu_cores,
                disk_io_mbps: allocation.allocated_disk_io / (1024 * 1024),
                network_io_mbps: allocation.allocated_network_io / (1024 * 1024),
                execution_timeout: Duration::from_secs(3600),
                idle_timeout: Duration::from_secs(300),
            };
            Ok(Self::check_resource_violations(usage, &limits))
        } else {
            Err(ResourceError::AgentNotFound { agent_id })
        }
    }

    async fn shutdown(&self) -> Result<(), ResourceError> {
        tracing::info!("Shutting down resource manager");

        *self.is_running.write() = false;
        self.shutdown_notify.notify_waiters();

        // Deallocate all resources
        let agent_ids: Vec<AgentId> = self.allocations.read().keys().copied().collect();

        for agent_id in agent_ids {
            if let Err(e) = self.deallocate_resources(agent_id).await {
                tracing::error!(
                    "Failed to deallocate resources for agent {} during shutdown: {}",
                    agent_id,
                    e
                );
            }
        }

        Ok(())
    }

    async fn check_health(&self) -> Result<ComponentHealth, ResourceError> {
        let is_running = *self.is_running.read();
        if !is_running {
            return Ok(ComponentHealth::unhealthy(
                "Resource manager is shut down".to_string(),
            ));
        }

        let system_status = self.get_system_status().await;
        let allocations_count = self.allocations.read().len();

        // Calculate resource utilization percentages
        let memory_usage = if system_status.total_memory > 0 {
            (system_status.total_memory - system_status.available_memory) as f64
                / system_status.total_memory as f64
        } else {
            0.0
        };

        let cpu_usage = if system_status.total_cpu_cores > 0 {
            (system_status.total_cpu_cores - system_status.available_cpu_cores) as f64
                / system_status.total_cpu_cores as f64
        } else {
            0.0
        };

        let status = if memory_usage > 0.9 || cpu_usage > 0.9 {
            ComponentHealth::unhealthy(format!(
                "Critical resource usage - Memory: {:.1}%, CPU: {:.1}%",
                memory_usage * 100.0,
                cpu_usage * 100.0
            ))
        } else if memory_usage > 0.8 || cpu_usage > 0.8 {
            ComponentHealth::degraded(format!(
                "High resource usage - Memory: {:.1}%, CPU: {:.1}%",
                memory_usage * 100.0,
                cpu_usage * 100.0
            ))
        } else {
            ComponentHealth::healthy(Some(format!(
                "Resources available - Memory: {:.1}%, CPU: {:.1}%, {} active allocations",
                memory_usage * 100.0,
                cpu_usage * 100.0,
                allocations_count
            )))
        };

        Ok(status
            .with_metric(
                "memory_usage_percent".to_string(),
                format!("{:.2}", memory_usage * 100.0),
            )
            .with_metric(
                "cpu_usage_percent".to_string(),
                format!("{:.2}", cpu_usage * 100.0),
            )
            .with_metric(
                "active_allocations".to_string(),
                allocations_count.to_string(),
            )
            .with_metric(
                "available_memory_mb".to_string(),
                (system_status.available_memory / (1024 * 1024)).to_string(),
            )
            .with_metric(
                "available_cpu_cores".to_string(),
                system_status.available_cpu_cores.to_string(),
            ))
    }
}

/// System resources tracking
#[derive(Debug, Clone)]
struct SystemResources {
    available_memory: usize,
    available_cpu_cores: u32,
    available_disk_space: usize,
    available_network_bandwidth: usize,
    reserved_memory: usize,
    reserved_cpu_cores: u32,
    reserved_disk_space: usize,
    reserved_network_bandwidth: usize,
}

impl SystemResources {
    fn new(config: &ResourceManagerConfig) -> Self {
        let reservation_factor = config.resource_reservation_percentage;

        // Use ceiling operation to ensure we always reserve at least 1 CPU when factor > 0
        let reserved_cpu = if reservation_factor > 0.0 {
            ((config.total_cpu_cores as f32 * reservation_factor).ceil() as u32).max(1)
        } else {
            0
        };
        let available_cpu = config.total_cpu_cores.saturating_sub(reserved_cpu);

        Self {
            available_memory: config.total_memory
                - (config.total_memory as f32 * reservation_factor) as usize,
            available_cpu_cores: available_cpu,
            available_disk_space: config.total_disk_space
                - (config.total_disk_space as f32 * reservation_factor) as usize,
            available_network_bandwidth: config.total_network_bandwidth
                - (config.total_network_bandwidth as f32 * reservation_factor) as usize,
            reserved_memory: (config.total_memory as f32 * reservation_factor) as usize,
            reserved_cpu_cores: reserved_cpu,
            reserved_disk_space: (config.total_disk_space as f32 * reservation_factor) as usize,
            reserved_network_bandwidth: (config.total_network_bandwidth as f32 * reservation_factor)
                as usize,
        }
    }

    fn can_allocate(&self, requirements: &ResourceRequirements) -> bool {
        self.available_memory >= requirements.max_memory_mb * 1024 * 1024
            && self.available_cpu_cores >= requirements.max_cpu_cores as u32
            && self.available_disk_space >= requirements.disk_space_mb * 1024 * 1024
            && self.available_network_bandwidth >= requirements.network_bandwidth_mbps * 1024 * 1024
    }

    fn allocate(&mut self, requirements: &ResourceRequirements) -> ResourceAllocation {
        let memory_bytes = requirements.max_memory_mb * 1024 * 1024;
        let disk_bytes = requirements.disk_space_mb * 1024 * 1024;
        let network_bytes = requirements.network_bandwidth_mbps * 1024 * 1024;

        self.available_memory -= memory_bytes;
        self.available_cpu_cores -= requirements.max_cpu_cores as u32;
        self.available_disk_space -= disk_bytes;
        self.available_network_bandwidth -= network_bytes;

        ResourceAllocation {
            agent_id: AgentId::new(), // Will be set by caller
            allocated_memory: memory_bytes,
            allocated_cpu_cores: requirements.max_cpu_cores,
            allocated_disk_io: disk_bytes,
            allocated_network_io: network_bytes,
            allocation_time: SystemTime::now(),
        }
    }

    fn deallocate(&mut self, allocation: &ResourceAllocation) {
        self.available_memory += allocation.allocated_memory;
        self.available_cpu_cores += allocation.allocated_cpu_cores as u32;
        self.available_disk_space += allocation.allocated_disk_io;
        self.available_network_bandwidth += allocation.allocated_network_io;
    }

    fn update_usage(&mut self, _usage: &ResourceUsage) {
        // In a real implementation, this would update current usage metrics
        // For now, we just track allocations vs available resources
    }

    /// Get system resource information including reservations
    fn get_resource_info(&self) -> ResourceInfo {
        ResourceInfo {
            available_memory: self.available_memory,
            available_cpu_cores: self.available_cpu_cores,
            available_disk_space: self.available_disk_space,
            available_network_bandwidth: self.available_network_bandwidth,
            reserved_memory: self.reserved_memory,
            reserved_cpu_cores: self.reserved_cpu_cores,
            reserved_disk_space: self.reserved_disk_space,
            reserved_network_bandwidth: self.reserved_network_bandwidth,
        }
    }
}

/// Resource system status
#[derive(Debug, Clone)]
pub struct ResourceSystemStatus {
    pub total_memory: usize,
    pub available_memory: usize,
    pub total_cpu_cores: u32,
    pub available_cpu_cores: u32,
    pub total_disk_space: usize,
    pub available_disk_space: usize,
    pub total_network_bandwidth: usize,
    pub available_network_bandwidth: usize,
    pub active_allocations: usize,
    pub last_updated: SystemTime,
}

/// Resource information including reservations
#[derive(Debug, Clone)]
pub struct ResourceInfo {
    pub available_memory: usize,
    pub available_cpu_cores: u32,
    pub available_disk_space: usize,
    pub available_network_bandwidth: usize,
    pub reserved_memory: usize,
    pub reserved_cpu_cores: u32,
    pub reserved_disk_space: usize,
    pub reserved_network_bandwidth: usize,
}

/// Resource violations
#[derive(Debug, Clone)]
pub enum ResourceViolation {
    Memory { used: usize, limit: usize },
    Cpu { used: f32, limit: f32 },
    DiskIo { used: usize, limit: usize },
    NetworkIo { used: usize, limit: usize },
}

/// Monitoring events for internal processing
#[derive(Debug, Clone)]
enum MonitoringEvent {
    UsageUpdate {
        agent_id: AgentId,
        usage: ResourceUsage,
    },
    AllocationRequest {
        agent_id: AgentId,
        requirements: ResourceRequirements,
    },
    DeallocationRequest {
        agent_id: AgentId,
    },
    LimitViolation {
        agent_id: AgentId,
        violations: Vec<ResourceViolation>,
    },
    /// Request the orchestrator (or sandbox backend) to throttle this agent:
    /// reduce its CPU quota, pause it briefly, or otherwise slow it down.
    /// The resource manager itself does not have the process handle, so
    /// consumers of this event own the enforcement mechanism.
    ThrottleRequested {
        agent_id: AgentId,
        consecutive_violations: u32,
        violations: Vec<ResourceViolation>,
    },
    /// Request the orchestrator to kill this agent outright. Emitted either
    /// immediately (`ViolationAction::Kill`) or after
    /// `kill_after_sustained_violations` consecutive breaches under
    /// `ViolationAction::Throttle`.
    KillRequested {
        agent_id: AgentId,
        violations: Vec<ResourceViolation>,
        reason: &'static str,
    },
}

#[cfg(test)]
mod tests {
    use super::*;

    fn create_test_requirements() -> ResourceRequirements {
        ResourceRequirements {
            min_memory_mb: 1,
            max_memory_mb: 1,
            min_cpu_cores: 1.0,
            max_cpu_cores: 1.0,
            disk_space_mb: 1,
            network_bandwidth_mbps: 1,
        }
    }

    #[tokio::test]
    async fn test_resource_allocation() {
        let manager = DefaultResourceManager::new(ResourceManagerConfig::default())
            .await
            .unwrap();
        let agent_id = AgentId::new();
        let requirements = create_test_requirements();

        let allocation = manager
            .allocate_resources(agent_id, requirements)
            .await
            .unwrap();
        assert_eq!(allocation.allocated_memory, 1024 * 1024);
        assert_eq!(allocation.allocated_cpu_cores, 1.0);
    }

    #[tokio::test]
    async fn test_resource_deallocation() {
        let manager = DefaultResourceManager::new(ResourceManagerConfig::default())
            .await
            .unwrap();
        let agent_id = AgentId::new();
        let requirements = create_test_requirements();

        manager
            .allocate_resources(agent_id, requirements)
            .await
            .unwrap();
        let result = manager.deallocate_resources(agent_id).await;
        assert!(result.is_ok());
    }

    #[tokio::test]
    async fn test_usage_tracking() {
        let manager = DefaultResourceManager::new(ResourceManagerConfig::default())
            .await
            .unwrap();
        let agent_id = AgentId::new();
        let requirements = create_test_requirements();

        manager
            .allocate_resources(agent_id, requirements)
            .await
            .unwrap();

        let usage = ResourceUsage {
            memory_used: 512 * 1024, // 512KB
            cpu_utilization: 0.5,
            disk_io_rate: 512 * 1024,
            network_io_rate: 512,
            uptime: Duration::from_secs(60),
        };

        manager.update_usage(agent_id, usage.clone()).await.unwrap();

        tokio::time::sleep(Duration::from_millis(20)).await;

        let retrieved_usage = manager.get_usage(agent_id).await.unwrap();
        assert_eq!(retrieved_usage.memory_used, usage.memory_used);
        assert_eq!(retrieved_usage.cpu_utilization, usage.cpu_utilization);
    }

    #[tokio::test]
    async fn test_system_status() {
        let manager = DefaultResourceManager::new(ResourceManagerConfig::default())
            .await
            .unwrap();
        let status = manager.get_system_status().await;

        assert!(status.total_memory > 0);
        assert!(status.available_memory <= status.total_memory);
        assert!(status.total_cpu_cores > 0);
        assert!(status.available_cpu_cores <= status.total_cpu_cores);
    }

    #[test]
    fn test_resource_violations() {
        let usage = ResourceUsage {
            memory_used: 2 * 1024 * 1024, // 2MB
            cpu_utilization: 2.0,
            disk_io_rate: 2 * 1024 * 1024,
            network_io_rate: 2 * 1024 * 1024, // 2MB to exceed 1MB limit
            uptime: Duration::from_secs(60),
        };

        let limits = ResourceLimits {
            memory_mb: 1,
            cpu_cores: 1.0,
            disk_io_mbps: 1,
            network_io_mbps: 1,
            execution_timeout: Duration::from_secs(3600),
            idle_timeout: Duration::from_secs(300),
        };

        let violations = DefaultResourceManager::check_resource_violations(&usage, &limits);
        assert_eq!(violations.len(), 4); // All resources exceeded
    }
}