use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Instant;
use tokio::sync::RwLock;
use metrics::{counter, gauge, histogram};
use crate::service_types::{SystemMetrics, NetworkMetrics, ServiceHealth};
#[derive(Debug)]
pub struct MetricsCollector {
start_time: Instant,
metrics: Arc<RwLock<ServiceMetricsData>>,
}
#[derive(Debug, Default)]
struct ServiceMetricsData {
total_tasks: u64,
completed_tasks: u64,
failed_tasks: u64,
active_tasks: u32,
task_execution_times: Vec<f64>,
tool_usage: HashMap<String, u64>,
error_counts: HashMap<String, u64>,
system_metrics: Option<SystemMetrics>,
}
impl MetricsCollector {
pub fn new() -> Self {
Self {
start_time: Instant::now(),
metrics: Arc::new(RwLock::new(ServiceMetricsData::default())),
}
}
pub fn uptime_seconds(&self) -> u64 {
self.start_time.elapsed().as_secs()
}
pub async fn record_task_start(&self) {
let mut metrics = self.metrics.write().await;
metrics.total_tasks += 1;
metrics.active_tasks += 1;
}
pub async fn record_task_completion(&self, execution_time_seconds: f64, success: bool) {
let mut metrics = self.metrics.write().await;
if metrics.active_tasks > 0 {
metrics.active_tasks -= 1;
}
if success {
metrics.completed_tasks += 1;
counter!("ai_agent_tasks_completed_total").increment(1);
} else {
metrics.failed_tasks += 1;
counter!("ai_agent_tasks_failed_total").increment(1);
}
metrics.task_execution_times.push(execution_time_seconds);
if metrics.task_execution_times.len() > 1000 {
metrics.task_execution_times.remove(0);
}
histogram!("ai_agent_task_duration_seconds").record(execution_time_seconds);
gauge!("ai_agent_active_tasks").set(metrics.active_tasks as f64);
}
pub async fn record_tool_usage(&self, tool_name: &str) {
let mut metrics = self.metrics.write().await;
*metrics.tool_usage.entry(tool_name.to_string()).or_insert(0) += 1;
counter!("ai_agent_tool_usage_total", "tool" => tool_name).increment(1);
}
pub async fn record_error(&self, error_type: &str) {
let mut metrics = self.metrics.write().await;
*metrics.error_counts.entry(error_type.to_string()).or_insert(0) += 1;
counter!("ai_agent_errors_total", "error_type" => error_type).increment(1);
}
pub async fn update_system_metrics(&self, system_metrics: SystemMetrics) {
let mut metrics = self.metrics.write().await;
metrics.system_metrics = Some(system_metrics.clone());
gauge!("ai_agent_cpu_usage_percent").set(system_metrics.cpu_usage_percent);
gauge!("ai_agent_memory_usage_mb").set(system_metrics.memory_usage_mb);
gauge!("ai_agent_disk_usage_mb").set(system_metrics.disk_usage_mb);
gauge!("ai_agent_network_bytes_received").set(system_metrics.network_io.bytes_received as f64);
gauge!("ai_agent_network_bytes_sent").set(system_metrics.network_io.bytes_sent as f64);
gauge!("ai_agent_network_active_connections").set(system_metrics.network_io.active_connections as f64);
}
pub async fn get_metrics_snapshot(&self) -> MetricsSnapshot {
let metrics = self.metrics.read().await;
let avg_execution_time = if metrics.task_execution_times.is_empty() {
0.0
} else {
metrics.task_execution_times.iter().sum::<f64>() / metrics.task_execution_times.len() as f64
};
MetricsSnapshot {
uptime_seconds: self.uptime_seconds(),
total_tasks: metrics.total_tasks,
completed_tasks: metrics.completed_tasks,
failed_tasks: metrics.failed_tasks,
active_tasks: metrics.active_tasks,
average_execution_time_seconds: avg_execution_time,
tool_usage: metrics.tool_usage.clone(),
error_counts: metrics.error_counts.clone(),
system_metrics: metrics.system_metrics.clone(),
}
}
pub async fn reset(&self) {
let mut metrics = self.metrics.write().await;
*metrics = ServiceMetricsData::default();
}
pub async fn get_health_status(&self) -> ServiceHealth {
let metrics = self.metrics.read().await;
let failure_rate = if metrics.total_tasks > 0 {
metrics.failed_tasks as f64 / metrics.total_tasks as f64
} else {
0.0
};
if let Some(system_metrics) = &metrics.system_metrics {
if system_metrics.cpu_usage_percent > 90.0
|| system_metrics.memory_usage_mb > 8000.0 || failure_rate > 0.5 {
return ServiceHealth::Unhealthy;
} else if system_metrics.cpu_usage_percent > 70.0
|| system_metrics.memory_usage_mb > 4000.0 || failure_rate > 0.1 {
return ServiceHealth::Degraded;
}
} else {
if failure_rate > 0.5 {
return ServiceHealth::Unhealthy;
} else if failure_rate > 0.1 {
return ServiceHealth::Degraded;
}
}
ServiceHealth::Healthy
}
}
impl Default for MetricsCollector {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricsSnapshot {
pub uptime_seconds: u64,
pub total_tasks: u64,
pub completed_tasks: u64,
pub failed_tasks: u64,
pub active_tasks: u32,
pub average_execution_time_seconds: f64,
pub tool_usage: HashMap<String, u64>,
pub error_counts: HashMap<String, u64>,
pub system_metrics: Option<SystemMetrics>,
}
pub struct SystemMetricsCollector;
impl SystemMetricsCollector {
pub async fn collect() -> Result<SystemMetrics, Box<dyn std::error::Error + Send + Sync>> {
let cpu_usage = Self::get_cpu_usage().await?;
let memory_usage = Self::get_memory_usage().await?;
let disk_usage = Self::get_disk_usage().await?;
let network_io = Self::get_network_io().await?;
Ok(SystemMetrics {
cpu_usage_percent: cpu_usage,
memory_usage_mb: memory_usage,
disk_usage_mb: disk_usage,
network_io,
})
}
async fn get_cpu_usage() -> Result<f64, Box<dyn std::error::Error + Send + Sync>> {
Ok(25.0) }
async fn get_memory_usage() -> Result<f64, Box<dyn std::error::Error + Send + Sync>> {
Ok(512.0) }
async fn get_disk_usage() -> Result<f64, Box<dyn std::error::Error + Send + Sync>> {
Ok(1024.0) }
async fn get_network_io() -> Result<NetworkMetrics, Box<dyn std::error::Error + Send + Sync>> {
Ok(NetworkMetrics {
bytes_received: 1024 * 1024, bytes_sent: 512 * 1024, active_connections: 5,
})
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricsConfig {
pub enabled: bool,
pub collection_interval_seconds: u64,
pub enable_prometheus: bool,
pub prometheus_endpoint: String,
pub retention_period_seconds: u64,
}
impl Default for MetricsConfig {
fn default() -> Self {
Self {
enabled: true,
collection_interval_seconds: 30,
enable_prometheus: true,
prometheus_endpoint: "/metrics".to_string(),
retention_period_seconds: 3600, }
}
}