use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::{Duration, Instant};
#[derive(Debug)]
pub struct ProductionMonitor {
engine_id: String,
start_time: Instant,
execution_times: HashMap<String, Vec<Duration>>,
error_counts: HashMap<String, usize>,
total_executions: usize,
resource_snapshots: Vec<ResourceSnapshot>,
audit_log: Vec<AuditEntry>,
config: MonitorConfig,
}
#[derive(Debug, Clone)]
pub struct MonitorConfig {
pub enable_audit: bool,
pub enable_resource_monitoring: bool,
pub max_audit_entries: usize,
pub performance_threshold_ms: u64,
pub memory_threshold_mb: usize,
}
impl Default for MonitorConfig {
fn default() -> Self {
Self {
enable_audit: true,
enable_resource_monitoring: true,
max_audit_entries: 10000,
performance_threshold_ms: 1000,
memory_threshold_mb: 1024,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum HealthStatus {
Healthy,
Degraded,
Unhealthy,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthCheck {
pub status: HealthStatus,
pub timestamp: String,
pub uptime_secs: u64,
pub total_executions: usize,
pub error_rate: f64,
pub avg_execution_ms: f64,
pub memory_mb: usize,
pub issues: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct ResourceSnapshot {
pub timestamp: Instant,
pub memory_bytes: usize,
pub active_rules: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AuditEntry {
pub timestamp: String,
pub event_type: AuditEventType,
pub rule_name: Option<String>,
pub details: String,
pub duration_ms: Option<u64>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum AuditEventType {
ExecutionStarted,
ExecutionCompleted,
ExecutionFailed,
EngineStarted,
EngineStopped,
ConfigChanged,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProductionStatistics {
pub total_executions: usize,
pub total_errors: usize,
pub avg_execution_ms: f64,
pub p50_execution_ms: f64,
pub p95_execution_ms: f64,
pub p99_execution_ms: f64,
pub max_execution_ms: f64,
pub rules_by_count: Vec<(String, usize)>,
pub rules_by_time: Vec<(String, f64)>,
pub slowest_rules: Vec<(String, f64)>,
}
impl ProductionMonitor {
pub fn new(engine_id: impl Into<String>) -> Self {
let engine_id = engine_id.into();
let mut monitor = Self {
engine_id: engine_id.clone(),
start_time: Instant::now(),
execution_times: HashMap::new(),
error_counts: HashMap::new(),
total_executions: 0,
resource_snapshots: Vec::new(),
audit_log: Vec::new(),
config: MonitorConfig::default(),
};
monitor.log_audit(
AuditEventType::EngineStarted,
None,
format!("Engine {} started", engine_id),
None,
);
monitor
}
pub fn engine_id(&self) -> &str {
&self.engine_id
}
pub fn with_config(engine_id: impl Into<String>, config: MonitorConfig) -> Self {
let mut monitor = Self::new(engine_id);
monitor.config = config;
monitor
}
pub fn record_rule_execution(&mut self, rule_name: impl Into<String>, duration: Duration) {
let rule_name = rule_name.into();
self.total_executions += 1;
self.execution_times
.entry(rule_name.clone())
.or_default()
.push(duration);
if duration.as_millis() as u64 > self.config.performance_threshold_ms {
self.log_audit(
AuditEventType::ExecutionCompleted,
Some(rule_name.clone()),
format!("Slow execution: {}ms", duration.as_millis()),
Some(duration.as_millis() as u64),
);
}
if self.config.enable_audit {
self.log_audit(
AuditEventType::ExecutionCompleted,
Some(rule_name),
"Rule executed successfully".to_string(),
Some(duration.as_millis() as u64),
);
}
}
pub fn record_error(&mut self, rule_name: impl Into<String>, error: &str) {
let rule_name = rule_name.into();
*self.error_counts.entry(rule_name.clone()).or_insert(0) += 1;
if self.config.enable_audit {
self.log_audit(
AuditEventType::ExecutionFailed,
Some(rule_name),
error.to_string(),
None,
);
}
}
pub fn snapshot_resources(&mut self, active_rules: usize) {
if !self.config.enable_resource_monitoring {
return;
}
let memory_bytes = self.estimate_memory_usage();
self.resource_snapshots.push(ResourceSnapshot {
timestamp: Instant::now(),
memory_bytes,
active_rules,
});
let memory_mb = memory_bytes / (1024 * 1024);
if memory_mb > self.config.memory_threshold_mb {
self.log_audit(
AuditEventType::ConfigChanged,
None,
format!("High memory usage: {}MB", memory_mb),
None,
);
}
}
pub fn check_health(&self) -> HealthCheck {
let uptime = self.start_time.elapsed();
let total_errors: usize = self.error_counts.values().sum();
let error_rate = if self.total_executions > 0 {
(total_errors as f64 / self.total_executions as f64) * 100.0
} else {
0.0
};
let all_times: Vec<Duration> = self.execution_times.values().flatten().copied().collect();
let avg_execution_ms = if !all_times.is_empty() {
let sum: Duration = all_times.iter().sum();
(sum.as_micros() as f64 / all_times.len() as f64) / 1000.0
} else {
0.0
};
let mut status = HealthStatus::Healthy;
let mut issues = Vec::new();
if error_rate > 10.0 {
status = HealthStatus::Unhealthy;
issues.push(format!("High error rate: {:.2}%", error_rate));
} else if error_rate > 5.0 {
status = HealthStatus::Degraded;
issues.push(format!("Elevated error rate: {:.2}%", error_rate));
}
if avg_execution_ms > self.config.performance_threshold_ms as f64 {
if status == HealthStatus::Healthy {
status = HealthStatus::Degraded;
}
issues.push(format!(
"High average execution time: {:.2}ms",
avg_execution_ms
));
}
let memory_mb = if let Some(snapshot) = self.resource_snapshots.last() {
snapshot.memory_bytes / (1024 * 1024)
} else {
0
};
if memory_mb > self.config.memory_threshold_mb {
if status == HealthStatus::Healthy {
status = HealthStatus::Degraded;
}
issues.push(format!("High memory usage: {}MB", memory_mb));
}
HealthCheck {
status,
timestamp: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.expect("SystemTime should be after UNIX_EPOCH")
.as_secs()
.to_string(),
uptime_secs: uptime.as_secs(),
total_executions: self.total_executions,
error_rate,
avg_execution_ms,
memory_mb,
issues,
}
}
pub fn get_statistics(&self) -> ProductionStatistics {
let all_times: Vec<Duration> = self.execution_times.values().flatten().copied().collect();
let (avg_ms, p50_ms, p95_ms, p99_ms, max_ms) = if !all_times.is_empty() {
let mut sorted_times = all_times.clone();
sorted_times.sort();
let sum: Duration = sorted_times.iter().sum();
let avg = (sum.as_micros() as f64 / sorted_times.len() as f64) / 1000.0;
let p50_idx = (sorted_times.len() as f64 * 0.50) as usize;
let p95_idx = (sorted_times.len() as f64 * 0.95) as usize;
let p99_idx = (sorted_times.len() as f64 * 0.99) as usize;
let p50 = (sorted_times
.get(p50_idx)
.unwrap_or(&Duration::ZERO)
.as_micros() as f64)
/ 1000.0;
let p95 = (sorted_times
.get(p95_idx)
.unwrap_or(&Duration::ZERO)
.as_micros() as f64)
/ 1000.0;
let p99 = (sorted_times
.get(p99_idx)
.unwrap_or(&Duration::ZERO)
.as_micros() as f64)
/ 1000.0;
let max = (sorted_times.last().unwrap_or(&Duration::ZERO).as_micros() as f64) / 1000.0;
(avg, p50, p95, p99, max)
} else {
(0.0, 0.0, 0.0, 0.0, 0.0)
};
let mut rules_by_count: Vec<_> = self
.execution_times
.iter()
.map(|(name, times)| (name.clone(), times.len()))
.collect();
rules_by_count.sort_by_key(|b| std::cmp::Reverse(b.1));
let mut rules_by_time: Vec<_> = self
.execution_times
.iter()
.map(|(name, times)| {
let total: Duration = times.iter().sum();
(name.clone(), (total.as_micros() as f64) / 1000.0)
})
.collect();
rules_by_time.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let mut slowest_rules: Vec<_> = self
.execution_times
.iter()
.map(|(name, times)| {
let avg: Duration = times.iter().sum::<Duration>() / times.len() as u32;
(name.clone(), (avg.as_micros() as f64) / 1000.0)
})
.collect();
slowest_rules.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let total_errors = self.error_counts.values().sum();
ProductionStatistics {
total_executions: self.total_executions,
total_errors,
avg_execution_ms: avg_ms,
p50_execution_ms: p50_ms,
p95_execution_ms: p95_ms,
p99_execution_ms: p99_ms,
max_execution_ms: max_ms,
rules_by_count: rules_by_count.into_iter().take(10).collect(),
rules_by_time: rules_by_time.into_iter().take(10).collect(),
slowest_rules: slowest_rules.into_iter().take(10).collect(),
}
}
pub fn get_audit_log(&self) -> &[AuditEntry] {
&self.audit_log
}
pub fn export_audit_log_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string_pretty(&self.audit_log)
}
pub fn export_statistics_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string_pretty(&self.get_statistics())
}
pub fn clear_audit_log(&mut self) {
self.audit_log.clear();
}
pub fn reset_statistics(&mut self) {
self.execution_times.clear();
self.error_counts.clear();
self.total_executions = 0;
self.resource_snapshots.clear();
}
fn log_audit(
&mut self,
event_type: AuditEventType,
rule_name: Option<String>,
details: String,
duration_ms: Option<u64>,
) {
if !self.config.enable_audit {
return;
}
if self.audit_log.len() >= self.config.max_audit_entries {
self.audit_log.remove(0);
}
self.audit_log.push(AuditEntry {
timestamp: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.expect("SystemTime should be after UNIX_EPOCH")
.as_secs()
.to_string(),
event_type,
rule_name,
details,
duration_ms,
});
}
fn estimate_memory_usage(&self) -> usize {
let execution_times_size = self
.execution_times
.values()
.map(|v| v.len() * std::mem::size_of::<Duration>())
.sum::<usize>();
let audit_log_size = self.audit_log.len() * 200;
let resource_snapshots_size =
self.resource_snapshots.len() * std::mem::size_of::<ResourceSnapshot>();
execution_times_size + audit_log_size + resource_snapshots_size + 1024
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_production_monitor_creation() {
let monitor = ProductionMonitor::new("test-engine");
assert_eq!(monitor.engine_id, "test-engine");
assert_eq!(monitor.total_executions, 0);
}
#[test]
fn test_record_rule_execution() {
let mut monitor = ProductionMonitor::new("test");
monitor.record_rule_execution("rule1", Duration::from_millis(10));
assert_eq!(monitor.total_executions, 1);
assert!(monitor.execution_times.contains_key("rule1"));
}
#[test]
fn test_record_error() {
let mut monitor = ProductionMonitor::new("test");
monitor.record_error("rule1", "Test error");
assert_eq!(monitor.error_counts.get("rule1"), Some(&1));
}
#[test]
fn test_health_check_healthy() {
let monitor = ProductionMonitor::new("test");
let health = monitor.check_health();
assert_eq!(health.status, HealthStatus::Healthy);
assert_eq!(health.total_executions, 0);
assert!(health.issues.is_empty());
}
#[test]
fn test_health_check_degraded() {
let mut monitor = ProductionMonitor::new("test");
for _ in 0..100 {
monitor.record_rule_execution("rule1", Duration::from_millis(5));
}
for _ in 0..7 {
monitor.record_error("rule1", "Error");
}
let health = monitor.check_health();
assert_eq!(health.status, HealthStatus::Degraded);
assert!(health.error_rate > 5.0);
}
#[test]
fn test_statistics() {
let mut monitor = ProductionMonitor::new("test");
monitor.record_rule_execution("rule1", Duration::from_millis(10));
monitor.record_rule_execution("rule1", Duration::from_millis(20));
monitor.record_rule_execution("rule2", Duration::from_millis(30));
let stats = monitor.get_statistics();
assert_eq!(stats.total_executions, 3);
assert!(stats.avg_execution_ms > 0.0);
}
#[test]
fn test_audit_log() {
let mut monitor = ProductionMonitor::new("test");
monitor.record_rule_execution("rule1", Duration::from_millis(10));
let audit = monitor.get_audit_log();
assert!(!audit.is_empty());
assert!(audit
.iter()
.any(|e| e.event_type == AuditEventType::EngineStarted));
}
#[test]
fn test_export_json() {
let mut monitor = ProductionMonitor::new("test");
monitor.record_rule_execution("rule1", Duration::from_millis(10));
let stats_json = monitor.export_statistics_json();
assert!(stats_json.is_ok());
let audit_json = monitor.export_audit_log_json();
assert!(audit_json.is_ok());
}
#[test]
fn test_resource_snapshot() {
let mut monitor = ProductionMonitor::new("test");
monitor.snapshot_resources(5);
assert_eq!(monitor.resource_snapshots.len(), 1);
assert_eq!(monitor.resource_snapshots[0].active_rules, 5);
}
#[test]
fn test_reset_statistics() {
let mut monitor = ProductionMonitor::new("test");
monitor.record_rule_execution("rule1", Duration::from_millis(10));
monitor.record_error("rule1", "error");
monitor.reset_statistics();
assert_eq!(monitor.total_executions, 0);
assert!(monitor.execution_times.is_empty());
assert!(monitor.error_counts.is_empty());
}
}