rustant_tools/
system_monitor.rs

1//! System monitor tool — production service topology, health checks, and incident tracking.
2
3use async_trait::async_trait;
4use chrono::{DateTime, Utc};
5use rustant_core::error::ToolError;
6use rustant_core::types::{RiskLevel, ToolOutput};
7use serde::{Deserialize, Serialize};
8use serde_json::{Value, json};
9use std::collections::{HashSet, VecDeque};
10use std::path::PathBuf;
11use std::time::Duration;
12
13use crate::registry::Tool;
14
15#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
16#[serde(rename_all = "lowercase")]
17enum ServiceType {
18    Api,
19    Database,
20    Cache,
21    Queue,
22    Frontend,
23    Worker,
24    Gateway,
25}
26
27impl ServiceType {
28    fn from_str(s: &str) -> Option<Self> {
29        match s {
30            "api" => Some(Self::Api),
31            "database" => Some(Self::Database),
32            "cache" => Some(Self::Cache),
33            "queue" => Some(Self::Queue),
34            "frontend" => Some(Self::Frontend),
35            "worker" => Some(Self::Worker),
36            "gateway" => Some(Self::Gateway),
37            _ => None,
38        }
39    }
40}
41
42impl std::fmt::Display for ServiceType {
43    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44        match self {
45            Self::Api => write!(f, "api"),
46            Self::Database => write!(f, "database"),
47            Self::Cache => write!(f, "cache"),
48            Self::Queue => write!(f, "queue"),
49            Self::Frontend => write!(f, "frontend"),
50            Self::Worker => write!(f, "worker"),
51            Self::Gateway => write!(f, "gateway"),
52        }
53    }
54}
55
56#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57#[serde(rename_all = "lowercase")]
58enum ServiceStatus {
59    Unknown,
60    Healthy,
61    Degraded,
62    Down,
63}
64
65impl ServiceStatus {
66    fn marker(&self) -> &'static str {
67        match self {
68            Self::Healthy => "\u{2713}",  // checkmark
69            Self::Degraded => "\u{26a0}", // warning
70            Self::Down => "\u{2717}",     // X mark
71            Self::Unknown => "?",
72        }
73    }
74}
75
76impl std::fmt::Display for ServiceStatus {
77    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
78        match self {
79            Self::Unknown => write!(f, "Unknown"),
80            Self::Healthy => write!(f, "Healthy"),
81            Self::Degraded => write!(f, "Degraded"),
82            Self::Down => write!(f, "Down"),
83        }
84    }
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
88struct Service {
89    id: usize,
90    name: String,
91    url: String,
92    service_type: ServiceType,
93    dependencies: Vec<usize>,
94    health_endpoint: Option<String>,
95    status: ServiceStatus,
96    last_checked: Option<DateTime<Utc>>,
97    response_time_ms: Option<u64>,
98}
99
100#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
101#[serde(rename_all = "lowercase")]
102enum IncidentSeverity {
103    Low,
104    Medium,
105    High,
106    Critical,
107}
108
109impl IncidentSeverity {
110    fn from_str(s: &str) -> Option<Self> {
111        match s {
112            "low" => Some(Self::Low),
113            "medium" => Some(Self::Medium),
114            "high" => Some(Self::High),
115            "critical" => Some(Self::Critical),
116            _ => None,
117        }
118    }
119}
120
121impl std::fmt::Display for IncidentSeverity {
122    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
123        match self {
124            Self::Low => write!(f, "Low"),
125            Self::Medium => write!(f, "Medium"),
126            Self::High => write!(f, "High"),
127            Self::Critical => write!(f, "Critical"),
128        }
129    }
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
133#[serde(rename_all = "lowercase")]
134enum IncidentStatus {
135    Investigating,
136    Identified,
137    Monitoring,
138    Resolved,
139}
140
141impl std::fmt::Display for IncidentStatus {
142    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
143        match self {
144            Self::Investigating => write!(f, "Investigating"),
145            Self::Identified => write!(f, "Identified"),
146            Self::Monitoring => write!(f, "Monitoring"),
147            Self::Resolved => write!(f, "Resolved"),
148        }
149    }
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
153struct TimelineEntry {
154    timestamp: DateTime<Utc>,
155    message: String,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
159struct Incident {
160    id: usize,
161    title: String,
162    severity: IncidentSeverity,
163    affected_services: Vec<usize>,
164    timeline: Vec<TimelineEntry>,
165    status: IncidentStatus,
166    root_cause: Option<String>,
167    resolution: Option<String>,
168    created_at: DateTime<Utc>,
169}
170
171#[derive(Debug, Default, Serialize, Deserialize)]
172struct MonitorState {
173    services: Vec<Service>,
174    incidents: Vec<Incident>,
175    next_service_id: usize,
176    next_incident_id: usize,
177}
178
179pub struct SystemMonitorTool {
180    workspace: PathBuf,
181}
182
183impl SystemMonitorTool {
184    pub fn new(workspace: PathBuf) -> Self {
185        Self { workspace }
186    }
187
188    fn state_path(&self) -> PathBuf {
189        self.workspace
190            .join(".rustant")
191            .join("monitoring")
192            .join("topology.json")
193    }
194
195    fn load_state(&self) -> MonitorState {
196        let path = self.state_path();
197        if path.exists() {
198            std::fs::read_to_string(&path)
199                .ok()
200                .and_then(|s| serde_json::from_str(&s).ok())
201                .unwrap_or_default()
202        } else {
203            MonitorState {
204                services: Vec::new(),
205                incidents: Vec::new(),
206                next_service_id: 1,
207                next_incident_id: 1,
208            }
209        }
210    }
211
212    fn save_state(&self, state: &MonitorState) -> Result<(), ToolError> {
213        let path = self.state_path();
214        if let Some(parent) = path.parent() {
215            std::fs::create_dir_all(parent).map_err(|e| ToolError::ExecutionFailed {
216                name: "system_monitor".to_string(),
217                message: format!("Failed to create state dir: {}", e),
218            })?;
219        }
220        let json = serde_json::to_string_pretty(state).map_err(|e| ToolError::ExecutionFailed {
221            name: "system_monitor".to_string(),
222            message: format!("Failed to serialize state: {}", e),
223        })?;
224        let tmp = path.with_extension("json.tmp");
225        std::fs::write(&tmp, &json).map_err(|e| ToolError::ExecutionFailed {
226            name: "system_monitor".to_string(),
227            message: format!("Failed to write state: {}", e),
228        })?;
229        std::fs::rename(&tmp, &path).map_err(|e| ToolError::ExecutionFailed {
230            name: "system_monitor".to_string(),
231            message: format!("Failed to rename state file: {}", e),
232        })?;
233        Ok(())
234    }
235
236    fn action_add_service(&self, args: &Value) -> Result<ToolOutput, ToolError> {
237        let name = args
238            .get("name")
239            .and_then(|v| v.as_str())
240            .unwrap_or("")
241            .trim();
242        if name.is_empty() {
243            return Ok(ToolOutput::text("Please provide a service name."));
244        }
245
246        let url = args
247            .get("url")
248            .and_then(|v| v.as_str())
249            .unwrap_or("")
250            .trim();
251        if url.is_empty() {
252            return Ok(ToolOutput::text("Please provide a service URL."));
253        }
254
255        let type_str = args
256            .get("service_type")
257            .and_then(|v| v.as_str())
258            .unwrap_or("");
259        let service_type = match ServiceType::from_str(type_str) {
260            Some(t) => t,
261            None => {
262                return Ok(ToolOutput::text(format!(
263                    "Invalid service_type: '{}'. Use: api, database, cache, queue, frontend, worker, gateway",
264                    type_str
265                )));
266            }
267        };
268
269        let dependencies: Vec<usize> = args
270            .get("dependencies")
271            .and_then(|v| v.as_array())
272            .map(|arr| {
273                arr.iter()
274                    .filter_map(|v| v.as_u64().map(|n| n as usize))
275                    .collect()
276            })
277            .unwrap_or_default();
278
279        let health_endpoint = args
280            .get("health_endpoint")
281            .and_then(|v| v.as_str())
282            .map(|s| s.to_string());
283
284        let mut state = self.load_state();
285        let id = state.next_service_id;
286        state.next_service_id += 1;
287
288        state.services.push(Service {
289            id,
290            name: name.to_string(),
291            url: url.to_string(),
292            service_type,
293            dependencies,
294            health_endpoint,
295            status: ServiceStatus::Unknown,
296            last_checked: None,
297            response_time_ms: None,
298        });
299
300        self.save_state(&state)?;
301        Ok(ToolOutput::text(format!(
302            "Added service '{}' (#{}) [{}] at {}",
303            name, id, type_str, url
304        )))
305    }
306
307    fn action_topology(&self) -> Result<ToolOutput, ToolError> {
308        let state = self.load_state();
309        if state.services.is_empty() {
310            return Ok(ToolOutput::text(
311                "No services registered. Use add_service to register services.",
312            ));
313        }
314
315        let mut output = String::from("Service Topology\n================\n\n");
316
317        for service in &state.services {
318            let marker = service.status.marker();
319            output.push_str(&format!(
320                "[{}] {} #{} ({}) - {}\n",
321                marker, service.name, service.id, service.service_type, service.url
322            ));
323            if !service.dependencies.is_empty() {
324                for dep_id in &service.dependencies {
325                    if let Some(dep) = state.services.iter().find(|s| s.id == *dep_id) {
326                        let dep_marker = dep.status.marker();
327                        output
328                            .push_str(&format!("  -> [{}] {} #{}\n", dep_marker, dep.name, dep.id));
329                    } else {
330                        output.push_str(&format!("  -> [?] unknown #{}\n", dep_id));
331                    }
332                }
333            }
334        }
335
336        Ok(ToolOutput::text(output))
337    }
338
339    async fn action_health_check(&self, args: &Value) -> Result<ToolOutput, ToolError> {
340        let mut state = self.load_state();
341        if state.services.is_empty() {
342            return Ok(ToolOutput::text("No services to check."));
343        }
344
345        let target_id = args
346            .get("service_id")
347            .and_then(|v| v.as_u64())
348            .map(|n| n as usize);
349
350        let client = reqwest::Client::builder()
351            .timeout(Duration::from_secs(10))
352            .build()
353            .map_err(|e| ToolError::ExecutionFailed {
354                name: "system_monitor".to_string(),
355                message: format!("Failed to create HTTP client: {}", e),
356            })?;
357
358        let indices: Vec<usize> = if let Some(sid) = target_id {
359            match state.services.iter().position(|s| s.id == sid) {
360                Some(idx) => vec![idx],
361                None => {
362                    return Ok(ToolOutput::text(format!("Service #{} not found.", sid)));
363                }
364            }
365        } else {
366            (0..state.services.len()).collect()
367        };
368
369        let mut results = Vec::new();
370
371        for idx in indices {
372            let service = &state.services[idx];
373            let endpoint = service.health_endpoint.as_deref().unwrap_or("/health");
374            let check_url = format!("{}{}", service.url.trim_end_matches('/'), endpoint);
375
376            let start = std::time::Instant::now();
377            let result = client.get(&check_url).send().await;
378            let elapsed_ms = start.elapsed().as_millis() as u64;
379
380            let (new_status, detail) = match result {
381                Ok(resp) => {
382                    let status_code = resp.status().as_u16();
383                    if (200..300).contains(&status_code) {
384                        (
385                            ServiceStatus::Healthy,
386                            format!("HTTP {} ({}ms)", status_code, elapsed_ms),
387                        )
388                    } else {
389                        (
390                            ServiceStatus::Degraded,
391                            format!("HTTP {} ({}ms)", status_code, elapsed_ms),
392                        )
393                    }
394                }
395                Err(e) => {
396                    let msg = if e.is_timeout() {
397                        "timeout".to_string()
398                    } else if e.is_connect() {
399                        "connection refused".to_string()
400                    } else {
401                        format!("{}", e)
402                    };
403                    (ServiceStatus::Down, msg)
404                }
405            };
406
407            let svc = &mut state.services[idx];
408            svc.status = new_status.clone();
409            svc.last_checked = Some(Utc::now());
410            svc.response_time_ms = Some(elapsed_ms);
411
412            results.push(format!(
413                "  {} #{} ({}): {} - {}",
414                svc.name,
415                svc.id,
416                new_status.marker(),
417                new_status,
418                detail
419            ));
420        }
421
422        self.save_state(&state)?;
423
424        let mut output = String::from("Health Check Results\n====================\n");
425        for r in &results {
426            output.push_str(r);
427            output.push('\n');
428        }
429        Ok(ToolOutput::text(output))
430    }
431
432    fn action_log_incident(&self, args: &Value) -> Result<ToolOutput, ToolError> {
433        let title = args
434            .get("title")
435            .and_then(|v| v.as_str())
436            .unwrap_or("")
437            .trim();
438        if title.is_empty() {
439            return Ok(ToolOutput::text("Please provide an incident title."));
440        }
441
442        let severity_str = args.get("severity").and_then(|v| v.as_str()).unwrap_or("");
443        let severity = match IncidentSeverity::from_str(severity_str) {
444            Some(s) => s,
445            None => {
446                return Ok(ToolOutput::text(format!(
447                    "Invalid severity: '{}'. Use: low, medium, high, critical",
448                    severity_str
449                )));
450            }
451        };
452
453        let affected_services: Vec<usize> = args
454            .get("affected_services")
455            .and_then(|v| v.as_array())
456            .map(|arr| {
457                arr.iter()
458                    .filter_map(|v| v.as_u64().map(|n| n as usize))
459                    .collect()
460            })
461            .unwrap_or_default();
462
463        let message = args
464            .get("message")
465            .and_then(|v| v.as_str())
466            .unwrap_or("Incident created")
467            .to_string();
468
469        let now = Utc::now();
470        let mut state = self.load_state();
471        let id = state.next_incident_id;
472        state.next_incident_id += 1;
473
474        state.incidents.push(Incident {
475            id,
476            title: title.to_string(),
477            severity,
478            affected_services,
479            timeline: vec![TimelineEntry {
480                timestamp: now,
481                message,
482            }],
483            status: IncidentStatus::Investigating,
484            root_cause: None,
485            resolution: None,
486            created_at: now,
487        });
488
489        self.save_state(&state)?;
490        Ok(ToolOutput::text(format!(
491            "Incident #{} logged: '{}' [{}] - Status: Investigating",
492            id, title, severity_str
493        )))
494    }
495
496    fn action_correlate(&self) -> Result<ToolOutput, ToolError> {
497        let state = self.load_state();
498
499        let mut service_incident_count: std::collections::HashMap<usize, usize> =
500            std::collections::HashMap::new();
501        let mut severity_count: std::collections::HashMap<String, usize> =
502            std::collections::HashMap::new();
503
504        for incident in &state.incidents {
505            for sid in &incident.affected_services {
506                *service_incident_count.entry(*sid).or_insert(0) += 1;
507            }
508            *severity_count
509                .entry(format!("{}", incident.severity))
510                .or_insert(0) += 1;
511        }
512
513        let mut output =
514            String::from("Incident Correlation Analysis\n=============================\n\n");
515
516        output.push_str(&format!("Total incidents: {}\n\n", state.incidents.len()));
517
518        if !severity_count.is_empty() {
519            output.push_str("Severity breakdown:\n");
520            for (sev, count) in &severity_count {
521                output.push_str(&format!("  {}: {}\n", sev, count));
522            }
523            output.push('\n');
524        }
525
526        if !service_incident_count.is_empty() {
527            output.push_str("Services by incident frequency:\n");
528            let mut sorted: Vec<_> = service_incident_count.iter().collect();
529            sorted.sort_by(|a, b| b.1.cmp(a.1));
530            for (sid, count) in sorted {
531                let name = state
532                    .services
533                    .iter()
534                    .find(|s| s.id == *sid)
535                    .map(|s| s.name.as_str())
536                    .unwrap_or("unknown");
537                output.push_str(&format!("  {} (#{}) - {} incidents\n", name, sid, count));
538            }
539            output.push('\n');
540        }
541
542        output.push_str("--- LLM Analysis Prompt ---\n");
543        output.push_str("Based on the above incident data, identify:\n");
544        output.push_str("1. Common failure patterns and root causes\n");
545        output.push_str("2. Services that are single points of failure\n");
546        output.push_str("3. Recommendations for improving reliability\n");
547
548        Ok(ToolOutput::text(output))
549    }
550
551    fn action_generate_runbook(&self, args: &Value) -> Result<ToolOutput, ToolError> {
552        let service_id = match args.get("service_id").and_then(|v| v.as_u64()) {
553            Some(id) => id as usize,
554            None => {
555                return Ok(ToolOutput::text("Please provide a service_id."));
556            }
557        };
558
559        let state = self.load_state();
560        let service = match state.services.iter().find(|s| s.id == service_id) {
561            Some(s) => s,
562            None => {
563                return Ok(ToolOutput::text(format!(
564                    "Service #{} not found.",
565                    service_id
566                )));
567            }
568        };
569
570        let dep_names: Vec<String> = service
571            .dependencies
572            .iter()
573            .filter_map(|did| {
574                state
575                    .services
576                    .iter()
577                    .find(|s| s.id == *did)
578                    .map(|s| format!("{} (#{})", s.name, s.id))
579            })
580            .collect();
581
582        let related_incidents: Vec<&Incident> = state
583            .incidents
584            .iter()
585            .filter(|i| i.affected_services.contains(&service_id))
586            .collect();
587
588        let mut output = String::from("--- Runbook Generation Prompt ---\n\n");
589        output.push_str(&format!("Service: {} (#{}))\n", service.name, service.id));
590        output.push_str(&format!("Type: {}\n", service.service_type));
591        output.push_str(&format!("URL: {}\n", service.url));
592        output.push_str(&format!("Status: {}\n", service.status));
593        if let Some(ref ep) = service.health_endpoint {
594            output.push_str(&format!("Health endpoint: {}\n", ep));
595        }
596
597        if !dep_names.is_empty() {
598            output.push_str(&format!("Dependencies: {}\n", dep_names.join(", ")));
599        }
600
601        if !related_incidents.is_empty() {
602            output.push_str(&format!(
603                "\nRecent incidents ({}): \n",
604                related_incidents.len()
605            ));
606            for inc in related_incidents.iter().rev().take(5) {
607                output.push_str(&format!(
608                    "  - #{} [{}] {} ({})\n",
609                    inc.id, inc.severity, inc.title, inc.status
610                ));
611            }
612        }
613
614        output.push_str("\nPlease generate a runbook covering:\n");
615        output.push_str("1. Service overview and architecture\n");
616        output.push_str("2. Health check procedures\n");
617        output.push_str("3. Common failure modes and troubleshooting steps\n");
618        output.push_str("4. Escalation procedures\n");
619        output.push_str("5. Recovery procedures\n");
620
621        Ok(ToolOutput::text(output))
622    }
623
624    fn action_impact_analysis(&self, args: &Value) -> Result<ToolOutput, ToolError> {
625        let service_id = match args.get("service_id").and_then(|v| v.as_u64()) {
626            Some(id) => id as usize,
627            None => {
628                return Ok(ToolOutput::text("Please provide a service_id."));
629            }
630        };
631
632        let state = self.load_state();
633
634        if !state.services.iter().any(|s| s.id == service_id) {
635            return Ok(ToolOutput::text(format!(
636                "Service #{} not found.",
637                service_id
638            )));
639        }
640
641        // Reverse BFS: find all services that transitively depend on the given service
642        let mut impacted: Vec<usize> = Vec::new();
643        let mut visited: HashSet<usize> = HashSet::new();
644        let mut queue: VecDeque<usize> = VecDeque::new();
645
646        queue.push_back(service_id);
647        visited.insert(service_id);
648
649        while let Some(current_id) = queue.pop_front() {
650            // Find all services that list current_id in their dependencies
651            for svc in &state.services {
652                if svc.dependencies.contains(&current_id) && !visited.contains(&svc.id) {
653                    visited.insert(svc.id);
654                    impacted.push(svc.id);
655                    queue.push_back(svc.id);
656                }
657            }
658        }
659
660        let source_name = state
661            .services
662            .iter()
663            .find(|s| s.id == service_id)
664            .map(|s| s.name.as_str())
665            .unwrap_or("unknown");
666
667        let mut output = String::from("Impact Analysis\n===============\n\n");
668        output.push_str(&format!(
669            "If '{}' (#{}) goes down:\n\n",
670            source_name, service_id
671        ));
672
673        if impacted.is_empty() {
674            output.push_str("No other services depend on this service.\n");
675        } else {
676            output.push_str(&format!(
677                "{} service(s) would be affected:\n",
678                impacted.len()
679            ));
680            for sid in &impacted {
681                if let Some(svc) = state.services.iter().find(|s| s.id == *sid) {
682                    output.push_str(&format!(
683                        "  - {} (#{}) [{}]\n",
684                        svc.name, svc.id, svc.service_type
685                    ));
686                }
687            }
688        }
689
690        Ok(ToolOutput::text(output))
691    }
692
693    fn action_list_services(&self, args: &Value) -> Result<ToolOutput, ToolError> {
694        let state = self.load_state();
695        if state.services.is_empty() {
696            return Ok(ToolOutput::text(
697                "No services registered. Use add_service to register services.",
698            ));
699        }
700
701        let status_filter = args.get("status").and_then(|v| v.as_str());
702
703        let filtered: Vec<&Service> = state
704            .services
705            .iter()
706            .filter(|s| {
707                if let Some(filter) = status_filter {
708                    match filter.to_lowercase().as_str() {
709                        "healthy" => s.status == ServiceStatus::Healthy,
710                        "degraded" => s.status == ServiceStatus::Degraded,
711                        "down" => s.status == ServiceStatus::Down,
712                        "unknown" => s.status == ServiceStatus::Unknown,
713                        _ => true,
714                    }
715                } else {
716                    true
717                }
718            })
719            .collect();
720
721        if filtered.is_empty() {
722            return Ok(ToolOutput::text(format!(
723                "No services match the filter '{}'.",
724                status_filter.unwrap_or("all")
725            )));
726        }
727
728        let mut output = String::from("Services\n========\n");
729        for svc in &filtered {
730            let checked = svc
731                .last_checked
732                .map(|t| format!(" (last checked: {})", t.format("%Y-%m-%d %H:%M:%S UTC")))
733                .unwrap_or_default();
734            let rt = svc
735                .response_time_ms
736                .map(|ms| format!(" [{}ms]", ms))
737                .unwrap_or_default();
738            output.push_str(&format!(
739                "  #{} {} [{}] ({}) - {}{}{}\n",
740                svc.id,
741                svc.name,
742                svc.status.marker(),
743                svc.service_type,
744                svc.status,
745                rt,
746                checked,
747            ));
748        }
749        output.push_str(&format!("\nTotal: {} service(s)\n", filtered.len()));
750
751        Ok(ToolOutput::text(output))
752    }
753}
754
755#[async_trait]
756impl Tool for SystemMonitorTool {
757    fn name(&self) -> &str {
758        "system_monitor"
759    }
760
761    fn description(&self) -> &str {
762        "Production system monitoring: service topology, health checks, incident tracking. Actions: add_service, topology, health_check, log_incident, correlate, generate_runbook, impact_analysis, list_services."
763    }
764
765    fn parameters_schema(&self) -> Value {
766        json!({
767            "type": "object",
768            "properties": {
769                "action": {
770                    "type": "string",
771                    "enum": ["add_service", "topology", "health_check", "log_incident", "correlate", "generate_runbook", "impact_analysis", "list_services"],
772                    "description": "Action to perform"
773                },
774                "name": { "type": "string", "description": "Service name (for add_service)" },
775                "url": { "type": "string", "description": "Service URL (for add_service)" },
776                "service_type": {
777                    "type": "string",
778                    "enum": ["api", "database", "cache", "queue", "frontend", "worker", "gateway"],
779                    "description": "Type of service (for add_service)"
780                },
781                "dependencies": {
782                    "type": "array",
783                    "items": { "type": "integer" },
784                    "description": "Array of service IDs this service depends on (for add_service)"
785                },
786                "health_endpoint": { "type": "string", "description": "Health check endpoint path (default: /health)" },
787                "service_id": { "type": "integer", "description": "Target service ID (for health_check, generate_runbook, impact_analysis)" },
788                "title": { "type": "string", "description": "Incident title (for log_incident)" },
789                "severity": {
790                    "type": "string",
791                    "enum": ["low", "medium", "high", "critical"],
792                    "description": "Incident severity (for log_incident)"
793                },
794                "affected_services": {
795                    "type": "array",
796                    "items": { "type": "integer" },
797                    "description": "Array of affected service IDs (for log_incident)"
798                },
799                "message": { "type": "string", "description": "Incident timeline message (for log_incident)" },
800                "status": {
801                    "type": "string",
802                    "enum": ["healthy", "degraded", "down", "unknown"],
803                    "description": "Filter by status (for list_services)"
804                }
805            },
806            "required": ["action"]
807        })
808    }
809
810    fn risk_level(&self) -> RiskLevel {
811        RiskLevel::Network
812    }
813
814    fn timeout(&self) -> Duration {
815        Duration::from_secs(60)
816    }
817
818    async fn execute(&self, args: Value) -> Result<ToolOutput, ToolError> {
819        let action = args.get("action").and_then(|v| v.as_str()).unwrap_or("");
820
821        match action {
822            "add_service" => self.action_add_service(&args),
823            "topology" => self.action_topology(),
824            "health_check" => self.action_health_check(&args).await,
825            "log_incident" => self.action_log_incident(&args),
826            "correlate" => self.action_correlate(),
827            "generate_runbook" => self.action_generate_runbook(&args),
828            "impact_analysis" => self.action_impact_analysis(&args),
829            "list_services" => self.action_list_services(&args),
830            _ => Ok(ToolOutput::text(format!(
831                "Unknown action: '{}'. Use: add_service, topology, health_check, log_incident, correlate, generate_runbook, impact_analysis, list_services",
832                action
833            ))),
834        }
835    }
836}
837
838#[cfg(test)]
839mod tests {
840    use super::*;
841    use tempfile::TempDir;
842
843    fn make_tool() -> (SystemMonitorTool, PathBuf) {
844        let dir = TempDir::new().unwrap();
845        let workspace = dir.path().canonicalize().unwrap();
846        let tool = SystemMonitorTool::new(workspace.clone());
847        // Leak the TempDir so it is not dropped while the test runs
848        std::mem::forget(dir);
849        (tool, workspace)
850    }
851
852    #[test]
853    fn test_tool_properties() {
854        let (tool, _ws) = make_tool();
855        assert_eq!(tool.name(), "system_monitor");
856        assert!(tool.description().contains("service topology"));
857        assert_eq!(tool.risk_level(), RiskLevel::Network);
858        assert_eq!(tool.timeout(), Duration::from_secs(60));
859    }
860
861    #[test]
862    fn test_schema_validation() {
863        let (tool, _ws) = make_tool();
864        let schema = tool.parameters_schema();
865        assert!(schema.get("properties").is_some());
866        assert!(schema["properties"]["action"].get("enum").is_some());
867        let actions = schema["properties"]["action"]["enum"].as_array().unwrap();
868        assert_eq!(actions.len(), 8);
869        assert!(
870            schema["required"]
871                .as_array()
872                .unwrap()
873                .contains(&json!("action"))
874        );
875    }
876
877    #[tokio::test]
878    async fn test_add_service() {
879        let (tool, ws) = make_tool();
880
881        let result = tool
882            .execute(json!({
883                "action": "add_service",
884                "name": "user-api",
885                "url": "http://localhost:8080",
886                "service_type": "api",
887                "health_endpoint": "/healthz"
888            }))
889            .await
890            .unwrap();
891
892        assert!(result.content.contains("user-api"));
893        assert!(result.content.contains("#1"));
894
895        // Verify state was persisted
896        let state_path = ws.join(".rustant").join("monitoring").join("topology.json");
897        assert!(state_path.exists());
898
899        let state: MonitorState =
900            serde_json::from_str(&std::fs::read_to_string(&state_path).unwrap()).unwrap();
901        assert_eq!(state.services.len(), 1);
902        assert_eq!(state.services[0].name, "user-api");
903        assert_eq!(state.services[0].status, ServiceStatus::Unknown);
904        assert_eq!(
905            state.services[0].health_endpoint,
906            Some("/healthz".to_string())
907        );
908    }
909
910    #[tokio::test]
911    async fn test_topology_output() {
912        let (tool, _ws) = make_tool();
913
914        // Add a database
915        tool.execute(json!({
916            "action": "add_service",
917            "name": "postgres",
918            "url": "http://db:5432",
919            "service_type": "database"
920        }))
921        .await
922        .unwrap();
923
924        // Add a cache
925        tool.execute(json!({
926            "action": "add_service",
927            "name": "redis",
928            "url": "http://cache:6379",
929            "service_type": "cache"
930        }))
931        .await
932        .unwrap();
933
934        // Add an API that depends on both
935        tool.execute(json!({
936            "action": "add_service",
937            "name": "user-api",
938            "url": "http://api:3000",
939            "service_type": "api",
940            "dependencies": [1, 2]
941        }))
942        .await
943        .unwrap();
944
945        let result = tool.execute(json!({"action": "topology"})).await.unwrap();
946
947        assert!(result.content.contains("Service Topology"));
948        assert!(result.content.contains("postgres"));
949        assert!(result.content.contains("redis"));
950        assert!(result.content.contains("user-api"));
951        // user-api should show its dependencies
952        assert!(result.content.contains("-> [?] postgres #1"));
953        assert!(result.content.contains("-> [?] redis #2"));
954    }
955
956    #[tokio::test]
957    async fn test_health_check_invalid_url() {
958        let (tool, _ws) = make_tool();
959
960        // Add a service with a URL that will fail to connect
961        tool.execute(json!({
962            "action": "add_service",
963            "name": "bad-service",
964            "url": "http://127.0.0.1:1",
965            "service_type": "api"
966        }))
967        .await
968        .unwrap();
969
970        let result = tool
971            .execute(json!({
972                "action": "health_check",
973                "service_id": 1
974            }))
975            .await
976            .unwrap();
977
978        assert!(result.content.contains("Health Check Results"));
979        assert!(result.content.contains("bad-service"));
980        assert!(result.content.contains("Down"));
981
982        // Verify the state was updated
983        let state = tool.load_state();
984        assert_eq!(state.services[0].status, ServiceStatus::Down);
985        assert!(state.services[0].last_checked.is_some());
986    }
987
988    #[tokio::test]
989    async fn test_log_incident() {
990        let (tool, _ws) = make_tool();
991
992        // Add a service first
993        tool.execute(json!({
994            "action": "add_service",
995            "name": "api",
996            "url": "http://localhost:8080",
997            "service_type": "api"
998        }))
999        .await
1000        .unwrap();
1001
1002        let result = tool
1003            .execute(json!({
1004                "action": "log_incident",
1005                "title": "API latency spike",
1006                "severity": "high",
1007                "affected_services": [1],
1008                "message": "Response times exceeded 5s"
1009            }))
1010            .await
1011            .unwrap();
1012
1013        assert!(result.content.contains("Incident #1"));
1014        assert!(result.content.contains("API latency spike"));
1015        assert!(result.content.contains("Investigating"));
1016
1017        // Verify incident was stored
1018        let state = tool.load_state();
1019        assert_eq!(state.incidents.len(), 1);
1020        assert_eq!(state.incidents[0].title, "API latency spike");
1021        assert_eq!(state.incidents[0].severity, IncidentSeverity::High);
1022        assert_eq!(state.incidents[0].status, IncidentStatus::Investigating);
1023        assert_eq!(state.incidents[0].timeline.len(), 1);
1024        assert_eq!(
1025            state.incidents[0].timeline[0].message,
1026            "Response times exceeded 5s"
1027        );
1028        assert_eq!(state.incidents[0].affected_services, vec![1]);
1029    }
1030
1031    #[tokio::test]
1032    async fn test_impact_analysis() {
1033        let (tool, _ws) = make_tool();
1034
1035        // C (#1) - database, no deps
1036        tool.execute(json!({
1037            "action": "add_service",
1038            "name": "database-C",
1039            "url": "http://db:5432",
1040            "service_type": "database"
1041        }))
1042        .await
1043        .unwrap();
1044
1045        // B (#2) depends on C
1046        tool.execute(json!({
1047            "action": "add_service",
1048            "name": "cache-B",
1049            "url": "http://cache:6379",
1050            "service_type": "cache",
1051            "dependencies": [1]
1052        }))
1053        .await
1054        .unwrap();
1055
1056        // A (#3) depends on B
1057        tool.execute(json!({
1058            "action": "add_service",
1059            "name": "api-A",
1060            "url": "http://api:3000",
1061            "service_type": "api",
1062            "dependencies": [2]
1063        }))
1064        .await
1065        .unwrap();
1066
1067        // Impact analysis on C: should find B and A
1068        let result = tool
1069            .execute(json!({
1070                "action": "impact_analysis",
1071                "service_id": 1
1072            }))
1073            .await
1074            .unwrap();
1075
1076        assert!(result.content.contains("Impact Analysis"));
1077        assert!(result.content.contains("database-C"));
1078        assert!(result.content.contains("cache-B"));
1079        assert!(result.content.contains("api-A"));
1080        assert!(result.content.contains("2 service(s) would be affected"));
1081    }
1082
1083    #[tokio::test]
1084    async fn test_list_services_filter() {
1085        let (tool, _ws) = make_tool();
1086
1087        // Add two services
1088        tool.execute(json!({
1089            "action": "add_service",
1090            "name": "svc-a",
1091            "url": "http://a:80",
1092            "service_type": "api"
1093        }))
1094        .await
1095        .unwrap();
1096
1097        tool.execute(json!({
1098            "action": "add_service",
1099            "name": "svc-b",
1100            "url": "http://b:80",
1101            "service_type": "worker"
1102        }))
1103        .await
1104        .unwrap();
1105
1106        // Both should be Unknown status
1107        let result = tool
1108            .execute(json!({"action": "list_services"}))
1109            .await
1110            .unwrap();
1111        assert!(result.content.contains("svc-a"));
1112        assert!(result.content.contains("svc-b"));
1113        assert!(result.content.contains("Total: 2"));
1114
1115        // Filter by unknown should show both
1116        let result = tool
1117            .execute(json!({"action": "list_services", "status": "unknown"}))
1118            .await
1119            .unwrap();
1120        assert!(result.content.contains("svc-a"));
1121        assert!(result.content.contains("svc-b"));
1122
1123        // Filter by healthy should show none
1124        let result = tool
1125            .execute(json!({"action": "list_services", "status": "healthy"}))
1126            .await
1127            .unwrap();
1128        assert!(result.content.contains("No services match"));
1129    }
1130
1131    #[tokio::test]
1132    async fn test_correlate_empty() {
1133        let (tool, _ws) = make_tool();
1134
1135        let result = tool.execute(json!({"action": "correlate"})).await.unwrap();
1136
1137        assert!(result.content.contains("Incident Correlation Analysis"));
1138        assert!(result.content.contains("Total incidents: 0"));
1139        assert!(result.content.contains("LLM Analysis Prompt"));
1140    }
1141
1142    #[tokio::test]
1143    async fn test_generate_runbook_returns_prompt() {
1144        let (tool, _ws) = make_tool();
1145
1146        tool.execute(json!({
1147            "action": "add_service",
1148            "name": "payment-api",
1149            "url": "http://payments:8080",
1150            "service_type": "api",
1151            "health_endpoint": "/status"
1152        }))
1153        .await
1154        .unwrap();
1155
1156        let result = tool
1157            .execute(json!({
1158                "action": "generate_runbook",
1159                "service_id": 1
1160            }))
1161            .await
1162            .unwrap();
1163
1164        assert!(result.content.contains("Runbook Generation Prompt"));
1165        assert!(result.content.contains("payment-api"));
1166        assert!(result.content.contains("http://payments:8080"));
1167        assert!(result.content.contains("/status"));
1168        assert!(result.content.contains("Health check procedures"));
1169        assert!(result.content.contains("Recovery procedures"));
1170    }
1171
1172    #[tokio::test]
1173    async fn test_state_roundtrip() {
1174        let (tool, _ws) = make_tool();
1175
1176        // Add services and an incident
1177        tool.execute(json!({
1178            "action": "add_service",
1179            "name": "db",
1180            "url": "http://db:5432",
1181            "service_type": "database"
1182        }))
1183        .await
1184        .unwrap();
1185
1186        tool.execute(json!({
1187            "action": "add_service",
1188            "name": "api",
1189            "url": "http://api:3000",
1190            "service_type": "api",
1191            "dependencies": [1]
1192        }))
1193        .await
1194        .unwrap();
1195
1196        tool.execute(json!({
1197            "action": "log_incident",
1198            "title": "DB down",
1199            "severity": "critical",
1200            "affected_services": [1]
1201        }))
1202        .await
1203        .unwrap();
1204
1205        // Load and verify full state
1206        let state = tool.load_state();
1207        assert_eq!(state.services.len(), 2);
1208        assert_eq!(state.incidents.len(), 1);
1209        assert_eq!(state.next_service_id, 3);
1210        assert_eq!(state.next_incident_id, 2);
1211        assert_eq!(state.services[1].dependencies, vec![1]);
1212        assert_eq!(state.incidents[0].severity, IncidentSeverity::Critical);
1213
1214        // Serialize and deserialize to verify roundtrip
1215        let json = serde_json::to_string_pretty(&state).unwrap();
1216        let restored: MonitorState = serde_json::from_str(&json).unwrap();
1217        assert_eq!(restored.services.len(), 2);
1218        assert_eq!(restored.incidents.len(), 1);
1219        assert_eq!(restored.next_service_id, 3);
1220        assert_eq!(restored.next_incident_id, 2);
1221    }
1222
1223    #[tokio::test]
1224    async fn test_unknown_action() {
1225        let (tool, _ws) = make_tool();
1226
1227        let result = tool
1228            .execute(json!({"action": "nonexistent"}))
1229            .await
1230            .unwrap();
1231
1232        assert!(result.content.contains("Unknown action"));
1233        assert!(result.content.contains("nonexistent"));
1234    }
1235}
rustant_tools/system_monitor.rs

rustant_tools/
system_monitor.rs