syncable_cli/agent/tools/
prometheus_discover.rs

1//! Prometheus Discovery Tool
2//!
3//! Discovers Prometheus services running in a Kubernetes cluster.
4//! Used to find Prometheus for live K8s optimization analysis.
5//!
6//! # Usage Flow
7//!
8//! 1. Use `prometheus_discover` to find Prometheus in cluster
9//! 2. Use `prometheus_connect` to establish connection
10//! 3. Use `k8s_optimize` with the connection for live analysis
11
12use crate::agent::ui::prometheus_display::{DiscoveredService, PrometheusDiscoveryDisplay};
13use rig::completion::ToolDefinition;
14use rig::tool::Tool;
15use serde::{Deserialize, Serialize};
16use serde_json::json;
17use std::process::Stdio;
18use tokio::process::Command;
19
20/// Arguments for the prometheus_discover tool
21#[derive(Debug, Deserialize)]
22pub struct PrometheusDiscoverArgs {
23    /// Kubernetes context (optional, uses current context if not specified)
24    #[serde(default)]
25    pub cluster: Option<String>,
26
27    /// Namespace to search in (optional, searches all namespaces if not specified)
28    #[serde(default)]
29    pub namespace: Option<String>,
30
31    /// Service name pattern to match (default: "prometheus")
32    #[serde(default)]
33    pub service_pattern: Option<String>,
34}
35
36/// A discovered Prometheus service
37#[derive(Debug, Clone, Serialize)]
38pub struct DiscoveredPrometheus {
39    pub name: String,
40    pub namespace: String,
41    pub port: u16,
42    pub service_type: String,
43    pub cluster_ip: Option<String>,
44}
45
46/// Error type for prometheus discovery
47#[derive(Debug, thiserror::Error)]
48#[error("Prometheus discovery error: {0}")]
49pub struct PrometheusDiscoverError(String);
50
51/// Tool for discovering Prometheus in Kubernetes clusters
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct PrometheusDiscoverTool;
54
55impl Default for PrometheusDiscoverTool {
56    fn default() -> Self {
57        Self::new()
58    }
59}
60
61impl PrometheusDiscoverTool {
62    /// Create a new PrometheusDiscoverTool
63    pub fn new() -> Self {
64        Self
65    }
66
67    /// Run kubectl to get services
68    async fn get_services(
69        &self,
70        namespace: Option<&str>,
71        context: Option<&str>,
72    ) -> Result<String, PrometheusDiscoverError> {
73        let mut cmd = Command::new("kubectl");
74        cmd.arg("get").arg("svc");
75
76        if let Some(ns) = namespace {
77            cmd.arg("-n").arg(ns);
78        } else {
79            cmd.arg("-A"); // All namespaces
80        }
81
82        cmd.arg("-o").arg("json");
83
84        if let Some(ctx) = context {
85            cmd.arg("--context").arg(ctx);
86        }
87
88        cmd.stdout(Stdio::piped()).stderr(Stdio::piped());
89
90        let output = cmd
91            .output()
92            .await
93            .map_err(|e| PrometheusDiscoverError(format!("Failed to run kubectl: {}", e)))?;
94
95        if !output.status.success() {
96            let stderr = String::from_utf8_lossy(&output.stderr);
97            return Err(PrometheusDiscoverError(format!(
98                "kubectl failed: {}",
99                stderr.trim()
100            )));
101        }
102
103        Ok(String::from_utf8_lossy(&output.stdout).to_string())
104    }
105
106    /// Parse services JSON and find Prometheus SERVER services specifically
107    /// We need to be precise - only find the actual Prometheus server, not every monitoring component
108    fn find_prometheus_services(
109        &self,
110        json_str: &str,
111        _pattern: &str,
112    ) -> Vec<DiscoveredPrometheus> {
113        let mut discovered = Vec::new();
114
115        // Parse JSON
116        let json: serde_json::Value = match serde_json::from_str(json_str) {
117            Ok(v) => v,
118            Err(_) => return discovered,
119        };
120
121        // Get items array
122        let items = match json.get("items").and_then(|v| v.as_array()) {
123            Some(items) => items,
124            None => return discovered,
125        };
126
127        for item in items {
128            let metadata = match item.get("metadata") {
129                Some(m) => m,
130                None => continue,
131            };
132
133            let name = metadata.get("name").and_then(|v| v.as_str()).unwrap_or("");
134            let namespace = metadata
135                .get("namespace")
136                .and_then(|v| v.as_str())
137                .unwrap_or("default");
138
139            // Get spec and check for port 9090 (Prometheus API port)
140            let spec = match item.get("spec") {
141                Some(s) => s,
142                None => continue,
143            };
144
145            let ports = spec.get("ports").and_then(|v| v.as_array());
146            let has_prometheus_port = ports
147                .map(|p| {
148                    p.iter()
149                        .any(|port| port.get("port").and_then(|v| v.as_u64()) == Some(9090))
150                })
151                .unwrap_or(false);
152
153            // STRICT FILTERING: Must be the actual Prometheus server
154            // Method 1: Service name is specifically prometheus-like AND has port 9090
155            let name_lower = name.to_lowercase();
156            let is_prometheus_by_name = has_prometheus_port
157                && (
158                    // Exact patterns for Prometheus server services
159                    name_lower == "prometheus" ||
160                name_lower == "prometheus-server" ||
161                name_lower == "prometheus-operated" ||
162                name_lower.ends_with("-prometheus") ||        // e.g., monitoring-prometheus
163                name_lower.ends_with("-prometheus-server") ||
164                // But NOT node-exporter, alertmanager, etc.
165                (name_lower.contains("prometheus") &&
166                 !name_lower.contains("node-exporter") &&
167                 !name_lower.contains("alertmanager") &&
168                 !name_lower.contains("pushgateway") &&
169                 !name_lower.contains("blackbox") &&
170                 !name_lower.contains("adapter"))
171                );
172
173            // Method 2: Check for app.kubernetes.io/name=prometheus label
174            let labels = metadata.get("labels").and_then(|l| l.as_object());
175            let is_prometheus_by_label = has_prometheus_port
176                && labels
177                    .map(|obj| {
178                        // Check for specific Prometheus server labels
179                        obj.get("app.kubernetes.io/name")
180                            .and_then(|v| v.as_str())
181                            .map(|s| s == "prometheus")
182                            .unwrap_or(false)
183                            || obj
184                                .get("app")
185                                .and_then(|v| v.as_str())
186                                .map(|s| {
187                                    s == "prometheus" || s.contains("prometheus-stack-prometheus")
188                                })
189                                .unwrap_or(false)
190                    })
191                    .unwrap_or(false);
192
193            if !is_prometheus_by_name && !is_prometheus_by_label {
194                continue;
195            }
196
197            let service_type = spec
198                .get("type")
199                .and_then(|v| v.as_str())
200                .unwrap_or("ClusterIP");
201            let cluster_ip = spec.get("clusterIP").and_then(|v| v.as_str());
202
203            discovered.push(DiscoveredPrometheus {
204                name: name.to_string(),
205                namespace: namespace.to_string(),
206                port: 9090, // Always 9090 for Prometheus server
207                service_type: service_type.to_string(),
208                cluster_ip: cluster_ip.map(|s| s.to_string()),
209            });
210        }
211
212        // Deduplicate - prefer the main service over -operated
213        if discovered.len() > 1 {
214            // Sort so main service comes first (not -operated)
215            discovered.sort_by(|a, b| {
216                let a_is_operated = a.name.contains("operated");
217                let b_is_operated = b.name.contains("operated");
218                a_is_operated.cmp(&b_is_operated)
219            });
220        }
221
222        discovered
223    }
224}
225
226impl Tool for PrometheusDiscoverTool {
227    const NAME: &'static str = "prometheus_discover";
228
229    type Args = PrometheusDiscoverArgs;
230    type Output = String;
231    type Error = PrometheusDiscoverError;
232
233    async fn definition(&self, _prompt: String) -> ToolDefinition {
234        ToolDefinition {
235            name: Self::NAME.to_string(),
236            description: r#"Discover Prometheus services in a Kubernetes cluster.
237
238**Use this tool when:**
239- User asks for K8s optimization with live/historical metrics
240- Need to find Prometheus for data-driven recommendations
241
242**What it does:**
243- Searches for services with "prometheus" in the name or labels
244- Returns discovered services with namespace, port, and type
245- Suggests using prometheus_connect to establish connection
246
247**Returns:**
248- List of discovered Prometheus services
249- Connection suggestions
250
251**Next steps after discovery:**
2521. Use `prometheus_connect` with the discovered service
2532. Then use `k8s_optimize` with the established connection"#
254                .to_string(),
255            parameters: json!({
256                "type": "object",
257                "properties": {
258                    "cluster": {
259                        "type": "string",
260                        "description": "Kubernetes context name (optional, uses current context)"
261                    },
262                    "namespace": {
263                        "type": "string",
264                        "description": "Namespace to search (optional, searches all namespaces)"
265                    },
266                    "service_pattern": {
267                        "type": "string",
268                        "description": "Pattern to match service names (default: 'prometheus')"
269                    }
270                }
271            }),
272        }
273    }
274
275    async fn call(&self, args: Self::Args) -> Result<Self::Output, Self::Error> {
276        let pattern = args.service_pattern.as_deref().unwrap_or("prometheus");
277
278        // Start display
279        let mut display = PrometheusDiscoveryDisplay::new();
280        display.start(args.namespace.as_deref());
281
282        // Get services from cluster
283        let services_json = match self
284            .get_services(args.namespace.as_deref(), args.cluster.as_deref())
285            .await
286        {
287            Ok(json) => json,
288            Err(e) => {
289                display.error(&e.to_string());
290                return Err(e);
291            }
292        };
293
294        // Find Prometheus services
295        let mut discovered = self.find_prometheus_services(&services_json, pattern);
296        let mut used_fallback = false;
297        let original_namespace = args.namespace.clone();
298
299        // FALLBACK: If specific namespace was provided but no results found, try ALL namespaces
300        // This handles the common case where agent assumes "prometheus" namespace but services
301        // are actually in "monitoring" or other namespace
302        if discovered.is_empty() && args.namespace.is_some() {
303            log::info!(
304                "No Prometheus found in '{}' namespace, searching all namespaces...",
305                args.namespace.as_deref().unwrap_or("")
306            );
307            display.searching_all_namespaces();
308
309            if let Ok(all_json) = self.get_services(None, args.cluster.as_deref()).await {
310                discovered = self.find_prometheus_services(&all_json, pattern);
311                if !discovered.is_empty() {
312                    used_fallback = true;
313                }
314            }
315        }
316
317        // Convert to display format
318        let display_services: Vec<DiscoveredService> = discovered
319            .iter()
320            .map(|d| DiscoveredService {
321                name: d.name.clone(),
322                namespace: d.namespace.clone(),
323                port: d.port,
324                service_type: d.service_type.clone(),
325            })
326            .collect();
327
328        // Show results in terminal UI
329        display.found_services(&display_services);
330
331        // Show suggestion if services found
332        if let Some(first) = display_services.first() {
333            display.show_suggestion(first);
334        }
335
336        // Build JSON response for agent
337        let response = if discovered.is_empty() {
338            json!({
339                "found": false,
340                "discovered": [],
341                "message": "No Prometheus services found in cluster",
342                "suggestions": [
343                    "Check if Prometheus is installed in a different namespace",
344                    "Provide an external Prometheus URL using prometheus_connect with url parameter",
345                    "Install Prometheus using Helm: helm install prometheus prometheus-community/prometheus"
346                ]
347            })
348        } else {
349            let message = if used_fallback {
350                format!(
351                    "Found {} Prometheus service(s) (note: not in '{}' namespace as specified, but found in other namespaces)",
352                    discovered.len(),
353                    original_namespace.as_deref().unwrap_or("")
354                )
355            } else {
356                format!("Found {} Prometheus service(s)", discovered.len())
357            };
358
359            json!({
360                "found": true,
361                "used_fallback_search": used_fallback,
362                "discovered": discovered.iter().map(|d| json!({
363                    "name": d.name,
364                    "namespace": d.namespace,
365                    "port": d.port,
366                    "type": d.service_type,
367                    "cluster_ip": d.cluster_ip,
368                    "resource": format!("svc/{}", d.name)
369                })).collect::<Vec<_>>(),
370                "message": message,
371                "next_step": "Use prometheus_connect to establish connection",
372                "example": {
373                    "tool": "prometheus_connect",
374                    "args": {
375                        "service": discovered.first().map(|d| d.name.clone()),
376                        "namespace": discovered.first().map(|d| d.namespace.clone()),
377                        "port": discovered.first().map(|d| d.port)
378                    }
379                }
380            })
381        };
382
383        Ok(serde_json::to_string_pretty(&response).unwrap_or_else(|_| "{}".to_string()))
384    }
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390
391    #[test]
392    fn test_tool_name() {
393        assert_eq!(PrometheusDiscoverTool::NAME, "prometheus_discover");
394    }
395
396    #[test]
397    fn test_find_prometheus_services() {
398        let tool = PrometheusDiscoverTool::new();
399
400        let json = r#"{
401            "items": [
402                {
403                    "metadata": {
404                        "name": "prometheus-server",
405                        "namespace": "monitoring"
406                    },
407                    "spec": {
408                        "type": "ClusterIP",
409                        "clusterIP": "10.0.0.100",
410                        "ports": [{"port": 9090, "name": "web"}]
411                    }
412                },
413                {
414                    "metadata": {
415                        "name": "grafana",
416                        "namespace": "monitoring"
417                    },
418                    "spec": {
419                        "type": "ClusterIP",
420                        "ports": [{"port": 3000}]
421                    }
422                }
423            ]
424        }"#;
425
426        let discovered = tool.find_prometheus_services(json, "prometheus");
427        assert_eq!(discovered.len(), 1);
428        assert_eq!(discovered[0].name, "prometheus-server");
429        assert_eq!(discovered[0].namespace, "monitoring");
430        assert_eq!(discovered[0].port, 9090);
431    }
432
433    #[test]
434    fn test_find_prometheus_by_label() {
435        let tool = PrometheusDiscoverTool::new();
436
437        let json = r#"{
438            "items": [
439                {
440                    "metadata": {
441                        "name": "kube-prometheus-stack-prometheus",
442                        "namespace": "monitoring",
443                        "labels": {
444                            "app": "prometheus"
445                        }
446                    },
447                    "spec": {
448                        "type": "ClusterIP",
449                        "ports": [{"port": 9090}]
450                    }
451                }
452            ]
453        }"#;
454
455        let discovered = tool.find_prometheus_services(json, "prometheus");
456        assert_eq!(discovered.len(), 1);
457    }
458
459    #[test]
460    fn test_no_prometheus_found() {
461        let tool = PrometheusDiscoverTool::new();
462
463        let json = r#"{"items": []}"#;
464
465        let discovered = tool.find_prometheus_services(json, "prometheus");
466        assert!(discovered.is_empty());
467    }
468
469    #[test]
470    fn test_filters_out_non_prometheus_services() {
471        let tool = PrometheusDiscoverTool::new();
472
473        // This JSON includes services that should be filtered OUT:
474        // - node-exporter (different service)
475        // - alertmanager (different service)
476        // - monitoring-coredns (unrelated, but might have prometheus labels)
477        // Only monitoring-prometheus should match
478        let json = r#"{
479            "items": [
480                {
481                    "metadata": {
482                        "name": "monitoring-prometheus",
483                        "namespace": "monitoring",
484                        "labels": {"app": "prometheus"}
485                    },
486                    "spec": {
487                        "type": "ClusterIP",
488                        "ports": [{"port": 9090}]
489                    }
490                },
491                {
492                    "metadata": {
493                        "name": "monitoring-prometheus-node-exporter",
494                        "namespace": "monitoring",
495                        "labels": {"app": "prometheus-node-exporter"}
496                    },
497                    "spec": {
498                        "type": "ClusterIP",
499                        "ports": [{"port": 9100}]
500                    }
501                },
502                {
503                    "metadata": {
504                        "name": "alertmanager-operated",
505                        "namespace": "monitoring",
506                        "labels": {"app": "alertmanager"}
507                    },
508                    "spec": {
509                        "type": "ClusterIP",
510                        "ports": [{"port": 9093}]
511                    }
512                },
513                {
514                    "metadata": {
515                        "name": "monitoring-coredns",
516                        "namespace": "kube-system",
517                        "labels": {"prometheus.io/scrape": "true"}
518                    },
519                    "spec": {
520                        "type": "ClusterIP",
521                        "ports": [{"port": 9153}]
522                    }
523                }
524            ]
525        }"#;
526
527        let discovered = tool.find_prometheus_services(json, "prometheus");
528        // Only monitoring-prometheus should be found
529        assert_eq!(
530            discovered.len(),
531            1,
532            "Should only find 1 service, found: {:?}",
533            discovered
534        );
535        assert_eq!(discovered[0].name, "monitoring-prometheus");
536    }
537}