Skip to main content

construct/observability/
prometheus.rs

1use super::traits::{Observer, ObserverEvent, ObserverMetric};
2use prometheus::{
3    Encoder, GaugeVec, Histogram, HistogramOpts, HistogramVec, IntCounterVec, Registry, TextEncoder,
4};
5
6/// Prometheus-backed observer — exposes metrics for scraping via `/metrics`.
7pub struct PrometheusObserver {
8    registry: Registry,
9
10    // Counters
11    agent_starts: IntCounterVec,
12    llm_requests: IntCounterVec,
13    tokens_input_total: IntCounterVec,
14    tokens_output_total: IntCounterVec,
15    tool_calls: IntCounterVec,
16    channel_messages: IntCounterVec,
17    heartbeat_ticks: prometheus::IntCounter,
18    errors: IntCounterVec,
19    cache_hits: IntCounterVec,
20    cache_misses: IntCounterVec,
21    cache_tokens_saved: IntCounterVec,
22
23    // Histograms
24    agent_duration: HistogramVec,
25    tool_duration: HistogramVec,
26    request_latency: Histogram,
27
28    // Gauges
29    tokens_used: prometheus::IntGauge,
30    active_sessions: GaugeVec,
31    queue_depth: GaugeVec,
32
33    // Hands
34    hand_runs: IntCounterVec,
35    hand_duration: HistogramVec,
36    hand_findings: IntCounterVec,
37
38    // DORA
39    deployments_total: IntCounterVec,
40    deployment_lead_time: Histogram,
41    deployment_failure_rate: prometheus::Gauge,
42    recovery_time: Histogram,
43    mttr: prometheus::Gauge,
44    deploy_success_count: std::sync::atomic::AtomicU64,
45    deploy_failure_count: std::sync::atomic::AtomicU64,
46}
47
48impl PrometheusObserver {
49    pub fn new() -> Self {
50        let registry = Registry::new();
51
52        let agent_starts = IntCounterVec::new(
53            prometheus::Opts::new("construct_agent_starts_total", "Total agent invocations"),
54            &["provider", "model"],
55        )
56        .expect("valid metric");
57
58        let llm_requests = IntCounterVec::new(
59            prometheus::Opts::new(
60                "construct_llm_requests_total",
61                "Total LLM provider requests",
62            ),
63            &["provider", "model", "success"],
64        )
65        .expect("valid metric");
66
67        let tokens_input_total = IntCounterVec::new(
68            prometheus::Opts::new(
69                "construct_tokens_input_total",
70                "Total input tokens consumed",
71            ),
72            &["provider", "model"],
73        )
74        .expect("valid metric");
75
76        let tokens_output_total = IntCounterVec::new(
77            prometheus::Opts::new(
78                "construct_tokens_output_total",
79                "Total output tokens consumed",
80            ),
81            &["provider", "model"],
82        )
83        .expect("valid metric");
84
85        let tool_calls = IntCounterVec::new(
86            prometheus::Opts::new("construct_tool_calls_total", "Total tool calls"),
87            &["tool", "success"],
88        )
89        .expect("valid metric");
90
91        let channel_messages = IntCounterVec::new(
92            prometheus::Opts::new("construct_channel_messages_total", "Total channel messages"),
93            &["channel", "direction"],
94        )
95        .expect("valid metric");
96
97        let heartbeat_ticks =
98            prometheus::IntCounter::new("construct_heartbeat_ticks_total", "Total heartbeat ticks")
99                .expect("valid metric");
100
101        let errors = IntCounterVec::new(
102            prometheus::Opts::new("construct_errors_total", "Total errors by component"),
103            &["component"],
104        )
105        .expect("valid metric");
106
107        let cache_hits = IntCounterVec::new(
108            prometheus::Opts::new("construct_cache_hits_total", "Total response cache hits"),
109            &["cache_type"],
110        )
111        .expect("valid metric");
112
113        let cache_misses = IntCounterVec::new(
114            prometheus::Opts::new(
115                "construct_cache_misses_total",
116                "Total response cache misses",
117            ),
118            &["cache_type"],
119        )
120        .expect("valid metric");
121
122        let cache_tokens_saved = IntCounterVec::new(
123            prometheus::Opts::new(
124                "construct_cache_tokens_saved_total",
125                "Total tokens saved by response cache",
126            ),
127            &["cache_type"],
128        )
129        .expect("valid metric");
130
131        let agent_duration = HistogramVec::new(
132            HistogramOpts::new(
133                "construct_agent_duration_seconds",
134                "Agent invocation duration in seconds",
135            )
136            .buckets(vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]),
137            &["provider", "model"],
138        )
139        .expect("valid metric");
140
141        let tool_duration = HistogramVec::new(
142            HistogramOpts::new(
143                "construct_tool_duration_seconds",
144                "Tool execution duration in seconds",
145            )
146            .buckets(vec![0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]),
147            &["tool"],
148        )
149        .expect("valid metric");
150
151        let request_latency = Histogram::with_opts(
152            HistogramOpts::new(
153                "construct_request_latency_seconds",
154                "Request latency in seconds",
155            )
156            .buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]),
157        )
158        .expect("valid metric");
159
160        let tokens_used = prometheus::IntGauge::new(
161            "construct_tokens_used_last",
162            "Tokens used in the last request",
163        )
164        .expect("valid metric");
165
166        let active_sessions = GaugeVec::new(
167            prometheus::Opts::new("construct_active_sessions", "Number of active sessions"),
168            &[],
169        )
170        .expect("valid metric");
171
172        let queue_depth = GaugeVec::new(
173            prometheus::Opts::new("construct_queue_depth", "Message queue depth"),
174            &[],
175        )
176        .expect("valid metric");
177
178        let hand_runs = IntCounterVec::new(
179            prometheus::Opts::new("construct_hand_runs_total", "Total hand runs by outcome"),
180            &["hand", "success"],
181        )
182        .expect("valid metric");
183
184        let hand_duration = HistogramVec::new(
185            HistogramOpts::new(
186                "construct_hand_duration_seconds",
187                "Hand run duration in seconds",
188            )
189            .buckets(vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]),
190            &["hand"],
191        )
192        .expect("valid metric");
193
194        let hand_findings = IntCounterVec::new(
195            prometheus::Opts::new(
196                "construct_hand_findings_total",
197                "Total findings produced by hand runs",
198            ),
199            &["hand"],
200        )
201        .expect("valid metric");
202
203        let deployments_total = IntCounterVec::new(
204            prometheus::Opts::new("construct_deployments_total", "Total deployments by status"),
205            &["status"],
206        )
207        .expect("valid metric");
208
209        let deployment_lead_time = Histogram::with_opts(
210            HistogramOpts::new(
211                "construct_deployment_lead_time_seconds",
212                "Deployment lead time from commit to deploy in seconds",
213            )
214            .buckets(vec![
215                60.0, 300.0, 600.0, 1800.0, 3600.0, 7200.0, 14400.0, 43200.0, 86400.0,
216            ]),
217        )
218        .expect("valid metric");
219
220        let deployment_failure_rate = prometheus::Gauge::new(
221            "construct_deployment_failure_rate",
222            "Ratio of failed deployments to total deployments",
223        )
224        .expect("valid metric");
225
226        let recovery_time = Histogram::with_opts(
227            HistogramOpts::new(
228                "construct_recovery_time_seconds",
229                "Time to recover from a failed deployment in seconds",
230            )
231            .buckets(vec![
232                60.0, 300.0, 600.0, 1800.0, 3600.0, 7200.0, 14400.0, 43200.0, 86400.0,
233            ]),
234        )
235        .expect("valid metric");
236
237        let mttr =
238            prometheus::Gauge::new("construct_mttr_seconds", "Mean time to recovery in seconds")
239                .expect("valid metric");
240
241        // Register all metrics
242        registry.register(Box::new(agent_starts.clone())).ok();
243        registry.register(Box::new(llm_requests.clone())).ok();
244        registry.register(Box::new(tokens_input_total.clone())).ok();
245        registry
246            .register(Box::new(tokens_output_total.clone()))
247            .ok();
248        registry.register(Box::new(tool_calls.clone())).ok();
249        registry.register(Box::new(channel_messages.clone())).ok();
250        registry.register(Box::new(heartbeat_ticks.clone())).ok();
251        registry.register(Box::new(errors.clone())).ok();
252        registry.register(Box::new(cache_hits.clone())).ok();
253        registry.register(Box::new(cache_misses.clone())).ok();
254        registry.register(Box::new(cache_tokens_saved.clone())).ok();
255        registry.register(Box::new(agent_duration.clone())).ok();
256        registry.register(Box::new(tool_duration.clone())).ok();
257        registry.register(Box::new(request_latency.clone())).ok();
258        registry.register(Box::new(tokens_used.clone())).ok();
259        registry.register(Box::new(active_sessions.clone())).ok();
260        registry.register(Box::new(queue_depth.clone())).ok();
261        registry.register(Box::new(hand_runs.clone())).ok();
262        registry.register(Box::new(hand_duration.clone())).ok();
263        registry.register(Box::new(hand_findings.clone())).ok();
264        registry.register(Box::new(deployments_total.clone())).ok();
265        registry
266            .register(Box::new(deployment_lead_time.clone()))
267            .ok();
268        registry
269            .register(Box::new(deployment_failure_rate.clone()))
270            .ok();
271        registry.register(Box::new(recovery_time.clone())).ok();
272        registry.register(Box::new(mttr.clone())).ok();
273
274        Self {
275            registry,
276            agent_starts,
277            llm_requests,
278            tokens_input_total,
279            tokens_output_total,
280            tool_calls,
281            channel_messages,
282            heartbeat_ticks,
283            errors,
284            cache_hits,
285            cache_misses,
286            cache_tokens_saved,
287            agent_duration,
288            tool_duration,
289            request_latency,
290            tokens_used,
291            active_sessions,
292            queue_depth,
293            hand_runs,
294            hand_duration,
295            hand_findings,
296            deployments_total,
297            deployment_lead_time,
298            deployment_failure_rate,
299            recovery_time,
300            mttr,
301            deploy_success_count: std::sync::atomic::AtomicU64::new(0),
302            deploy_failure_count: std::sync::atomic::AtomicU64::new(0),
303        }
304    }
305
306    /// Encode all registered metrics into Prometheus text exposition format.
307    pub fn encode(&self) -> String {
308        let encoder = TextEncoder::new();
309        let families = self.registry.gather();
310        let mut buf = Vec::new();
311        encoder.encode(&families, &mut buf).unwrap_or_default();
312        String::from_utf8(buf).unwrap_or_default()
313    }
314}
315
316impl Observer for PrometheusObserver {
317    fn record_event(&self, event: &ObserverEvent) {
318        match event {
319            ObserverEvent::AgentStart { provider, model } => {
320                self.agent_starts
321                    .with_label_values(&[provider, model])
322                    .inc();
323            }
324            ObserverEvent::AgentEnd {
325                provider,
326                model,
327                duration,
328                tokens_used,
329                cost_usd: _,
330            } => {
331                // Agent duration is recorded via the histogram with provider/model labels
332                self.agent_duration
333                    .with_label_values(&[provider, model])
334                    .observe(duration.as_secs_f64());
335                if let Some(t) = tokens_used {
336                    self.tokens_used.set(i64::try_from(*t).unwrap_or(i64::MAX));
337                }
338            }
339            ObserverEvent::LlmResponse {
340                provider,
341                model,
342                success,
343                input_tokens,
344                output_tokens,
345                ..
346            } => {
347                let success_str = if *success { "true" } else { "false" };
348                self.llm_requests
349                    .with_label_values(&[provider.as_str(), model.as_str(), success_str])
350                    .inc();
351                if let Some(input) = input_tokens {
352                    self.tokens_input_total
353                        .with_label_values(&[provider.as_str(), model.as_str()])
354                        .inc_by(*input);
355                }
356                if let Some(output) = output_tokens {
357                    self.tokens_output_total
358                        .with_label_values(&[provider.as_str(), model.as_str()])
359                        .inc_by(*output);
360                }
361            }
362            ObserverEvent::ToolCallStart { .. }
363            | ObserverEvent::TurnComplete
364            | ObserverEvent::LlmRequest { .. }
365            | ObserverEvent::DeploymentStarted { .. }
366            | ObserverEvent::RecoveryCompleted { .. } => {}
367            ObserverEvent::ToolCall {
368                tool,
369                duration,
370                success,
371            } => {
372                let success_str = if *success { "true" } else { "false" };
373                self.tool_calls
374                    .with_label_values(&[tool.as_str(), success_str])
375                    .inc();
376                self.tool_duration
377                    .with_label_values(&[tool.as_str()])
378                    .observe(duration.as_secs_f64());
379            }
380            ObserverEvent::ChannelMessage { channel, direction } => {
381                self.channel_messages
382                    .with_label_values(&[channel, direction])
383                    .inc();
384            }
385            ObserverEvent::HeartbeatTick => {
386                self.heartbeat_ticks.inc();
387            }
388            ObserverEvent::CacheHit {
389                cache_type,
390                tokens_saved,
391            } => {
392                self.cache_hits.with_label_values(&[cache_type]).inc();
393                self.cache_tokens_saved
394                    .with_label_values(&[cache_type])
395                    .inc_by(*tokens_saved);
396            }
397            ObserverEvent::CacheMiss { cache_type } => {
398                self.cache_misses.with_label_values(&[cache_type]).inc();
399            }
400            ObserverEvent::Error {
401                component,
402                message: _,
403            } => {
404                self.errors.with_label_values(&[component]).inc();
405            }
406            ObserverEvent::HandStarted { hand_name } => {
407                self.hand_runs
408                    .with_label_values(&[hand_name.as_str(), "true"])
409                    .inc_by(0); // touch the series so it appears in output
410            }
411            ObserverEvent::HandCompleted {
412                hand_name,
413                duration_ms,
414                findings_count,
415            } => {
416                self.hand_runs
417                    .with_label_values(&[hand_name.as_str(), "true"])
418                    .inc();
419                self.hand_duration
420                    .with_label_values(&[hand_name.as_str()])
421                    .observe(*duration_ms as f64 / 1000.0);
422                self.hand_findings
423                    .with_label_values(&[hand_name.as_str()])
424                    .inc_by(*findings_count as u64);
425            }
426            ObserverEvent::HandFailed {
427                hand_name,
428                duration_ms,
429                ..
430            } => {
431                self.hand_runs
432                    .with_label_values(&[hand_name.as_str(), "false"])
433                    .inc();
434                self.hand_duration
435                    .with_label_values(&[hand_name.as_str()])
436                    .observe(*duration_ms as f64 / 1000.0);
437            }
438            ObserverEvent::DeploymentCompleted { .. } => {
439                self.deployments_total.with_label_values(&["success"]).inc();
440                let s = self
441                    .deploy_success_count
442                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed)
443                    + 1;
444                let f = self
445                    .deploy_failure_count
446                    .load(std::sync::atomic::Ordering::Relaxed);
447                let total = s + f;
448                if total > 0 {
449                    self.deployment_failure_rate.set(f as f64 / total as f64);
450                }
451            }
452            ObserverEvent::DeploymentFailed { .. } => {
453                self.deployments_total.with_label_values(&["failure"]).inc();
454                let f = self
455                    .deploy_failure_count
456                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed)
457                    + 1;
458                let s = self
459                    .deploy_success_count
460                    .load(std::sync::atomic::Ordering::Relaxed);
461                let total = s + f;
462                if total > 0 {
463                    self.deployment_failure_rate.set(f as f64 / total as f64);
464                }
465            }
466        }
467    }
468
469    fn record_metric(&self, metric: &ObserverMetric) {
470        match metric {
471            ObserverMetric::RequestLatency(d) => {
472                self.request_latency.observe(d.as_secs_f64());
473            }
474            ObserverMetric::TokensUsed(t) => {
475                self.tokens_used.set(i64::try_from(*t).unwrap_or(i64::MAX));
476            }
477            ObserverMetric::ActiveSessions(s) => {
478                self.active_sessions
479                    .with_label_values(&[] as &[&str])
480                    .set(*s as f64);
481            }
482            ObserverMetric::QueueDepth(d) => {
483                self.queue_depth
484                    .with_label_values(&[] as &[&str])
485                    .set(*d as f64);
486            }
487            ObserverMetric::HandRunDuration {
488                hand_name,
489                duration,
490            } => {
491                self.hand_duration
492                    .with_label_values(&[hand_name.as_str()])
493                    .observe(duration.as_secs_f64());
494            }
495            ObserverMetric::HandFindingsCount { hand_name, count } => {
496                self.hand_findings
497                    .with_label_values(&[hand_name.as_str()])
498                    .inc_by(*count);
499            }
500            ObserverMetric::HandSuccessRate { hand_name, success } => {
501                let success_str = if *success { "true" } else { "false" };
502                self.hand_runs
503                    .with_label_values(&[hand_name.as_str(), success_str])
504                    .inc();
505            }
506            ObserverMetric::DeploymentLeadTime(d) => {
507                self.deployment_lead_time.observe(d.as_secs_f64());
508            }
509            ObserverMetric::RecoveryTime(d) => {
510                self.recovery_time.observe(d.as_secs_f64());
511                self.mttr.set(d.as_secs_f64());
512            }
513        }
514    }
515
516    fn name(&self) -> &str {
517        "prometheus"
518    }
519
520    fn as_any(&self) -> &dyn std::any::Any {
521        self
522    }
523}
524
525#[cfg(test)]
526mod tests {
527    use super::*;
528    use std::time::Duration;
529
530    #[test]
531    fn prometheus_observer_name() {
532        assert_eq!(PrometheusObserver::new().name(), "prometheus");
533    }
534
535    #[test]
536    fn records_all_events_without_panic() {
537        let obs = PrometheusObserver::new();
538        obs.record_event(&ObserverEvent::AgentStart {
539            provider: "openrouter".into(),
540            model: "claude-sonnet".into(),
541        });
542        obs.record_event(&ObserverEvent::AgentEnd {
543            provider: "openrouter".into(),
544            model: "claude-sonnet".into(),
545            duration: Duration::from_millis(500),
546            tokens_used: Some(100),
547            cost_usd: None,
548        });
549        obs.record_event(&ObserverEvent::AgentEnd {
550            provider: "openrouter".into(),
551            model: "claude-sonnet".into(),
552            duration: Duration::ZERO,
553            tokens_used: None,
554            cost_usd: None,
555        });
556        obs.record_event(&ObserverEvent::ToolCall {
557            tool: "shell".into(),
558            duration: Duration::from_millis(10),
559            success: true,
560        });
561        obs.record_event(&ObserverEvent::ToolCall {
562            tool: "file_read".into(),
563            duration: Duration::from_millis(5),
564            success: false,
565        });
566        obs.record_event(&ObserverEvent::ChannelMessage {
567            channel: "telegram".into(),
568            direction: "inbound".into(),
569        });
570        obs.record_event(&ObserverEvent::HeartbeatTick);
571        obs.record_event(&ObserverEvent::Error {
572            component: "provider".into(),
573            message: "timeout".into(),
574        });
575    }
576
577    #[test]
578    fn records_all_metrics_without_panic() {
579        let obs = PrometheusObserver::new();
580        obs.record_metric(&ObserverMetric::RequestLatency(Duration::from_secs(2)));
581        obs.record_metric(&ObserverMetric::TokensUsed(500));
582        obs.record_metric(&ObserverMetric::TokensUsed(0));
583        obs.record_metric(&ObserverMetric::ActiveSessions(3));
584        obs.record_metric(&ObserverMetric::QueueDepth(42));
585    }
586
587    #[test]
588    fn encode_produces_prometheus_text_format() {
589        let obs = PrometheusObserver::new();
590        obs.record_event(&ObserverEvent::AgentStart {
591            provider: "openrouter".into(),
592            model: "claude-sonnet".into(),
593        });
594        obs.record_event(&ObserverEvent::ToolCall {
595            tool: "shell".into(),
596            duration: Duration::from_millis(100),
597            success: true,
598        });
599        obs.record_event(&ObserverEvent::HeartbeatTick);
600        obs.record_metric(&ObserverMetric::RequestLatency(Duration::from_millis(250)));
601
602        let output = obs.encode();
603        assert!(output.contains("construct_agent_starts_total"));
604        assert!(output.contains("construct_tool_calls_total"));
605        assert!(output.contains("construct_heartbeat_ticks_total"));
606        assert!(output.contains("construct_request_latency_seconds"));
607    }
608
609    #[test]
610    fn counters_increment_correctly() {
611        let obs = PrometheusObserver::new();
612
613        for _ in 0..3 {
614            obs.record_event(&ObserverEvent::HeartbeatTick);
615        }
616
617        let output = obs.encode();
618        assert!(output.contains("construct_heartbeat_ticks_total 3"));
619    }
620
621    #[test]
622    fn tool_calls_track_success_and_failure_separately() {
623        let obs = PrometheusObserver::new();
624
625        obs.record_event(&ObserverEvent::ToolCall {
626            tool: "shell".into(),
627            duration: Duration::from_millis(10),
628            success: true,
629        });
630        obs.record_event(&ObserverEvent::ToolCall {
631            tool: "shell".into(),
632            duration: Duration::from_millis(10),
633            success: true,
634        });
635        obs.record_event(&ObserverEvent::ToolCall {
636            tool: "shell".into(),
637            duration: Duration::from_millis(10),
638            success: false,
639        });
640
641        let output = obs.encode();
642        assert!(output.contains(r#"construct_tool_calls_total{success="true",tool="shell"} 2"#));
643        assert!(output.contains(r#"construct_tool_calls_total{success="false",tool="shell"} 1"#));
644    }
645
646    #[test]
647    fn errors_track_by_component() {
648        let obs = PrometheusObserver::new();
649        obs.record_event(&ObserverEvent::Error {
650            component: "provider".into(),
651            message: "timeout".into(),
652        });
653        obs.record_event(&ObserverEvent::Error {
654            component: "provider".into(),
655            message: "rate limit".into(),
656        });
657        obs.record_event(&ObserverEvent::Error {
658            component: "channels".into(),
659            message: "disconnected".into(),
660        });
661
662        let output = obs.encode();
663        assert!(output.contains(r#"construct_errors_total{component="provider"} 2"#));
664        assert!(output.contains(r#"construct_errors_total{component="channels"} 1"#));
665    }
666
667    #[test]
668    fn gauge_reflects_latest_value() {
669        let obs = PrometheusObserver::new();
670        obs.record_metric(&ObserverMetric::TokensUsed(100));
671        obs.record_metric(&ObserverMetric::TokensUsed(200));
672
673        let output = obs.encode();
674        assert!(output.contains("construct_tokens_used_last 200"));
675    }
676
677    #[test]
678    fn llm_response_tracks_request_count_and_tokens() {
679        let obs = PrometheusObserver::new();
680
681        obs.record_event(&ObserverEvent::LlmResponse {
682            provider: "openrouter".into(),
683            model: "claude-sonnet".into(),
684            duration: Duration::from_millis(200),
685            success: true,
686            error_message: None,
687            input_tokens: Some(100),
688            output_tokens: Some(50),
689        });
690        obs.record_event(&ObserverEvent::LlmResponse {
691            provider: "openrouter".into(),
692            model: "claude-sonnet".into(),
693            duration: Duration::from_millis(300),
694            success: true,
695            error_message: None,
696            input_tokens: Some(200),
697            output_tokens: Some(80),
698        });
699
700        let output = obs.encode();
701        assert!(output.contains(
702            r#"construct_llm_requests_total{model="claude-sonnet",provider="openrouter",success="true"} 2"#
703        ));
704        assert!(output.contains(
705            r#"construct_tokens_input_total{model="claude-sonnet",provider="openrouter"} 300"#
706        ));
707        assert!(output.contains(
708            r#"construct_tokens_output_total{model="claude-sonnet",provider="openrouter"} 130"#
709        ));
710    }
711
712    #[test]
713    fn hand_events_track_runs_and_duration() {
714        let obs = PrometheusObserver::new();
715
716        obs.record_event(&ObserverEvent::HandCompleted {
717            hand_name: "review".into(),
718            duration_ms: 1500,
719            findings_count: 3,
720        });
721        obs.record_event(&ObserverEvent::HandCompleted {
722            hand_name: "review".into(),
723            duration_ms: 2000,
724            findings_count: 1,
725        });
726        obs.record_event(&ObserverEvent::HandFailed {
727            hand_name: "review".into(),
728            error: "timeout".into(),
729            duration_ms: 5000,
730        });
731
732        let output = obs.encode();
733        assert!(output.contains(r#"construct_hand_runs_total{hand="review",success="true"} 2"#));
734        assert!(output.contains(r#"construct_hand_runs_total{hand="review",success="false"} 1"#));
735        assert!(output.contains(r#"construct_hand_findings_total{hand="review"} 4"#));
736        assert!(output.contains("construct_hand_duration_seconds"));
737    }
738
739    #[test]
740    fn hand_metrics_record_duration_and_findings() {
741        let obs = PrometheusObserver::new();
742
743        obs.record_metric(&ObserverMetric::HandRunDuration {
744            hand_name: "scan".into(),
745            duration: Duration::from_millis(800),
746        });
747        obs.record_metric(&ObserverMetric::HandFindingsCount {
748            hand_name: "scan".into(),
749            count: 5,
750        });
751        obs.record_metric(&ObserverMetric::HandSuccessRate {
752            hand_name: "scan".into(),
753            success: true,
754        });
755        obs.record_metric(&ObserverMetric::HandSuccessRate {
756            hand_name: "scan".into(),
757            success: false,
758        });
759
760        let output = obs.encode();
761        assert!(output.contains("construct_hand_duration_seconds"));
762        assert!(output.contains(r#"construct_hand_findings_total{hand="scan"} 5"#));
763        assert!(output.contains(r#"construct_hand_runs_total{hand="scan",success="true"} 1"#));
764        assert!(output.contains(r#"construct_hand_runs_total{hand="scan",success="false"} 1"#));
765    }
766
767    #[test]
768    fn llm_response_without_tokens_increments_request_only() {
769        let obs = PrometheusObserver::new();
770
771        obs.record_event(&ObserverEvent::LlmResponse {
772            provider: "ollama".into(),
773            model: "llama3".into(),
774            duration: Duration::from_millis(100),
775            success: false,
776            error_message: Some("timeout".into()),
777            input_tokens: None,
778            output_tokens: None,
779        });
780
781        let output = obs.encode();
782        assert!(output.contains(
783            r#"construct_llm_requests_total{model="llama3",provider="ollama",success="false"} 1"#
784        ));
785        // Token counters should not appear (no data recorded)
786        assert!(!output.contains("construct_tokens_input_total{"));
787        assert!(!output.contains("construct_tokens_output_total{"));
788    }
789
790    #[test]
791    fn dora_deployment_events_track_counters() {
792        let obs = PrometheusObserver::new();
793
794        obs.record_event(&ObserverEvent::DeploymentCompleted {
795            deploy_id: "d1".into(),
796            commit_sha: "abc123".into(),
797        });
798        obs.record_event(&ObserverEvent::DeploymentCompleted {
799            deploy_id: "d2".into(),
800            commit_sha: "def456".into(),
801        });
802        obs.record_event(&ObserverEvent::DeploymentFailed {
803            deploy_id: "d3".into(),
804            reason: "timeout".into(),
805        });
806
807        let output = obs.encode();
808        assert!(output.contains(r#"construct_deployments_total{status="success"} 2"#));
809        assert!(output.contains(r#"construct_deployments_total{status="failure"} 1"#));
810    }
811
812    #[test]
813    fn dora_failure_rate_gauge_updates() {
814        let obs = PrometheusObserver::new();
815
816        obs.record_event(&ObserverEvent::DeploymentCompleted {
817            deploy_id: "d1".into(),
818            commit_sha: "abc".into(),
819        });
820        obs.record_event(&ObserverEvent::DeploymentFailed {
821            deploy_id: "d2".into(),
822            reason: "error".into(),
823        });
824
825        let output = obs.encode();
826        // 1 failure out of 2 total = 0.5
827        assert!(output.contains("construct_deployment_failure_rate 0.5"));
828    }
829
830    #[test]
831    fn dora_lead_time_and_recovery_metrics() {
832        let obs = PrometheusObserver::new();
833
834        obs.record_metric(&ObserverMetric::DeploymentLeadTime(Duration::from_secs(
835            3600,
836        )));
837        obs.record_metric(&ObserverMetric::RecoveryTime(Duration::from_secs(600)));
838
839        let output = obs.encode();
840        assert!(output.contains("construct_deployment_lead_time_seconds"));
841        assert!(output.contains("construct_recovery_time_seconds"));
842        assert!(output.contains("construct_mttr_seconds 600"));
843    }
844
845    #[test]
846    fn dora_started_and_recovery_events_no_panic() {
847        let obs = PrometheusObserver::new();
848
849        obs.record_event(&ObserverEvent::DeploymentStarted {
850            deploy_id: "d1".into(),
851        });
852        obs.record_event(&ObserverEvent::RecoveryCompleted {
853            deploy_id: "d1".into(),
854        });
855    }
856}