Skip to main content

mockforge_observability/prometheus/
metrics.rs

1//! Prometheus metrics definitions and registry
2
3use once_cell::sync::Lazy;
4use prometheus::{
5    Gauge, GaugeVec, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
6    Opts, Registry,
7};
8use std::sync::Arc;
9use tracing::debug;
10
11/// A single drift evaluation sample to feed into
12/// [`MetricsRegistry::record_drift_evaluation`]. Borrows the string labels
13/// so callers don't have to allocate per-request.
14///
15/// `workspace_id` should be the empty string when the request isn't tied to a
16/// specific tenant (the global "drift-by-endpoint" series remains useful and
17/// the dashboards collapse the label).
18#[derive(Debug, Clone, Copy)]
19pub struct DriftEvaluationSample<'a> {
20    pub workspace_id: &'a str,
21    pub endpoint: &'a str,
22    pub method: &'a str,
23    pub total: u32,
24    pub breaking: u32,
25    pub potentially_breaking: u32,
26    pub budget_exceeded: bool,
27}
28
29/// Global metrics registry for MockForge
30#[derive(Clone)]
31pub struct MetricsRegistry {
32    registry: Arc<Registry>,
33
34    // Request metrics by protocol
35    pub requests_total: IntCounterVec,
36    pub requests_duration_seconds: HistogramVec,
37    pub requests_in_flight: IntGaugeVec,
38
39    // Request metrics by path (endpoint-specific)
40    pub requests_by_path_total: IntCounterVec,
41    pub request_duration_by_path_seconds: HistogramVec,
42    pub average_latency_by_path_seconds: GaugeVec,
43
44    // Workspace-specific metrics
45    pub workspace_requests_total: IntCounterVec,
46    pub workspace_requests_duration_seconds: HistogramVec,
47    pub workspace_active_routes: IntGaugeVec,
48    pub workspace_errors_total: IntCounterVec,
49
50    // Error metrics
51    pub errors_total: IntCounterVec,
52    pub error_rate: GaugeVec,
53
54    // Plugin metrics
55    pub plugin_executions_total: IntCounterVec,
56    pub plugin_execution_duration_seconds: HistogramVec,
57    pub plugin_errors_total: IntCounterVec,
58
59    // WebSocket specific metrics
60    pub ws_connections_active: IntGauge,
61    pub ws_connections_total: IntCounter,
62    pub ws_connection_duration_seconds: HistogramVec,
63    pub ws_messages_sent: IntCounter,
64    pub ws_messages_received: IntCounter,
65    pub ws_errors_total: IntCounter,
66
67    // SMTP specific metrics
68    pub smtp_connections_active: IntGauge,
69    pub smtp_connections_total: IntCounter,
70    pub smtp_messages_received_total: IntCounter,
71    pub smtp_messages_stored_total: IntCounter,
72    pub smtp_errors_total: IntCounterVec,
73
74    // MQTT specific metrics
75    pub mqtt_connections_active: IntGauge,
76    pub mqtt_connections_total: IntCounter,
77    pub mqtt_messages_published_total: IntCounter,
78    pub mqtt_messages_received_total: IntCounter,
79    pub mqtt_topics_active: IntGauge,
80    pub mqtt_subscriptions_active: IntGauge,
81    pub mqtt_retained_messages: IntGauge,
82    pub mqtt_errors_total: IntCounterVec,
83
84    // Kafka specific metrics. Driven by a periodic snapshot of the broker's
85    // internal `KafkaMetrics`, so these are IntGauges set to the absolute
86    // current value each tick (the `_total` ones are monotonic in practice).
87    pub kafka_connections_active: IntGauge,
88    pub kafka_messages_produced_total: IntGauge,
89    pub kafka_messages_consumed_total: IntGauge,
90    pub kafka_topics_total: IntGauge,
91    pub kafka_partitions_total: IntGauge,
92    pub kafka_consumer_groups_total: IntGauge,
93    pub kafka_errors_total: IntGauge,
94
95    // AMQP specific metrics (periodic snapshot of the broker's `AmqpMetrics`).
96    pub amqp_connections_active: IntGauge,
97    pub amqp_channels_active: IntGauge,
98    pub amqp_messages_published_total: IntGauge,
99    pub amqp_messages_consumed_total: IntGauge,
100    pub amqp_messages_acked_total: IntGauge,
101    pub amqp_queues_total: IntGauge,
102    pub amqp_exchanges_total: IntGauge,
103    pub amqp_bindings_total: IntGauge,
104    pub amqp_errors_total: IntGauge,
105
106    // System metrics
107    pub memory_usage_bytes: Gauge,
108    pub cpu_usage_percent: Gauge,
109    pub thread_count: Gauge,
110    pub uptime_seconds: Gauge,
111
112    // Scenario metrics (for Phase 4)
113    pub active_scenario_mode: IntGauge,
114    pub chaos_triggers_total: IntCounter,
115
116    // Business/SLO metrics
117    pub service_availability: GaugeVec,
118    pub slo_compliance: GaugeVec,
119    pub successful_request_rate: GaugeVec,
120    pub p95_latency_slo_compliance: GaugeVec,
121    pub error_budget_remaining: GaugeVec,
122
123    // Marketplace metrics
124    pub marketplace_publish_total: IntCounterVec,
125    pub marketplace_publish_duration_seconds: HistogramVec,
126    pub marketplace_download_total: IntCounterVec,
127    pub marketplace_download_duration_seconds: HistogramVec,
128    pub marketplace_search_total: IntCounterVec,
129    pub marketplace_search_duration_seconds: HistogramVec,
130    pub marketplace_errors_total: IntCounterVec,
131    pub marketplace_items_total: IntGaugeVec,
132
133    // Contract-drift metrics (issue #678) — emitted whenever
134    // `DriftBudgetEngine::evaluate*` runs against a request via the HTTP
135    // drift-tracking middleware. Labelled by workspace and endpoint so the
136    // MockOps UI's DriftPercentageDashboard can break down per-resource.
137    /// Drift severity as a 0.0–100.0 percentage (breaking + potentially-breaking
138    /// changes ÷ total observed mismatches × 100). 0 means no drift detected.
139    pub drift_percentage: GaugeVec,
140    /// Count of total mismatches observed in the last evaluation.
141    pub drift_total_changes: IntGaugeVec,
142    /// Count of breaking changes in the last evaluation. > 0 indicates a
143    /// contract-breaking drift event that should page someone.
144    pub drift_breaking_changes: IntGaugeVec,
145    /// Boolean (0 or 1) — was the configured drift budget exceeded?
146    pub drift_budget_exceeded: IntGaugeVec,
147}
148
149impl MetricsRegistry {
150    /// Create a new metrics registry with all metrics initialized
151    pub fn new() -> Self {
152        let registry = Registry::new();
153
154        // Request metrics (with pillar label)
155        let requests_total = IntCounterVec::new(
156            Opts::new(
157                "mockforge_requests_total",
158                "Total number of requests by protocol, method, status, and pillar",
159            ),
160            &["protocol", "method", "status", "pillar"],
161        )
162        .expect("Failed to create requests_total metric");
163
164        let requests_duration_seconds = HistogramVec::new(
165            HistogramOpts::new(
166                "mockforge_request_duration_seconds",
167                "Request duration in seconds by protocol, method, and pillar",
168            )
169            .buckets(vec![
170                0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
171            ]),
172            &["protocol", "method", "pillar"],
173        )
174        .expect("Failed to create requests_duration_seconds metric");
175
176        let requests_in_flight = IntGaugeVec::new(
177            Opts::new(
178                "mockforge_requests_in_flight",
179                "Number of requests currently being processed",
180            ),
181            &["protocol"],
182        )
183        .expect("Failed to create requests_in_flight metric");
184
185        // Error metrics (with pillar label)
186        let errors_total = IntCounterVec::new(
187            Opts::new(
188                "mockforge_errors_total",
189                "Total number of errors by protocol, error type, and pillar",
190            ),
191            &["protocol", "error_type", "pillar"],
192        )
193        .expect("Failed to create errors_total metric");
194
195        let error_rate = GaugeVec::new(
196            Opts::new("mockforge_error_rate", "Error rate by protocol (0.0 to 1.0)"),
197            &["protocol"],
198        )
199        .expect("Failed to create error_rate metric");
200
201        // Plugin metrics
202        let plugin_executions_total = IntCounterVec::new(
203            Opts::new("mockforge_plugin_executions_total", "Total number of plugin executions"),
204            &["plugin_name", "status"],
205        )
206        .expect("Failed to create plugin_executions_total metric");
207
208        let plugin_execution_duration_seconds = HistogramVec::new(
209            HistogramOpts::new(
210                "mockforge_plugin_execution_duration_seconds",
211                "Plugin execution duration in seconds",
212            )
213            .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]),
214            &["plugin_name"],
215        )
216        .expect("Failed to create plugin_execution_duration_seconds metric");
217
218        let plugin_errors_total = IntCounterVec::new(
219            Opts::new("mockforge_plugin_errors_total", "Total number of plugin errors"),
220            &["plugin_name", "error_type"],
221        )
222        .expect("Failed to create plugin_errors_total metric");
223
224        // WebSocket metrics
225        // Path-based request metrics
226        let requests_by_path_total = IntCounterVec::new(
227            Opts::new(
228                "mockforge_requests_by_path_total",
229                "Total number of requests by path, method, and status",
230            ),
231            &["path", "method", "status"],
232        )
233        .expect("Failed to create requests_by_path_total metric");
234
235        let request_duration_by_path_seconds = HistogramVec::new(
236            HistogramOpts::new(
237                "mockforge_request_duration_by_path_seconds",
238                "Request duration by path in seconds",
239            )
240            .buckets(vec![
241                0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
242            ]),
243            &["path", "method"],
244        )
245        .expect("Failed to create request_duration_by_path_seconds metric");
246
247        let average_latency_by_path_seconds = GaugeVec::new(
248            Opts::new(
249                "mockforge_average_latency_by_path_seconds",
250                "Average request latency by path in seconds",
251            ),
252            &["path", "method"],
253        )
254        .expect("Failed to create average_latency_by_path_seconds metric");
255
256        // Workspace-specific metrics
257        let workspace_requests_total = IntCounterVec::new(
258            Opts::new(
259                "mockforge_workspace_requests_total",
260                "Total number of requests by workspace, method, and status",
261            ),
262            &["workspace_id", "method", "status"],
263        )
264        .expect("Failed to create workspace_requests_total metric");
265
266        let workspace_requests_duration_seconds = HistogramVec::new(
267            HistogramOpts::new(
268                "mockforge_workspace_request_duration_seconds",
269                "Request duration by workspace in seconds",
270            )
271            .buckets(vec![
272                0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
273            ]),
274            &["workspace_id", "method"],
275        )
276        .expect("Failed to create workspace_requests_duration_seconds metric");
277
278        let workspace_active_routes = IntGaugeVec::new(
279            Opts::new(
280                "mockforge_workspace_active_routes",
281                "Number of active routes in each workspace",
282            ),
283            &["workspace_id"],
284        )
285        .expect("Failed to create workspace_active_routes metric");
286
287        let workspace_errors_total = IntCounterVec::new(
288            Opts::new("mockforge_workspace_errors_total", "Total number of errors by workspace"),
289            &["workspace_id", "error_type"],
290        )
291        .expect("Failed to create workspace_errors_total metric");
292
293        // WebSocket metrics
294        let ws_connections_active = IntGauge::new(
295            "mockforge_ws_connections_active",
296            "Number of active WebSocket connections",
297        )
298        .expect("Failed to create ws_connections_active metric");
299
300        let ws_connections_total = IntCounter::new(
301            "mockforge_ws_connections_total",
302            "Total number of WebSocket connections established",
303        )
304        .expect("Failed to create ws_connections_total metric");
305
306        let ws_connection_duration_seconds = HistogramVec::new(
307            HistogramOpts::new(
308                "mockforge_ws_connection_duration_seconds",
309                "WebSocket connection duration in seconds",
310            )
311            .buckets(vec![1.0, 5.0, 10.0, 30.0, 60.0, 300.0, 600.0, 1800.0, 3600.0]),
312            &["status"],
313        )
314        .expect("Failed to create ws_connection_duration_seconds metric");
315
316        let ws_messages_sent = IntCounter::new(
317            "mockforge_ws_messages_sent_total",
318            "Total number of WebSocket messages sent",
319        )
320        .expect("Failed to create ws_messages_sent metric");
321
322        let ws_messages_received = IntCounter::new(
323            "mockforge_ws_messages_received_total",
324            "Total number of WebSocket messages received",
325        )
326        .expect("Failed to create ws_messages_received metric");
327
328        let ws_errors_total =
329            IntCounter::new("mockforge_ws_errors_total", "Total number of WebSocket errors")
330                .expect("Failed to create ws_errors_total metric");
331
332        // SMTP metrics
333        let smtp_connections_active =
334            IntGauge::new("mockforge_smtp_connections_active", "Number of active SMTP connections")
335                .expect("Failed to create smtp_connections_active metric");
336
337        let smtp_connections_total =
338            IntCounter::new("mockforge_smtp_connections_total", "Total number of SMTP connections")
339                .expect("Failed to create smtp_connections_total metric");
340
341        let smtp_messages_received_total = IntCounter::new(
342            "mockforge_smtp_messages_received_total",
343            "Total number of SMTP messages received",
344        )
345        .expect("Failed to create smtp_messages_received_total metric");
346
347        let smtp_messages_stored_total = IntCounter::new(
348            "mockforge_smtp_messages_stored_total",
349            "Total number of SMTP messages stored in mailbox",
350        )
351        .expect("Failed to create smtp_messages_stored_total metric");
352
353        let smtp_errors_total = IntCounterVec::new(
354            Opts::new("mockforge_smtp_errors_total", "Total number of SMTP errors by type"),
355            &["error_type"],
356        )
357        .expect("Failed to create smtp_errors_total metric");
358
359        // MQTT metrics
360        let mqtt_connections_active = IntGauge::new(
361            "mockforge_mqtt_connections_active",
362            "Number of active MQTT client connections",
363        )
364        .expect("Failed to create mqtt_connections_active metric");
365
366        let mqtt_connections_total = IntCounter::new(
367            "mockforge_mqtt_connections_total",
368            "Total number of MQTT client connections established",
369        )
370        .expect("Failed to create mqtt_connections_total metric");
371
372        let mqtt_messages_published_total = IntCounter::new(
373            "mockforge_mqtt_messages_published_total",
374            "Total number of MQTT messages published",
375        )
376        .expect("Failed to create mqtt_messages_published_total metric");
377
378        let mqtt_messages_received_total = IntCounter::new(
379            "mockforge_mqtt_messages_received_total",
380            "Total number of MQTT messages received",
381        )
382        .expect("Failed to create mqtt_messages_received_total metric");
383
384        let mqtt_topics_active =
385            IntGauge::new("mockforge_mqtt_topics_active", "Number of active MQTT topics")
386                .expect("Failed to create mqtt_topics_active metric");
387
388        let mqtt_subscriptions_active = IntGauge::new(
389            "mockforge_mqtt_subscriptions_active",
390            "Number of active MQTT subscriptions",
391        )
392        .expect("Failed to create mqtt_subscriptions_active metric");
393
394        let mqtt_retained_messages =
395            IntGauge::new("mockforge_mqtt_retained_messages", "Number of retained MQTT messages")
396                .expect("Failed to create mqtt_retained_messages metric");
397
398        let mqtt_errors_total = IntCounterVec::new(
399            Opts::new("mockforge_mqtt_errors_total", "Total number of MQTT errors by type"),
400            &["error_type"],
401        )
402        .expect("Failed to create mqtt_errors_total metric");
403
404        // Kafka metrics
405        let kafka_connections_active = IntGauge::new(
406            "mockforge_kafka_connections_active",
407            "Number of active Kafka client connections",
408        )
409        .expect("Failed to create kafka_connections_active metric");
410        let kafka_messages_produced_total = IntGauge::new(
411            "mockforge_kafka_messages_produced_total",
412            "Total number of Kafka messages produced",
413        )
414        .expect("Failed to create kafka_messages_produced_total metric");
415        let kafka_messages_consumed_total = IntGauge::new(
416            "mockforge_kafka_messages_consumed_total",
417            "Total number of Kafka messages consumed",
418        )
419        .expect("Failed to create kafka_messages_consumed_total metric");
420        let kafka_topics_total =
421            IntGauge::new("mockforge_kafka_topics_total", "Number of Kafka topics")
422                .expect("Failed to create kafka_topics_total metric");
423        let kafka_partitions_total =
424            IntGauge::new("mockforge_kafka_partitions_total", "Number of Kafka partitions")
425                .expect("Failed to create kafka_partitions_total metric");
426        let kafka_consumer_groups_total = IntGauge::new(
427            "mockforge_kafka_consumer_groups_total",
428            "Number of Kafka consumer groups",
429        )
430        .expect("Failed to create kafka_consumer_groups_total metric");
431        let kafka_errors_total =
432            IntGauge::new("mockforge_kafka_errors_total", "Total number of Kafka errors")
433                .expect("Failed to create kafka_errors_total metric");
434
435        // AMQP metrics
436        let amqp_connections_active =
437            IntGauge::new("mockforge_amqp_connections_active", "Number of active AMQP connections")
438                .expect("Failed to create amqp_connections_active metric");
439        let amqp_channels_active =
440            IntGauge::new("mockforge_amqp_channels_active", "Number of active AMQP channels")
441                .expect("Failed to create amqp_channels_active metric");
442        let amqp_messages_published_total = IntGauge::new(
443            "mockforge_amqp_messages_published_total",
444            "Total number of AMQP messages published",
445        )
446        .expect("Failed to create amqp_messages_published_total metric");
447        let amqp_messages_consumed_total = IntGauge::new(
448            "mockforge_amqp_messages_consumed_total",
449            "Total number of AMQP messages consumed",
450        )
451        .expect("Failed to create amqp_messages_consumed_total metric");
452        let amqp_messages_acked_total = IntGauge::new(
453            "mockforge_amqp_messages_acked_total",
454            "Total number of AMQP messages acknowledged",
455        )
456        .expect("Failed to create amqp_messages_acked_total metric");
457        let amqp_queues_total =
458            IntGauge::new("mockforge_amqp_queues_total", "Number of AMQP queues")
459                .expect("Failed to create amqp_queues_total metric");
460        let amqp_exchanges_total =
461            IntGauge::new("mockforge_amqp_exchanges_total", "Number of AMQP exchanges")
462                .expect("Failed to create amqp_exchanges_total metric");
463        let amqp_bindings_total =
464            IntGauge::new("mockforge_amqp_bindings_total", "Number of AMQP bindings")
465                .expect("Failed to create amqp_bindings_total metric");
466        let amqp_errors_total =
467            IntGauge::new("mockforge_amqp_errors_total", "Total number of AMQP errors")
468                .expect("Failed to create amqp_errors_total metric");
469
470        // System metrics
471        let memory_usage_bytes =
472            Gauge::new("mockforge_memory_usage_bytes", "Memory usage in bytes")
473                .expect("Failed to create memory_usage_bytes metric");
474
475        let cpu_usage_percent = Gauge::new("mockforge_cpu_usage_percent", "CPU usage percentage")
476            .expect("Failed to create cpu_usage_percent metric");
477
478        let thread_count = Gauge::new("mockforge_thread_count", "Number of active threads")
479            .expect("Failed to create thread_count metric");
480
481        let uptime_seconds = Gauge::new("mockforge_uptime_seconds", "Server uptime in seconds")
482            .expect("Failed to create uptime_seconds metric");
483
484        // Scenario metrics
485        let active_scenario_mode = IntGauge::new(
486            "mockforge_active_scenario_mode",
487            "Active scenario mode (0=healthy, 1=degraded, 2=error, 3=chaos)",
488        )
489        .expect("Failed to create active_scenario_mode metric");
490
491        let chaos_triggers_total = IntCounter::new(
492            "mockforge_chaos_triggers_total",
493            "Total number of chaos mode triggers",
494        )
495        .expect("Failed to create chaos_triggers_total metric");
496
497        // Business/SLO metrics
498        let service_availability = GaugeVec::new(
499            Opts::new(
500                "mockforge_service_availability",
501                "Service availability percentage (0.0 to 1.0) by protocol",
502            ),
503            &["protocol"],
504        )
505        .expect("Failed to create service_availability metric");
506
507        let slo_compliance = GaugeVec::new(
508            Opts::new(
509                "mockforge_slo_compliance",
510                "SLO compliance percentage (0.0 to 1.0) by protocol and slo_type",
511            ),
512            &["protocol", "slo_type"],
513        )
514        .expect("Failed to create slo_compliance metric");
515
516        let successful_request_rate = GaugeVec::new(
517            Opts::new(
518                "mockforge_successful_request_rate",
519                "Successful request rate (0.0 to 1.0) by protocol",
520            ),
521            &["protocol"],
522        )
523        .expect("Failed to create successful_request_rate metric");
524
525        let p95_latency_slo_compliance = GaugeVec::new(
526            Opts::new(
527                "mockforge_p95_latency_slo_compliance",
528                "P95 latency SLO compliance (1.0 = compliant, 0.0 = non-compliant) by protocol",
529            ),
530            &["protocol"],
531        )
532        .expect("Failed to create p95_latency_slo_compliance metric");
533
534        let error_budget_remaining = GaugeVec::new(
535            Opts::new(
536                "mockforge_error_budget_remaining",
537                "Remaining error budget percentage (0.0 to 1.0) by protocol",
538            ),
539            &["protocol"],
540        )
541        .expect("Failed to create error_budget_remaining metric");
542
543        // Marketplace metrics
544        let marketplace_publish_total = IntCounterVec::new(
545            Opts::new(
546                "mockforge_marketplace_publish_total",
547                "Total number of marketplace items published by type and status",
548            ),
549            &["type", "status"], // type: plugin, template, scenario; status: success, error
550        )
551        .expect("Failed to create marketplace_publish_total metric");
552
553        let marketplace_publish_duration_seconds = HistogramVec::new(
554            HistogramOpts::new(
555                "mockforge_marketplace_publish_duration_seconds",
556                "Marketplace publish operation duration in seconds",
557            )
558            .buckets(vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0]),
559            &["type"], // type: plugin, template, scenario
560        )
561        .expect("Failed to create marketplace_publish_duration_seconds metric");
562
563        let marketplace_download_total = IntCounterVec::new(
564            Opts::new(
565                "mockforge_marketplace_download_total",
566                "Total number of marketplace items downloaded by type and status",
567            ),
568            &["type", "status"], // type: plugin, template, scenario; status: success, error
569        )
570        .expect("Failed to create marketplace_download_total metric");
571
572        let marketplace_download_duration_seconds = HistogramVec::new(
573            HistogramOpts::new(
574                "mockforge_marketplace_download_duration_seconds",
575                "Marketplace download operation duration in seconds",
576            )
577            .buckets(vec![0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]),
578            &["type"], // type: plugin, template, scenario
579        )
580        .expect("Failed to create marketplace_download_duration_seconds metric");
581
582        let marketplace_search_total = IntCounterVec::new(
583            Opts::new(
584                "mockforge_marketplace_search_total",
585                "Total number of marketplace searches by type and status",
586            ),
587            &["type", "status"], // type: plugin, template, scenario; status: success, error
588        )
589        .expect("Failed to create marketplace_search_total metric");
590
591        let marketplace_search_duration_seconds = HistogramVec::new(
592            HistogramOpts::new(
593                "mockforge_marketplace_search_duration_seconds",
594                "Marketplace search operation duration in seconds",
595            )
596            .buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0]),
597            &["type"], // type: plugin, template, scenario
598        )
599        .expect("Failed to create marketplace_search_duration_seconds metric");
600
601        let marketplace_errors_total = IntCounterVec::new(
602            Opts::new(
603                "mockforge_marketplace_errors_total",
604                "Total number of marketplace errors by type and error_code",
605            ),
606            &["type", "error_code"], // type: plugin, template, scenario; error_code: validation_failed, not_found, etc.
607        )
608        .expect("Failed to create marketplace_errors_total metric");
609
610        let marketplace_items_total = IntGaugeVec::new(
611            Opts::new(
612                "mockforge_marketplace_items_total",
613                "Total number of marketplace items by type",
614            ),
615            &["type"], // type: plugin, template, scenario
616        )
617        .expect("Failed to create marketplace_items_total metric");
618
619        // Drift metrics (#678). Labelled by workspace + endpoint + method so
620        // both the MockOps drift dashboard and per-API alerting work.
621        let drift_percentage = GaugeVec::new(
622            Opts::new(
623                "mockforge_drift_percentage",
624                "Contract drift severity as a 0.0–100.0 percentage (breaking + potentially-breaking ÷ total observed × 100)",
625            ),
626            &["workspace_id", "endpoint", "method"],
627        )
628        .expect("Failed to create drift_percentage metric");
629
630        let drift_total_changes = IntGaugeVec::new(
631            Opts::new(
632                "mockforge_drift_total_changes",
633                "Total mismatches observed in the most recent drift evaluation",
634            ),
635            &["workspace_id", "endpoint", "method"],
636        )
637        .expect("Failed to create drift_total_changes metric");
638
639        let drift_breaking_changes = IntGaugeVec::new(
640            Opts::new(
641                "mockforge_drift_breaking_changes",
642                "Breaking changes observed in the most recent drift evaluation",
643            ),
644            &["workspace_id", "endpoint", "method"],
645        )
646        .expect("Failed to create drift_breaking_changes metric");
647
648        let drift_budget_exceeded = IntGaugeVec::new(
649            Opts::new(
650                "mockforge_drift_budget_exceeded",
651                "1 when the configured drift budget was exceeded by the most recent evaluation, 0 otherwise",
652            ),
653            &["workspace_id", "endpoint", "method"],
654        )
655        .expect("Failed to create drift_budget_exceeded metric");
656
657        // Register all metrics
658        registry
659            .register(Box::new(requests_total.clone()))
660            .expect("Failed to register requests_total");
661        registry
662            .register(Box::new(requests_duration_seconds.clone()))
663            .expect("Failed to register requests_duration_seconds");
664        registry
665            .register(Box::new(requests_in_flight.clone()))
666            .expect("Failed to register requests_in_flight");
667        registry
668            .register(Box::new(requests_by_path_total.clone()))
669            .expect("Failed to register requests_by_path_total");
670        registry
671            .register(Box::new(request_duration_by_path_seconds.clone()))
672            .expect("Failed to register request_duration_by_path_seconds");
673        registry
674            .register(Box::new(average_latency_by_path_seconds.clone()))
675            .expect("Failed to register average_latency_by_path_seconds");
676        registry
677            .register(Box::new(workspace_requests_total.clone()))
678            .expect("Failed to register workspace_requests_total");
679        registry
680            .register(Box::new(workspace_requests_duration_seconds.clone()))
681            .expect("Failed to register workspace_requests_duration_seconds");
682        registry
683            .register(Box::new(workspace_active_routes.clone()))
684            .expect("Failed to register workspace_active_routes");
685        registry
686            .register(Box::new(workspace_errors_total.clone()))
687            .expect("Failed to register workspace_errors_total");
688        registry
689            .register(Box::new(errors_total.clone()))
690            .expect("Failed to register errors_total");
691        registry
692            .register(Box::new(error_rate.clone()))
693            .expect("Failed to register error_rate");
694        registry
695            .register(Box::new(plugin_executions_total.clone()))
696            .expect("Failed to register plugin_executions_total");
697        registry
698            .register(Box::new(plugin_execution_duration_seconds.clone()))
699            .expect("Failed to register plugin_execution_duration_seconds");
700        registry
701            .register(Box::new(plugin_errors_total.clone()))
702            .expect("Failed to register plugin_errors_total");
703        registry
704            .register(Box::new(ws_connections_active.clone()))
705            .expect("Failed to register ws_connections_active");
706        registry
707            .register(Box::new(ws_connections_total.clone()))
708            .expect("Failed to register ws_connections_total");
709        registry
710            .register(Box::new(ws_connection_duration_seconds.clone()))
711            .expect("Failed to register ws_connection_duration_seconds");
712        registry
713            .register(Box::new(ws_messages_sent.clone()))
714            .expect("Failed to register ws_messages_sent");
715        registry
716            .register(Box::new(ws_messages_received.clone()))
717            .expect("Failed to register ws_messages_received");
718        registry
719            .register(Box::new(ws_errors_total.clone()))
720            .expect("Failed to register ws_errors_total");
721        registry
722            .register(Box::new(smtp_connections_active.clone()))
723            .expect("Failed to register smtp_connections_active");
724        registry
725            .register(Box::new(smtp_connections_total.clone()))
726            .expect("Failed to register smtp_connections_total");
727        registry
728            .register(Box::new(smtp_messages_received_total.clone()))
729            .expect("Failed to register smtp_messages_received_total");
730        registry
731            .register(Box::new(smtp_messages_stored_total.clone()))
732            .expect("Failed to register smtp_messages_stored_total");
733        registry
734            .register(Box::new(smtp_errors_total.clone()))
735            .expect("Failed to register smtp_errors_total");
736        registry
737            .register(Box::new(mqtt_connections_active.clone()))
738            .expect("Failed to register mqtt_connections_active");
739        registry
740            .register(Box::new(mqtt_connections_total.clone()))
741            .expect("Failed to register mqtt_connections_total");
742        registry
743            .register(Box::new(mqtt_messages_published_total.clone()))
744            .expect("Failed to register mqtt_messages_published_total");
745        registry
746            .register(Box::new(mqtt_messages_received_total.clone()))
747            .expect("Failed to register mqtt_messages_received_total");
748        registry
749            .register(Box::new(mqtt_topics_active.clone()))
750            .expect("Failed to register mqtt_topics_active");
751        registry
752            .register(Box::new(mqtt_subscriptions_active.clone()))
753            .expect("Failed to register mqtt_subscriptions_active");
754        registry
755            .register(Box::new(mqtt_retained_messages.clone()))
756            .expect("Failed to register mqtt_retained_messages");
757        registry
758            .register(Box::new(mqtt_errors_total.clone()))
759            .expect("Failed to register mqtt_errors_total");
760        for m in [
761            &kafka_connections_active,
762            &kafka_messages_produced_total,
763            &kafka_messages_consumed_total,
764            &kafka_topics_total,
765            &kafka_partitions_total,
766            &kafka_consumer_groups_total,
767            &kafka_errors_total,
768            &amqp_connections_active,
769            &amqp_channels_active,
770            &amqp_messages_published_total,
771            &amqp_messages_consumed_total,
772            &amqp_messages_acked_total,
773            &amqp_queues_total,
774            &amqp_exchanges_total,
775            &amqp_bindings_total,
776            &amqp_errors_total,
777        ] {
778            registry
779                .register(Box::new(m.clone()))
780                .expect("Failed to register kafka/amqp protocol metric");
781        }
782        registry
783            .register(Box::new(memory_usage_bytes.clone()))
784            .expect("Failed to register memory_usage_bytes");
785        registry
786            .register(Box::new(cpu_usage_percent.clone()))
787            .expect("Failed to register cpu_usage_percent");
788        registry
789            .register(Box::new(thread_count.clone()))
790            .expect("Failed to register thread_count");
791        registry
792            .register(Box::new(uptime_seconds.clone()))
793            .expect("Failed to register uptime_seconds");
794        registry
795            .register(Box::new(active_scenario_mode.clone()))
796            .expect("Failed to register active_scenario_mode");
797        registry
798            .register(Box::new(chaos_triggers_total.clone()))
799            .expect("Failed to register chaos_triggers_total");
800        registry
801            .register(Box::new(service_availability.clone()))
802            .expect("Failed to register service_availability");
803        registry
804            .register(Box::new(slo_compliance.clone()))
805            .expect("Failed to register slo_compliance");
806        registry
807            .register(Box::new(successful_request_rate.clone()))
808            .expect("Failed to register successful_request_rate");
809        registry
810            .register(Box::new(p95_latency_slo_compliance.clone()))
811            .expect("Failed to register p95_latency_slo_compliance");
812        registry
813            .register(Box::new(error_budget_remaining.clone()))
814            .expect("Failed to register error_budget_remaining");
815        registry
816            .register(Box::new(marketplace_publish_total.clone()))
817            .expect("Failed to register marketplace_publish_total");
818        registry
819            .register(Box::new(marketplace_publish_duration_seconds.clone()))
820            .expect("Failed to register marketplace_publish_duration_seconds");
821        registry
822            .register(Box::new(marketplace_download_total.clone()))
823            .expect("Failed to register marketplace_download_total");
824        registry
825            .register(Box::new(marketplace_download_duration_seconds.clone()))
826            .expect("Failed to register marketplace_download_duration_seconds");
827        registry
828            .register(Box::new(marketplace_search_total.clone()))
829            .expect("Failed to register marketplace_search_total");
830        registry
831            .register(Box::new(marketplace_search_duration_seconds.clone()))
832            .expect("Failed to register marketplace_search_duration_seconds");
833        registry
834            .register(Box::new(marketplace_errors_total.clone()))
835            .expect("Failed to register marketplace_errors_total");
836        registry
837            .register(Box::new(marketplace_items_total.clone()))
838            .expect("Failed to register marketplace_items_total");
839        registry
840            .register(Box::new(drift_percentage.clone()))
841            .expect("Failed to register drift_percentage");
842        registry
843            .register(Box::new(drift_total_changes.clone()))
844            .expect("Failed to register drift_total_changes");
845        registry
846            .register(Box::new(drift_breaking_changes.clone()))
847            .expect("Failed to register drift_breaking_changes");
848        registry
849            .register(Box::new(drift_budget_exceeded.clone()))
850            .expect("Failed to register drift_budget_exceeded");
851
852        debug!("Initialized Prometheus metrics registry");
853
854        Self {
855            registry: Arc::new(registry),
856            requests_total,
857            requests_duration_seconds,
858            requests_in_flight,
859            requests_by_path_total,
860            request_duration_by_path_seconds,
861            average_latency_by_path_seconds,
862            workspace_requests_total,
863            workspace_requests_duration_seconds,
864            workspace_active_routes,
865            workspace_errors_total,
866            errors_total,
867            error_rate,
868            plugin_executions_total,
869            plugin_execution_duration_seconds,
870            plugin_errors_total,
871            ws_connections_active,
872            ws_connections_total,
873            ws_connection_duration_seconds,
874            ws_messages_sent,
875            ws_messages_received,
876            ws_errors_total,
877            smtp_connections_active,
878            smtp_connections_total,
879            smtp_messages_received_total,
880            smtp_messages_stored_total,
881            smtp_errors_total,
882            mqtt_connections_active,
883            mqtt_connections_total,
884            mqtt_messages_published_total,
885            mqtt_messages_received_total,
886            mqtt_topics_active,
887            mqtt_subscriptions_active,
888            mqtt_retained_messages,
889            mqtt_errors_total,
890            kafka_connections_active,
891            kafka_messages_produced_total,
892            kafka_messages_consumed_total,
893            kafka_topics_total,
894            kafka_partitions_total,
895            kafka_consumer_groups_total,
896            kafka_errors_total,
897            amqp_connections_active,
898            amqp_channels_active,
899            amqp_messages_published_total,
900            amqp_messages_consumed_total,
901            amqp_messages_acked_total,
902            amqp_queues_total,
903            amqp_exchanges_total,
904            amqp_bindings_total,
905            amqp_errors_total,
906            memory_usage_bytes,
907            cpu_usage_percent,
908            thread_count,
909            uptime_seconds,
910            active_scenario_mode,
911            chaos_triggers_total,
912            service_availability,
913            slo_compliance,
914            successful_request_rate,
915            p95_latency_slo_compliance,
916            error_budget_remaining,
917            marketplace_publish_total,
918            marketplace_publish_duration_seconds,
919            marketplace_download_total,
920            marketplace_download_duration_seconds,
921            marketplace_search_total,
922            marketplace_search_duration_seconds,
923            marketplace_errors_total,
924            marketplace_items_total,
925            drift_percentage,
926            drift_total_changes,
927            drift_breaking_changes,
928            drift_budget_exceeded,
929        }
930    }
931
932    /// Record a contract-drift evaluation result against the workspace +
933    /// endpoint Prometheus gauges. Closes part of #678.
934    ///
935    /// Callers pass:
936    /// - `workspace_id` — empty string for un-attributed evaluations (legacy)
937    /// - `endpoint` and `method` — the request that was evaluated
938    /// - `total`, `breaking`, `potentially_breaking`, `non_breaking` — counts
939    ///   from `DriftResult`
940    /// - `budget_exceeded` — whether the configured budget tripped
941    ///
942    /// The "drift percentage" is computed as
943    /// `(breaking + potentially_breaking) / total * 100`. A zero-total
944    /// evaluation records 0% drift (the request matched the contract
945    /// exactly).
946    pub fn record_drift_evaluation(&self, sample: DriftEvaluationSample<'_>) {
947        let pct = if sample.total == 0 {
948            0.0
949        } else {
950            (sample.breaking + sample.potentially_breaking) as f64 / sample.total as f64 * 100.0
951        };
952        let labels = [sample.workspace_id, sample.endpoint, sample.method];
953        self.drift_percentage.with_label_values(&labels).set(pct);
954        self.drift_total_changes.with_label_values(&labels).set(sample.total as i64);
955        self.drift_breaking_changes
956            .with_label_values(&labels)
957            .set(sample.breaking as i64);
958        self.drift_budget_exceeded
959            .with_label_values(&labels)
960            .set(if sample.budget_exceeded { 1 } else { 0 });
961    }
962
963    /// Get the underlying Prometheus registry
964    pub fn registry(&self) -> &Registry {
965        &self.registry
966    }
967
968    /// Check if the registry is initialized
969    pub fn is_initialized(&self) -> bool {
970        true
971    }
972
973    /// Record an HTTP request
974    pub fn record_http_request(&self, method: &str, status: u16, duration_seconds: f64) {
975        self.record_http_request_with_pillar(method, status, duration_seconds, "");
976    }
977
978    /// Record an HTTP request with pillar information
979    pub fn record_http_request_with_pillar(
980        &self,
981        method: &str,
982        status: u16,
983        duration_seconds: f64,
984        pillar: &str,
985    ) {
986        let status_str = status.to_string();
987        let pillar_label = if pillar.is_empty() { "unknown" } else { pillar };
988        self.requests_total
989            .with_label_values(&["http", method, &status_str, pillar_label])
990            .inc();
991        self.requests_duration_seconds
992            .with_label_values(&["http", method, pillar_label])
993            .observe(duration_seconds);
994    }
995
996    /// Record a gRPC request
997    pub fn record_grpc_request(&self, method: &str, status: &str, duration_seconds: f64) {
998        self.record_grpc_request_with_pillar(method, status, duration_seconds, "");
999    }
1000
1001    /// Record a gRPC request with pillar information
1002    pub fn record_grpc_request_with_pillar(
1003        &self,
1004        method: &str,
1005        status: &str,
1006        duration_seconds: f64,
1007        pillar: &str,
1008    ) {
1009        let pillar_label = if pillar.is_empty() { "unknown" } else { pillar };
1010        self.requests_total
1011            .with_label_values(&["grpc", method, status, pillar_label])
1012            .inc();
1013        self.requests_duration_seconds
1014            .with_label_values(&["grpc", method, pillar_label])
1015            .observe(duration_seconds);
1016    }
1017
1018    /// Record a WebSocket message
1019    pub fn record_ws_message_sent(&self) {
1020        self.ws_messages_sent.inc();
1021    }
1022
1023    /// Record a WebSocket message received
1024    pub fn record_ws_message_received(&self) {
1025        self.ws_messages_received.inc();
1026    }
1027
1028    /// Record a GraphQL request
1029    pub fn record_graphql_request(&self, operation: &str, status: u16, duration_seconds: f64) {
1030        let status_str = status.to_string();
1031        // GraphQL requests are categorized under the "contracts" pillar
1032        self.requests_total
1033            .with_label_values(&["graphql", operation, &status_str, "contracts"])
1034            .inc();
1035        self.requests_duration_seconds
1036            .with_label_values(&["graphql", operation, "contracts"])
1037            .observe(duration_seconds);
1038    }
1039
1040    /// Record a plugin execution
1041    pub fn record_plugin_execution(&self, plugin_name: &str, success: bool, duration_seconds: f64) {
1042        let status = if success { "success" } else { "failure" };
1043        self.plugin_executions_total.with_label_values(&[plugin_name, status]).inc();
1044        self.plugin_execution_duration_seconds
1045            .with_label_values(&[plugin_name])
1046            .observe(duration_seconds);
1047    }
1048
1049    /// Increment in-flight requests
1050    pub fn increment_in_flight(&self, protocol: &str) {
1051        self.requests_in_flight.with_label_values(&[protocol]).inc();
1052    }
1053
1054    /// Decrement in-flight requests
1055    pub fn decrement_in_flight(&self, protocol: &str) {
1056        self.requests_in_flight.with_label_values(&[protocol]).dec();
1057    }
1058
1059    /// Record an error
1060    pub fn record_error(&self, protocol: &str, error_type: &str) {
1061        self.record_error_with_pillar(protocol, error_type, "");
1062    }
1063
1064    /// Record an error with pillar information
1065    pub fn record_error_with_pillar(&self, protocol: &str, error_type: &str, pillar: &str) {
1066        let pillar_label = if pillar.is_empty() { "unknown" } else { pillar };
1067        self.errors_total.with_label_values(&[protocol, error_type, pillar_label]).inc();
1068    }
1069
1070    /// Update memory usage
1071    pub fn update_memory_usage(&self, bytes: f64) {
1072        self.memory_usage_bytes.set(bytes);
1073    }
1074
1075    /// Update CPU usage
1076    pub fn update_cpu_usage(&self, percent: f64) {
1077        self.cpu_usage_percent.set(percent);
1078    }
1079
1080    /// Set active scenario mode (0=healthy, 1=degraded, 2=error, 3=chaos)
1081    pub fn set_scenario_mode(&self, mode: i64) {
1082        self.active_scenario_mode.set(mode);
1083    }
1084
1085    /// Record a chaos trigger
1086    pub fn record_chaos_trigger(&self) {
1087        self.chaos_triggers_total.inc();
1088    }
1089
1090    /// Record an HTTP request with path information
1091    pub fn record_http_request_with_path(
1092        &self,
1093        path: &str,
1094        method: &str,
1095        status: u16,
1096        duration_seconds: f64,
1097    ) {
1098        self.record_http_request_with_path_and_pillar(path, method, status, duration_seconds, "");
1099    }
1100
1101    /// Record an HTTP request with path and pillar information
1102    pub fn record_http_request_with_path_and_pillar(
1103        &self,
1104        path: &str,
1105        method: &str,
1106        status: u16,
1107        duration_seconds: f64,
1108        pillar: &str,
1109    ) {
1110        // Normalize path to avoid cardinality explosion
1111        let normalized_path = normalize_path(path);
1112        let status_str = status.to_string();
1113
1114        // Record by path
1115        self.requests_by_path_total
1116            .with_label_values(&[normalized_path.as_str(), method, status_str.as_str()])
1117            .inc();
1118        self.request_duration_by_path_seconds
1119            .with_label_values(&[normalized_path.as_str(), method])
1120            .observe(duration_seconds);
1121
1122        // Update average latency (simple moving average approximation)
1123        // Note: For production use, consider using a proper moving average or quantiles
1124        let current = self
1125            .average_latency_by_path_seconds
1126            .with_label_values(&[normalized_path.as_str(), method])
1127            .get();
1128        let new_avg = if current == 0.0 {
1129            duration_seconds
1130        } else {
1131            (current * 0.95) + (duration_seconds * 0.05)
1132        };
1133        self.average_latency_by_path_seconds
1134            .with_label_values(&[normalized_path.as_str(), method])
1135            .set(new_avg);
1136
1137        // Also record in the general metrics with pillar
1138        self.record_http_request_with_pillar(method, status, duration_seconds, pillar);
1139    }
1140
1141    /// Record a WebSocket connection established
1142    pub fn record_ws_connection_established(&self) {
1143        self.ws_connections_total.inc();
1144        self.ws_connections_active.inc();
1145    }
1146
1147    /// Record a WebSocket connection closed
1148    pub fn record_ws_connection_closed(&self, duration_seconds: f64, status: &str) {
1149        self.ws_connections_active.dec();
1150        self.ws_connection_duration_seconds
1151            .with_label_values(&[status])
1152            .observe(duration_seconds);
1153    }
1154
1155    /// Record a WebSocket error
1156    pub fn record_ws_error(&self) {
1157        self.ws_errors_total.inc();
1158    }
1159
1160    /// Record an SMTP connection established
1161    pub fn record_smtp_connection_established(&self) {
1162        self.smtp_connections_total.inc();
1163        self.smtp_connections_active.inc();
1164    }
1165
1166    /// Record an SMTP connection closed
1167    pub fn record_smtp_connection_closed(&self) {
1168        self.smtp_connections_active.dec();
1169    }
1170
1171    /// Record an SMTP message received
1172    pub fn record_smtp_message_received(&self) {
1173        self.smtp_messages_received_total.inc();
1174    }
1175
1176    /// Record an SMTP message stored
1177    pub fn record_smtp_message_stored(&self) {
1178        self.smtp_messages_stored_total.inc();
1179    }
1180
1181    /// Record an SMTP error
1182    pub fn record_smtp_error(&self, error_type: &str) {
1183        self.smtp_errors_total.with_label_values(&[error_type]).inc();
1184    }
1185
1186    /// Update thread count
1187    pub fn update_thread_count(&self, count: f64) {
1188        self.thread_count.set(count);
1189    }
1190
1191    /// Update uptime
1192    pub fn update_uptime(&self, seconds: f64) {
1193        self.uptime_seconds.set(seconds);
1194    }
1195
1196    // ==================== Workspace-specific metrics ====================
1197
1198    /// Record a workspace request
1199    pub fn record_workspace_request(
1200        &self,
1201        workspace_id: &str,
1202        method: &str,
1203        status: u16,
1204        duration_seconds: f64,
1205    ) {
1206        let status_str = status.to_string();
1207        self.workspace_requests_total
1208            .with_label_values(&[workspace_id, method, &status_str])
1209            .inc();
1210        self.workspace_requests_duration_seconds
1211            .with_label_values(&[workspace_id, method])
1212            .observe(duration_seconds);
1213    }
1214
1215    /// Update workspace active routes count
1216    pub fn update_workspace_active_routes(&self, workspace_id: &str, count: i64) {
1217        self.workspace_active_routes.with_label_values(&[workspace_id]).set(count);
1218    }
1219
1220    /// Record a workspace error
1221    pub fn record_workspace_error(&self, workspace_id: &str, error_type: &str) {
1222        self.workspace_errors_total.with_label_values(&[workspace_id, error_type]).inc();
1223    }
1224
1225    /// Increment workspace active routes
1226    pub fn increment_workspace_routes(&self, workspace_id: &str) {
1227        self.workspace_active_routes.with_label_values(&[workspace_id]).inc();
1228    }
1229
1230    /// Decrement workspace active routes
1231    pub fn decrement_workspace_routes(&self, workspace_id: &str) {
1232        self.workspace_active_routes.with_label_values(&[workspace_id]).dec();
1233    }
1234
1235    // ==================== Marketplace metrics ====================
1236
1237    /// Record a marketplace publish operation
1238    pub fn record_marketplace_publish(
1239        &self,
1240        item_type: &str,
1241        success: bool,
1242        duration_seconds: f64,
1243    ) {
1244        let status = if success { "success" } else { "error" };
1245        self.marketplace_publish_total.with_label_values(&[item_type, status]).inc();
1246        self.marketplace_publish_duration_seconds
1247            .with_label_values(&[item_type])
1248            .observe(duration_seconds);
1249    }
1250
1251    /// Record a marketplace download operation
1252    pub fn record_marketplace_download(
1253        &self,
1254        item_type: &str,
1255        success: bool,
1256        duration_seconds: f64,
1257    ) {
1258        let status = if success { "success" } else { "error" };
1259        self.marketplace_download_total.with_label_values(&[item_type, status]).inc();
1260        self.marketplace_download_duration_seconds
1261            .with_label_values(&[item_type])
1262            .observe(duration_seconds);
1263    }
1264
1265    /// Record a marketplace search operation
1266    pub fn record_marketplace_search(&self, item_type: &str, success: bool, duration_seconds: f64) {
1267        let status = if success { "success" } else { "error" };
1268        self.marketplace_search_total.with_label_values(&[item_type, status]).inc();
1269        self.marketplace_search_duration_seconds
1270            .with_label_values(&[item_type])
1271            .observe(duration_seconds);
1272    }
1273
1274    /// Record a marketplace error
1275    pub fn record_marketplace_error(&self, item_type: &str, error_code: &str) {
1276        self.marketplace_errors_total.with_label_values(&[item_type, error_code]).inc();
1277    }
1278
1279    /// Update the total number of marketplace items
1280    pub fn update_marketplace_items_total(&self, item_type: &str, count: i64) {
1281        self.marketplace_items_total.with_label_values(&[item_type]).set(count);
1282    }
1283}
1284
1285/// Normalize path to avoid high cardinality
1286///
1287/// This function replaces dynamic path segments (IDs, UUIDs, etc.) with placeholders
1288/// to prevent metric explosion.
1289fn normalize_path(path: &str) -> String {
1290    let mut segments: Vec<&str> = path.split('/').collect();
1291
1292    for segment in &mut segments {
1293        // Replace UUIDs, numeric IDs, or hex strings with :id placeholder
1294        if is_uuid(segment)
1295            || segment.parse::<i64>().is_ok()
1296            || (segment.len() > 8 && segment.chars().all(|c| c.is_ascii_hexdigit()))
1297        {
1298            *segment = ":id";
1299        }
1300    }
1301
1302    segments.join("/")
1303}
1304
1305/// Check if a string is a UUID
1306fn is_uuid(s: &str) -> bool {
1307    s.len() == 36 && s.chars().filter(|&c| c == '-').count() == 4
1308}
1309
1310impl Default for MetricsRegistry {
1311    fn default() -> Self {
1312        Self::new()
1313    }
1314}
1315
1316/// Global metrics registry instance.
1317///
1318/// Held as an `Arc` so the same instance can be both written to (via
1319/// `get_global_registry()`) AND served at `GET /metrics` (via
1320/// `get_global_registry_arc()`). Previously the metrics endpoint was handed a
1321/// *separate* `MetricsRegistry::new()`, so nothing that recorded metrics was
1322/// ever exported (#743).
1323static GLOBAL_REGISTRY: Lazy<Arc<MetricsRegistry>> = Lazy::new(|| Arc::new(MetricsRegistry::new()));
1324
1325/// Get a shared reference to the global metrics registry (for recording).
1326pub fn get_global_registry() -> &'static MetricsRegistry {
1327    &GLOBAL_REGISTRY
1328}
1329
1330/// Get an owned `Arc` handle to the global metrics registry, e.g. to mount the
1331/// Prometheus `/metrics` endpoint on the same instance everything records into.
1332pub fn get_global_registry_arc() -> Arc<MetricsRegistry> {
1333    Arc::clone(&GLOBAL_REGISTRY)
1334}
1335
1336#[cfg(test)]
1337mod tests {
1338    use super::*;
1339
1340    #[test]
1341    fn test_metrics_registry_creation() {
1342        let registry = MetricsRegistry::new();
1343        assert!(registry.is_initialized());
1344    }
1345
1346    #[test]
1347    fn test_record_http_request() {
1348        let registry = MetricsRegistry::new();
1349        registry.record_http_request("GET", 200, 0.045);
1350        registry.record_http_request("POST", 201, 0.123);
1351
1352        // Verify metrics were recorded (they should not panic)
1353        assert!(registry.is_initialized());
1354    }
1355
1356    #[test]
1357    fn test_record_http_request_with_pillar() {
1358        let registry = MetricsRegistry::new();
1359        registry.record_http_request_with_pillar("GET", 200, 0.045, "reality");
1360        registry.record_http_request_with_pillar("POST", 201, 0.123, "contracts");
1361
1362        // Verify metrics were recorded (they should not panic)
1363        assert!(registry.is_initialized());
1364    }
1365
1366    #[test]
1367    fn test_global_registry() {
1368        let registry = get_global_registry();
1369        assert!(registry.is_initialized());
1370    }
1371
1372    #[test]
1373    fn test_record_drift_evaluation_basic_percentage() {
1374        let registry = MetricsRegistry::new();
1375        // 1 breaking + 2 potentially_breaking out of 10 total = 30%
1376        registry.record_drift_evaluation(DriftEvaluationSample {
1377            workspace_id: "ws-a",
1378            endpoint: "/users",
1379            method: "GET",
1380            total: 10,
1381            breaking: 1,
1382            potentially_breaking: 2,
1383            budget_exceeded: false,
1384        });
1385
1386        let pct = registry.drift_percentage.with_label_values(&["ws-a", "/users", "GET"]).get();
1387        assert!((pct - 30.0).abs() < f64::EPSILON);
1388        assert_eq!(
1389            registry.drift_total_changes.with_label_values(&["ws-a", "/users", "GET"]).get(),
1390            10
1391        );
1392        assert_eq!(
1393            registry
1394                .drift_breaking_changes
1395                .with_label_values(&["ws-a", "/users", "GET"])
1396                .get(),
1397            1
1398        );
1399        assert_eq!(
1400            registry
1401                .drift_budget_exceeded
1402                .with_label_values(&["ws-a", "/users", "GET"])
1403                .get(),
1404            0
1405        );
1406    }
1407
1408    #[test]
1409    fn test_record_drift_evaluation_zero_total_is_zero_percent() {
1410        let registry = MetricsRegistry::new();
1411        // No mismatches observed → 0% drift, not a divide-by-zero.
1412        registry.record_drift_evaluation(DriftEvaluationSample {
1413            workspace_id: "",
1414            endpoint: "/health",
1415            method: "GET",
1416            total: 0,
1417            breaking: 0,
1418            potentially_breaking: 0,
1419            budget_exceeded: false,
1420        });
1421        let pct = registry.drift_percentage.with_label_values(&["", "/health", "GET"]).get();
1422        assert_eq!(pct, 0.0);
1423    }
1424
1425    #[test]
1426    fn test_record_drift_evaluation_budget_exceeded_flag() {
1427        let registry = MetricsRegistry::new();
1428        registry.record_drift_evaluation(DriftEvaluationSample {
1429            workspace_id: "ws-b",
1430            endpoint: "/orders",
1431            method: "POST",
1432            total: 4,
1433            breaking: 2,
1434            potentially_breaking: 0,
1435            budget_exceeded: true,
1436        });
1437        assert_eq!(
1438            registry
1439                .drift_budget_exceeded
1440                .with_label_values(&["ws-b", "/orders", "POST"])
1441                .get(),
1442            1
1443        );
1444    }
1445
1446    #[test]
1447    fn test_plugin_metrics() {
1448        let registry = MetricsRegistry::new();
1449        registry.record_plugin_execution("test-plugin", true, 0.025);
1450        registry.record_plugin_execution("test-plugin", false, 0.050);
1451        assert!(registry.is_initialized());
1452    }
1453
1454    #[test]
1455    fn test_websocket_metrics() {
1456        let registry = MetricsRegistry::new();
1457        registry.record_ws_message_sent();
1458        registry.record_ws_message_received();
1459        registry.record_ws_connection_established();
1460        registry.record_ws_connection_closed(120.5, "normal");
1461        registry.record_ws_error();
1462        assert!(registry.is_initialized());
1463    }
1464
1465    #[test]
1466    fn test_path_normalization() {
1467        assert_eq!(normalize_path("/api/users/123"), "/api/users/:id");
1468        assert_eq!(
1469            normalize_path("/api/users/550e8400-e29b-41d4-a716-446655440000"),
1470            "/api/users/:id"
1471        );
1472        assert_eq!(normalize_path("/api/users/abc123def456"), "/api/users/:id");
1473        assert_eq!(normalize_path("/api/users/list"), "/api/users/list");
1474    }
1475
1476    #[test]
1477    fn test_path_based_metrics() {
1478        let registry = MetricsRegistry::new();
1479        registry.record_http_request_with_path("/api/users/123", "GET", 200, 0.045);
1480        registry.record_http_request_with_path("/api/users/456", "GET", 200, 0.055);
1481        registry.record_http_request_with_path("/api/posts", "POST", 201, 0.123);
1482        assert!(registry.is_initialized());
1483    }
1484
1485    #[test]
1486    fn test_smtp_metrics() {
1487        let registry = MetricsRegistry::new();
1488        registry.record_smtp_connection_established();
1489        registry.record_smtp_message_received();
1490        registry.record_smtp_message_stored();
1491        registry.record_smtp_connection_closed();
1492        registry.record_smtp_error("timeout");
1493        assert!(registry.is_initialized());
1494    }
1495
1496    #[test]
1497    fn test_system_metrics() {
1498        let registry = MetricsRegistry::new();
1499        registry.update_memory_usage(1024.0 * 1024.0 * 100.0); // 100 MB
1500        registry.update_cpu_usage(45.5);
1501        registry.update_thread_count(25.0);
1502        registry.update_uptime(3600.0); // 1 hour
1503        assert!(registry.is_initialized());
1504    }
1505
1506    #[test]
1507    fn test_workspace_metrics() {
1508        let registry = MetricsRegistry::new();
1509
1510        // Record workspace requests
1511        registry.record_workspace_request("workspace1", "GET", 200, 0.045);
1512        registry.record_workspace_request("workspace1", "POST", 201, 0.123);
1513        registry.record_workspace_request("workspace2", "GET", 200, 0.055);
1514
1515        // Update active routes
1516        registry.update_workspace_active_routes("workspace1", 10);
1517        registry.update_workspace_active_routes("workspace2", 5);
1518
1519        // Record errors
1520        registry.record_workspace_error("workspace1", "validation");
1521        registry.record_workspace_error("workspace2", "timeout");
1522
1523        // Test increment/decrement
1524        registry.increment_workspace_routes("workspace1");
1525        registry.decrement_workspace_routes("workspace1");
1526
1527        assert!(registry.is_initialized());
1528    }
1529
1530    #[test]
1531    fn test_workspace_metrics_isolation() {
1532        let registry = MetricsRegistry::new();
1533
1534        // Ensure metrics for different workspaces are independent
1535        registry.record_workspace_request("ws1", "GET", 200, 0.1);
1536        registry.record_workspace_request("ws2", "GET", 200, 0.2);
1537
1538        registry.update_workspace_active_routes("ws1", 5);
1539        registry.update_workspace_active_routes("ws2", 10);
1540
1541        // Both should be tracked independently
1542        assert!(registry.is_initialized());
1543    }
1544}