llm_memory_graph/observatory/
prometheus.rs

1//! Prometheus metrics integration for real-time monitoring
2//!
3//! This module provides comprehensive production-grade Prometheus metrics for monitoring
4//! memory graph operations, performance, resource utilization, and integrations.
5//!
6//! # Metrics Categories
7//!
8//! ## Core Metrics
9//! - **Counters**: Track cumulative counts of operations (nodes, edges, prompts, etc.)
10//! - **Histograms**: Measure latency distributions and batch sizes
11//! - **Gauges**: Monitor current state and resource utilization
12//!
13//! ## Production Metrics
14//! - **gRPC Metrics**: Request counts, durations, and active streams
15//! - **Plugin Metrics**: Plugin executions, durations, and error tracking
16//! - **Integration Metrics**: LLM-Registry calls and Data-Vault operations
17//!
18//! # Examples
19//!
20//! ## Basic Usage
21//!
22//! ```no_run
23//! use llm_memory_graph::observatory::prometheus::PrometheusMetrics;
24//! use prometheus::Registry;
25//!
26//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
27//! let registry = Registry::new();
28//! let metrics = PrometheusMetrics::new(&registry)?;
29//!
30//! // Record core operations
31//! metrics.record_node_created();
32//! metrics.record_write_latency(0.025);
33//! metrics.set_active_sessions(5);
34//!
35//! // Export metrics
36//! let metrics_text = prometheus::TextEncoder::new()
37//!     .encode_to_string(&registry.gather())?;
38//! # Ok(())
39//! # }
40//! ```
41//!
42//! ## Production Metrics Usage
43//!
44//! ```no_run
45//! # use llm_memory_graph::observatory::prometheus::PrometheusMetrics;
46//! # use prometheus::Registry;
47//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
48//! # let registry = Registry::new();
49//! # let metrics = PrometheusMetrics::new(&registry)?;
50//! // Record gRPC operations
51//! metrics.record_grpc_request("CreateSession", "success");
52//! metrics.record_grpc_request_duration("CreateSession", 0.015);
53//! metrics.inc_grpc_active_streams();
54//!
55//! // Record plugin operations
56//! metrics.record_plugin_execution("audit_logger", "on_node_create");
57//! metrics.record_plugin_duration("audit_logger", "on_node_create", 0.002);
58//! metrics.record_plugin_error("validator", "validation_failed");
59//!
60//! // Record integration operations
61//! metrics.record_registry_call("register_model", "success");
62//! metrics.record_vault_archive();
63//! metrics.record_vault_retrieval();
64//! # Ok(())
65//! # }
66//! ```
67
68use crate::error::Result;
69use prometheus::{
70    Histogram, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, Opts, Registry,
71};
72
73/// Prometheus metrics for MemoryGraph monitoring
74///
75/// Provides comprehensive production-grade metrics across multiple categories:
76/// - 8 Counters for tracking operations
77/// - 5 Histograms for latency and size distributions
78/// - 5 Gauges for current state monitoring
79/// - 7 Production metrics (gRPC, Plugin, Integration)
80#[derive(Clone)]
81pub struct PrometheusMetrics {
82    // Counters - Track cumulative counts
83    /// Total number of nodes created
84    pub nodes_created: IntCounter,
85    /// Total number of edges created
86    pub edges_created: IntCounter,
87    /// Total number of prompts submitted
88    pub prompts_submitted: IntCounter,
89    /// Total number of responses generated
90    pub responses_generated: IntCounter,
91    /// Total number of tools invoked
92    pub tools_invoked: IntCounter,
93    /// Total number of agent handoffs
94    pub agent_handoffs: IntCounter,
95    /// Total number of template instantiations
96    pub template_instantiations: IntCounter,
97    /// Total number of queries executed
98    pub queries_executed: IntCounter,
99
100    // Histograms - Measure distributions
101    /// Write operation latency distribution (seconds)
102    pub write_latency: Histogram,
103    /// Read operation latency distribution (seconds)
104    pub read_latency: Histogram,
105    /// Query execution duration distribution (seconds)
106    pub query_duration: Histogram,
107    /// Tool execution duration distribution (seconds)
108    pub tool_duration: Histogram,
109    /// Batch operation size distribution
110    pub batch_size: Histogram,
111
112    // Gauges - Current state metrics
113    /// Number of currently active sessions
114    pub active_sessions: IntGauge,
115    /// Total number of nodes in the graph
116    pub total_nodes: IntGauge,
117    /// Total number of edges in the graph
118    pub total_edges: IntGauge,
119    /// Current cache size in bytes
120    pub cache_size_bytes: IntGauge,
121    /// Current event buffer size
122    pub buffer_size: IntGauge,
123
124    // Production Metrics - gRPC
125    /// Total gRPC requests by method and status
126    pub grpc_requests_total: IntCounterVec,
127    /// gRPC request duration by method
128    pub grpc_request_duration: HistogramVec,
129    /// Number of active gRPC streams
130    pub grpc_active_streams: IntGauge,
131
132    // Production Metrics - Plugin System
133    /// Total plugin executions by plugin name and hook
134    pub plugin_executions_total: IntCounterVec,
135    /// Plugin execution duration by plugin and hook
136    pub plugin_duration: HistogramVec,
137    /// Total plugin errors by plugin and error type
138    pub plugin_errors_total: IntCounterVec,
139
140    // Production Metrics - Integrations
141    /// Total LLM-Registry API calls by operation and status
142    pub registry_calls_total: IntCounterVec,
143    /// Total sessions archived to Data-Vault
144    pub vault_archives_total: IntCounter,
145    /// Total sessions retrieved from Data-Vault
146    pub vault_retrievals_total: IntCounter,
147    /// Total Data-Vault errors
148    pub vault_errors_total: IntCounter,
149}
150
151impl PrometheusMetrics {
152    /// Create and register all metrics with the provided registry
153    ///
154    /// # Arguments
155    ///
156    /// * `registry` - Prometheus registry to register metrics with
157    ///
158    /// # Returns
159    ///
160    /// Returns a new PrometheusMetrics instance with all metrics registered
161    ///
162    /// # Errors
163    ///
164    /// Returns an error if metric registration fails
165    pub fn new(registry: &Registry) -> Result<Self> {
166        // Counters
167        let nodes_created = IntCounter::with_opts(Opts::new(
168            "memory_graph_nodes_created_total",
169            "Total number of nodes created in the memory graph",
170        ))?;
171        registry.register(Box::new(nodes_created.clone()))?;
172
173        let edges_created = IntCounter::with_opts(Opts::new(
174            "memory_graph_edges_created_total",
175            "Total number of edges created in the memory graph",
176        ))?;
177        registry.register(Box::new(edges_created.clone()))?;
178
179        let prompts_submitted = IntCounter::with_opts(Opts::new(
180            "memory_graph_prompts_submitted_total",
181            "Total number of prompts submitted",
182        ))?;
183        registry.register(Box::new(prompts_submitted.clone()))?;
184
185        let responses_generated = IntCounter::with_opts(Opts::new(
186            "memory_graph_responses_generated_total",
187            "Total number of responses generated",
188        ))?;
189        registry.register(Box::new(responses_generated.clone()))?;
190
191        let tools_invoked = IntCounter::with_opts(Opts::new(
192            "memory_graph_tools_invoked_total",
193            "Total number of tools invoked",
194        ))?;
195        registry.register(Box::new(tools_invoked.clone()))?;
196
197        let agent_handoffs = IntCounter::with_opts(Opts::new(
198            "memory_graph_agent_handoffs_total",
199            "Total number of agent handoffs",
200        ))?;
201        registry.register(Box::new(agent_handoffs.clone()))?;
202
203        let template_instantiations = IntCounter::with_opts(Opts::new(
204            "memory_graph_template_instantiations_total",
205            "Total number of template instantiations",
206        ))?;
207        registry.register(Box::new(template_instantiations.clone()))?;
208
209        let queries_executed = IntCounter::with_opts(Opts::new(
210            "memory_graph_queries_executed_total",
211            "Total number of queries executed",
212        ))?;
213        registry.register(Box::new(queries_executed.clone()))?;
214
215        // Histograms with appropriate buckets
216        let write_latency = Histogram::with_opts(
217            HistogramOpts::new(
218                "memory_graph_write_latency_seconds",
219                "Write operation latency in seconds",
220            )
221            .buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]),
222        )?;
223        registry.register(Box::new(write_latency.clone()))?;
224
225        let read_latency = Histogram::with_opts(
226            HistogramOpts::new(
227                "memory_graph_read_latency_seconds",
228                "Read operation latency in seconds",
229            )
230            .buckets(vec![0.0001, 0.0005, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1]),
231        )?;
232        registry.register(Box::new(read_latency.clone()))?;
233
234        let query_duration = Histogram::with_opts(
235            HistogramOpts::new(
236                "memory_graph_query_duration_seconds",
237                "Query execution duration in seconds",
238            )
239            .buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]),
240        )?;
241        registry.register(Box::new(query_duration.clone()))?;
242
243        let tool_duration = Histogram::with_opts(
244            HistogramOpts::new(
245                "memory_graph_tool_duration_seconds",
246                "Tool execution duration in seconds",
247            )
248            .buckets(vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0]),
249        )?;
250        registry.register(Box::new(tool_duration.clone()))?;
251
252        let batch_size = Histogram::with_opts(
253            HistogramOpts::new(
254                "memory_graph_batch_size",
255                "Batch operation size distribution",
256            )
257            .buckets(vec![
258                1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0,
259            ]),
260        )?;
261        registry.register(Box::new(batch_size.clone()))?;
262
263        // Gauges
264        let active_sessions = IntGauge::with_opts(Opts::new(
265            "memory_graph_active_sessions",
266            "Number of currently active sessions",
267        ))?;
268        registry.register(Box::new(active_sessions.clone()))?;
269
270        let total_nodes = IntGauge::with_opts(Opts::new(
271            "memory_graph_total_nodes",
272            "Total number of nodes in the graph",
273        ))?;
274        registry.register(Box::new(total_nodes.clone()))?;
275
276        let total_edges = IntGauge::with_opts(Opts::new(
277            "memory_graph_total_edges",
278            "Total number of edges in the graph",
279        ))?;
280        registry.register(Box::new(total_edges.clone()))?;
281
282        let cache_size_bytes = IntGauge::with_opts(Opts::new(
283            "memory_graph_cache_size_bytes",
284            "Current cache size in bytes",
285        ))?;
286        registry.register(Box::new(cache_size_bytes.clone()))?;
287
288        let buffer_size = IntGauge::with_opts(Opts::new(
289            "memory_graph_buffer_size",
290            "Current event buffer size",
291        ))?;
292        registry.register(Box::new(buffer_size.clone()))?;
293
294        // Production Metrics - gRPC
295        let grpc_requests_total = IntCounterVec::new(
296            Opts::new(
297                "memory_graph_grpc_requests_total",
298                "Total number of gRPC requests by method and status",
299            ),
300            &["method", "status"],
301        )?;
302        registry.register(Box::new(grpc_requests_total.clone()))?;
303
304        let grpc_request_duration = HistogramVec::new(
305            HistogramOpts::new(
306                "memory_graph_grpc_request_duration_seconds",
307                "gRPC request duration in seconds by method",
308            )
309            .buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]),
310            &["method"],
311        )?;
312        registry.register(Box::new(grpc_request_duration.clone()))?;
313
314        let grpc_active_streams = IntGauge::with_opts(Opts::new(
315            "memory_graph_grpc_active_streams",
316            "Number of active gRPC streams",
317        ))?;
318        registry.register(Box::new(grpc_active_streams.clone()))?;
319
320        // Production Metrics - Plugin System
321        let plugin_executions_total = IntCounterVec::new(
322            Opts::new(
323                "memory_graph_plugin_executions_total",
324                "Total plugin executions by plugin name and hook",
325            ),
326            &["plugin", "hook"],
327        )?;
328        registry.register(Box::new(plugin_executions_total.clone()))?;
329
330        let plugin_duration = HistogramVec::new(
331            HistogramOpts::new(
332                "memory_graph_plugin_duration_seconds",
333                "Plugin execution duration in seconds",
334            )
335            .buckets(vec![0.001, 0.01, 0.1, 1.0, 5.0, 10.0]),
336            &["plugin", "hook"],
337        )?;
338        registry.register(Box::new(plugin_duration.clone()))?;
339
340        let plugin_errors_total = IntCounterVec::new(
341            Opts::new(
342                "memory_graph_plugin_errors_total",
343                "Total plugin errors by plugin and error type",
344            ),
345            &["plugin", "error_type"],
346        )?;
347        registry.register(Box::new(plugin_errors_total.clone()))?;
348
349        // Production Metrics - Integrations
350        let registry_calls_total = IntCounterVec::new(
351            Opts::new(
352                "memory_graph_registry_calls_total",
353                "Total LLM-Registry API calls by operation and status",
354            ),
355            &["operation", "status"],
356        )?;
357        registry.register(Box::new(registry_calls_total.clone()))?;
358
359        let vault_archives_total = IntCounter::with_opts(Opts::new(
360            "memory_graph_vault_archives_total",
361            "Total sessions archived to Data-Vault",
362        ))?;
363        registry.register(Box::new(vault_archives_total.clone()))?;
364
365        let vault_retrievals_total = IntCounter::with_opts(Opts::new(
366            "memory_graph_vault_retrievals_total",
367            "Total sessions retrieved from Data-Vault",
368        ))?;
369        registry.register(Box::new(vault_retrievals_total.clone()))?;
370
371        let vault_errors_total = IntCounter::with_opts(Opts::new(
372            "memory_graph_vault_errors_total",
373            "Total Data-Vault operation errors",
374        ))?;
375        registry.register(Box::new(vault_errors_total.clone()))?;
376
377        Ok(Self {
378            nodes_created,
379            edges_created,
380            prompts_submitted,
381            responses_generated,
382            tools_invoked,
383            agent_handoffs,
384            template_instantiations,
385            queries_executed,
386            write_latency,
387            read_latency,
388            query_duration,
389            tool_duration,
390            batch_size,
391            active_sessions,
392            total_nodes,
393            total_edges,
394            cache_size_bytes,
395            buffer_size,
396            grpc_requests_total,
397            grpc_request_duration,
398            grpc_active_streams,
399            plugin_executions_total,
400            plugin_duration,
401            plugin_errors_total,
402            registry_calls_total,
403            vault_archives_total,
404            vault_retrievals_total,
405            vault_errors_total,
406        })
407    }
408
409    /// Create metrics with a custom namespace prefix
410    pub fn with_namespace(registry: &Registry, _namespace: &str) -> Result<Self> {
411        // This is a simplified version - in production you'd apply prefix to all metrics
412        Self::new(registry)
413    }
414
415    // Counter recording methods
416
417    /// Record a node creation
418    pub fn record_node_created(&self) {
419        self.nodes_created.inc();
420    }
421
422    /// Record multiple node creations
423    pub fn record_nodes_created(&self, count: u64) {
424        self.nodes_created.inc_by(count);
425    }
426
427    /// Record an edge creation
428    pub fn record_edge_created(&self) {
429        self.edges_created.inc();
430    }
431
432    /// Record multiple edge creations
433    pub fn record_edges_created(&self, count: u64) {
434        self.edges_created.inc_by(count);
435    }
436
437    /// Record a prompt submission
438    pub fn record_prompt_submitted(&self) {
439        self.prompts_submitted.inc();
440    }
441
442    /// Record a response generation
443    pub fn record_response_generated(&self) {
444        self.responses_generated.inc();
445    }
446
447    /// Record a tool invocation
448    pub fn record_tool_invoked(&self) {
449        self.tools_invoked.inc();
450    }
451
452    /// Record an agent handoff
453    pub fn record_agent_handoff(&self) {
454        self.agent_handoffs.inc();
455    }
456
457    /// Record a template instantiation
458    pub fn record_template_instantiation(&self) {
459        self.template_instantiations.inc();
460    }
461
462    /// Record a query execution
463    pub fn record_query_executed(&self) {
464        self.queries_executed.inc();
465    }
466
467    // Histogram recording methods
468
469    /// Record write operation latency in seconds
470    pub fn record_write_latency(&self, duration_secs: f64) {
471        self.write_latency.observe(duration_secs);
472    }
473
474    /// Record read operation latency in seconds
475    pub fn record_read_latency(&self, duration_secs: f64) {
476        self.read_latency.observe(duration_secs);
477    }
478
479    /// Record query execution duration in seconds
480    pub fn record_query_duration(&self, duration_secs: f64) {
481        self.query_duration.observe(duration_secs);
482    }
483
484    /// Record tool execution duration in seconds
485    pub fn record_tool_duration(&self, duration_secs: f64) {
486        self.tool_duration.observe(duration_secs);
487    }
488
489    /// Record batch operation size
490    pub fn record_batch_size(&self, size: usize) {
491        self.batch_size.observe(size as f64);
492    }
493
494    // Gauge update methods
495
496    /// Set the number of active sessions
497    pub fn set_active_sessions(&self, count: i64) {
498        self.active_sessions.set(count);
499    }
500
501    /// Increment active sessions count
502    pub fn inc_active_sessions(&self) {
503        self.active_sessions.inc();
504    }
505
506    /// Decrement active sessions count
507    pub fn dec_active_sessions(&self) {
508        self.active_sessions.dec();
509    }
510
511    /// Set the total number of nodes
512    pub fn set_total_nodes(&self, count: i64) {
513        self.total_nodes.set(count);
514    }
515
516    /// Increment total nodes count
517    pub fn inc_total_nodes(&self) {
518        self.total_nodes.inc();
519    }
520
521    /// Increment total nodes by amount
522    pub fn inc_total_nodes_by(&self, count: i64) {
523        self.total_nodes.add(count);
524    }
525
526    /// Set the total number of edges
527    pub fn set_total_edges(&self, count: i64) {
528        self.total_edges.set(count);
529    }
530
531    /// Increment total edges count
532    pub fn inc_total_edges(&self) {
533        self.total_edges.inc();
534    }
535
536    /// Increment total edges by amount
537    pub fn inc_total_edges_by(&self, count: i64) {
538        self.total_edges.add(count);
539    }
540
541    /// Set cache size in bytes
542    pub fn set_cache_size_bytes(&self, size: i64) {
543        self.cache_size_bytes.set(size);
544    }
545
546    /// Set event buffer size
547    pub fn set_buffer_size(&self, size: i64) {
548        self.buffer_size.set(size);
549    }
550
551    // Production Metrics - gRPC Helper Methods
552
553    /// Record a gRPC request with method and status
554    pub fn record_grpc_request(&self, method: &str, status: &str) {
555        self.grpc_requests_total
556            .with_label_values(&[method, status])
557            .inc();
558    }
559
560    /// Record gRPC request duration for a method
561    pub fn record_grpc_request_duration(&self, method: &str, duration_secs: f64) {
562        self.grpc_request_duration
563            .with_label_values(&[method])
564            .observe(duration_secs);
565    }
566
567    /// Increment active gRPC streams count
568    pub fn inc_grpc_active_streams(&self) {
569        self.grpc_active_streams.inc();
570    }
571
572    /// Decrement active gRPC streams count
573    pub fn dec_grpc_active_streams(&self) {
574        self.grpc_active_streams.dec();
575    }
576
577    /// Set the number of active gRPC streams
578    pub fn set_grpc_active_streams(&self, count: i64) {
579        self.grpc_active_streams.set(count);
580    }
581
582    // Production Metrics - Plugin Helper Methods
583
584    /// Record a plugin execution
585    pub fn record_plugin_execution(&self, plugin: &str, hook: &str) {
586        self.plugin_executions_total
587            .with_label_values(&[plugin, hook])
588            .inc();
589    }
590
591    /// Record plugin execution duration
592    pub fn record_plugin_duration(&self, plugin: &str, hook: &str, duration_secs: f64) {
593        self.plugin_duration
594            .with_label_values(&[plugin, hook])
595            .observe(duration_secs);
596    }
597
598    /// Record a plugin error
599    pub fn record_plugin_error(&self, plugin: &str, error_type: &str) {
600        self.plugin_errors_total
601            .with_label_values(&[plugin, error_type])
602            .inc();
603    }
604
605    // Production Metrics - Integration Helper Methods
606
607    /// Record an LLM-Registry API call
608    pub fn record_registry_call(&self, operation: &str, status: &str) {
609        self.registry_calls_total
610            .with_label_values(&[operation, status])
611            .inc();
612    }
613
614    /// Record a successful vault archive operation
615    pub fn record_vault_archive(&self) {
616        self.vault_archives_total.inc();
617    }
618
619    /// Record multiple vault archive operations
620    pub fn record_vault_archives(&self, count: u64) {
621        self.vault_archives_total.inc_by(count);
622    }
623
624    /// Record a successful vault retrieval operation
625    pub fn record_vault_retrieval(&self) {
626        self.vault_retrievals_total.inc();
627    }
628
629    /// Record multiple vault retrieval operations
630    pub fn record_vault_retrievals(&self, count: u64) {
631        self.vault_retrievals_total.inc_by(count);
632    }
633
634    /// Record a vault error
635    pub fn record_vault_error(&self) {
636        self.vault_errors_total.inc();
637    }
638
639    /// Record multiple vault errors
640    pub fn record_vault_errors(&self, count: u64) {
641        self.vault_errors_total.inc_by(count);
642    }
643
644    /// Get a snapshot of all counter values
645    pub fn get_counter_snapshot(&self) -> MetricsCounterSnapshot {
646        MetricsCounterSnapshot {
647            nodes_created: self.nodes_created.get(),
648            edges_created: self.edges_created.get(),
649            prompts_submitted: self.prompts_submitted.get(),
650            responses_generated: self.responses_generated.get(),
651            tools_invoked: self.tools_invoked.get(),
652            agent_handoffs: self.agent_handoffs.get(),
653            template_instantiations: self.template_instantiations.get(),
654            queries_executed: self.queries_executed.get(),
655        }
656    }
657
658    /// Get a snapshot of all gauge values
659    pub fn get_gauge_snapshot(&self) -> MetricsGaugeSnapshot {
660        MetricsGaugeSnapshot {
661            active_sessions: self.active_sessions.get(),
662            total_nodes: self.total_nodes.get(),
663            total_edges: self.total_edges.get(),
664            cache_size_bytes: self.cache_size_bytes.get(),
665            buffer_size: self.buffer_size.get(),
666        }
667    }
668
669    /// Get a snapshot of gRPC metrics
670    pub fn get_grpc_snapshot(&self) -> GrpcMetricsSnapshot {
671        GrpcMetricsSnapshot {
672            active_streams: self.grpc_active_streams.get(),
673        }
674    }
675
676    /// Get a snapshot of vault metrics
677    pub fn get_vault_snapshot(&self) -> VaultMetricsSnapshot {
678        VaultMetricsSnapshot {
679            total_archives: self.vault_archives_total.get(),
680            total_retrievals: self.vault_retrievals_total.get(),
681            total_errors: self.vault_errors_total.get(),
682        }
683    }
684}
685
686/// Snapshot of counter metric values
687#[derive(Debug, Clone, PartialEq)]
688pub struct MetricsCounterSnapshot {
689    /// Total nodes created
690    pub nodes_created: u64,
691    /// Total edges created
692    pub edges_created: u64,
693    /// Total prompts submitted
694    pub prompts_submitted: u64,
695    /// Total responses generated
696    pub responses_generated: u64,
697    /// Total tools invoked
698    pub tools_invoked: u64,
699    /// Total agent handoffs
700    pub agent_handoffs: u64,
701    /// Total template instantiations
702    pub template_instantiations: u64,
703    /// Total queries executed
704    pub queries_executed: u64,
705}
706
707/// Snapshot of gauge metric values
708#[derive(Debug, Clone, PartialEq)]
709pub struct MetricsGaugeSnapshot {
710    /// Current active sessions
711    pub active_sessions: i64,
712    /// Total nodes in graph
713    pub total_nodes: i64,
714    /// Total edges in graph
715    pub total_edges: i64,
716    /// Cache size in bytes
717    pub cache_size_bytes: i64,
718    /// Event buffer size
719    pub buffer_size: i64,
720}
721
722/// Production metrics snapshot for gRPC operations
723#[derive(Debug, Clone, PartialEq)]
724pub struct GrpcMetricsSnapshot {
725    /// Number of active gRPC streams
726    pub active_streams: i64,
727}
728
729/// Production metrics snapshot for vault operations
730#[derive(Debug, Clone, PartialEq)]
731pub struct VaultMetricsSnapshot {
732    /// Total archives to vault
733    pub total_archives: u64,
734    /// Total retrievals from vault
735    pub total_retrievals: u64,
736    /// Total vault errors
737    pub total_errors: u64,
738}
739
740#[cfg(test)]
741mod tests {
742    use super::*;
743
744    #[test]
745    fn test_metrics_creation() {
746        let registry = Registry::new();
747        let metrics = PrometheusMetrics::new(&registry).unwrap();
748
749        // Verify all metrics are initialized
750        assert_eq!(metrics.nodes_created.get(), 0);
751        assert_eq!(metrics.edges_created.get(), 0);
752        assert_eq!(metrics.active_sessions.get(), 0);
753    }
754
755    #[test]
756    fn test_counter_recording() {
757        let registry = Registry::new();
758        let metrics = PrometheusMetrics::new(&registry).unwrap();
759
760        metrics.record_node_created();
761        assert_eq!(metrics.nodes_created.get(), 1);
762
763        metrics.record_nodes_created(5);
764        assert_eq!(metrics.nodes_created.get(), 6);
765
766        metrics.record_edge_created();
767        assert_eq!(metrics.edges_created.get(), 1);
768    }
769
770    #[test]
771    fn test_all_counters() {
772        let registry = Registry::new();
773        let metrics = PrometheusMetrics::new(&registry).unwrap();
774
775        metrics.record_node_created();
776        metrics.record_edge_created();
777        metrics.record_prompt_submitted();
778        metrics.record_response_generated();
779        metrics.record_tool_invoked();
780        metrics.record_agent_handoff();
781        metrics.record_template_instantiation();
782        metrics.record_query_executed();
783
784        assert_eq!(metrics.nodes_created.get(), 1);
785        assert_eq!(metrics.edges_created.get(), 1);
786        assert_eq!(metrics.prompts_submitted.get(), 1);
787        assert_eq!(metrics.responses_generated.get(), 1);
788        assert_eq!(metrics.tools_invoked.get(), 1);
789        assert_eq!(metrics.agent_handoffs.get(), 1);
790        assert_eq!(metrics.template_instantiations.get(), 1);
791        assert_eq!(metrics.queries_executed.get(), 1);
792    }
793
794    #[test]
795    fn test_histogram_recording() {
796        let registry = Registry::new();
797        let metrics = PrometheusMetrics::new(&registry).unwrap();
798
799        metrics.record_write_latency(0.025);
800        metrics.record_read_latency(0.001);
801        metrics.record_query_duration(0.5);
802        metrics.record_tool_duration(2.0);
803        metrics.record_batch_size(50);
804
805        // Histograms don't expose simple get() - verify no panics
806        assert!(true);
807    }
808
809    #[test]
810    fn test_gauge_updates() {
811        let registry = Registry::new();
812        let metrics = PrometheusMetrics::new(&registry).unwrap();
813
814        metrics.set_active_sessions(5);
815        assert_eq!(metrics.active_sessions.get(), 5);
816
817        metrics.inc_active_sessions();
818        assert_eq!(metrics.active_sessions.get(), 6);
819
820        metrics.dec_active_sessions();
821        assert_eq!(metrics.active_sessions.get(), 5);
822
823        metrics.set_total_nodes(100);
824        assert_eq!(metrics.total_nodes.get(), 100);
825
826        metrics.inc_total_nodes();
827        assert_eq!(metrics.total_nodes.get(), 101);
828
829        metrics.inc_total_nodes_by(9);
830        assert_eq!(metrics.total_nodes.get(), 110);
831    }
832
833    #[test]
834    fn test_all_gauges() {
835        let registry = Registry::new();
836        let metrics = PrometheusMetrics::new(&registry).unwrap();
837
838        metrics.set_active_sessions(10);
839        metrics.set_total_nodes(500);
840        metrics.set_total_edges(800);
841        metrics.set_cache_size_bytes(1024 * 1024);
842        metrics.set_buffer_size(25);
843
844        assert_eq!(metrics.active_sessions.get(), 10);
845        assert_eq!(metrics.total_nodes.get(), 500);
846        assert_eq!(metrics.total_edges.get(), 800);
847        assert_eq!(metrics.cache_size_bytes.get(), 1024 * 1024);
848        assert_eq!(metrics.buffer_size.get(), 25);
849    }
850
851    #[test]
852    fn test_counter_snapshot() {
853        let registry = Registry::new();
854        let metrics = PrometheusMetrics::new(&registry).unwrap();
855
856        metrics.record_node_created();
857        metrics.record_edges_created(3);
858        metrics.record_prompt_submitted();
859
860        let snapshot = metrics.get_counter_snapshot();
861        assert_eq!(snapshot.nodes_created, 1);
862        assert_eq!(snapshot.edges_created, 3);
863        assert_eq!(snapshot.prompts_submitted, 1);
864        assert_eq!(snapshot.responses_generated, 0);
865    }
866
867    #[test]
868    fn test_gauge_snapshot() {
869        let registry = Registry::new();
870        let metrics = PrometheusMetrics::new(&registry).unwrap();
871
872        metrics.set_active_sessions(7);
873        metrics.set_total_nodes(150);
874        metrics.set_cache_size_bytes(2048);
875
876        let snapshot = metrics.get_gauge_snapshot();
877        assert_eq!(snapshot.active_sessions, 7);
878        assert_eq!(snapshot.total_nodes, 150);
879        assert_eq!(snapshot.cache_size_bytes, 2048);
880    }
881
882    #[test]
883    fn test_metrics_clone() {
884        let registry = Registry::new();
885        let metrics = PrometheusMetrics::new(&registry).unwrap();
886
887        metrics.record_node_created();
888
889        let cloned = metrics.clone();
890        cloned.record_node_created();
891
892        // Both should see the same counter
893        assert_eq!(metrics.nodes_created.get(), 2);
894        assert_eq!(cloned.nodes_created.get(), 2);
895    }
896
897    #[test]
898    fn test_concurrent_counter_updates() {
899        use std::sync::Arc;
900        use std::thread;
901
902        let registry = Registry::new();
903        let metrics = Arc::new(PrometheusMetrics::new(&registry).unwrap());
904
905        let mut handles = vec![];
906        for _ in 0..10 {
907            let metrics_clone = Arc::clone(&metrics);
908            let handle = thread::spawn(move || {
909                for _ in 0..100 {
910                    metrics_clone.record_node_created();
911                }
912            });
913            handles.push(handle);
914        }
915
916        for handle in handles {
917            handle.join().unwrap();
918        }
919
920        assert_eq!(metrics.nodes_created.get(), 1000);
921    }
922
923    #[test]
924    fn test_latency_buckets() {
925        let registry = Registry::new();
926        let metrics = PrometheusMetrics::new(&registry).unwrap();
927
928        // Test various latency ranges
929        metrics.record_write_latency(0.001); // 1ms
930        metrics.record_write_latency(0.010); // 10ms
931        metrics.record_write_latency(0.100); // 100ms
932        metrics.record_write_latency(1.000); // 1s
933
934        metrics.record_read_latency(0.0001); // 0.1ms
935        metrics.record_read_latency(0.001); // 1ms
936        metrics.record_read_latency(0.01); // 10ms
937    }
938
939    #[test]
940    fn test_batch_size_distribution() {
941        let registry = Registry::new();
942        let metrics = PrometheusMetrics::new(&registry).unwrap();
943
944        metrics.record_batch_size(1);
945        metrics.record_batch_size(10);
946        metrics.record_batch_size(50);
947        metrics.record_batch_size(100);
948        metrics.record_batch_size(500);
949    }
950
951    #[test]
952    fn test_edge_operations_tracking() {
953        let registry = Registry::new();
954        let metrics = PrometheusMetrics::new(&registry).unwrap();
955
956        metrics.set_total_edges(100);
957        metrics.record_edge_created();
958
959        // Track edge creation but gauge doesn't auto-increment
960        assert_eq!(metrics.edges_created.get(), 1);
961
962        // Manually sync gauge with counter
963        metrics.inc_total_edges();
964        assert_eq!(metrics.total_edges.get(), 101);
965    }
966
967    #[test]
968    fn test_session_lifecycle() {
969        let registry = Registry::new();
970        let metrics = PrometheusMetrics::new(&registry).unwrap();
971
972        // Session created
973        metrics.inc_active_sessions();
974        assert_eq!(metrics.active_sessions.get(), 1);
975
976        // More sessions created
977        metrics.inc_active_sessions();
978        metrics.inc_active_sessions();
979        assert_eq!(metrics.active_sessions.get(), 3);
980
981        // Session ended
982        metrics.dec_active_sessions();
983        assert_eq!(metrics.active_sessions.get(), 2);
984    }
985
986    #[test]
987    fn test_prometheus_text_export() {
988        use prometheus::TextEncoder;
989
990        let registry = Registry::new();
991        let metrics = PrometheusMetrics::new(&registry).unwrap();
992
993        metrics.record_node_created();
994        metrics.record_prompt_submitted();
995        metrics.set_active_sessions(5);
996
997        let encoder = TextEncoder::new();
998        let metric_families = registry.gather();
999        let encoded = encoder.encode_to_string(&metric_families).unwrap();
1000
1001        assert!(encoded.contains("memory_graph_nodes_created_total"));
1002        assert!(encoded.contains("memory_graph_prompts_submitted_total"));
1003        assert!(encoded.contains("memory_graph_active_sessions"));
1004    }
1005
1006    // Production Metrics Tests - gRPC
1007
1008    #[test]
1009    fn test_grpc_request_metrics() {
1010        let registry = Registry::new();
1011        let metrics = PrometheusMetrics::new(&registry).unwrap();
1012
1013        // Record various gRPC requests
1014        metrics.record_grpc_request("CreateSession", "success");
1015        metrics.record_grpc_request("CreateSession", "success");
1016        metrics.record_grpc_request("GetNode", "success");
1017        metrics.record_grpc_request("UpdateNode", "error");
1018
1019        // Verify counters are tracked per label
1020        let grpc_total = metrics
1021            .grpc_requests_total
1022            .with_label_values(&["CreateSession", "success"])
1023            .get();
1024        assert_eq!(grpc_total, 2);
1025
1026        let error_total = metrics
1027            .grpc_requests_total
1028            .with_label_values(&["UpdateNode", "error"])
1029            .get();
1030        assert_eq!(error_total, 1);
1031    }
1032
1033    #[test]
1034    fn test_grpc_request_duration() {
1035        let registry = Registry::new();
1036        let metrics = PrometheusMetrics::new(&registry).unwrap();
1037
1038        // Record request durations
1039        metrics.record_grpc_request_duration("CreateSession", 0.015);
1040        metrics.record_grpc_request_duration("Query", 0.125);
1041        metrics.record_grpc_request_duration("BatchCreateNodes", 0.45);
1042
1043        // Verify no panics - histogram values aren't directly accessible
1044        assert!(true);
1045    }
1046
1047    #[test]
1048    fn test_grpc_active_streams() {
1049        let registry = Registry::new();
1050        let metrics = PrometheusMetrics::new(&registry).unwrap();
1051
1052        assert_eq!(metrics.grpc_active_streams.get(), 0);
1053
1054        metrics.inc_grpc_active_streams();
1055        assert_eq!(metrics.grpc_active_streams.get(), 1);
1056
1057        metrics.inc_grpc_active_streams();
1058        metrics.inc_grpc_active_streams();
1059        assert_eq!(metrics.grpc_active_streams.get(), 3);
1060
1061        metrics.dec_grpc_active_streams();
1062        assert_eq!(metrics.grpc_active_streams.get(), 2);
1063
1064        metrics.set_grpc_active_streams(10);
1065        assert_eq!(metrics.grpc_active_streams.get(), 10);
1066    }
1067
1068    #[test]
1069    fn test_grpc_snapshot() {
1070        let registry = Registry::new();
1071        let metrics = PrometheusMetrics::new(&registry).unwrap();
1072
1073        metrics.set_grpc_active_streams(5);
1074
1075        let snapshot = metrics.get_grpc_snapshot();
1076        assert_eq!(snapshot.active_streams, 5);
1077    }
1078
1079    // Production Metrics Tests - Plugin System
1080
1081    #[test]
1082    fn test_plugin_execution_metrics() {
1083        let registry = Registry::new();
1084        let metrics = PrometheusMetrics::new(&registry).unwrap();
1085
1086        // Record plugin executions
1087        metrics.record_plugin_execution("audit_logger", "on_node_create");
1088        metrics.record_plugin_execution("audit_logger", "on_node_create");
1089        metrics.record_plugin_execution("validator", "on_edge_create");
1090        metrics.record_plugin_execution("audit_logger", "on_session_close");
1091
1092        // Verify counters
1093        let audit_create = metrics
1094            .plugin_executions_total
1095            .with_label_values(&["audit_logger", "on_node_create"])
1096            .get();
1097        assert_eq!(audit_create, 2);
1098
1099        let validator_edge = metrics
1100            .plugin_executions_total
1101            .with_label_values(&["validator", "on_edge_create"])
1102            .get();
1103        assert_eq!(validator_edge, 1);
1104    }
1105
1106    #[test]
1107    fn test_plugin_duration_metrics() {
1108        let registry = Registry::new();
1109        let metrics = PrometheusMetrics::new(&registry).unwrap();
1110
1111        // Record plugin durations
1112        metrics.record_plugin_duration("audit_logger", "on_node_create", 0.002);
1113        metrics.record_plugin_duration("validator", "on_edge_create", 0.015);
1114        metrics.record_plugin_duration("transformer", "on_query", 0.125);
1115
1116        // Verify no panics
1117        assert!(true);
1118    }
1119
1120    #[test]
1121    fn test_plugin_error_metrics() {
1122        let registry = Registry::new();
1123        let metrics = PrometheusMetrics::new(&registry).unwrap();
1124
1125        // Record plugin errors
1126        metrics.record_plugin_error("validator", "validation_failed");
1127        metrics.record_plugin_error("validator", "validation_failed");
1128        metrics.record_plugin_error("transformer", "timeout");
1129        metrics.record_plugin_error("audit_logger", "connection_error");
1130
1131        // Verify error counters
1132        let validation_errors = metrics
1133            .plugin_errors_total
1134            .with_label_values(&["validator", "validation_failed"])
1135            .get();
1136        assert_eq!(validation_errors, 2);
1137
1138        let timeout_errors = metrics
1139            .plugin_errors_total
1140            .with_label_values(&["transformer", "timeout"])
1141            .get();
1142        assert_eq!(timeout_errors, 1);
1143    }
1144
1145    // Production Metrics Tests - Integrations
1146
1147    #[test]
1148    fn test_registry_call_metrics() {
1149        let registry = Registry::new();
1150        let metrics = PrometheusMetrics::new(&registry).unwrap();
1151
1152        // Record registry API calls
1153        metrics.record_registry_call("register_model", "success");
1154        metrics.record_registry_call("register_model", "success");
1155        metrics.record_registry_call("get_metadata", "success");
1156        metrics.record_registry_call("update_version", "error");
1157
1158        // Verify counters
1159        let register_success = metrics
1160            .registry_calls_total
1161            .with_label_values(&["register_model", "success"])
1162            .get();
1163        assert_eq!(register_success, 2);
1164
1165        let update_error = metrics
1166            .registry_calls_total
1167            .with_label_values(&["update_version", "error"])
1168            .get();
1169        assert_eq!(update_error, 1);
1170    }
1171
1172    #[test]
1173    fn test_vault_archive_metrics() {
1174        let registry = Registry::new();
1175        let metrics = PrometheusMetrics::new(&registry).unwrap();
1176
1177        assert_eq!(metrics.vault_archives_total.get(), 0);
1178
1179        metrics.record_vault_archive();
1180        assert_eq!(metrics.vault_archives_total.get(), 1);
1181
1182        metrics.record_vault_archives(5);
1183        assert_eq!(metrics.vault_archives_total.get(), 6);
1184    }
1185
1186    #[test]
1187    fn test_vault_retrieval_metrics() {
1188        let registry = Registry::new();
1189        let metrics = PrometheusMetrics::new(&registry).unwrap();
1190
1191        assert_eq!(metrics.vault_retrievals_total.get(), 0);
1192
1193        metrics.record_vault_retrieval();
1194        assert_eq!(metrics.vault_retrievals_total.get(), 1);
1195
1196        metrics.record_vault_retrievals(3);
1197        assert_eq!(metrics.vault_retrievals_total.get(), 4);
1198    }
1199
1200    #[test]
1201    fn test_vault_error_metrics() {
1202        let registry = Registry::new();
1203        let metrics = PrometheusMetrics::new(&registry).unwrap();
1204
1205        assert_eq!(metrics.vault_errors_total.get(), 0);
1206
1207        metrics.record_vault_error();
1208        assert_eq!(metrics.vault_errors_total.get(), 1);
1209
1210        metrics.record_vault_errors(2);
1211        assert_eq!(metrics.vault_errors_total.get(), 3);
1212    }
1213
1214    #[test]
1215    fn test_vault_snapshot() {
1216        let registry = Registry::new();
1217        let metrics = PrometheusMetrics::new(&registry).unwrap();
1218
1219        metrics.record_vault_archives(10);
1220        metrics.record_vault_retrievals(5);
1221        metrics.record_vault_errors(2);
1222
1223        let snapshot = metrics.get_vault_snapshot();
1224        assert_eq!(snapshot.total_archives, 10);
1225        assert_eq!(snapshot.total_retrievals, 5);
1226        assert_eq!(snapshot.total_errors, 2);
1227    }
1228
1229    #[test]
1230    fn test_production_metrics_text_export() {
1231        use prometheus::TextEncoder;
1232
1233        let registry = Registry::new();
1234        let metrics = PrometheusMetrics::new(&registry).unwrap();
1235
1236        // Record production metrics
1237        metrics.record_grpc_request("CreateSession", "success");
1238        metrics.record_plugin_execution("audit_logger", "on_node_create");
1239        metrics.record_registry_call("register_model", "success");
1240        metrics.record_vault_archive();
1241
1242        let encoder = TextEncoder::new();
1243        let metric_families = registry.gather();
1244        let encoded = encoder.encode_to_string(&metric_families).unwrap();
1245
1246        // Verify production metrics are exported
1247        assert!(encoded.contains("memory_graph_grpc_requests_total"));
1248        assert!(encoded.contains("memory_graph_plugin_executions_total"));
1249        assert!(encoded.contains("memory_graph_registry_calls_total"));
1250        assert!(encoded.contains("memory_graph_vault_archives_total"));
1251    }
1252
1253    #[test]
1254    fn test_complete_production_workflow() {
1255        let registry = Registry::new();
1256        let metrics = PrometheusMetrics::new(&registry).unwrap();
1257
1258        // Simulate a complete gRPC request with plugin execution
1259        metrics.inc_grpc_active_streams();
1260        metrics.record_grpc_request("CreateNode", "success");
1261        metrics.record_grpc_request_duration("CreateNode", 0.025);
1262
1263        // Plugin hooks execute
1264        metrics.record_plugin_execution("validator", "on_node_create");
1265        metrics.record_plugin_duration("validator", "on_node_create", 0.003);
1266        metrics.record_plugin_execution("audit_logger", "on_node_create");
1267        metrics.record_plugin_duration("audit_logger", "on_node_create", 0.001);
1268
1269        // Registry integration
1270        metrics.record_registry_call("track_operation", "success");
1271
1272        // Core metrics
1273        metrics.record_node_created();
1274        metrics.inc_total_nodes();
1275
1276        // Stream completes
1277        metrics.dec_grpc_active_streams();
1278
1279        // Verify final state
1280        assert_eq!(metrics.grpc_active_streams.get(), 0);
1281        assert_eq!(metrics.nodes_created.get(), 1);
1282        assert_eq!(metrics.total_nodes.get(), 1);
1283    }
1284
1285    #[test]
1286    fn test_all_production_metrics_initialized() {
1287        let registry = Registry::new();
1288        let metrics = PrometheusMetrics::new(&registry).unwrap();
1289
1290        // Verify all production metrics are initialized
1291        assert_eq!(metrics.grpc_active_streams.get(), 0);
1292        assert_eq!(metrics.vault_archives_total.get(), 0);
1293        assert_eq!(metrics.vault_retrievals_total.get(), 0);
1294        assert_eq!(metrics.vault_errors_total.get(), 0);
1295    }
1296}