blueprint_qos/metrics/service.rs
1use blueprint_core::info;
2use std::sync::Arc;
3
4use crate::error::Result;
5use crate::metrics::opentelemetry::OpenTelemetryConfig;
6use crate::metrics::provider::EnhancedMetricsProvider;
7use crate::metrics::types::MetricsConfig;
8
9/// Service responsible for metrics collection, processing, and exposure.
10///
11/// The `MetricsService` orchestrates the metrics collection pipeline, managing the
12/// lifecycle of the underlying metrics provider. It serves as the main entry point
13/// for recording application metrics (like job execution statistics) and provides
14/// access to the configured metrics infrastructure. The service works with Prometheus
15/// and OpenTelemetry to provide comprehensive observability for Blueprint services.
16#[derive(Clone)]
17pub struct MetricsService {
18 /// Metrics provider
19 provider: Arc<EnhancedMetricsProvider>,
20
21 /// Configuration
22 #[allow(dead_code)]
23 config: MetricsConfig,
24}
25
26impl MetricsService {
27 /// Creates a new metrics service with default OpenTelemetry configuration.
28 ///
29 /// Initializes the metrics collection infrastructure with the specified configuration,
30 /// creating an `EnhancedMetricsProvider` with default OpenTelemetry settings. This
31 /// provider will collect system metrics, application metrics, and expose them through
32 /// Prometheus and OpenTelemetry.
33 ///
34 /// # Parameters
35 /// * `config` - Configuration for metrics collection, retention and exposure
36 ///
37 /// # Errors
38 /// Returns an error if the metrics provider initialization fails, which could occur
39 /// due to invalid configuration or resource allocation issues
40 pub fn new(config: MetricsConfig) -> Result<Self> {
41 let otel_config = OpenTelemetryConfig::default();
42 let provider = Arc::new(EnhancedMetricsProvider::new(config.clone(), &otel_config)?);
43
44 Ok(Self { provider, config })
45 }
46
47 /// Creates a new metrics service with custom OpenTelemetry configuration.
48 ///
49 /// Similar to `new()`, but allows for customized OpenTelemetry settings, enabling
50 /// fine-tuning of the tracing and metrics export behavior. Use this constructor when
51 /// you need to customize the OpenTelemetry pipeline for advanced observability requirements.
52 ///
53 /// # Parameters
54 /// * `config` - Configuration for metrics collection, retention and exposure
55 /// * `otel_config` - Custom OpenTelemetry configuration for trace and metrics export
56 ///
57 /// # Errors
58 /// Returns an error if the metrics provider initialization fails, which could occur
59 /// due to invalid configuration or resource allocation issues
60 pub fn with_otel_config(
61 config: MetricsConfig,
62 otel_config: &OpenTelemetryConfig,
63 ) -> Result<Self> {
64 let provider = Arc::new(EnhancedMetricsProvider::new(config.clone(), otel_config)?);
65
66 Ok(Self { provider, config })
67 }
68
69 /// Returns a reference to the underlying metrics provider.
70 ///
71 /// Provides access to the `EnhancedMetricsProvider` which handles the actual collection,
72 /// storage, and exposure of metrics. This can be used for advanced metrics operations
73 /// not directly exposed by the `MetricsService` interface.
74 #[must_use]
75 pub fn provider(&self) -> Arc<EnhancedMetricsProvider> {
76 self.provider.clone()
77 }
78
79 /// Returns a clone of the OpenTelemetry job executions counter.
80 ///
81 /// This counter tracks the total number of job executions across the Blueprint service.
82 /// It can be used directly to increment execution counts from components that have
83 /// access to the metrics service but not the full provider.
84 #[must_use]
85 pub fn get_otel_job_executions_counter(&self) -> opentelemetry::metrics::Counter<u64> {
86 self.provider.get_otel_job_executions_counter()
87 }
88
89 /// Records metrics for a successful job execution.
90 ///
91 /// Updates both Prometheus and OpenTelemetry metrics with information about a completed job.
92 /// This information is used to track job throughput, execution time distributions, and
93 /// success rates for the specified job, service, and blueprint IDs.
94 ///
95 /// # Parameters
96 /// * `job_id` - Unique identifier for the executed job
97 /// * `execution_time` - Duration of the job execution in seconds
98 /// * `service_id` - Identifier for the service that executed the job
99 /// * `blueprint_id` - Identifier for the blueprint that executed the job
100 pub fn record_job_execution(
101 &self,
102 job_id: u64,
103 execution_time: f64,
104 service_id: u64,
105 blueprint_id: u64,
106 ) {
107 self.provider
108 .record_job_execution(job_id, execution_time, service_id, blueprint_id);
109 }
110
111 /// Records metrics for a failed job execution.
112 ///
113 /// Updates error counters and metrics when a job fails, categorizing the error by type.
114 /// This allows for tracking error rates and the distribution of different failure modes
115 /// across jobs to help with debugging and reliability improvements.
116 ///
117 /// # Parameters
118 /// * `job_id` - Unique identifier for the failed job
119 /// * `error_type` - Classification of the error that occurred
120 pub fn record_job_error(&self, job_id: u64, error_type: &str) {
121 self.provider.record_job_error(job_id, error_type);
122 }
123}
124
125/// Runs a standalone metrics server with the given configuration.
126///
127/// This function initializes the metrics collection infrastructure and starts a
128/// metrics server that exposes collected metrics via HTTP endpoints compatible with
129/// Prometheus scraping. It also initializes the background tasks for collecting
130/// system and application metrics at regular intervals.
131///
132/// # Parameters
133/// * `config` - Configuration for metrics collection, retention, and server settings
134///
135/// # Returns
136/// A reference to the initialized and started metrics provider on success
137///
138/// # Errors
139/// Returns an error if the metrics provider initialization or server startup fails
140pub async fn run_metrics_server(config: MetricsConfig) -> Result<Arc<EnhancedMetricsProvider>> {
141 let otel_config = OpenTelemetryConfig::default();
142 let provider = Arc::new(EnhancedMetricsProvider::new(config, &otel_config)?);
143
144 // Start the metrics collection
145 provider.clone().start_collection().await?;
146
147 info!("Started metrics server");
148
149 Ok(provider)
150}
151
152#[cfg(test)]
153mod tests {
154 use super::*;
155 use crate::servers::prometheus::PrometheusServerConfig;
156
157 /// Tests that a new `MetricsService` can be created with a valid configuration.
158 ///
159 /// ```
160 /// MetricsConfig -> MetricsService
161 /// ```
162 ///
163 /// Expected outcome: `MetricsService` is created with the provided config
164 #[test]
165 fn test_metrics_service_creation() {
166 let config = MetricsConfig {
167 prometheus_server: Some(PrometheusServerConfig::default()),
168 service_id: 42,
169 blueprint_id: 24,
170 collection_interval_secs: 60,
171 max_history: 100,
172 };
173
174 let service = MetricsService::new(config.clone());
175 assert!(service.is_ok());
176
177 let service = service.unwrap();
178 assert!(std::sync::Arc::strong_count(&service.provider()) >= 1);
179 }
180
181 /// Tests that a new `MetricsService` can be created with custom OpenTelemetry configuration.
182 ///
183 /// ```
184 /// MetricsConfig + OpenTelemetryConfig -> MetricsService
185 /// ```
186 ///
187 /// Expected outcome: `MetricsService` is created with the provided configs
188 #[test]
189 fn test_metrics_service_with_otel_config() {
190 let config = MetricsConfig {
191 prometheus_server: Some(PrometheusServerConfig::default()),
192 service_id: 42,
193 blueprint_id: 24,
194 collection_interval_secs: 60,
195 max_history: 100,
196 };
197
198 let otel_config = OpenTelemetryConfig::default();
199
200 let service = MetricsService::with_otel_config(config.clone(), &otel_config);
201 assert!(service.is_ok());
202
203 let service = service.unwrap();
204 assert!(std::sync::Arc::strong_count(&service.provider()) >= 1);
205 }
206
207 /// Tests that the `MetricsService` can record job executions.
208 ///
209 /// ```
210 /// MetricsService.record_job_execution() -> Job execution recorded
211 /// ```
212 ///
213 /// Expected outcome: Job execution is recorded in the metrics provider
214 #[test]
215 fn test_metrics_service_record_job_execution() {
216 let config = MetricsConfig {
217 prometheus_server: Some(PrometheusServerConfig::default()),
218 service_id: 42,
219 blueprint_id: 24,
220 collection_interval_secs: 60,
221 max_history: 100,
222 };
223
224 let service = MetricsService::new(config.clone()).unwrap();
225
226 service.record_job_execution(1, 0.5, 42, 24);
227 }
228
229 /// Tests that the `MetricsService` can record job errors.
230 ///
231 /// ```
232 /// MetricsService.record_job_error() -> Job error recorded
233 /// ```
234 ///
235 /// Expected outcome: Job error is recorded in the metrics provider
236 #[test]
237 fn test_metrics_service_record_job_error() {
238 let config = MetricsConfig {
239 prometheus_server: Some(PrometheusServerConfig::default()),
240 service_id: 42,
241 blueprint_id: 24,
242 collection_interval_secs: 60,
243 max_history: 100,
244 };
245
246 let service = MetricsService::new(config.clone()).unwrap();
247
248 service.record_job_error(1, "test_error");
249 }
250}