blueprint_qos/metrics/
service.rs

1use blueprint_core::info;
2use std::sync::Arc;
3
4use crate::error::Result;
5use crate::metrics::opentelemetry::OpenTelemetryConfig;
6use crate::metrics::provider::EnhancedMetricsProvider;
7use crate::metrics::types::MetricsConfig;
8
9/// Service responsible for metrics collection, processing, and exposure.
10///
11/// The `MetricsService` orchestrates the metrics collection pipeline, managing the
12/// lifecycle of the underlying metrics provider. It serves as the main entry point
13/// for recording application metrics (like job execution statistics) and provides
14/// access to the configured metrics infrastructure. The service works with Prometheus
15/// and OpenTelemetry to provide comprehensive observability for Blueprint services.
16#[derive(Clone)]
17pub struct MetricsService {
18    /// Metrics provider
19    provider: Arc<EnhancedMetricsProvider>,
20
21    /// Configuration
22    #[allow(dead_code)]
23    config: MetricsConfig,
24}
25
26impl MetricsService {
27    /// Creates a new metrics service with default OpenTelemetry configuration.
28    ///
29    /// Initializes the metrics collection infrastructure with the specified configuration,
30    /// creating an `EnhancedMetricsProvider` with default OpenTelemetry settings. This
31    /// provider will collect system metrics, application metrics, and expose them through
32    /// Prometheus and OpenTelemetry.
33    ///
34    /// # Parameters
35    /// * `config` - Configuration for metrics collection, retention and exposure
36    ///
37    /// # Errors
38    /// Returns an error if the metrics provider initialization fails, which could occur
39    /// due to invalid configuration or resource allocation issues
40    pub fn new(config: MetricsConfig) -> Result<Self> {
41        let otel_config = OpenTelemetryConfig::default();
42        let provider = Arc::new(EnhancedMetricsProvider::new(config.clone(), &otel_config)?);
43
44        Ok(Self { provider, config })
45    }
46
47    /// Creates a new metrics service with custom OpenTelemetry configuration.
48    ///
49    /// Similar to `new()`, but allows for customized OpenTelemetry settings, enabling
50    /// fine-tuning of the tracing and metrics export behavior. Use this constructor when
51    /// you need to customize the OpenTelemetry pipeline for advanced observability requirements.
52    ///
53    /// # Parameters
54    /// * `config` - Configuration for metrics collection, retention and exposure
55    /// * `otel_config` - Custom OpenTelemetry configuration for trace and metrics export
56    ///
57    /// # Errors
58    /// Returns an error if the metrics provider initialization fails, which could occur
59    /// due to invalid configuration or resource allocation issues
60    pub fn with_otel_config(
61        config: MetricsConfig,
62        otel_config: &OpenTelemetryConfig,
63    ) -> Result<Self> {
64        let provider = Arc::new(EnhancedMetricsProvider::new(config.clone(), otel_config)?);
65
66        Ok(Self { provider, config })
67    }
68
69    /// Returns a reference to the underlying metrics provider.
70    ///
71    /// Provides access to the `EnhancedMetricsProvider` which handles the actual collection,
72    /// storage, and exposure of metrics. This can be used for advanced metrics operations
73    /// not directly exposed by the `MetricsService` interface.
74    #[must_use]
75    pub fn provider(&self) -> Arc<EnhancedMetricsProvider> {
76        self.provider.clone()
77    }
78
79    /// Returns a clone of the OpenTelemetry job executions counter.
80    ///
81    /// This counter tracks the total number of job executions across the Blueprint service.
82    /// It can be used directly to increment execution counts from components that have
83    /// access to the metrics service but not the full provider.
84    #[must_use]
85    pub fn get_otel_job_executions_counter(&self) -> opentelemetry::metrics::Counter<u64> {
86        self.provider.get_otel_job_executions_counter()
87    }
88
89    /// Records metrics for a successful job execution.
90    ///
91    /// Updates both Prometheus and OpenTelemetry metrics with information about a completed job.
92    /// This information is used to track job throughput, execution time distributions, and
93    /// success rates for the specified job, service, and blueprint IDs.
94    ///
95    /// # Parameters
96    /// * `job_id` - Unique identifier for the executed job
97    /// * `execution_time` - Duration of the job execution in seconds
98    /// * `service_id` - Identifier for the service that executed the job
99    /// * `blueprint_id` - Identifier for the blueprint that executed the job
100    pub fn record_job_execution(
101        &self,
102        job_id: u64,
103        execution_time: f64,
104        service_id: u64,
105        blueprint_id: u64,
106    ) {
107        self.provider
108            .record_job_execution(job_id, execution_time, service_id, blueprint_id);
109    }
110
111    /// Records metrics for a failed job execution.
112    ///
113    /// Updates error counters and metrics when a job fails, categorizing the error by type.
114    /// This allows for tracking error rates and the distribution of different failure modes
115    /// across jobs to help with debugging and reliability improvements.
116    ///
117    /// # Parameters
118    /// * `job_id` - Unique identifier for the failed job
119    /// * `error_type` - Classification of the error that occurred
120    pub fn record_job_error(&self, job_id: u64, error_type: &str) {
121        self.provider.record_job_error(job_id, error_type);
122    }
123}
124
125/// Runs a standalone metrics server with the given configuration.
126///
127/// This function initializes the metrics collection infrastructure and starts a
128/// metrics server that exposes collected metrics via HTTP endpoints compatible with
129/// Prometheus scraping. It also initializes the background tasks for collecting
130/// system and application metrics at regular intervals.
131///
132/// # Parameters
133/// * `config` - Configuration for metrics collection, retention, and server settings
134///
135/// # Returns
136/// A reference to the initialized and started metrics provider on success
137///
138/// # Errors
139/// Returns an error if the metrics provider initialization or server startup fails
140pub async fn run_metrics_server(config: MetricsConfig) -> Result<Arc<EnhancedMetricsProvider>> {
141    let otel_config = OpenTelemetryConfig::default();
142    let provider = Arc::new(EnhancedMetricsProvider::new(config, &otel_config)?);
143
144    // Start the metrics collection
145    provider.clone().start_collection().await?;
146
147    info!("Started metrics server");
148
149    Ok(provider)
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155    use crate::servers::prometheus::PrometheusServerConfig;
156
157    /// Tests that a new `MetricsService` can be created with a valid configuration.
158    ///
159    /// ```
160    /// MetricsConfig -> MetricsService
161    /// ```
162    ///
163    /// Expected outcome: `MetricsService` is created with the provided config
164    #[test]
165    fn test_metrics_service_creation() {
166        let config = MetricsConfig {
167            prometheus_server: Some(PrometheusServerConfig::default()),
168            service_id: 42,
169            blueprint_id: 24,
170            collection_interval_secs: 60,
171            max_history: 100,
172        };
173
174        let service = MetricsService::new(config.clone());
175        assert!(service.is_ok());
176
177        let service = service.unwrap();
178        assert!(std::sync::Arc::strong_count(&service.provider()) >= 1);
179    }
180
181    /// Tests that a new `MetricsService` can be created with custom OpenTelemetry configuration.
182    ///
183    /// ```
184    /// MetricsConfig + OpenTelemetryConfig -> MetricsService
185    /// ```
186    ///
187    /// Expected outcome: `MetricsService` is created with the provided configs
188    #[test]
189    fn test_metrics_service_with_otel_config() {
190        let config = MetricsConfig {
191            prometheus_server: Some(PrometheusServerConfig::default()),
192            service_id: 42,
193            blueprint_id: 24,
194            collection_interval_secs: 60,
195            max_history: 100,
196        };
197
198        let otel_config = OpenTelemetryConfig::default();
199
200        let service = MetricsService::with_otel_config(config.clone(), &otel_config);
201        assert!(service.is_ok());
202
203        let service = service.unwrap();
204        assert!(std::sync::Arc::strong_count(&service.provider()) >= 1);
205    }
206
207    /// Tests that the `MetricsService` can record job executions.
208    ///
209    /// ```
210    /// MetricsService.record_job_execution() -> Job execution recorded
211    /// ```
212    ///
213    /// Expected outcome: Job execution is recorded in the metrics provider
214    #[test]
215    fn test_metrics_service_record_job_execution() {
216        let config = MetricsConfig {
217            prometheus_server: Some(PrometheusServerConfig::default()),
218            service_id: 42,
219            blueprint_id: 24,
220            collection_interval_secs: 60,
221            max_history: 100,
222        };
223
224        let service = MetricsService::new(config.clone()).unwrap();
225
226        service.record_job_execution(1, 0.5, 42, 24);
227    }
228
229    /// Tests that the `MetricsService` can record job errors.
230    ///
231    /// ```
232    /// MetricsService.record_job_error() -> Job error recorded
233    /// ```
234    ///
235    /// Expected outcome: Job error is recorded in the metrics provider
236    #[test]
237    fn test_metrics_service_record_job_error() {
238        let config = MetricsConfig {
239            prometheus_server: Some(PrometheusServerConfig::default()),
240            service_id: 42,
241            blueprint_id: 24,
242            collection_interval_secs: 60,
243            max_history: 100,
244        };
245
246        let service = MetricsService::new(config.clone()).unwrap();
247
248        service.record_job_error(1, "test_error");
249    }
250}