clnrm_core/services/readiness.rs
1//! Service readiness checks based on OTEL span detection
2//!
3//! This module provides span-based health checking for services.
4//! Services can specify a span name to wait for before being marked ready,
5//! enabling precise synchronization based on actual service behavior.
6
7use crate::error::{CleanroomError, Result};
8use std::time::{Duration, Instant};
9use tokio::time::sleep;
10
11/// Default timeout for waiting for spans (30 seconds)
12pub const DEFAULT_SPAN_WAIT_TIMEOUT_SECS: u64 = 30;
13
14/// Poll interval for checking span appearance (500ms)
15const SPAN_POLL_INTERVAL_MS: u64 = 500;
16
17/// Configuration for span-based readiness checks
18#[derive(Debug, Clone)]
19pub struct SpanReadinessConfig {
20 /// Name of the span to wait for
21 pub span_name: String,
22 /// Timeout duration for waiting
23 pub timeout: Duration,
24}
25
26impl SpanReadinessConfig {
27 /// Create a new span readiness configuration
28 pub fn new(span_name: String, timeout_secs: Option<u64>) -> Self {
29 let timeout = Duration::from_secs(timeout_secs.unwrap_or(DEFAULT_SPAN_WAIT_TIMEOUT_SECS));
30 Self { span_name, timeout }
31 }
32}
33
34/// Span source for checking span appearance
35#[derive(Debug, Clone)]
36pub enum SpanSource {
37 /// Check stdout stream for span (for stdout exporter)
38 Stdout(String),
39 /// Query OTLP collector endpoint
40 OtlpHttp { endpoint: String },
41 /// Query OTLP gRPC collector
42 OtlpGrpc { endpoint: String },
43}
44
45/// Wait for a span to appear in the specified source
46///
47/// This function polls the span source until the specified span name appears
48/// or the timeout is reached. It follows core team standards with proper error
49/// handling and no unwrap/expect calls.
50///
51/// # Arguments
52///
53/// * `config` - Readiness configuration with span name and timeout
54/// * `source` - Source to check for span appearance
55///
56/// # Returns
57///
58/// * `Ok(())` if span was detected before timeout
59/// * `Err(CleanroomError)` if timeout occurred or span check failed
60///
61/// # Errors
62///
63/// Returns error if:
64/// - Timeout is reached without detecting span
65/// - Source is inaccessible
66/// - Span parsing fails
67pub async fn wait_for_span(config: &SpanReadinessConfig, source: &SpanSource) -> Result<()> {
68 let start_time = Instant::now();
69 let poll_interval = Duration::from_millis(SPAN_POLL_INTERVAL_MS);
70
71 loop {
72 // Check if timeout reached
73 if start_time.elapsed() >= config.timeout {
74 return Err(CleanroomError::timeout_error(format!(
75 "Span '{}' not detected within {} seconds",
76 config.span_name,
77 config.timeout.as_secs()
78 ))
79 .with_context("Service readiness check"));
80 }
81
82 // Check span source
83 match check_span_in_source(&config.span_name, source).await {
84 Ok(true) => {
85 // Span detected - service is ready
86 tracing::info!(
87 span_name = %config.span_name,
88 elapsed_ms = start_time.elapsed().as_millis(),
89 "Service ready: span detected"
90 );
91 return Ok(());
92 }
93 Ok(false) => {
94 // Span not found yet, continue polling
95 sleep(poll_interval).await;
96 }
97 Err(e) => {
98 // Error checking span - log and retry
99 tracing::warn!(
100 span_name = %config.span_name,
101 error = %e,
102 "Failed to check span, retrying"
103 );
104 sleep(poll_interval).await;
105 }
106 }
107 }
108}
109
110/// Check if a span exists in the specified source
111///
112/// # Arguments
113///
114/// * `span_name` - Name of the span to search for
115/// * `source` - Source to check (stdout, OTLP HTTP, or OTLP gRPC)
116///
117/// # Returns
118///
119/// * `Ok(true)` if span was found
120/// * `Ok(false)` if span was not found
121/// * `Err(CleanroomError)` if checking failed
122async fn check_span_in_source(span_name: &str, source: &SpanSource) -> Result<bool> {
123 match source {
124 SpanSource::Stdout(output) => check_span_in_stdout(span_name, output),
125 SpanSource::OtlpHttp { endpoint } => check_span_in_otlp_http(span_name, endpoint).await,
126 SpanSource::OtlpGrpc { endpoint } => check_span_in_otlp_grpc(span_name, endpoint).await,
127 }
128}
129
130/// Check if span appears in stdout output
131///
132/// This is used when services export spans to stdout (common in testing).
133/// We search for span name patterns in the captured output.
134fn check_span_in_stdout(span_name: &str, output: &str) -> Result<bool> {
135 // Check for span name in various formats that stdout exporter might use
136 let patterns = [
137 format!("\"name\":\"{}", span_name), // JSON format
138 format!("name: {}", span_name), // YAML-like format
139 format!("span.name={}", span_name), // Key-value format
140 format!("SpanName({})", span_name), // Debug format
141 span_name.to_string(), // Direct match
142 ];
143
144 for pattern in &patterns {
145 if output.contains(pattern) {
146 return Ok(true);
147 }
148 }
149
150 Ok(false)
151}
152
153/// Check if span exists in OTLP HTTP collector
154///
155/// Queries the collector's trace endpoint to see if span has been recorded.
156/// This requires the collector to expose a query API.
157async fn check_span_in_otlp_http(span_name: &str, endpoint: &str) -> Result<bool> {
158 // Build query URL for searching spans
159 let query_url = format!("{}/v1/traces?span_name={}", endpoint, span_name);
160
161 // Create HTTP client with timeout
162 let client = reqwest::Client::builder()
163 .timeout(Duration::from_secs(5))
164 .build()
165 .map_err(|e| {
166 CleanroomError::network_error("Failed to create HTTP client")
167 .with_context("OTLP HTTP span check")
168 .with_source(e.to_string())
169 })?;
170
171 // Query collector
172 match client.get(&query_url).send().await {
173 Ok(response) => {
174 if response.status().is_success() {
175 let body = response.text().await.map_err(|e| {
176 CleanroomError::network_error("Failed to read response body")
177 .with_source(e.to_string())
178 })?;
179
180 // Check if response contains our span
181 Ok(body.contains(span_name))
182 } else {
183 // Collector not ready or span not found
184 Ok(false)
185 }
186 }
187 Err(e) => {
188 // Connection failed - collector might not be ready yet
189 tracing::debug!(
190 endpoint = %endpoint,
191 error = %e,
192 "OTLP HTTP check failed, collector may not be ready"
193 );
194 Ok(false)
195 }
196 }
197}
198
199/// Check if span exists in OTLP gRPC collector
200///
201/// Queries the collector via gRPC to check for span existence.
202async fn check_span_in_otlp_grpc(_span_name: &str, endpoint: &str) -> Result<bool> {
203 // CRITICAL: Placeholder implementation
204 // Real implementation requires:
205 // 1. gRPC client setup with tonic/grpcio
206 // 2. Connection to OTLP gRPC endpoint
207 // 3. Query traces API
208 // 4. Search for span by name
209 //
210 // For MVP, we'll return false to indicate span not found
211 // This allows tests to timeout with clear error message
212 tracing::warn!(
213 endpoint = %endpoint,
214 "OTLP gRPC span checking not yet implemented, returning false"
215 );
216 Ok(false)
217}