clnrm_core/services/
readiness.rs

1//! Service readiness checks based on OTEL span detection
2//!
3//! This module provides span-based health checking for services.
4//! Services can specify a span name to wait for before being marked ready,
5//! enabling precise synchronization based on actual service behavior.
6
7use crate::error::{CleanroomError, Result};
8use std::time::{Duration, Instant};
9use tokio::time::sleep;
10
11/// Default timeout for waiting for spans (30 seconds)
12pub const DEFAULT_SPAN_WAIT_TIMEOUT_SECS: u64 = 30;
13
14/// Poll interval for checking span appearance (500ms)
15const SPAN_POLL_INTERVAL_MS: u64 = 500;
16
17/// Configuration for span-based readiness checks
18#[derive(Debug, Clone)]
19pub struct SpanReadinessConfig {
20    /// Name of the span to wait for
21    pub span_name: String,
22    /// Timeout duration for waiting
23    pub timeout: Duration,
24}
25
26impl SpanReadinessConfig {
27    /// Create a new span readiness configuration
28    pub fn new(span_name: String, timeout_secs: Option<u64>) -> Self {
29        let timeout = Duration::from_secs(timeout_secs.unwrap_or(DEFAULT_SPAN_WAIT_TIMEOUT_SECS));
30        Self { span_name, timeout }
31    }
32}
33
34/// Span source for checking span appearance
35#[derive(Debug, Clone)]
36pub enum SpanSource {
37    /// Check stdout stream for span (for stdout exporter)
38    Stdout(String),
39    /// Query OTLP collector endpoint
40    OtlpHttp { endpoint: String },
41    /// Query OTLP gRPC collector
42    OtlpGrpc { endpoint: String },
43}
44
45/// Wait for a span to appear in the specified source
46///
47/// This function polls the span source until the specified span name appears
48/// or the timeout is reached. It follows core team standards with proper error
49/// handling and no unwrap/expect calls.
50///
51/// # Arguments
52///
53/// * `config` - Readiness configuration with span name and timeout
54/// * `source` - Source to check for span appearance
55///
56/// # Returns
57///
58/// * `Ok(())` if span was detected before timeout
59/// * `Err(CleanroomError)` if timeout occurred or span check failed
60///
61/// # Errors
62///
63/// Returns error if:
64/// - Timeout is reached without detecting span
65/// - Source is inaccessible
66/// - Span parsing fails
67pub async fn wait_for_span(config: &SpanReadinessConfig, source: &SpanSource) -> Result<()> {
68    let start_time = Instant::now();
69    let poll_interval = Duration::from_millis(SPAN_POLL_INTERVAL_MS);
70
71    loop {
72        // Check if timeout reached
73        if start_time.elapsed() >= config.timeout {
74            return Err(CleanroomError::timeout_error(format!(
75                "Span '{}' not detected within {} seconds",
76                config.span_name,
77                config.timeout.as_secs()
78            ))
79            .with_context("Service readiness check"));
80        }
81
82        // Check span source
83        match check_span_in_source(&config.span_name, source).await {
84            Ok(true) => {
85                // Span detected - service is ready
86                tracing::info!(
87                    span_name = %config.span_name,
88                    elapsed_ms = start_time.elapsed().as_millis(),
89                    "Service ready: span detected"
90                );
91                return Ok(());
92            }
93            Ok(false) => {
94                // Span not found yet, continue polling
95                sleep(poll_interval).await;
96            }
97            Err(e) => {
98                // Error checking span - log and retry
99                tracing::warn!(
100                    span_name = %config.span_name,
101                    error = %e,
102                    "Failed to check span, retrying"
103                );
104                sleep(poll_interval).await;
105            }
106        }
107    }
108}
109
110/// Check if a span exists in the specified source
111///
112/// # Arguments
113///
114/// * `span_name` - Name of the span to search for
115/// * `source` - Source to check (stdout, OTLP HTTP, or OTLP gRPC)
116///
117/// # Returns
118///
119/// * `Ok(true)` if span was found
120/// * `Ok(false)` if span was not found
121/// * `Err(CleanroomError)` if checking failed
122async fn check_span_in_source(span_name: &str, source: &SpanSource) -> Result<bool> {
123    match source {
124        SpanSource::Stdout(output) => check_span_in_stdout(span_name, output),
125        SpanSource::OtlpHttp { endpoint } => check_span_in_otlp_http(span_name, endpoint).await,
126        SpanSource::OtlpGrpc { endpoint } => check_span_in_otlp_grpc(span_name, endpoint).await,
127    }
128}
129
130/// Check if span appears in stdout output
131///
132/// This is used when services export spans to stdout (common in testing).
133/// We search for span name patterns in the captured output.
134fn check_span_in_stdout(span_name: &str, output: &str) -> Result<bool> {
135    // Check for span name in various formats that stdout exporter might use
136    let patterns = [
137        format!("\"name\":\"{}", span_name), // JSON format
138        format!("name: {}", span_name),      // YAML-like format
139        format!("span.name={}", span_name),  // Key-value format
140        format!("SpanName({})", span_name),  // Debug format
141        span_name.to_string(),               // Direct match
142    ];
143
144    for pattern in &patterns {
145        if output.contains(pattern) {
146            return Ok(true);
147        }
148    }
149
150    Ok(false)
151}
152
153/// Check if span exists in OTLP HTTP collector
154///
155/// Queries the collector's trace endpoint to see if span has been recorded.
156/// This requires the collector to expose a query API.
157async fn check_span_in_otlp_http(span_name: &str, endpoint: &str) -> Result<bool> {
158    // Build query URL for searching spans
159    let query_url = format!("{}/v1/traces?span_name={}", endpoint, span_name);
160
161    // Create HTTP client with timeout
162    let client = reqwest::Client::builder()
163        .timeout(Duration::from_secs(5))
164        .build()
165        .map_err(|e| {
166            CleanroomError::network_error("Failed to create HTTP client")
167                .with_context("OTLP HTTP span check")
168                .with_source(e.to_string())
169        })?;
170
171    // Query collector
172    match client.get(&query_url).send().await {
173        Ok(response) => {
174            if response.status().is_success() {
175                let body = response.text().await.map_err(|e| {
176                    CleanroomError::network_error("Failed to read response body")
177                        .with_source(e.to_string())
178                })?;
179
180                // Check if response contains our span
181                Ok(body.contains(span_name))
182            } else {
183                // Collector not ready or span not found
184                Ok(false)
185            }
186        }
187        Err(e) => {
188            // Connection failed - collector might not be ready yet
189            tracing::debug!(
190                endpoint = %endpoint,
191                error = %e,
192                "OTLP HTTP check failed, collector may not be ready"
193            );
194            Ok(false)
195        }
196    }
197}
198
199/// Check if span exists in OTLP gRPC collector
200///
201/// Queries the collector via gRPC to check for span existence.
202async fn check_span_in_otlp_grpc(_span_name: &str, endpoint: &str) -> Result<bool> {
203    // CRITICAL: Placeholder implementation
204    // Real implementation requires:
205    // 1. gRPC client setup with tonic/grpcio
206    // 2. Connection to OTLP gRPC endpoint
207    // 3. Query traces API
208    // 4. Search for span by name
209    //
210    // For MVP, we'll return false to indicate span not found
211    // This allows tests to timeout with clear error message
212    tracing::warn!(
213        endpoint = %endpoint,
214        "OTLP gRPC span checking not yet implemented, returning false"
215    );
216    Ok(false)
217}