Skip to main content

gestalt/
telemetry.rs

1//! OpenTelemetry helpers for provider-authored GenAI instrumentation.
2//!
3//! Gestalt injects standard `OTEL_*` environment variables into provider
4//! processes from the host telemetry configuration. This module records GenAI
5//! semantic-convention spans and metrics through the process-global
6//! OpenTelemetry API configured by the provider runtime.
7
8use std::any::type_name;
9use std::error::Error;
10use std::time::Instant;
11
12use opentelemetry::global;
13use opentelemetry::metrics::Histogram;
14use opentelemetry::trace::{Span, SpanKind, Status, Tracer};
15use opentelemetry::{Array, KeyValue, StringValue, Value};
16use std::sync::OnceLock;
17
18/// OpenTelemetry instrumentation scope used by the Gestalt SDK.
19pub const TELEMETRY_INSTRUMENTATION_NAME: &str = "gestalt.provider";
20/// Default GenAI provider name used for Gestalt-owned agent and tool work.
21pub const GENAI_PROVIDER_NAME: &str = "gestalt";
22
23/// GenAI operation name for chat/model calls.
24pub const GENAI_OPERATION_CHAT: &str = "chat";
25/// GenAI operation name for tool executions.
26pub const GENAI_OPERATION_EXECUTE_TOOL: &str = "execute_tool";
27/// GenAI operation name for agent invocations.
28pub const GENAI_OPERATION_INVOKE_AGENT: &str = "invoke_agent";
29
30/// GenAI tool type for datastore-backed tools.
31pub const GENAI_TOOL_TYPE_DATASTORE: &str = "datastore";
32/// GenAI tool type for extension/plugin-backed tools.
33pub const GENAI_TOOL_TYPE_EXTENSION: &str = "extension";
34
35const OPERATION_DURATION_METRIC: &str = "gen_ai.client.operation.duration";
36const TOKEN_USAGE_METRIC: &str = "gen_ai.client.token.usage";
37const OPERATION_DURATION_BUCKETS: &[f64] = &[
38    0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92,
39];
40const TOKEN_USAGE_BUCKETS: &[f64] = &[
41    1.0, 4.0, 16.0, 64.0, 256.0, 1024.0, 4096.0, 16384.0, 65536.0, 262144.0, 1048576.0, 4194304.0,
42    16777216.0, 67108864.0,
43];
44
45static OPERATION_DURATION: OnceLock<Histogram<f64>> = OnceLock::new();
46static TOKEN_USAGE: OnceLock<Histogram<u64>> = OnceLock::new();
47
48/// Options for recording an upstream model SDK call.
49#[derive(Clone, Debug, Default)]
50pub struct ModelOperationOptions {
51    /// Upstream model provider name.
52    pub provider_name: String,
53    /// Requested model name.
54    pub request_model: String,
55    /// Common model request options to record.
56    pub request_options: RequestOptions,
57    /// Additional span attributes for the request.
58    pub request_attributes: Vec<KeyValue>,
59}
60
61impl ModelOperationOptions {
62    /// Creates model-operation options for a provider and model.
63    pub fn new(provider_name: impl Into<String>, request_model: impl Into<String>) -> Self {
64        Self {
65            provider_name: provider_name.into(),
66            request_model: request_model.into(),
67            ..Self::default()
68        }
69    }
70
71    /// Sets common model request options.
72    pub fn with_request_options(mut self, request_options: RequestOptions) -> Self {
73        self.request_options = request_options;
74        self
75    }
76
77    /// Adds one request span attribute.
78    pub fn with_request_attribute(mut self, attribute: KeyValue) -> Self {
79        self.request_attributes.push(attribute);
80        self
81    }
82}
83
84/// Common GenAI request options that are useful as span attributes.
85#[derive(Clone, Debug, Default)]
86pub struct RequestOptions {
87    /// Number of choices requested.
88    pub choice_count: Option<i64>,
89    /// Frequency penalty requested.
90    pub frequency_penalty: Option<f64>,
91    /// Maximum output tokens requested.
92    pub max_tokens: Option<i64>,
93    /// Presence penalty requested.
94    pub presence_penalty: Option<f64>,
95    /// Sampling seed requested.
96    pub seed: Option<i64>,
97    /// Sampling temperature requested.
98    pub temperature: Option<f64>,
99    /// Top-k sampling parameter requested.
100    pub top_k: Option<i64>,
101    /// Top-p sampling parameter requested.
102    pub top_p: Option<f64>,
103}
104
105/// Options for recording provider-owned agent turn execution.
106#[derive(Clone, Debug, Default)]
107pub struct AgentInvocationOptions {
108    /// Agent name.
109    pub agent_name: String,
110    /// Agent session id.
111    pub session_id: String,
112    /// Agent turn id.
113    pub turn_id: String,
114    /// Model used by the agent turn.
115    pub model: String,
116}
117
118impl AgentInvocationOptions {
119    /// Creates options for an agent invocation span.
120    pub fn new(
121        agent_name: impl Into<String>,
122        session_id: impl Into<String>,
123        turn_id: impl Into<String>,
124        model: impl Into<String>,
125    ) -> Self {
126        Self {
127            agent_name: agent_name.into(),
128            session_id: session_id.into(),
129            turn_id: turn_id.into(),
130            model: model.into(),
131        }
132    }
133}
134
135/// Options for recording provider-owned tool execution.
136#[derive(Clone, Debug, Default)]
137pub struct ToolExecutionOptions {
138    /// Tool name.
139    pub tool_name: String,
140    /// Tool call id, when supplied by the model.
141    pub tool_call_id: String,
142    /// GenAI tool type.
143    pub tool_type: String,
144}
145
146impl ToolExecutionOptions {
147    /// Creates options for a tool execution span.
148    pub fn new(tool_name: impl Into<String>) -> Self {
149        Self {
150            tool_name: tool_name.into(),
151            tool_type: GENAI_TOOL_TYPE_EXTENSION.to_string(),
152            ..Self::default()
153        }
154    }
155
156    /// Sets the model-supplied tool call id.
157    pub fn with_tool_call_id(mut self, tool_call_id: impl Into<String>) -> Self {
158        self.tool_call_id = tool_call_id.into();
159        self
160    }
161
162    /// Sets the GenAI tool type.
163    pub fn with_tool_type(mut self, tool_type: impl Into<String>) -> Self {
164        self.tool_type = tool_type.into();
165        self
166    }
167}
168
169/// GenAI token usage recorded on spans and token usage metrics.
170#[derive(Clone, Debug, Default)]
171pub struct TokenUsage {
172    /// Input token count.
173    pub input_tokens: Option<u64>,
174    /// Output token count.
175    pub output_tokens: Option<u64>,
176    /// Anthropic cache-creation input token count.
177    pub cache_creation_input_tokens: Option<u64>,
178    /// Cached input token count.
179    pub cache_read_input_tokens: Option<u64>,
180    /// Reasoning output token count.
181    pub reasoning_output_tokens: Option<u64>,
182}
183
184/// Records a GenAI span plus operation duration and token usage metrics.
185#[derive(Debug)]
186pub struct GenAIOperation {
187    span: global::BoxedSpan,
188    started_at: Instant,
189    metric_attributes: Vec<KeyValue>,
190    error_type: Option<String>,
191    ended: bool,
192}
193
194impl GenAIOperation {
195    /// Ends the span and records operation duration.
196    pub fn end(&mut self) {
197        if self.ended {
198            return;
199        }
200        self.ended = true;
201
202        let mut attributes = self.metric_attributes.clone();
203        if let Some(error_type) = self.error_type.clone() {
204            append_or_replace(
205                &mut attributes,
206                KeyValue::new("error.type", error_type.to_string()),
207            );
208        }
209        operation_duration().record(self.started_at.elapsed().as_secs_f64(), &attributes);
210        self.span.end();
211    }
212
213    /// Marks the operation span and duration metric as failed.
214    pub fn mark_error(&mut self, error_type: impl Into<String>, description: impl Into<String>) {
215        let error_type = clean_string(error_type.into()).unwrap_or_else(|| "_OTHER".to_string());
216        self.error_type = Some(error_type.clone());
217        append_or_replace(
218            &mut self.metric_attributes,
219            KeyValue::new("error.type", error_type.clone()),
220        );
221        self.span
222            .set_attribute(KeyValue::new("error.type", error_type.clone()));
223        self.span.set_status(Status::error(description.into()));
224    }
225
226    /// Records an error object and marks the operation as failed.
227    pub fn record_error<E>(&mut self, err: &E)
228    where
229        E: Error + 'static,
230    {
231        self.mark_error(type_name::<E>(), err.to_string());
232        self.span.record_error(err);
233    }
234
235    /// Sets a span attribute. `gen_ai.response.model` is also added to metric attributes.
236    pub fn set_attribute(&mut self, attribute: KeyValue) {
237        if attribute.key.as_str() == "gen_ai.response.model" {
238            append_or_replace(&mut self.metric_attributes, attribute.clone());
239        }
240        self.span.set_attribute(attribute);
241    }
242
243    /// Attaches common GenAI response metadata to the span.
244    pub fn set_response_metadata(
245        &mut self,
246        response_id: Option<&str>,
247        response_model: Option<&str>,
248        finish_reasons: &[&str],
249    ) {
250        if let Some(response_id) = clean_string(response_id.unwrap_or_default()) {
251            self.set_attribute(KeyValue::new("gen_ai.response.id", response_id));
252        }
253        if let Some(response_model) = clean_string(response_model.unwrap_or_default()) {
254            self.set_attribute(KeyValue::new("gen_ai.response.model", response_model));
255        }
256        let finish_reasons = finish_reasons
257            .iter()
258            .filter_map(|reason| clean_string(*reason).map(StringValue::from))
259            .collect::<Vec<_>>();
260        if !finish_reasons.is_empty() {
261            self.set_attribute(KeyValue::new(
262                "gen_ai.response.finish_reasons",
263                Value::Array(Array::String(finish_reasons)),
264            ));
265        }
266    }
267
268    /// Records GenAI token usage on the span and token usage metric.
269    pub fn record_usage(&mut self, usage: TokenUsage) {
270        self.set_u64_attribute("gen_ai.usage.input_tokens", usage.input_tokens);
271        self.set_u64_attribute("gen_ai.usage.output_tokens", usage.output_tokens);
272        self.set_u64_attribute(
273            "gen_ai.usage.cache_creation.input_tokens",
274            usage.cache_creation_input_tokens,
275        );
276        self.set_u64_attribute(
277            "gen_ai.usage.cache_read.input_tokens",
278            usage.cache_read_input_tokens,
279        );
280        self.set_u64_attribute(
281            "gen_ai.usage.reasoning.output_tokens",
282            usage.reasoning_output_tokens,
283        );
284
285        self.record_token_usage(usage.input_tokens, "input");
286        self.record_token_usage(usage.output_tokens, "output");
287    }
288
289    fn set_u64_attribute(&mut self, key: &'static str, value: Option<u64>) {
290        let Some(value) = value else {
291            return;
292        };
293        if value <= i64::MAX as u64 {
294            self.set_attribute(KeyValue::new(key, value as i64));
295        }
296    }
297
298    fn record_token_usage(&self, tokens: Option<u64>, token_type: &'static str) {
299        let Some(tokens) = tokens else {
300            return;
301        };
302        let mut attributes = self.metric_attributes.clone();
303        append_or_replace(
304            &mut attributes,
305            KeyValue::new("gen_ai.token.type", token_type),
306        );
307        token_usage().record(tokens, &attributes);
308    }
309}
310
311impl Drop for GenAIOperation {
312    fn drop(&mut self) {
313        self.end();
314    }
315}
316
317/// Starts a GenAI client span for an upstream model SDK call.
318pub fn model_operation(options: ModelOperationOptions) -> GenAIOperation {
319    let provider_name = clean_string(options.provider_name).unwrap_or_else(|| "_OTHER".to_string());
320    let request_model = clean_string(options.request_model).unwrap_or_default();
321    let metric_attributes = vec![
322        KeyValue::new("gen_ai.operation.name", GENAI_OPERATION_CHAT),
323        KeyValue::new("gen_ai.provider.name", provider_name),
324        KeyValue::new("gen_ai.request.model", request_model.clone()),
325    ];
326    let mut span_attributes = metric_attributes.clone();
327    span_attributes.extend(request_option_attributes(options.request_options));
328    span_attributes.extend(options.request_attributes);
329
330    start_operation(
331        span_name(GENAI_OPERATION_CHAT, &request_model),
332        SpanKind::Client,
333        span_attributes,
334        metric_attributes,
335    )
336}
337
338/// Starts a GenAI internal span for provider-owned agent turn execution.
339pub fn agent_invocation(options: AgentInvocationOptions) -> GenAIOperation {
340    let agent_name = clean_string(options.agent_name).unwrap_or_else(|| "provider".to_string());
341    let model = clean_string(options.model).unwrap_or_default();
342    let span_attributes = vec![
343        KeyValue::new("gen_ai.operation.name", GENAI_OPERATION_INVOKE_AGENT),
344        KeyValue::new("gen_ai.provider.name", GENAI_PROVIDER_NAME),
345        KeyValue::new("gen_ai.agent.name", agent_name.clone()),
346        KeyValue::new(
347            "gen_ai.conversation.id",
348            clean_string(options.session_id).unwrap_or_default(),
349        ),
350        KeyValue::new("gen_ai.request.model", model.clone()),
351        KeyValue::new(
352            "gestalt.agent.turn_id",
353            clean_string(options.turn_id).unwrap_or_default(),
354        ),
355    ];
356    let metric_attributes = vec![
357        KeyValue::new("gen_ai.operation.name", GENAI_OPERATION_INVOKE_AGENT),
358        KeyValue::new("gen_ai.provider.name", GENAI_PROVIDER_NAME),
359        KeyValue::new("gen_ai.agent.name", agent_name.clone()),
360        KeyValue::new("gen_ai.request.model", model),
361    ];
362
363    start_operation(
364        span_name(GENAI_OPERATION_INVOKE_AGENT, &agent_name),
365        SpanKind::Internal,
366        span_attributes,
367        metric_attributes,
368    )
369}
370
371/// Starts a GenAI internal span for provider-owned tool execution.
372pub fn tool_execution(options: ToolExecutionOptions) -> GenAIOperation {
373    let tool_name = clean_string(options.tool_name).unwrap_or_else(|| "_OTHER".to_string());
374    let tool_type =
375        clean_string(options.tool_type).unwrap_or_else(|| GENAI_TOOL_TYPE_EXTENSION.to_string());
376    let span_attributes = vec![
377        KeyValue::new("gen_ai.operation.name", GENAI_OPERATION_EXECUTE_TOOL),
378        KeyValue::new("gen_ai.provider.name", GENAI_PROVIDER_NAME),
379        KeyValue::new("gen_ai.tool.name", tool_name.clone()),
380        KeyValue::new(
381            "gen_ai.tool.call.id",
382            clean_string(options.tool_call_id).unwrap_or_default(),
383        ),
384        KeyValue::new("gen_ai.tool.type", tool_type.clone()),
385    ];
386    let metric_attributes = vec![
387        KeyValue::new("gen_ai.operation.name", GENAI_OPERATION_EXECUTE_TOOL),
388        KeyValue::new("gen_ai.provider.name", GENAI_PROVIDER_NAME),
389        KeyValue::new("gen_ai.tool.name", tool_name.clone()),
390        KeyValue::new("gen_ai.tool.type", tool_type),
391    ];
392
393    start_operation(
394        span_name(GENAI_OPERATION_EXECUTE_TOOL, &tool_name),
395        SpanKind::Internal,
396        span_attributes,
397        metric_attributes,
398    )
399}
400
401fn start_operation(
402    name: String,
403    kind: SpanKind,
404    span_attributes: Vec<KeyValue>,
405    metric_attributes: Vec<KeyValue>,
406) -> GenAIOperation {
407    let tracer = global::tracer(TELEMETRY_INSTRUMENTATION_NAME);
408    let span = tracer
409        .span_builder(name)
410        .with_kind(kind)
411        .with_attributes(span_attributes)
412        .start(&tracer);
413
414    GenAIOperation {
415        span,
416        started_at: Instant::now(),
417        metric_attributes,
418        error_type: None,
419        ended: false,
420    }
421}
422
423fn operation_duration() -> &'static Histogram<f64> {
424    OPERATION_DURATION.get_or_init(|| {
425        global::meter(TELEMETRY_INSTRUMENTATION_NAME)
426            .f64_histogram(OPERATION_DURATION_METRIC)
427            .with_unit("s")
428            .with_description("GenAI operation duration.")
429            .with_boundaries(OPERATION_DURATION_BUCKETS.to_vec())
430            .build()
431    })
432}
433
434fn token_usage() -> &'static Histogram<u64> {
435    TOKEN_USAGE.get_or_init(|| {
436        global::meter(TELEMETRY_INSTRUMENTATION_NAME)
437            .u64_histogram(TOKEN_USAGE_METRIC)
438            .with_unit("{token}")
439            .with_description("Number of input and output tokens used.")
440            .with_boundaries(TOKEN_USAGE_BUCKETS.to_vec())
441            .build()
442    })
443}
444
445fn request_option_attributes(options: RequestOptions) -> Vec<KeyValue> {
446    let mut attributes = Vec::new();
447    push_i64(
448        &mut attributes,
449        "gen_ai.request.choice.count",
450        options.choice_count,
451    );
452    push_f64(
453        &mut attributes,
454        "gen_ai.request.frequency_penalty",
455        options.frequency_penalty,
456    );
457    push_i64(
458        &mut attributes,
459        "gen_ai.request.max_tokens",
460        options.max_tokens,
461    );
462    push_f64(
463        &mut attributes,
464        "gen_ai.request.presence_penalty",
465        options.presence_penalty,
466    );
467    push_i64(&mut attributes, "gen_ai.request.seed", options.seed);
468    push_f64(
469        &mut attributes,
470        "gen_ai.request.temperature",
471        options.temperature,
472    );
473    push_i64(&mut attributes, "gen_ai.request.top_k", options.top_k);
474    push_f64(&mut attributes, "gen_ai.request.top_p", options.top_p);
475    attributes
476}
477
478fn push_i64(attributes: &mut Vec<KeyValue>, key: &'static str, value: Option<i64>) {
479    if let Some(value) = value {
480        attributes.push(KeyValue::new(key, value));
481    }
482}
483
484fn push_f64(attributes: &mut Vec<KeyValue>, key: &'static str, value: Option<f64>) {
485    if let Some(value) = value.filter(|value| value.is_finite()) {
486        attributes.push(KeyValue::new(key, value));
487    }
488}
489
490fn append_or_replace(attributes: &mut Vec<KeyValue>, attribute: KeyValue) {
491    if let Some(existing) = attributes
492        .iter_mut()
493        .find(|existing| existing.key.as_str() == attribute.key.as_str())
494    {
495        *existing = attribute;
496    } else {
497        attributes.push(attribute);
498    }
499}
500
501fn span_name(operation: &'static str, subject: &str) -> String {
502    let subject = subject.trim();
503    if subject.is_empty() {
504        operation.to_string()
505    } else {
506        format!("{operation} {subject}")
507    }
508}
509
510fn clean_string(value: impl Into<String>) -> Option<String> {
511    let value = value.into().trim().to_string();
512    if value.is_empty() { None } else { Some(value) }
513}
514
515#[cfg(test)]
516mod tests {
517    use super::*;
518    use std::fmt;
519
520    #[derive(Debug)]
521    struct CustomTelemetryError;
522
523    impl fmt::Display for CustomTelemetryError {
524        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
525            write!(f, "custom telemetry error")
526        }
527    }
528
529    impl Error for CustomTelemetryError {}
530
531    #[test]
532    fn model_metric_attributes_exclude_request_options() {
533        let operation = model_operation(
534            ModelOperationOptions::new("openai", "gpt-4.1").with_request_options(RequestOptions {
535                seed: Some(123),
536                temperature: Some(0.2),
537                ..RequestOptions::default()
538            }),
539        );
540
541        assert!(
542            !operation
543                .metric_attributes
544                .iter()
545                .any(|attr| attr.key.as_str() == "gen_ai.request.seed")
546        );
547    }
548
549    #[test]
550    fn operations_record_without_configured_sdk() {
551        let mut operation = model_operation(ModelOperationOptions::new("openai", "gpt-4.1"));
552        operation.set_response_metadata(Some("resp-123"), Some("gpt-4.1"), &["stop"]);
553        operation.record_usage(TokenUsage {
554            input_tokens: Some(12),
555            output_tokens: Some(34),
556            ..TokenUsage::default()
557        });
558        operation.end();
559
560        let mut agent = agent_invocation(AgentInvocationOptions::new(
561            "simple",
562            "session-123",
563            "turn-123",
564            "claude-opus-4-1",
565        ));
566        agent.mark_error("agent_error", "agent failed");
567        agent.end();
568
569        let mut tool = tool_execution(
570            ToolExecutionOptions::new("github.search").with_tool_call_id("call-123"),
571        );
572        tool.mark_error("tool_error", "tool failed");
573        tool.end();
574    }
575
576    #[test]
577    fn record_error_uses_concrete_error_type() {
578        let mut operation = model_operation(ModelOperationOptions::new("openai", "gpt-4.1"));
579        let err = CustomTelemetryError;
580
581        operation.record_error(&err);
582
583        assert_eq!(
584            operation.error_type.as_deref(),
585            Some("gestalt::telemetry::tests::CustomTelemetryError")
586        );
587    }
588}