meerkat_core/realtime_transcript.rs
1//! Typed realtime transcript append seam.
2//!
3//! Provider adapters translate provider-native realtime events into these
4//! identity-bearing events. The session layer owns idempotency, causal ordering,
5//! and the decision to materialize canonical transcript messages.
6
7use serde::{Deserialize, Serialize};
8
9use crate::types::{StopReason, Usage};
10
11/// Durable session metadata key for realtime transcript append state.
12pub const SESSION_REALTIME_TRANSCRIPT_STATE_KEY: &str = "realtime_transcript_state";
13
14/// Provider-neutral role for a realtime transcript item.
15#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
17#[serde(rename_all = "snake_case")]
18pub enum RealtimeTranscriptRole {
19 User,
20 Assistant,
21}
22
23/// Output lane carried by an assistant realtime transcript item.
24///
25/// T9/T10: distinguishes display text (authored output the model writes,
26/// e.g. OpenAI realtime `response.output_text.delta`) from spoken transcript
27/// (text derived from audio output, e.g. `response.output_audio_transcript.*`).
28/// The materializer at [`crate::session::Session::append_realtime_transcript_event`]
29/// dispatches on this to flush either [`crate::types::AssistantBlock::Text`]
30/// (for `Display`) or [`crate::types::AssistantBlock::Transcript`] with
31/// `source: TranscriptSource::Spoken` (for `Spoken`).
32///
33/// `Display` is the default for items that arrive only via
34/// [`RealtimeTranscriptEvent::AssistantTextDelta`]; an item is upgraded to
35/// `Spoken` the first time an [`RealtimeTranscriptEvent::AssistantTranscriptDelta`]
36/// fragment arrives for it. Mixed-lane content on the same `item_id` is not
37/// expected from any provider today; if observed the **first** lane wins
38/// (the materializer cannot retroactively re-classify a partially-flushed
39/// item) and a `tracing::warn!` is emitted.
40#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
41#[derive(
42 Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, Default,
43)]
44#[serde(rename_all = "snake_case")]
45#[non_exhaustive]
46pub enum TranscriptLane {
47 #[default]
48 Display,
49 Spoken,
50}
51
52/// A typed, identity-bearing realtime transcript event consumed by the session.
53#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
54#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
55#[serde(tag = "type", rename_all = "snake_case")]
56pub enum RealtimeTranscriptEvent {
57 /// Observe a provider item and its causal predecessor without committing
58 /// content yet.
59 ItemObserved {
60 item_id: String,
61 previous_item_id: Option<String>,
62 role: RealtimeTranscriptRole,
63 response_id: Option<String>,
64 },
65 /// Observe a provider item that participates in provider causal ordering
66 /// but must not materialize transcript content.
67 ItemSkipped {
68 item_id: String,
69 previous_item_id: Option<String>,
70 },
71 /// Provider finalized the transcript for a user input item.
72 UserTranscriptFinal {
73 item_id: String,
74 previous_item_id: Option<String>,
75 content_index: u32,
76 text: String,
77 },
78 /// Provider emitted an assistant **display-text** delta for an output
79 /// item — authored text the model writes (e.g. OpenAI realtime
80 /// `response.output_text.delta`).
81 ///
82 /// Materializes as [`crate::types::AssistantBlock::Text`].
83 AssistantTextDelta {
84 response_id: String,
85 delta_id: String,
86 item_id: String,
87 previous_item_id: Option<String>,
88 content_index: u32,
89 delta: String,
90 },
91 /// Provider emitted an assistant **spoken-transcript** delta for an
92 /// output item — text derived from audio output (e.g. OpenAI realtime
93 /// `response.output_audio_transcript.delta`).
94 ///
95 /// Identity shape mirrors [`Self::AssistantTextDelta`] so the session's
96 /// idempotent ordering / staging logic owns dedup uniformly across
97 /// lanes. Materializes as [`crate::types::AssistantBlock::Transcript`]
98 /// with `source: TranscriptSource::Spoken` (T9/T10).
99 AssistantTranscriptDelta {
100 response_id: String,
101 delta_id: String,
102 item_id: String,
103 previous_item_id: Option<String>,
104 content_index: u32,
105 delta: String,
106 },
107 /// Provider reported the assistant output item was truncated to the heard
108 /// transcript prefix.
109 AssistantTranscriptTruncated {
110 response_id: String,
111 item_id: String,
112 content_index: u32,
113 text: String,
114 },
115 /// R5-7: provider supplied authoritative final transcript text for an
116 /// assistant output item, overriding any incomplete delta accumulation.
117 ///
118 /// Necessary for two cases:
119 /// 1. Final-only providers that emit a single `AssistantTranscriptFinal`
120 /// observation without prior deltas.
121 /// 2. Recovery from delta loss (R5-1: lossy media lane back-pressure
122 /// may drop transcript deltas; the final's text is the authoritative
123 /// reconciliation).
124 ///
125 /// The materializer locates the staged item by
126 /// `(response_id, item_id, content_index)`, replaces its accumulated
127 /// content with `text`, and (if no item is staged yet) creates one on
128 /// the spoken lane. Flush still happens via `AssistantTurnCompleted`;
129 /// this variant only updates the staged content.
130 AssistantTranscriptFinalText {
131 response_id: String,
132 item_id: String,
133 content_index: u32,
134 text: String,
135 },
136 /// Provider turn reached a terminal boundary. The session decides which
137 /// staged assistant items, if any, are now canonical.
138 AssistantTurnCompleted {
139 response_id: String,
140 stop_reason: StopReason,
141 usage: Usage,
142 },
143 /// Provider turn was interrupted before terminal materialization.
144 AssistantTurnInterrupted { response_id: String },
145}
146
147/// Canonical message materialized by applying a realtime transcript event.
148#[derive(Debug, Clone, PartialEq)]
149pub enum RealtimeTranscriptMaterializedMessage {
150 User {
151 item_id: String,
152 text: String,
153 },
154 Assistant {
155 item_id: String,
156 response_id: String,
157 text: String,
158 stop_reason: StopReason,
159 usage: Usage,
160 /// T9/T10: which output lane the staged content arrived on.
161 /// Drives whether the materializer flushes
162 /// [`crate::types::AssistantBlock::Text`] (Display) or
163 /// [`crate::types::AssistantBlock::Transcript`] with
164 /// `source: TranscriptSource::Spoken` (Spoken).
165 lane: TranscriptLane,
166 },
167}
168
169/// Result of applying a realtime transcript event.
170#[derive(Debug, Clone, Default, PartialEq)]
171pub struct RealtimeTranscriptApplyOutcome {
172 pub materialized_messages: Vec<RealtimeTranscriptMaterializedMessage>,
173}
174
175impl RealtimeTranscriptApplyOutcome {
176 #[must_use]
177 pub fn is_inert(&self) -> bool {
178 self.materialized_messages.is_empty()
179 }
180}