Skip to main content

meerkat_core/
realtime_transcript.rs

1//! Typed realtime transcript append seam.
2//!
3//! Provider adapters translate provider-native realtime events into these
4//! identity-bearing events. The session layer owns idempotency, causal ordering,
5//! and the decision to materialize canonical transcript messages.
6
7use serde::{Deserialize, Serialize};
8
9use crate::types::{StopReason, Usage};
10
11/// Durable session metadata key for realtime transcript append state.
12pub const SESSION_REALTIME_TRANSCRIPT_STATE_KEY: &str = "realtime_transcript_state";
13
14/// Provider-neutral role for a realtime transcript item.
15#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
17#[serde(rename_all = "snake_case")]
18pub enum RealtimeTranscriptRole {
19    User,
20    Assistant,
21}
22
23/// Output lane carried by an assistant realtime transcript item.
24///
25/// T9/T10: distinguishes display text (authored output the model writes,
26/// e.g. OpenAI realtime `response.output_text.delta`) from spoken transcript
27/// (text derived from audio output, e.g. `response.output_audio_transcript.*`).
28/// The materializer at [`crate::session::Session::append_realtime_transcript_event`]
29/// dispatches on this to flush either [`crate::types::AssistantBlock::Text`]
30/// (for `Display`) or [`crate::types::AssistantBlock::Transcript`] with
31/// `source: TranscriptSource::Spoken` (for `Spoken`).
32///
33/// `Display` is the default for items that arrive only via
34/// [`RealtimeTranscriptEvent::AssistantTextDelta`]; an item is upgraded to
35/// `Spoken` the first time an [`RealtimeTranscriptEvent::AssistantTranscriptDelta`]
36/// fragment arrives for it. Mixed-lane content on the same `item_id` is not
37/// expected from any provider today; if observed the **first** lane wins
38/// (the materializer cannot retroactively re-classify a partially-flushed
39/// item) and a `tracing::warn!` is emitted.
40#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
41#[derive(
42    Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, Default,
43)]
44#[serde(rename_all = "snake_case")]
45#[non_exhaustive]
46pub enum TranscriptLane {
47    #[default]
48    Display,
49    Spoken,
50}
51
52/// A typed, identity-bearing realtime transcript event consumed by the session.
53#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
54#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
55#[serde(tag = "type", rename_all = "snake_case")]
56pub enum RealtimeTranscriptEvent {
57    /// Observe a provider item and its causal predecessor without committing
58    /// content yet.
59    ItemObserved {
60        item_id: String,
61        previous_item_id: Option<String>,
62        role: RealtimeTranscriptRole,
63        response_id: Option<String>,
64    },
65    /// Observe a provider item that participates in provider causal ordering
66    /// but must not materialize transcript content.
67    ItemSkipped {
68        item_id: String,
69        previous_item_id: Option<String>,
70    },
71    /// Provider finalized the transcript for a user input item.
72    UserTranscriptFinal {
73        item_id: String,
74        previous_item_id: Option<String>,
75        content_index: u32,
76        text: String,
77    },
78    /// Provider emitted an assistant **display-text** delta for an output
79    /// item — authored text the model writes (e.g. OpenAI realtime
80    /// `response.output_text.delta`).
81    ///
82    /// Materializes as [`crate::types::AssistantBlock::Text`].
83    AssistantTextDelta {
84        response_id: String,
85        delta_id: String,
86        item_id: String,
87        previous_item_id: Option<String>,
88        content_index: u32,
89        delta: String,
90    },
91    /// Provider emitted an assistant **spoken-transcript** delta for an
92    /// output item — text derived from audio output (e.g. OpenAI realtime
93    /// `response.output_audio_transcript.delta`).
94    ///
95    /// Identity shape mirrors [`Self::AssistantTextDelta`] so the session's
96    /// idempotent ordering / staging logic owns dedup uniformly across
97    /// lanes. Materializes as [`crate::types::AssistantBlock::Transcript`]
98    /// with `source: TranscriptSource::Spoken` (T9/T10).
99    AssistantTranscriptDelta {
100        response_id: String,
101        delta_id: String,
102        item_id: String,
103        previous_item_id: Option<String>,
104        content_index: u32,
105        delta: String,
106    },
107    /// Provider reported the assistant output item was truncated to the heard
108    /// transcript prefix.
109    AssistantTranscriptTruncated {
110        response_id: String,
111        item_id: String,
112        content_index: u32,
113        text: String,
114    },
115    /// R5-7: provider supplied authoritative final transcript text for an
116    /// assistant output item, overriding any incomplete delta accumulation.
117    ///
118    /// Necessary for two cases:
119    ///   1. Final-only providers that emit a single `AssistantTranscriptFinal`
120    ///      observation without prior deltas.
121    ///   2. Recovery from delta loss (R5-1: lossy media lane back-pressure
122    ///      may drop transcript deltas; the final's text is the authoritative
123    ///      reconciliation).
124    ///
125    /// The materializer locates the staged item by
126    /// `(response_id, item_id, content_index)`, replaces its accumulated
127    /// content with `text`, and (if no item is staged yet) creates one on
128    /// the spoken lane. Flush still happens via `AssistantTurnCompleted`;
129    /// this variant only updates the staged content.
130    AssistantTranscriptFinalText {
131        response_id: String,
132        item_id: String,
133        content_index: u32,
134        text: String,
135    },
136    /// Provider turn reached a terminal boundary. The session decides which
137    /// staged assistant items, if any, are now canonical.
138    AssistantTurnCompleted {
139        response_id: String,
140        stop_reason: StopReason,
141        usage: Usage,
142    },
143    /// Provider turn was interrupted before terminal materialization.
144    AssistantTurnInterrupted { response_id: String },
145}
146
147/// Canonical message materialized by applying a realtime transcript event.
148#[derive(Debug, Clone, PartialEq)]
149pub enum RealtimeTranscriptMaterializedMessage {
150    User {
151        item_id: String,
152        text: String,
153    },
154    Assistant {
155        item_id: String,
156        response_id: String,
157        text: String,
158        stop_reason: StopReason,
159        usage: Usage,
160        /// T9/T10: which output lane the staged content arrived on.
161        /// Drives whether the materializer flushes
162        /// [`crate::types::AssistantBlock::Text`] (Display) or
163        /// [`crate::types::AssistantBlock::Transcript`] with
164        /// `source: TranscriptSource::Spoken` (Spoken).
165        lane: TranscriptLane,
166    },
167}
168
169/// Result of applying a realtime transcript event.
170#[derive(Debug, Clone, Default, PartialEq)]
171pub struct RealtimeTranscriptApplyOutcome {
172    pub materialized_messages: Vec<RealtimeTranscriptMaterializedMessage>,
173}
174
175impl RealtimeTranscriptApplyOutcome {
176    #[must_use]
177    pub fn is_inert(&self) -> bool {
178        self.materialized_messages.is_empty()
179    }
180}