1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
//! Typed realtime transcript append seam.
//!
//! Provider adapters translate provider-native realtime events into these
//! identity-bearing events. The session layer owns idempotency, causal ordering,
//! and the decision to materialize canonical transcript messages.
use serde::{Deserialize, Serialize};
use crate::types::{StopReason, Usage};
/// Durable session metadata key for realtime transcript append state.
pub const SESSION_REALTIME_TRANSCRIPT_STATE_KEY: &str = "realtime_transcript_state";
/// Provider-neutral role for a realtime transcript item.
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeTranscriptRole {
User,
Assistant,
}
/// Output lane carried by an assistant realtime transcript item.
///
/// T9/T10: distinguishes display text (authored output the model writes,
/// e.g. OpenAI realtime `response.output_text.delta`) from spoken transcript
/// (text derived from audio output, e.g. `response.output_audio_transcript.*`).
/// The materializer at [`crate::session::Session::append_realtime_transcript_event`]
/// dispatches on this to flush either [`crate::types::AssistantBlock::Text`]
/// (for `Display`) or [`crate::types::AssistantBlock::Transcript`] with
/// `source: TranscriptSource::Spoken` (for `Spoken`).
///
/// `Display` is the default for items that arrive only via
/// [`RealtimeTranscriptEvent::AssistantTextDelta`]; an item is upgraded to
/// `Spoken` the first time an [`RealtimeTranscriptEvent::AssistantTranscriptDelta`]
/// fragment arrives for it. Mixed-lane content on the same `item_id` is not
/// expected from any provider today; if observed the **first** lane wins
/// (the materializer cannot retroactively re-classify a partially-flushed
/// item) and a `tracing::warn!` is emitted.
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
#[derive(
Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, Default,
)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum TranscriptLane {
#[default]
Display,
Spoken,
}
/// A typed, identity-bearing realtime transcript event consumed by the session.
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum RealtimeTranscriptEvent {
/// Observe a provider item and its causal predecessor without committing
/// content yet.
ItemObserved {
item_id: String,
previous_item_id: Option<String>,
role: RealtimeTranscriptRole,
response_id: Option<String>,
},
/// Observe a provider item that participates in provider causal ordering
/// but must not materialize transcript content.
ItemSkipped {
item_id: String,
previous_item_id: Option<String>,
},
/// Provider finalized the transcript for a user input item.
UserTranscriptFinal {
item_id: String,
previous_item_id: Option<String>,
content_index: u32,
text: String,
},
/// Provider emitted an assistant **display-text** delta for an output
/// item — authored text the model writes (e.g. OpenAI realtime
/// `response.output_text.delta`).
///
/// Materializes as [`crate::types::AssistantBlock::Text`].
AssistantTextDelta {
response_id: String,
delta_id: String,
item_id: String,
previous_item_id: Option<String>,
content_index: u32,
delta: String,
},
/// Provider emitted an assistant **spoken-transcript** delta for an
/// output item — text derived from audio output (e.g. OpenAI realtime
/// `response.output_audio_transcript.delta`).
///
/// Identity shape mirrors [`Self::AssistantTextDelta`] so the session's
/// idempotent ordering / staging logic owns dedup uniformly across
/// lanes. Materializes as [`crate::types::AssistantBlock::Transcript`]
/// with `source: TranscriptSource::Spoken` (T9/T10).
AssistantTranscriptDelta {
response_id: String,
delta_id: String,
item_id: String,
previous_item_id: Option<String>,
content_index: u32,
delta: String,
},
/// Provider reported the assistant output item was truncated to the heard
/// transcript prefix.
AssistantTranscriptTruncated {
response_id: String,
item_id: String,
content_index: u32,
text: String,
},
/// R5-7: provider supplied authoritative final transcript text for an
/// assistant output item, overriding any incomplete delta accumulation.
///
/// Necessary for two cases:
/// 1. Final-only providers that emit a single `AssistantTranscriptFinal`
/// observation without prior deltas.
/// 2. Recovery from delta loss (R5-1: lossy media lane back-pressure
/// may drop transcript deltas; the final's text is the authoritative
/// reconciliation).
///
/// The materializer locates the staged item by
/// `(response_id, item_id, content_index)`, replaces its accumulated
/// content with `text`, and (if no item is staged yet) creates one on
/// the spoken lane. Flush still happens via `AssistantTurnCompleted`;
/// this variant only updates the staged content.
AssistantTranscriptFinalText {
response_id: String,
item_id: String,
content_index: u32,
text: String,
},
/// Provider turn reached a terminal boundary. The session decides which
/// staged assistant items, if any, are now canonical.
AssistantTurnCompleted {
response_id: String,
stop_reason: StopReason,
usage: Usage,
},
/// Provider turn was interrupted before terminal materialization.
AssistantTurnInterrupted { response_id: String },
}
/// Canonical message materialized by applying a realtime transcript event.
#[derive(Debug, Clone, PartialEq)]
pub enum RealtimeTranscriptMaterializedMessage {
User {
item_id: String,
text: String,
},
Assistant {
item_id: String,
response_id: String,
text: String,
stop_reason: StopReason,
usage: Usage,
/// T9/T10: which output lane the staged content arrived on.
/// Drives whether the materializer flushes
/// [`crate::types::AssistantBlock::Text`] (Display) or
/// [`crate::types::AssistantBlock::Transcript`] with
/// `source: TranscriptSource::Spoken` (Spoken).
lane: TranscriptLane,
},
}
/// Result of applying a realtime transcript event.
#[derive(Debug, Clone, Default, PartialEq)]
pub struct RealtimeTranscriptApplyOutcome {
pub materialized_messages: Vec<RealtimeTranscriptMaterializedMessage>,
}
impl RealtimeTranscriptApplyOutcome {
#[must_use]
pub fn is_inert(&self) -> bool {
self.materialized_messages.is_empty()
}
}