1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
//! Local speech-to-text for inbound voice notes.
//!
//! Lifts `agent-creator-microapp::stt` into a reusable SDK
//! feature. Two pieces:
//!
//! - [`transcribe::transcribe_file`] — decodes an ogg-opus audio
//! file (the format both WhatsApp and Telegram voice notes use)
//! to 16 kHz mono s16le PCM via `ogg` + `opus-wave` (pure Rust,
//! no `ffmpeg` subprocess) and runs whisper.cpp on the result.
//! - [`tool::InboundTransformHandler`] — `ToolHandler` impl ready to
//! hand to `Microapp::with_tool("audio_stt_inbound_transform", …)`.
//! Implements the framework's auto-discovered
//! `*_inbound_transform` wire shape (see `nexo-core`'s LLM behavior
//! pipeline): non-audio passthrough, missing-file → graceful
//! `{ ok: false }`, transcription failure → passthrough so the
//! chat turn isn't dropped.
//!
//! # Quick start
//!
//! ```no_run
//! # use std::sync::Arc;
//! # use nexo_microapp_sdk::Microapp;
//! # use nexo_microapp_sdk::stt::{InboundTransformHandler, TranscribeConfig};
//! let cfg = Arc::new(TranscribeConfig {
//! model_path: "/var/lib/myapp/whisper/ggml-tiny-q5_1.bin".into(),
//! lang_hint: Some("es".into()),
//! ..Default::default()
//! });
//! let app = Microapp::new("voice-microapp", env!("CARGO_PKG_VERSION"))
//! .with_tool(
//! "audio_stt_inbound_transform",
//! InboundTransformHandler::new(cfg),
//! );
//! # let _ = app;
//! ```
// Backend dispatch. Both backends speak the same
// `transcribe_file(path, cfg) -> Result<String>` signature so the
// public re-export at the bottom of this file picks the right one
// at compile time.
//
// Precedence when both features are enabled simultaneously
// (typically only the parity test): `stt-candle` wins as the
// public re-export; parity tests reference the whisper-rs path
// directly through `super::stt::transcribe::transcribe_file` and
// compare against `super::stt::transcribe_candle::transcribe_file`.
//
// Production builds should pick exactly one backend. The legacy
// `stt` feature is slated for removal after a stability window;
// the CHANGELOG flags the migration path.
// `tool` only wires the `*_inbound_transform` tool that calls
// `transcribe_file(path, cfg)` — a local-file API. Cloud-only
// builds (`stt-cloud` without `stt` / `stt-candle`) don't pull
// it; cloud consumers call the provider trait directly with
// audio bytes already in memory.
// Shared audio decode chain (ogg-opus → s16 PCM → f32) — used by
// both backends. Available whenever either feature is on, since
// the parent module is gated on `any(...)`.
pub
// Legacy whisper-rs path.
// Candle path — the submodules carry their own
// `#[cfg(feature = "stt-candle")]` guard so a stale build cache
// doesn't surface unrelated errors when toggling features.
pub
// Cloud STT backends (OpenAI Whisper, Groq Whisper-large-v3,
// Anthropic voice_stream) + `CompositeProvider` fallback chain.
//
// The workspace dependency pin is trimmed so resolver-2 doesn't
// unify wasm-broken reqwest features into the SDK wasm32 graph.
// With that done, the cloud REST legs
// compile cleanly on `wasm32-unknown-unknown` via reqwest's
// browser fetch backend. Two routes:
//
// - `stt-cloud-wasm` — wasm-compatible base (no rustls-tls)
// - `stt-cloud` — native default (stacks rustls-tls on top)
//
// Either feature enables this module. Inner submodules carry
// per-target cfg gates for native-only transports:
//
// - `local_candle` — needs Candle inference (`stt-candle`
// feature; Candle deps don't compile for wasm32)
// - `anthropic` — tokio-tungstenite drags TCP types absent
// on wasm32; carries its own `cfg(not(wasm32))`. A future
// swap to gloo-net would unblock browser microapps that
// demand voice_stream.
use PathBuf;
use Error;
pub use InboundTransformHandler;
// Backend dispatch — re-export the same `transcribe_file`
// symbol from whichever backend is active. Downstream microapps
// don't need to know which one is in effect. When both are on
// (parity tests), `stt-candle` wins as the default re-export;
// the legacy whisper-rs entry stays reachable through
// `super::stt::transcribe::transcribe_file`.
pub use transcribe_file;
pub use transcribe_file;
/// Result alias for STT operations.
pub type Result<T> = Result;
/// Typed error surface for the `stt` feature.
/// Knobs for [`transcribe_file`] / [`InboundTransformHandler`].
///
/// Cheap to clone and intended to be wrapped in `Arc` and shared
/// across handler invocations.