rs_voice_toolkit_stt/
lib.rs

1//! # STT (Speech-to-Text) Module - 语音转文本模块
2//! 
3//! 这个模块提供了基于 OpenAI Whisper 模型的高质量语音识别功能，
4//! 支持文件转录、实时流式处理、语音活动检测等多种功能。
5//! 
6//! ## 主要功能
7//! 
8//! ### 核心特性
9//! - **高精度识别**: 基于 Whisper 模型，支持多种语言
10//! - **文件转录**: 支持多种音频格式的批量转录
11//! - **实时流式处理**: 支持音频流的实时转录
12//! - **语音活动检测**: 智能检测语音片段，提高处理效率
13//! - **多模型支持**: 支持 tiny、base、small、medium、large 等不同规模的模型
14//! - **性能监控**: 提供详细的性能指标和基准测试
15//! 
16//! ### 支持的音频格式
17//! - **WAV**: 原生支持，无需转换
18//! - **MP3**: 自动转换为兼容格式
19//! - **FLAC**: 自动转换为兼容格式
20//! - **M4A**: 自动转换为兼容格式
21//! - **OGG**: 自动转换为兼容格式
22//! 
23//! ## 快速开始
24//! 
25//! ### 基本文件转录
26//! 
27//! ```rust
28//! use rs_voice_toolkit_stt::{transcribe_file, WhisperConfig, SttError};
29//! 
30//! #[tokio::main]
31//! async fn main() -> Result<(), SttError> {
32//!     let model_path = "models/ggml-base.bin";
33//!     let audio_path = "audio/hello.wav";
34//!     
35//!     // 基本转录
36//!     let result = transcribe_file(model_path, audio_path).await?;
37//!     println!("转录结果: {}", result.text);
38//!     println!("处理时间: {:?}", result.processing_time);
39//!     
40//!     Ok(())
41//! }
42//! ```
43//! 
44//! ### 自定义配置转录
45//! 
46//! ```rust
47//! use rs_voice_toolkit_stt::{transcribe_file_with_config, WhisperConfig, SttError};
48//! 
49//! #[tokio::main]
50//! async fn main() -> Result<(), SttError> {
51//!     let model_path = "models/ggml-base.bin";
52//!     let audio_path = "audio/hello.wav";
53//!     
54//!     // 自定义配置
55//!     let config = WhisperConfig::new(model_path)
56//!         .with_language("zh")          // 指定中文
57//!         .with_temperature(0.2)         // 降低温度
58//!         .with_vad(true)               // 启用语音活动检测
59//!         .with_translate(false);        // 禁用翻译
60//!     
61//!     let result = transcribe_file_with_config(model_path, audio_path, Some(config)).await?;
62//!     println!("转录结果: {}", result.text);
63//!     
64//!     Ok(())
65//! }
66//! ```
67//! 
68//! ### 流式转录
69//! 
70//! ```rust
71//! use rs_voice_toolkit_stt::{StreamingTranscriber, StreamingConfig, SttError};
72//! 
73//! #[tokio::main]
74//! async fn main() -> Result<(), SttError> {
75//!     let model_path = "models/ggml-base.bin";
76//!     
77//!     // 创建流式转录器
78//!     let mut transcriber = StreamingTranscriber::new(model_path).await?;
79//!     
80//!     // 配置参数
81//!     transcriber.set_language("auto")?;
82//!     transcriber.set_task("transcribe")?;
83//!     transcriber.enable_vad(true)?;
84//!     
85//!     // 模拟音频流处理
86//!     let audio_chunks: Vec<Vec<f32>> = vec/*[音频数据块]*/;
87//!     
88//!     for chunk in audio_chunks {
89//!         let segments = transcriber.process_audio(&chunk).await?;
90//!         for segment in segments {
91//!             println!("[{}s-{}s] {}", segment.start_time, segment.end_time, segment.text);
92//!         }
93//!     }
94//!     
95//!     // 获取最终结果
96//!     let final_result = transcriber.finalize().await?;
97//!     println!("最终转录: {}", final_result.text);
98//!     
99//!     Ok(())
100//! }
101//! ```
102//! 
103//! ## 模型选择指南
104//! 
105//! | 模型 | 大小 | 速度 | 准确度 | 适用场景 |
106//! |------|------|------|--------|----------|
107//! | tiny | ~39MB | 极快 | 一般 | 快速测试、实时应用 |
108//! | base | ~74MB | 快 | 良好 | 日常应用、平衡性能 |
109//! | small | ~244MB | 中等 | 很好 | 高要求应用 |
110//! | medium | ~769MB | 较慢 | 优秀 | 专业应用 |
111//! | large | ~1550MB | 慢 | 最佳 | 最高精度要求 |
112//! 
113//! ## 性能优化
114//! 
115//! ### 模型加载优化
116//! - 首次加载模型后保持实例，避免重复加载
117//! - 对于长期运行的应用，预加载常用模型
118//! - 使用模型缓存减少启动时间
119//! 
120//! ### 音频处理优化
121//! - 启用 VAD (语音活动检测) 跳过静音部分
122//! - 预转换音频为 Whisper 兼容格式
123//! - 批量处理多个文件减少初始化开销
124//! 
125//! ### 系统资源优化
126//! - 启用 GPU 加速 (CUDA/Vulkan/Metal)
127//! - 调整线程数以优化 CPU 使用率
128//! - 监控内存使用，避免大文件处理时的内存溢出
129//! 
130//! ## 错误处理
131//! 
132//! 模块提供了详细的错误类型，帮助快速定位问题：
133//! 
134//! ```rust
135//! use rs_voice_toolkit_stt::{SttError, transcribe_file};
136//! 
137//! match transcribe_file("model.bin", "audio.wav").await {
138//!     Ok(result) => println!("转录成功: {}", result.text),
139//!     Err(SttError::ModelLoadError(e)) => println!("模型加载失败: {}", e),
140//!     Err(SttError::AudioProcessingError(e)) => println!("音频处理失败: {}", e),
141//!     Err(SttError::WhisperError(e)) => println!("Whisper 处理失败: {}", e),
142//!     Err(SttError::IoError(e)) => println!("IO 错误: {}", e),
143//!     Err(e) => println!("其他错误: {}", e),
144//! }
145//! ```
146//! 
147//! ## 系统要求
148//! 
149//! - **内存**: 
150//!   - tiny 模型: ~200MB
151//!   - base 模型: ~400MB
152//!   - small 模型: ~800MB
153//!   - medium 模型: ~1.5GB
154//!   - large 模型: ~3GB
155//! 
156//! - **CPU**: 支持多线程处理，推荐 4 核以上
157//! - **GPU**: 可选，支持 CUDA/Vulkan/Metal 加速
158//! - **磁盘**: 模型文件存储空间
159//! 
160//! ## 注意事项
161//! 
162//! - 首次使用需要下载 Whisper 模型文件
163//! - 建议在使用前验证音频文件格式
164//! - 长音频文件建议使用流式处理
165//! - 实时应用建议使用 tiny 或 base 模型
166
167// 模块导出集中于下方；避免未使用导入
168
169// 导入错误处理模块
170pub mod error;
171pub use error::{SttError, SttResult};
172
173// 导入音频处理模块
174pub mod audio;
175pub use audio::{AudioConfig, AudioData, AudioFormat};
176
177// 导入Whisper转录模块
178pub mod whisper;
179pub use whisper::{
180    transcribe_file, transcribe_file_with_config, transcribe_file_with_language,
181    transcribe_file_with_transcriber, TranscriptionResult, TranscriptionSegment, WhisperConfig,
182    WhisperTranscriber,
183};
184
185// 导入VAD模块
186pub mod vad;
187pub use vad::SimpleVad;
188
189#[cfg(test)]
190mod integration_tests {
191    use super::*;
192
193    #[tokio::test]
194    async fn test_end_to_end_transcription_on_fixture() {
195        // 定位 fixtures 模型与音频
196        let crate_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
197        let root_dir = crate_dir.parent().expect("stt crate has parent");
198        let model = root_dir.join("fixtures/models/ggml-tiny.bin");
199        let audio = root_dir.join("fixtures/audio/jfk.wav");
200
201        if !model.exists() || !audio.exists() {
202            eprintln!(
203                "跳过: 缺少 fixtures 模型或音频 ({} , {})",
204                model.display(),
205                audio.display()
206            );
207            return;
208        }
209
210        let result = transcribe_file(&model, &audio)
211            .await
212            .expect("端到端转录应成功");
213
214        assert!(!result.text.trim().is_empty(), "应产生非空文本");
215        assert!(result.audio_duration > 0);
216    }
217
218    #[tokio::test]
219    async fn test_transcription_bank_audio() {
220        // 定位 fixtures 模型与新增的bank_audio.m4a音频文件
221        let crate_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
222        let root_dir = crate_dir.parent().expect("stt crate has parent");
223        // 测试多个模型
224        let models = [
225            "ggml-tiny.bin",
226            "ggml-small.bin",
227            "ggml-medium.bin"
228        ];
229        let audio = root_dir.join("fixtures/audio/bank_audio.m4a");
230
231        if !audio.exists() {
232            eprintln!("跳过: 缺少音频文件: {}", audio.display());
233            return;
234        }
235
236        for model_name in models {
237            let model = root_dir.join("fixtures/models/").join(model_name);
238            
239            if !model.exists() {
240                println!("跳过: 缺少模型文件: {}", model.display());
241                continue;
242            }
243
244            println!("\n开始测试bank_audio.m4a文件的转录，使用模型: {}", model.display());
245            
246            // 方法1: 使用默认配置
247            let default_result = transcribe_file(&model, &audio).await;
248            println!("默认配置结果: {}", default_result.as_ref().map(|r| &r.text).unwrap_or(&String::from("失败")));
249            
250            // 方法2: 明确指定语言为中文
251            let with_lang_result = transcribe_file_with_language(&model, &audio, "zh").await;
252            println!("指定中文结果: {}", with_lang_result.as_ref().map(|r| &r.text).unwrap_or(&String::from("失败")));
253            
254            // 方法3: 自定义配置 - 降低置信度要求，适合不太清晰的音频
255            let custom_config = WhisperConfig::new(&model)
256                .with_language("zh")
257                .with_temperature(0.2) // 增加温度可能提高识别率
258                .with_vad(false); // 禁用VAD可能有助于捕获所有语音
259            
260            let custom_result = transcribe_file_with_config(&model, &audio, Some(custom_config)).await;
261            println!("自定义配置结果: {}", custom_result.as_ref().map(|r| &r.text).unwrap_or(&String::from("失败")));
262        }
263    }
264
265    #[tokio::test]
266    async fn test_different_models_on_bank_audio() {
267        // 测试不同模型在同一音频上的表现
268        let crate_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
269        let root_dir = crate_dir.parent().expect("stt crate has parent");
270        let audio = root_dir.join("fixtures/audio/bank_audio.m4a");
271
272        if !audio.exists() {
273            eprintln!("跳过: 缺少音频文件: {}", audio.display());
274            return;
275        }
276
277        // 尝试的模型列表
278        let models_to_test = [
279            "ggml-tiny.bin",
280            "ggml-small.bin",
281            "ggml-medium.bin",
282        ];
283
284        for model_name in models_to_test {
285            let model = root_dir.join("fixtures/models/").join(model_name);
286            
287            if !model.exists() {
288                println!("跳过: 缺少模型文件: {}", model.display());
289                continue;
290            }
291
292            println!("\n测试模型: {}", model_name);
293            match transcribe_file(&model, &audio).await {
294                Ok(result) => {
295                    println!("  转录结果: {}", result.text);
296                    println!("  音频时长: {}毫秒", result.audio_duration);
297                    println!("  处理时长: {}毫秒", result.processing_time);
298                    println!("  实时因子: {:.2}x", result.real_time_factor());
299                    println!("  检测到的语言: {:?}", result.language);
300                    println!("  分段数量: {}", result.segments.len());
301                },
302                Err(err) => {
303                    println!("  转录失败: {}", err);
304                }
305            }
306        }
307    }
308}
309
310// 导入流式转录模块
311#[cfg(feature = "streaming")]
312pub mod streaming;
313#[cfg(feature = "streaming")]
314pub use streaming::{
315    create_custom_streaming_transcriber, create_streaming_transcriber, StreamingConfig,
316    StreamingEvent, StreamingTranscriber,
317};
rs_voice_toolkit_stt/lib.rs

rs_voice_toolkit_stt/
lib.rs