1#![allow(clippy::similar_names)] #![allow(clippy::cast_precision_loss)] #![allow(clippy::cast_possible_truncation)] #![allow(clippy::unreadable_literal)] #![allow(clippy::module_name_repetitions)] #![allow(clippy::cast_sign_loss)] #![allow(clippy::unused_async)] #![allow(clippy::missing_errors_doc)] #![allow(clippy::missing_panics_doc)] #![allow(clippy::unused_self)] #![allow(clippy::must_use_candidate)] #![allow(clippy::missing_const_for_fn)] #![allow(clippy::doc_markdown)] #![allow(clippy::unnecessary_wraps)] #![allow(clippy::format_push_string)] #![allow(clippy::cast_possible_wrap)] #![allow(clippy::cast_lossless)] #![allow(clippy::ptr_as_ptr)] #![allow(clippy::struct_excessive_bools)] #![allow(clippy::fn_params_excessive_bools)] #![allow(clippy::too_many_lines)] #![allow(clippy::redundant_closure)] #![allow(clippy::float_cmp)] #![allow(clippy::match_same_arms)] #![allow(clippy::manual_let_else)] #![allow(clippy::wildcard_imports)] #![allow(clippy::items_after_statements)] #![allow(clippy::return_self_not_must_use)] #![allow(clippy::needless_range_loop)] #![allow(clippy::uninlined_format_args)] #![allow(clippy::needless_pass_by_value)] #![allow(clippy::manual_clamp)] #![allow(clippy::redundant_closure_for_method_calls)] #![allow(clippy::await_holding_lock)] #![allow(clippy::trivially_copy_pass_by_ref)] #![allow(clippy::no_effect_underscore_binding)] #![warn(missing_docs)]
211#![warn(clippy::all)]
212#![allow(clippy::duplicated_attributes)] #![allow(clippy::needless_range_loop)]
215#![allow(clippy::manual_clamp)]
216#![allow(clippy::await_holding_lock)]
217#![allow(clippy::redundant_closure)]
218#![allow(clippy::field_reassign_with_default)]
219#![allow(clippy::vec_init_then_push)]
220#![allow(clippy::type_complexity)]
221#![allow(clippy::too_many_arguments)]
222#![allow(clippy::should_implement_trait)]
223#![allow(clippy::derivable_impls)]
224#![allow(clippy::wrong_self_convention)]
225#![allow(clippy::useless_vec)]
226#![allow(clippy::unnecessary_unwrap)]
227#![allow(clippy::unnecessary_cast)]
228#![allow(clippy::manual_map)]
229#![allow(clippy::excessive_precision)]
230#![allow(clippy::double_must_use)]
231#![allow(clippy::doc_lazy_continuation)]
232#![allow(clippy::cloned_ref_to_slice_refs)]
233
234pub use voirs_sdk::{AudioBuffer, LanguageCode, Phoneme, VoirsError};
237
238mod sync_utils {
240 use super::RecognitionError;
241 use std::sync::{Mutex, MutexGuard, PoisonError};
242
243 pub trait MutexExt<T> {
245 fn lock_safe(&self) -> Result<MutexGuard<'_, T>, RecognitionError>;
247 }
248
249 impl<T> MutexExt<T> for Mutex<T> {
250 fn lock_safe(&self) -> Result<MutexGuard<'_, T>, RecognitionError> {
251 self.lock().map_err(|e: PoisonError<MutexGuard<'_, T>>| {
252 RecognitionError::SynchronizationError {
253 message: format!("Mutex lock poisoned: {}", e),
254 }
255 })
256 }
257 }
258}
259
260pub(crate) use sync_utils::MutexExt;
262
263pub mod analysis;
265pub mod asr;
266pub mod audio_formats;
267pub mod audio_utilities;
268pub mod caching;
269pub mod cloud_storage;
270pub mod config;
271pub mod disaster_recovery;
272pub mod error_bridge;
273pub mod error_enhancement;
274pub mod error_recovery;
275pub mod high_availability;
276pub mod integration;
277pub mod logging;
278pub mod memory_optimization;
279pub mod mobile;
280pub mod monitoring;
281pub mod multimodal;
282pub mod performance;
283pub mod phoneme;
284pub mod preprocessing;
285pub mod privacy;
286pub mod sdk_bridge;
287pub mod security_audit;
288#[cfg(feature = "rest-api")]
289pub mod serverless;
290pub mod sla_guarantees;
291pub mod training;
292pub mod traits;
293pub mod wake_word;
294
295#[cfg(feature = "wasm")]
296pub mod wasm;
298
299#[cfg(feature = "c-api")]
301pub mod c_api;
303
304#[cfg(feature = "rest-api")]
306pub mod rest_api;
308
309#[cfg(feature = "python")]
311pub mod python;
312
313#[cfg(feature = "python")]
315pub use python::*;
317
318pub use traits::{
321 ASRConfig, ASRFeature, ASRMetadata, ASRModel, AudioAnalysis, AudioAnalysisConfig,
322 AudioAnalyzer, AudioAnalyzerMetadata, AudioStream, PhonemeAlignment, PhonemeRecognitionConfig,
323 PhonemeRecognizer, PhonemeRecognizerMetadata, RecognitionResult, Transcript, TranscriptChunk,
324 TranscriptStream,
325};
326
327pub use analysis::AudioAnalyzerImpl;
330pub use asr::{ASRBackend, ASRBenchmarkingSuite, AccuracyValidator, IntelligentASRFallback};
332
333pub use asr::advanced_optimization::{
336 AdvancedOptimizationConfig, KnowledgeDistillationOptimizer, MixedPrecisionOptimizer,
337 OptimizationObjective, OptimizationPlatform, ProgressivePruningOptimizer,
338};
339pub use asr::optimization_integration::{
341 ModelStats, OptimizationPipeline, OptimizationResults, OptimizationSummary,
342};
343pub use audio_formats::{
345 load_audio, load_audio_with_sample_rate, AudioFormat, AudioLoadConfig, UniversalAudioLoader,
346};
347pub use audio_utilities::{
349 analyze_audio_quality, extract_speech_segments, load_and_preprocess, optimize_for_recognition,
350 split_audio_smart, AudioQualityReport, AudioUtilities,
351};
352pub use performance::{
354 PerformanceMetrics, PerformanceRequirements, PerformanceValidator, ValidationResult,
355};
356#[cfg(feature = "forced-align")]
357pub use phoneme::ForcedAlignModel;
359
360#[cfg(feature = "mfa")]
361pub use phoneme::MFAModel;
363pub use preprocessing::{AudioPreprocessingConfig, AudioPreprocessor};
365pub use wake_word::{
367 EnergyOptimizer, NeuralWakeWordModel, TemplateWakeWordModel, TrainingPhase, TrainingProgress,
368 TrainingValidationReport, WakeWordConfig, WakeWordDetection, WakeWordDetector,
369 WakeWordDetectorImpl, WakeWordModel, WakeWordStats, WakeWordTrainer, WakeWordTrainerImpl,
370 WakeWordTrainingData,
371};
372
373pub fn load_audio_simple(path: &str) -> Result<Vec<f32>, RecognitionError> {
376 let audio_buffer = load_audio(path)?;
377 Ok(audio_buffer.samples().to_vec())
378}
379
380pub const VERSION: &str = env!("CARGO_PKG_VERSION");
382
383pub mod prelude {
385 pub use crate::traits::{
387 ASRConfig, ASRMetadata, ASRModel, AudioAnalysis, AudioAnalysisConfig, AudioAnalyzer,
388 AudioAnalyzerMetadata, AudioStream, PhonemeAlignment, PhonemeRecognitionConfig,
389 PhonemeRecognizer, PhonemeRecognizerMetadata, RecognitionResult, Transcript,
390 TranscriptChunk, TranscriptStream,
391 };
392
393 #[cfg(feature = "whisper-pure")]
398 pub use crate::asr::PureRustWhisper;
400
401 #[cfg(feature = "deepspeech")]
402 pub use crate::asr::DeepSpeechModel;
404
405 #[cfg(feature = "wav2vec2")]
406 pub use crate::asr::Wav2Vec2Model;
408
409 #[cfg(feature = "forced-align")]
410 pub use crate::phoneme::ForcedAlignModel;
412
413 #[cfg(feature = "mfa")]
414 pub use crate::phoneme::MFAModel;
416
417 pub use crate::analysis::AudioAnalyzerImpl;
419
420 pub use crate::asr::{
423 ASRBackend, ASRBenchmarkingSuite, AccuracyValidator, IntelligentASRFallback,
424 };
425
426 pub use crate::audio_formats::{
429 load_audio, load_audio_with_sample_rate, AudioFormat, AudioLoadConfig, UniversalAudioLoader,
430 };
431
432 pub use crate::audio_utilities::{
435 analyze_audio_quality, extract_speech_segments, load_and_preprocess,
436 optimize_for_recognition, split_audio_smart, AudioQualityReport, AudioUtilities,
437 };
438
439 pub use crate::performance::{
442 PerformanceMetrics, PerformanceRequirements, PerformanceValidator, ValidationResult,
443 };
444
445 pub use crate::integration::{
448 ComponentInfo, IntegratedPerformanceMonitor, IntegrationConfig, PipelineProcessingConfig,
449 UnifiedVoirsPipeline, VoirsIntegrationManager,
450 };
451
452 pub use crate::wake_word::{
455 EnergyOptimizer, NeuralWakeWordModel, TemplateWakeWordModel, TrainingPhase,
456 TrainingProgress, TrainingValidationReport, WakeWordConfig, WakeWordDetection,
457 WakeWordDetector, WakeWordDetectorImpl, WakeWordModel, WakeWordStats, WakeWordTrainer,
458 WakeWordTrainerImpl, WakeWordTrainingData,
459 };
460
461 pub use voirs_sdk::{AudioBuffer, LanguageCode, Phoneme, VoirsError};
464
465 pub use async_trait::async_trait;
468
469 pub use crate::error_enhancement::{
472 enhance_recognition_error, get_quick_fixes, is_error_recoverable, ErrorEnhancer,
473 };
474}
475
476#[derive(Debug, thiserror::Error)]
482pub enum RecognitionError {
484 #[error("Failed to load model: {message}")]
486 ModelLoadError {
487 message: String,
489 #[source]
491 source: Option<Box<dyn std::error::Error + Send + Sync>>,
492 },
493
494 #[error("Model error: {message}")]
496 ModelError {
497 message: String,
499 #[source]
501 source: Option<Box<dyn std::error::Error + Send + Sync>>,
502 },
503
504 #[error("Audio processing error: {message}")]
506 AudioProcessingError {
507 message: String,
509 #[source]
511 source: Option<Box<dyn std::error::Error + Send + Sync>>,
512 },
513
514 #[error("Transcription failed: {message}")]
516 TranscriptionError {
517 message: String,
519 #[source]
521 source: Option<Box<dyn std::error::Error + Send + Sync>>,
522 },
523
524 #[error("Phoneme recognition failed: {message}")]
526 PhonemeRecognitionError {
527 message: String,
529 #[source]
531 source: Option<Box<dyn std::error::Error + Send + Sync>>,
532 },
533
534 #[error("Audio analysis failed: {message}")]
536 AudioAnalysisError {
537 message: String,
539 #[source]
541 source: Option<Box<dyn std::error::Error + Send + Sync>>,
542 },
543
544 #[error("Configuration error: {message}")]
546 ConfigurationError {
547 message: String,
549 },
550
551 #[error("Feature not supported: {feature}")]
553 FeatureNotSupported {
554 feature: String,
556 },
557
558 #[error("Invalid input: {message}")]
560 InvalidInput {
561 message: String,
563 },
564
565 #[error("Resource error: {message}")]
567 ResourceError {
568 message: String,
570 #[source]
572 source: Option<Box<dyn std::error::Error + Send + Sync>>,
573 },
574
575 #[error("Unsupported audio format: {0}")]
577 UnsupportedFormat(String),
578
579 #[error("Invalid audio format: {0}")]
581 InvalidFormat(String),
582
583 #[error("Model '{model}' not found. Available models: {available:?}")]
585 ModelNotFound {
586 model: String,
588 available: Vec<String>,
590 suggestions: Vec<String>,
592 },
593
594 #[error("Language '{language}' not supported. Supported languages: {supported:?}")]
596 LanguageNotSupported {
597 language: String,
599 supported: Vec<String>,
601 suggestions: Vec<String>,
603 },
604
605 #[error("Device '{device}' not available: {reason}. Fallback: {fallback}")]
607 DeviceNotAvailable {
608 device: String,
610 reason: String,
612 fallback: String,
614 },
615
616 #[error("Insufficient memory: need {required_mb}MB, have {available_mb}MB. Recommendation: {recommendation}")]
618 InsufficientMemory {
619 required_mb: u64,
621 available_mb: u64,
623 recommendation: String,
625 },
626
627 #[error("Recognition timed out after {timeout_ms}ms. Audio duration: {audio_duration_ms}ms. Suggestion: {suggestion}")]
629 RecognitionTimeout {
630 timeout_ms: u64,
632 audio_duration_ms: u64,
634 suggestion: String,
636 },
637
638 #[error("Memory error: {message}")]
640 MemoryError {
641 message: String,
643 #[source]
645 source: Option<Box<dyn std::error::Error + Send + Sync>>,
646 },
647
648 #[error("Training error: {message}")]
650 TrainingError {
651 message: String,
653 #[source]
655 source: Option<Box<dyn std::error::Error + Send + Sync>>,
656 },
657
658 #[error("Synchronization error: {message}")]
660 SynchronizationError {
661 message: String,
663 },
664}
665
666impl From<RecognitionError> for VoirsError {
667 fn from(err: RecognitionError) -> Self {
668 match err {
669 RecognitionError::ModelLoadError { message, source } => VoirsError::ModelError {
670 model_type: voirs_sdk::error::ModelType::ASR,
671 message,
672 source,
673 },
674 RecognitionError::ModelError { message, source } => VoirsError::ModelError {
675 model_type: voirs_sdk::error::ModelType::ASR,
676 message,
677 source,
678 },
679 RecognitionError::AudioProcessingError { message, source: _ } => {
680 VoirsError::AudioError {
681 message,
682 buffer_info: None,
683 }
684 }
685 RecognitionError::TranscriptionError { message, source } => VoirsError::ModelError {
686 model_type: voirs_sdk::error::ModelType::ASR,
687 message,
688 source,
689 },
690 RecognitionError::PhonemeRecognitionError { message, source } => {
691 VoirsError::ModelError {
692 model_type: voirs_sdk::error::ModelType::ASR,
693 message,
694 source,
695 }
696 }
697 RecognitionError::AudioAnalysisError { message, source: _ } => VoirsError::AudioError {
698 message,
699 buffer_info: None,
700 },
701 RecognitionError::ConfigurationError { message } => VoirsError::ConfigError {
702 field: "ASR".to_string(),
703 message,
704 },
705 RecognitionError::FeatureNotSupported { feature } => VoirsError::ModelError {
706 model_type: voirs_sdk::error::ModelType::ASR,
707 message: format!("Feature not supported: {feature}"),
708 source: None,
709 },
710 RecognitionError::InvalidInput { message } => VoirsError::ConfigError {
711 field: "Input".to_string(),
712 message: format!("Invalid input: {message}"),
713 },
714 RecognitionError::ResourceError { message, source } => VoirsError::ModelError {
715 model_type: voirs_sdk::error::ModelType::ASR,
716 message: format!("Resource error: {message}"),
717 source,
718 },
719 RecognitionError::UnsupportedFormat(format) => VoirsError::AudioError {
720 message: format!("Unsupported audio format: {format}"),
721 buffer_info: None,
722 },
723 RecognitionError::InvalidFormat(format) => VoirsError::AudioError {
724 message: format!("Invalid audio format: {format}"),
725 buffer_info: None,
726 },
727 RecognitionError::ModelNotFound {
728 model,
729 available,
730 suggestions,
731 } => VoirsError::VoiceNotFound {
732 voice: model,
733 available,
734 suggestions,
735 },
736 RecognitionError::LanguageNotSupported {
737 language,
738 supported,
739 suggestions: _,
740 } => VoirsError::LanguageNotSupported {
741 language,
742 supported,
743 },
744 RecognitionError::DeviceNotAvailable {
745 device,
746 reason,
747 fallback,
748 } => VoirsError::DeviceError {
749 device,
750 message: format!("Device not available: {reason}"),
751 recovery_hint: Some(format!("Use fallback device: {fallback}")),
752 },
753 RecognitionError::InsufficientMemory {
754 required_mb,
755 available_mb,
756 recommendation: _,
757 } => VoirsError::GpuOutOfMemory {
758 device: "ASR".to_string(),
759 used_mb: required_mb as u32,
760 available_mb: available_mb as u32,
761 },
762 RecognitionError::RecognitionTimeout {
763 timeout_ms,
764 audio_duration_ms,
765 suggestion,
766 } => VoirsError::ModelError {
767 model_type: voirs_sdk::error::ModelType::ASR,
768 message: format!(
769 "Recognition timed out after {timeout_ms}ms. Audio duration: {audio_duration_ms}ms. Suggestion: {suggestion}"
770 ),
771 source: None,
772 },
773 RecognitionError::MemoryError { message, source } => VoirsError::ModelError {
774 model_type: voirs_sdk::error::ModelType::ASR,
775 message: format!("Memory error: {message}"),
776 source,
777 },
778 RecognitionError::TrainingError { message, source } => VoirsError::ModelError {
779 model_type: voirs_sdk::error::ModelType::ASR,
780 message: format!("Training error: {message}"),
781 source,
782 },
783 RecognitionError::SynchronizationError { message } => VoirsError::ModelError {
784 model_type: voirs_sdk::error::ModelType::ASR,
785 message: format!("Synchronization error: {message}"),
786 source: None,
787 },
788 }
789 }
790}
791
792impl From<VoirsError> for RecognitionError {
793 fn from(err: VoirsError) -> Self {
794 match err {
795 VoirsError::ModelError {
796 model_type: _,
797 message,
798 source,
799 } => RecognitionError::ModelError { message, source },
800 VoirsError::AudioError {
801 message,
802 buffer_info: _,
803 } => RecognitionError::AudioProcessingError {
804 message,
805 source: None,
806 },
807 VoirsError::ConfigError { field: _, message } => {
808 RecognitionError::ConfigurationError { message }
809 }
810 VoirsError::SerializationError { format: _, message } => {
811 RecognitionError::InvalidInput { message }
812 }
813 VoirsError::NetworkError {
814 message,
815 source,
816 retry_count: _,
817 max_retries: _,
818 } => RecognitionError::ResourceError { message, source },
819 VoirsError::IoError {
820 path: _,
821 operation: _,
822 source,
823 } => RecognitionError::ResourceError {
824 message: format!("I/O error: {source}"),
825 source: Some(Box::new(source)),
826 },
827 _ => RecognitionError::ModelError {
829 message: format!("VoiRS error: {err}"),
830 source: Some(Box::new(err)),
831 },
832 }
833 }
834}
835
836impl From<candle_core::Error> for RecognitionError {
837 fn from(err: candle_core::Error) -> Self {
838 RecognitionError::ModelError {
839 message: format!("Candle error: {err}"),
840 source: Some(Box::new(err)),
841 }
842 }
843}
844
845pub fn validate_model_file(path: &std::path::Path) -> RecognitionResult<()> {
851 if !path.exists() {
852 return Err(RecognitionError::ModelLoadError {
853 message: format!("Model file not found: {}", path.display()),
854 source: None,
855 }
856 .into());
857 }
858
859 if !path.is_file() {
860 return Err(RecognitionError::ModelLoadError {
861 message: format!("Path is not a file: {}", path.display()),
862 source: None,
863 }
864 .into());
865 }
866
867 Ok(())
868}
869
870#[must_use]
872pub fn default_asr_config(language: LanguageCode) -> ASRConfig {
874 ASRConfig {
875 language: Some(language),
876 ..Default::default()
877 }
878}
879
880#[must_use]
882pub fn default_phoneme_config(language: LanguageCode) -> PhonemeRecognitionConfig {
884 PhonemeRecognitionConfig {
885 language,
886 ..Default::default()
887 }
888}
889
890#[must_use]
892pub fn default_analysis_config() -> AudioAnalysisConfig {
894 AudioAnalysisConfig::default()
895}
896
897#[must_use]
899pub fn confidence_to_label(confidence: f32) -> &'static str {
901 match confidence {
902 c if c >= 0.9 => "Very High",
903 c if c >= 0.7 => "High",
904 c if c >= 0.5 => "Medium",
905 c if c >= 0.3 => "Low",
906 _ => "Very Low",
907 }
908}
909
910#[must_use]
912pub fn merge_transcripts(transcripts: &[Transcript]) -> Transcript {
914 if transcripts.is_empty() {
915 return Transcript {
916 text: String::new(),
917 language: LanguageCode::EnUs,
918 confidence: 0.0,
919 word_timestamps: Vec::new(),
920 sentence_boundaries: Vec::new(),
921 processing_duration: None,
922 };
923 }
924
925 let mut merged_text = String::new();
926 let mut all_word_timestamps = Vec::new();
927 let mut all_sentence_boundaries = Vec::new();
928 let mut total_confidence = 0.0;
929 let mut total_duration = std::time::Duration::ZERO;
930
931 for transcript in transcripts {
932 if !merged_text.is_empty() {
933 merged_text.push(' ');
934 }
935 merged_text.push_str(&transcript.text);
936
937 all_word_timestamps.extend(transcript.word_timestamps.clone());
938 all_sentence_boundaries.extend(transcript.sentence_boundaries.clone());
939 total_confidence += transcript.confidence;
940
941 if let Some(duration) = transcript.processing_duration {
942 total_duration += duration;
943 }
944 }
945
946 Transcript {
947 text: merged_text,
948 language: transcripts[0].language,
949 confidence: total_confidence / transcripts.len() as f32,
950 word_timestamps: all_word_timestamps,
951 sentence_boundaries: all_sentence_boundaries,
952 processing_duration: Some(total_duration),
953 }
954}
955
956#[cfg(test)]
957mod tests {
958 use super::*;
959
960 #[test]
961 #[allow(clippy::const_is_empty)]
962 fn test_version() {
963 assert!(!VERSION.is_empty(), "VERSION should not be empty");
964 }
965
966 #[test]
967 fn test_confidence_to_label() {
968 assert_eq!(confidence_to_label(0.95), "Very High");
969 assert_eq!(confidence_to_label(0.8), "High");
970 assert_eq!(confidence_to_label(0.6), "Medium");
971 assert_eq!(confidence_to_label(0.4), "Low");
972 assert_eq!(confidence_to_label(0.2), "Very Low");
973 }
974
975 #[test]
976 fn test_default_configs() {
977 let asr_config = default_asr_config(LanguageCode::EnUs);
978 assert_eq!(asr_config.language, Some(LanguageCode::EnUs));
979 assert!(asr_config.word_timestamps);
980
981 let phoneme_config = default_phoneme_config(LanguageCode::EnUs);
982 assert_eq!(phoneme_config.language, LanguageCode::EnUs);
983 assert!(phoneme_config.word_alignment);
984
985 let analysis_config = default_analysis_config();
986 assert!(analysis_config.quality_metrics);
987 assert!(analysis_config.prosody_analysis);
988 }
989
990 #[test]
991 fn test_merge_transcripts() {
992 let transcript1 = Transcript {
993 text: "Hello".to_string(),
994 language: LanguageCode::EnUs,
995 confidence: 0.9,
996 word_timestamps: vec![],
997 sentence_boundaries: vec![],
998 processing_duration: Some(std::time::Duration::from_millis(100)),
999 };
1000
1001 let transcript2 = Transcript {
1002 text: "world".to_string(),
1003 language: LanguageCode::EnUs,
1004 confidence: 0.8,
1005 word_timestamps: vec![],
1006 sentence_boundaries: vec![],
1007 processing_duration: Some(std::time::Duration::from_millis(150)),
1008 };
1009
1010 let merged = merge_transcripts(&[transcript1, transcript2]);
1011 assert_eq!(merged.text, "Hello world");
1012 assert!((merged.confidence - 0.85).abs() < f32::EPSILON);
1013 assert_eq!(
1014 merged.processing_duration,
1015 Some(std::time::Duration::from_millis(250))
1016 );
1017 }
1018}