voirs_recognizer/
lib.rs

1//! # `VoiRS` Recognition
2//!
3//! Voice recognition and analysis capabilities for the `VoiRS` ecosystem.
4//! This crate provides automatic speech recognition (ASR), phoneme recognition,
5//! and comprehensive audio analysis functionality.
6
7// Allow pedantic lints that are acceptable for audio/DSP processing code
8#![allow(clippy::similar_names)] // Many similar variable names in DSP algorithms
9#![allow(clippy::cast_precision_loss)] // Acceptable for audio sample conversions
10#![allow(clippy::cast_possible_truncation)] // Controlled truncation in audio processing
11#![allow(clippy::unreadable_literal)] // Scientific constants in DSP algorithms
12#![allow(clippy::module_name_repetitions)] // Type names often repeat module names
13#![allow(clippy::cast_sign_loss)] // Intentional in index calculations
14#![allow(clippy::unused_async)] // Public API functions need async for consistency and future compatibility
15#![allow(clippy::missing_errors_doc)] // Many internal functions with self-documenting error types
16#![allow(clippy::missing_panics_doc)] // Panics are documented where relevant, not for all edge cases
17#![allow(clippy::unused_self)] // Some trait implementations require &self for consistency
18#![allow(clippy::must_use_candidate)] // Not all return values need must_use annotation
19#![allow(clippy::missing_const_for_fn)] // Not all functions can/should be const
20#![allow(clippy::doc_markdown)] // Technical terms don't all need backticks
21#![allow(clippy::unnecessary_wraps)] // Result/Option wrappers maintained for API consistency
22#![allow(clippy::format_push_string)] // Sometimes more readable than alternative
23#![allow(clippy::cast_possible_wrap)] // Controlled wrapping in DSP code
24#![allow(clippy::cast_lossless)] // Explicit casts preferred for clarity
25#![allow(clippy::ptr_as_ptr)] // Raw pointer casts are intentional in FFI/WASM
26#![allow(clippy::struct_excessive_bools)] // Config structs naturally have many boolean flags
27#![allow(clippy::fn_params_excessive_bools)] // Some functions need multiple boolean parameters
28#![allow(clippy::too_many_lines)] // Some DSP functions are inherently complex
29#![allow(clippy::redundant_closure)] // Sometimes closures are clearer
30#![allow(clippy::float_cmp)] // Exact float comparisons are intentional in some DSP contexts
31#![allow(clippy::match_same_arms)] // Pattern matching clarity sometimes requires duplication
32#![allow(clippy::manual_let_else)] // if-let patterns sometimes clearer
33#![allow(clippy::wildcard_imports)] // Prelude imports are convenient and standard
34#![allow(clippy::items_after_statements)] // Helper items after code can improve readability
35#![allow(clippy::return_self_not_must_use)] // Not all builder methods need must_use
36#![allow(clippy::needless_range_loop)] // Range loops sometimes clearer than iterators in DSP
37#![allow(clippy::uninlined_format_args)] // Explicit argument names can improve clarity
38#![allow(clippy::needless_pass_by_value)] // Some functions designed for ownership transfer
39#![allow(clippy::manual_clamp)] // Manual clamping sometimes clearer
40#![allow(clippy::redundant_closure_for_method_calls)] // Closures sometimes needed for type inference
41#![allow(clippy::await_holding_lock)] // Controlled lock holding in async contexts
42#![allow(clippy::trivially_copy_pass_by_ref)] // API consistency more important than micro-optimization
43#![allow(clippy::no_effect_underscore_binding)] // Underscore bindings used for drop guards
44//!
45//! ## Features
46//!
47//! - **ASR Models**: Whisper, `DeepSpeech`, `Wav2Vec2` support
48//! - **Phoneme Recognition**: Forced alignment and automatic recognition
49//! - **Audio Analysis**: Quality metrics, prosody, speaker characteristics
50//! - **Streaming Support**: Real-time processing capabilities
51//! - **Multi-language**: Support for multiple languages and accents
52//!
53//! ## Quick Start
54//!
55//! ### Basic Audio Analysis
56//!
57//! ```rust,no_run
58//! use voirs_recognizer::prelude::*;
59//! use voirs_recognizer::RecognitionError;
60//!
61//! #[tokio::main]
62//! async fn main() -> Result<(), RecognitionError> {
63//!     // Create audio buffer with some sample data
64//!     let samples = vec![0.0f32; 16000]; // 1 second of silence at 16kHz
65//!     let audio = AudioBuffer::mono(samples, 16000);
66//!     
67//!     // Create audio analyzer for comprehensive analysis
68//!     let analyzer_config = AudioAnalysisConfig::default();
69//!     let analyzer = AudioAnalyzerImpl::new(analyzer_config).await?;
70//!     let analysis = analyzer.analyze(&audio, Some(&AudioAnalysisConfig::default())).await?;
71//!     
72//!     // Access quality metrics
73//!     if let Some(snr) = analysis.quality_metrics.get("snr") {
74//!         println!("Audio analysis complete: SNR = {:.2}", snr);
75//!     }
76//!     
77//!     Ok(())
78//! }
79//! ```
80//!
81//! ### ASR Accuracy Validation
82//!
83//! ```rust,no_run
84//! use voirs_recognizer::prelude::*;
85//! use voirs_recognizer::{RecognitionError, asr::BenchmarkingConfig};
86//!
87//! #[tokio::main]
88//! async fn main() -> Result<(), RecognitionError> {
89//!     // Create a benchmarking suite with default configuration
90//!     let benchmark_config = BenchmarkingConfig::default();
91//!     let benchmark_suite = ASRBenchmarkingSuite::new(benchmark_config).await?;
92//!     
93//!     // Create an accuracy validator with standard requirements
94//!     let accuracy_validator = AccuracyValidator::new_standard();
95//!     
96//!     // Validate accuracy against standard benchmarks
97//!     let validation_report = accuracy_validator.validate_accuracy(&benchmark_suite).await?;
98//!     
99//!     // Generate and display validation report
100//!     let summary = accuracy_validator.generate_summary_report(&validation_report);
101//!     println!("Accuracy Validation Results:\n{}", summary);
102//!     
103//!     // Check if all requirements passed
104//!     if validation_report.overall_passed {
105//!         println!("✅ All accuracy requirements passed!");
106//!     } else {
107//!         println!("❌ Some accuracy requirements failed.");
108//!         println!("Passed: {}/{}", validation_report.passed_requirements, validation_report.total_requirements);
109//!     }
110//!     
111//!     Ok(())
112//! }
113//! ```
114//!
115//! ## Performance Tuning Guide
116//!
117//! ### Model Selection for Performance
118//!
119//! Choose the appropriate model size based on your performance requirements:
120//!
121//! ```rust,no_run
122//! use voirs_recognizer::prelude::*;
123//! use voirs_recognizer::asr::whisper::WhisperConfig;
124//!
125//! // For real-time applications with tight latency constraints
126//! let fast_config = WhisperConfig {
127//!     model_size: "tiny".to_string(),
128//!     ..Default::default()
129//! };
130//!
131//! // For balanced performance and accuracy  
132//! let balanced_config = WhisperConfig {
133//!     model_size: "base".to_string(),
134//!     ..Default::default()
135//! };
136//!
137//! // For highest accuracy (higher latency)
138//! let accurate_config = WhisperConfig {
139//!     model_size: "small".to_string(),
140//!     ..Default::default()
141//! };
142//! ```
143//!
144//! ### Memory Optimization
145//!
146//! - **Model Quantization**: Use INT8 or FP16 quantization to reduce memory usage
147//! - **Batch Processing**: Process multiple audio files together for better throughput
148//! - **Memory Pools**: Enable GPU memory pooling for efficient tensor reuse
149//!
150//! ### Real-time Processing Optimization
151//!
152//! ```rust,no_run
153//! use voirs_recognizer::prelude::*;
154//! use voirs_recognizer::integration::config::{StreamingConfig, LatencyMode};
155//!
156//! // Configure for ultra-low latency
157//! let streaming_config = StreamingConfig {
158//!     latency_mode: LatencyMode::UltraLow,
159//!     chunk_size: 1600,          // Smaller chunks for lower latency (100ms at 16kHz)
160//!     overlap: 400,              // Minimal overlap (25ms at 16kHz)  
161//!     buffer_duration: 3.0,      // Limited buffer for speed
162//! };
163//! ```
164//!
165//! ### Performance Monitoring
166//!
167//! Monitor your application's performance to ensure it meets requirements:
168//!
169//! ```rust,no_run
170//! use voirs_recognizer::prelude::*;
171//! use std::time::Duration;
172//!
173//! let validator = PerformanceValidator::new()
174//!     .with_verbose(true);
175//!
176//! let requirements = PerformanceRequirements {
177//!     max_rtf: 0.3,              // Real-time factor < 0.3
178//!     max_memory_usage: 2_000_000_000, // < 2GB
179//!     max_startup_time_ms: 5000, // < 5 seconds
180//!     max_streaming_latency_ms: 200, // < 200ms
181//! };
182//!
183//! // Validate streaming latency
184//! let latency = Duration::from_millis(150);
185//! let (latency_ms, passed) = validator.validate_streaming_latency(latency);
186//! if !passed {
187//!     println!("Streaming latency {} ms exceeds requirement {} ms",
188//!              latency_ms, requirements.max_streaming_latency_ms);
189//! }
190//! ```
191//!
192//! ### Platform-Specific Optimizations
193//!
194//! #### GPU Acceleration
195//! - Enable CUDA support for NVIDIA GPUs
196//! - Use Metal acceleration on Apple Silicon
197//! - Configure appropriate batch sizes for your GPU memory
198//!
199//! #### SIMD Optimizations
200//! - `VoiRS` automatically detects and uses SIMD instructions (AVX2, NEON)
201//! - Ensure your CPU supports these instruction sets for optimal performance
202//! - No manual configuration required - optimizations are applied automatically
203//!
204//! #### Multi-threading
205//! - Use `num_cpus::get()` to optimize thread pool sizes
206//! - Enable parallel processing for batch operations
207//! - Balance thread count with memory usage
208//!
209
210#![warn(missing_docs)]
211#![warn(clippy::all)]
212// Allow specific clippy lints after warnings to override them (duplicates from earlier allows needed here)
213#![allow(clippy::duplicated_attributes)] // Intentional duplicates to override warns
214#![allow(clippy::needless_range_loop)]
215#![allow(clippy::manual_clamp)]
216#![allow(clippy::await_holding_lock)]
217#![allow(clippy::redundant_closure)]
218#![allow(clippy::field_reassign_with_default)]
219#![allow(clippy::vec_init_then_push)]
220#![allow(clippy::type_complexity)]
221#![allow(clippy::too_many_arguments)]
222#![allow(clippy::should_implement_trait)]
223#![allow(clippy::derivable_impls)]
224#![allow(clippy::wrong_self_convention)]
225#![allow(clippy::useless_vec)]
226#![allow(clippy::unnecessary_unwrap)]
227#![allow(clippy::unnecessary_cast)]
228#![allow(clippy::manual_map)]
229#![allow(clippy::excessive_precision)]
230#![allow(clippy::double_must_use)]
231#![allow(clippy::doc_lazy_continuation)]
232#![allow(clippy::cloned_ref_to_slice_refs)]
233
234// Re-export core VoiRS types
235/// Item
236pub use voirs_sdk::{AudioBuffer, LanguageCode, Phoneme, VoirsError};
237
238// Internal utilities for error-free mutex handling
239mod sync_utils {
240    use super::RecognitionError;
241    use std::sync::{Mutex, MutexGuard, PoisonError};
242
243    /// Extension trait for `Mutex` to provide error handling without unwrap
244    pub trait MutexExt<T> {
245        /// Lock mutex and map poison error to `RecognitionError`
246        fn lock_safe(&self) -> Result<MutexGuard<'_, T>, RecognitionError>;
247    }
248
249    impl<T> MutexExt<T> for Mutex<T> {
250        fn lock_safe(&self) -> Result<MutexGuard<'_, T>, RecognitionError> {
251            self.lock().map_err(|e: PoisonError<MutexGuard<'_, T>>| {
252                RecognitionError::SynchronizationError {
253                    message: format!("Mutex lock poisoned: {}", e),
254                }
255            })
256        }
257    }
258}
259
260// Re-export the utility trait for internal use
261pub(crate) use sync_utils::MutexExt;
262
263// Public API modules
264pub mod analysis;
265pub mod asr;
266pub mod audio_formats;
267pub mod audio_utilities;
268pub mod caching;
269pub mod cloud_storage;
270pub mod config;
271pub mod disaster_recovery;
272pub mod error_bridge;
273pub mod error_enhancement;
274pub mod error_recovery;
275pub mod high_availability;
276pub mod integration;
277pub mod logging;
278pub mod memory_optimization;
279pub mod mobile;
280pub mod monitoring;
281pub mod multimodal;
282pub mod performance;
283pub mod phoneme;
284pub mod preprocessing;
285pub mod privacy;
286pub mod sdk_bridge;
287pub mod security_audit;
288#[cfg(feature = "rest-api")]
289pub mod serverless;
290pub mod sla_guarantees;
291pub mod training;
292pub mod traits;
293pub mod wake_word;
294
295#[cfg(feature = "wasm")]
296/// Pub
297pub mod wasm;
298
299// C API bindings (optional)
300#[cfg(feature = "c-api")]
301/// Pub
302pub mod c_api;
303
304// REST API bindings (optional)
305#[cfg(feature = "rest-api")]
306/// Pub
307pub mod rest_api;
308
309// Python bindings (optional)
310#[cfg(feature = "python")]
311pub mod python;
312
313// Re-export Python module when the feature is enabled
314#[cfg(feature = "python")]
315/// Item
316pub use python::*;
317
318// Re-export key types from traits to avoid ambiguous glob re-exports
319/// Item
320pub use traits::{
321    ASRConfig, ASRFeature, ASRMetadata, ASRModel, AudioAnalysis, AudioAnalysisConfig,
322    AudioAnalyzer, AudioAnalyzerMetadata, AudioStream, PhonemeAlignment, PhonemeRecognitionConfig,
323    PhonemeRecognizer, PhonemeRecognizerMetadata, RecognitionResult, Transcript, TranscriptChunk,
324    TranscriptStream,
325};
326
327// Re-export specific implementations to avoid conflicts
328/// Item
329pub use analysis::AudioAnalyzerImpl;
330/// Item
331pub use asr::{ASRBackend, ASRBenchmarkingSuite, AccuracyValidator, IntelligentASRFallback};
332
333// Advanced optimization exports
334/// Item
335pub use asr::advanced_optimization::{
336    AdvancedOptimizationConfig, KnowledgeDistillationOptimizer, MixedPrecisionOptimizer,
337    OptimizationObjective, OptimizationPlatform, ProgressivePruningOptimizer,
338};
339/// Item
340pub use asr::optimization_integration::{
341    ModelStats, OptimizationPipeline, OptimizationResults, OptimizationSummary,
342};
343/// Item
344pub use audio_formats::{
345    load_audio, load_audio_with_sample_rate, AudioFormat, AudioLoadConfig, UniversalAudioLoader,
346};
347/// Item
348pub use audio_utilities::{
349    analyze_audio_quality, extract_speech_segments, load_and_preprocess, optimize_for_recognition,
350    split_audio_smart, AudioQualityReport, AudioUtilities,
351};
352/// Item
353pub use performance::{
354    PerformanceMetrics, PerformanceRequirements, PerformanceValidator, ValidationResult,
355};
356#[cfg(feature = "forced-align")]
357/// Item
358pub use phoneme::ForcedAlignModel;
359
360#[cfg(feature = "mfa")]
361/// Item
362pub use phoneme::MFAModel;
363/// Item
364pub use preprocessing::{AudioPreprocessingConfig, AudioPreprocessor};
365/// Item
366pub use wake_word::{
367    EnergyOptimizer, NeuralWakeWordModel, TemplateWakeWordModel, TrainingPhase, TrainingProgress,
368    TrainingValidationReport, WakeWordConfig, WakeWordDetection, WakeWordDetector,
369    WakeWordDetectorImpl, WakeWordModel, WakeWordStats, WakeWordTrainer, WakeWordTrainerImpl,
370    WakeWordTrainingData,
371};
372
373// Convenience functions for quick start examples
374/// Simple audio loading function for quick start examples
375pub fn load_audio_simple(path: &str) -> Result<Vec<f32>, RecognitionError> {
376    let audio_buffer = load_audio(path)?;
377    Ok(audio_buffer.samples().to_vec())
378}
379
380/// Version information
381pub const VERSION: &str = env!("CARGO_PKG_VERSION");
382
383/// Convenient prelude for common imports
384pub mod prelude {
385    /// Prelude module for convenient imports
386    pub use crate::traits::{
387        ASRConfig, ASRMetadata, ASRModel, AudioAnalysis, AudioAnalysisConfig, AudioAnalyzer,
388        AudioAnalyzerMetadata, AudioStream, PhonemeAlignment, PhonemeRecognitionConfig,
389        PhonemeRecognizer, PhonemeRecognizerMetadata, RecognitionResult, Transcript,
390        TranscriptChunk, TranscriptStream,
391    };
392
393    // Re-export from submodules when they're implemented
394    // #[cfg(feature = "whisper")]
395    // pub use crate::asr::WhisperModel;
396
397    #[cfg(feature = "whisper-pure")]
398    /// Item
399    pub use crate::asr::PureRustWhisper;
400
401    #[cfg(feature = "deepspeech")]
402    /// Item
403    pub use crate::asr::DeepSpeechModel;
404
405    #[cfg(feature = "wav2vec2")]
406    /// Item
407    pub use crate::asr::Wav2Vec2Model;
408
409    #[cfg(feature = "forced-align")]
410    /// Item
411    pub use crate::phoneme::ForcedAlignModel;
412
413    #[cfg(feature = "mfa")]
414    /// Item
415    pub use crate::phoneme::MFAModel;
416
417    /// Item
418    pub use crate::analysis::AudioAnalyzerImpl;
419
420    // Re-export ASR utilities
421    /// Item
422    pub use crate::asr::{
423        ASRBackend, ASRBenchmarkingSuite, AccuracyValidator, IntelligentASRFallback,
424    };
425
426    // Re-export audio format utilities
427    /// Item
428    pub use crate::audio_formats::{
429        load_audio, load_audio_with_sample_rate, AudioFormat, AudioLoadConfig, UniversalAudioLoader,
430    };
431
432    // Re-export audio utilities
433    /// Item
434    pub use crate::audio_utilities::{
435        analyze_audio_quality, extract_speech_segments, load_and_preprocess,
436        optimize_for_recognition, split_audio_smart, AudioQualityReport, AudioUtilities,
437    };
438
439    // Re-export performance utilities
440    /// Item
441    pub use crate::performance::{
442        PerformanceMetrics, PerformanceRequirements, PerformanceValidator, ValidationResult,
443    };
444
445    // Re-export integration utilities
446    /// Item
447    pub use crate::integration::{
448        ComponentInfo, IntegratedPerformanceMonitor, IntegrationConfig, PipelineProcessingConfig,
449        UnifiedVoirsPipeline, VoirsIntegrationManager,
450    };
451
452    // Re-export wake word utilities
453    /// Item
454    pub use crate::wake_word::{
455        EnergyOptimizer, NeuralWakeWordModel, TemplateWakeWordModel, TrainingPhase,
456        TrainingProgress, TrainingValidationReport, WakeWordConfig, WakeWordDetection,
457        WakeWordDetector, WakeWordDetectorImpl, WakeWordModel, WakeWordStats, WakeWordTrainer,
458        WakeWordTrainerImpl, WakeWordTrainingData,
459    };
460
461    // Re-export SDK types
462    /// Item
463    pub use voirs_sdk::{AudioBuffer, LanguageCode, Phoneme, VoirsError};
464
465    // Re-export async trait
466    /// Item
467    pub use async_trait::async_trait;
468
469    // Re-export error enhancement utilities
470    /// Item
471    pub use crate::error_enhancement::{
472        enhance_recognition_error, get_quick_fixes, is_error_recoverable, ErrorEnhancer,
473    };
474}
475
476// ============================================================================
477// Error Types
478// ============================================================================
479
480/// Recognition-specific error types
481#[derive(Debug, thiserror::Error)]
482/// Recognition Error
483pub enum RecognitionError {
484    /// Model loading failed
485    #[error("Failed to load model: {message}")]
486    ModelLoadError {
487        /// Error message
488        message: String,
489        /// Source error
490        #[source]
491        source: Option<Box<dyn std::error::Error + Send + Sync>>,
492    },
493
494    /// Model operation error
495    #[error("Model error: {message}")]
496    ModelError {
497        /// Error message
498        message: String,
499        /// Source error
500        #[source]
501        source: Option<Box<dyn std::error::Error + Send + Sync>>,
502    },
503
504    /// Audio processing error
505    #[error("Audio processing error: {message}")]
506    AudioProcessingError {
507        /// Error message
508        message: String,
509        /// Source error
510        #[source]
511        source: Option<Box<dyn std::error::Error + Send + Sync>>,
512    },
513
514    /// Transcription error
515    #[error("Transcription failed: {message}")]
516    TranscriptionError {
517        /// Error message
518        message: String,
519        /// Source error
520        #[source]
521        source: Option<Box<dyn std::error::Error + Send + Sync>>,
522    },
523
524    /// Phoneme recognition error
525    #[error("Phoneme recognition failed: {message}")]
526    PhonemeRecognitionError {
527        /// Error message
528        message: String,
529        /// Source error
530        #[source]
531        source: Option<Box<dyn std::error::Error + Send + Sync>>,
532    },
533
534    /// Audio analysis error
535    #[error("Audio analysis failed: {message}")]
536    AudioAnalysisError {
537        /// Error message
538        message: String,
539        /// Source error
540        #[source]
541        source: Option<Box<dyn std::error::Error + Send + Sync>>,
542    },
543
544    /// Configuration error
545    #[error("Configuration error: {message}")]
546    ConfigurationError {
547        /// Error message
548        message: String,
549    },
550
551    /// Feature not supported
552    #[error("Feature not supported: {feature}")]
553    FeatureNotSupported {
554        /// Feature name
555        feature: String,
556    },
557
558    /// Invalid input
559    #[error("Invalid input: {message}")]
560    InvalidInput {
561        /// Error message
562        message: String,
563    },
564
565    /// Resource error
566    #[error("Resource error: {message}")]
567    ResourceError {
568        /// Error message
569        message: String,
570        /// Source error
571        #[source]
572        source: Option<Box<dyn std::error::Error + Send + Sync>>,
573    },
574
575    /// Unsupported audio format
576    #[error("Unsupported audio format: {0}")]
577    UnsupportedFormat(String),
578
579    /// Invalid audio format
580    #[error("Invalid audio format: {0}")]
581    InvalidFormat(String),
582
583    /// Model not found with suggestions
584    #[error("Model '{model}' not found. Available models: {available:?}")]
585    ModelNotFound {
586        /// Model name
587        model: String,
588        /// Available models
589        available: Vec<String>,
590        /// Suggested alternatives
591        suggestions: Vec<String>,
592    },
593
594    /// Language not supported with suggestions
595    #[error("Language '{language}' not supported. Supported languages: {supported:?}")]
596    LanguageNotSupported {
597        /// Requested language
598        language: String,
599        /// Supported languages
600        supported: Vec<String>,
601        /// Suggested alternatives
602        suggestions: Vec<String>,
603    },
604
605    /// GPU/Device not available with fallback info
606    #[error("Device '{device}' not available: {reason}. Fallback: {fallback}")]
607    DeviceNotAvailable {
608        /// Requested device
609        device: String,
610        /// Reason for unavailability
611        reason: String,
612        /// Fallback device
613        fallback: String,
614    },
615
616    /// Insufficient memory with recommendation
617    #[error("Insufficient memory: need {required_mb}MB, have {available_mb}MB. Recommendation: {recommendation}")]
618    InsufficientMemory {
619        /// Required memory in MB
620        required_mb: u64,
621        /// Available memory in MB
622        available_mb: u64,
623        /// Recommendation to resolve
624        recommendation: String,
625    },
626
627    /// Recognition timeout with retry suggestion
628    #[error("Recognition timed out after {timeout_ms}ms. Audio duration: {audio_duration_ms}ms. Suggestion: {suggestion}")]
629    RecognitionTimeout {
630        /// Timeout duration in milliseconds
631        timeout_ms: u64,
632        /// Audio duration in milliseconds
633        audio_duration_ms: u64,
634        /// Suggestion for resolution
635        suggestion: String,
636    },
637
638    /// Memory error
639    #[error("Memory error: {message}")]
640    MemoryError {
641        /// Error message
642        message: String,
643        /// Source error
644        #[source]
645        source: Option<Box<dyn std::error::Error + Send + Sync>>,
646    },
647
648    /// Training error
649    #[error("Training error: {message}")]
650    TrainingError {
651        /// Error message
652        message: String,
653        /// Source error
654        #[source]
655        source: Option<Box<dyn std::error::Error + Send + Sync>>,
656    },
657
658    /// Synchronization error (mutex poisoning, lock failure)
659    #[error("Synchronization error: {message}")]
660    SynchronizationError {
661        /// Error message
662        message: String,
663    },
664}
665
666impl From<RecognitionError> for VoirsError {
667    fn from(err: RecognitionError) -> Self {
668        match err {
669            RecognitionError::ModelLoadError { message, source } => VoirsError::ModelError {
670                model_type: voirs_sdk::error::ModelType::ASR,
671                message,
672                source,
673            },
674            RecognitionError::ModelError { message, source } => VoirsError::ModelError {
675                model_type: voirs_sdk::error::ModelType::ASR,
676                message,
677                source,
678            },
679            RecognitionError::AudioProcessingError { message, source: _ } => {
680                VoirsError::AudioError {
681                    message,
682                    buffer_info: None,
683                }
684            }
685            RecognitionError::TranscriptionError { message, source } => VoirsError::ModelError {
686                model_type: voirs_sdk::error::ModelType::ASR,
687                message,
688                source,
689            },
690            RecognitionError::PhonemeRecognitionError { message, source } => {
691                VoirsError::ModelError {
692                    model_type: voirs_sdk::error::ModelType::ASR,
693                    message,
694                    source,
695                }
696            }
697            RecognitionError::AudioAnalysisError { message, source: _ } => VoirsError::AudioError {
698                message,
699                buffer_info: None,
700            },
701            RecognitionError::ConfigurationError { message } => VoirsError::ConfigError {
702                field: "ASR".to_string(),
703                message,
704            },
705            RecognitionError::FeatureNotSupported { feature } => VoirsError::ModelError {
706                model_type: voirs_sdk::error::ModelType::ASR,
707                message: format!("Feature not supported: {feature}"),
708                source: None,
709            },
710            RecognitionError::InvalidInput { message } => VoirsError::ConfigError {
711                field: "Input".to_string(),
712                message: format!("Invalid input: {message}"),
713            },
714            RecognitionError::ResourceError { message, source } => VoirsError::ModelError {
715                model_type: voirs_sdk::error::ModelType::ASR,
716                message: format!("Resource error: {message}"),
717                source,
718            },
719            RecognitionError::UnsupportedFormat(format) => VoirsError::AudioError {
720                message: format!("Unsupported audio format: {format}"),
721                buffer_info: None,
722            },
723            RecognitionError::InvalidFormat(format) => VoirsError::AudioError {
724                message: format!("Invalid audio format: {format}"),
725                buffer_info: None,
726            },
727            RecognitionError::ModelNotFound {
728                model,
729                available,
730                suggestions,
731            } => VoirsError::VoiceNotFound {
732                voice: model,
733                available,
734                suggestions,
735            },
736            RecognitionError::LanguageNotSupported {
737                language,
738                supported,
739                suggestions: _,
740            } => VoirsError::LanguageNotSupported {
741                language,
742                supported,
743            },
744            RecognitionError::DeviceNotAvailable {
745                device,
746                reason,
747                fallback,
748            } => VoirsError::DeviceError {
749                device,
750                message: format!("Device not available: {reason}"),
751                recovery_hint: Some(format!("Use fallback device: {fallback}")),
752            },
753            RecognitionError::InsufficientMemory {
754                required_mb,
755                available_mb,
756                recommendation: _,
757            } => VoirsError::GpuOutOfMemory {
758                device: "ASR".to_string(),
759                used_mb: required_mb as u32,
760                available_mb: available_mb as u32,
761            },
762            RecognitionError::RecognitionTimeout {
763                timeout_ms,
764                audio_duration_ms,
765                suggestion,
766            } => VoirsError::ModelError {
767                model_type: voirs_sdk::error::ModelType::ASR,
768                message: format!(
769                    "Recognition timed out after {timeout_ms}ms. Audio duration: {audio_duration_ms}ms. Suggestion: {suggestion}"
770                ),
771                source: None,
772            },
773            RecognitionError::MemoryError { message, source } => VoirsError::ModelError {
774                model_type: voirs_sdk::error::ModelType::ASR,
775                message: format!("Memory error: {message}"),
776                source,
777            },
778            RecognitionError::TrainingError { message, source } => VoirsError::ModelError {
779                model_type: voirs_sdk::error::ModelType::ASR,
780                message: format!("Training error: {message}"),
781                source,
782            },
783            RecognitionError::SynchronizationError { message } => VoirsError::ModelError {
784                model_type: voirs_sdk::error::ModelType::ASR,
785                message: format!("Synchronization error: {message}"),
786                source: None,
787            },
788        }
789    }
790}
791
792impl From<VoirsError> for RecognitionError {
793    fn from(err: VoirsError) -> Self {
794        match err {
795            VoirsError::ModelError {
796                model_type: _,
797                message,
798                source,
799            } => RecognitionError::ModelError { message, source },
800            VoirsError::AudioError {
801                message,
802                buffer_info: _,
803            } => RecognitionError::AudioProcessingError {
804                message,
805                source: None,
806            },
807            VoirsError::ConfigError { field: _, message } => {
808                RecognitionError::ConfigurationError { message }
809            }
810            VoirsError::SerializationError { format: _, message } => {
811                RecognitionError::InvalidInput { message }
812            }
813            VoirsError::NetworkError {
814                message,
815                source,
816                retry_count: _,
817                max_retries: _,
818            } => RecognitionError::ResourceError { message, source },
819            VoirsError::IoError {
820                path: _,
821                operation: _,
822                source,
823            } => RecognitionError::ResourceError {
824                message: format!("I/O error: {source}"),
825                source: Some(Box::new(source)),
826            },
827            // Catch-all for other VoirsError variants
828            _ => RecognitionError::ModelError {
829                message: format!("VoiRS error: {err}"),
830                source: Some(Box::new(err)),
831            },
832        }
833    }
834}
835
836impl From<candle_core::Error> for RecognitionError {
837    fn from(err: candle_core::Error) -> Self {
838        RecognitionError::ModelError {
839            message: format!("Candle error: {err}"),
840            source: Some(Box::new(err)),
841        }
842    }
843}
844
845// ============================================================================
846// Utility Functions
847// ============================================================================
848
849/// Check if a model file exists and is valid
850pub fn validate_model_file(path: &std::path::Path) -> RecognitionResult<()> {
851    if !path.exists() {
852        return Err(RecognitionError::ModelLoadError {
853            message: format!("Model file not found: {}", path.display()),
854            source: None,
855        }
856        .into());
857    }
858
859    if !path.is_file() {
860        return Err(RecognitionError::ModelLoadError {
861            message: format!("Path is not a file: {}", path.display()),
862            source: None,
863        }
864        .into());
865    }
866
867    Ok(())
868}
869
870/// Create a default ASR configuration for a given language
871#[must_use]
872/// default asr config
873pub fn default_asr_config(language: LanguageCode) -> ASRConfig {
874    ASRConfig {
875        language: Some(language),
876        ..Default::default()
877    }
878}
879
880/// Create a default phoneme recognition configuration for a given language
881#[must_use]
882/// default phoneme config
883pub fn default_phoneme_config(language: LanguageCode) -> PhonemeRecognitionConfig {
884    PhonemeRecognitionConfig {
885        language,
886        ..Default::default()
887    }
888}
889
890/// Create a default audio analysis configuration
891#[must_use]
892/// default analysis config
893pub fn default_analysis_config() -> AudioAnalysisConfig {
894    AudioAnalysisConfig::default()
895}
896
897/// Convert confidence score to human-readable label
898#[must_use]
899/// confidence to label
900pub fn confidence_to_label(confidence: f32) -> &'static str {
901    match confidence {
902        c if c >= 0.9 => "Very High",
903        c if c >= 0.7 => "High",
904        c if c >= 0.5 => "Medium",
905        c if c >= 0.3 => "Low",
906        _ => "Very Low",
907    }
908}
909
910/// Utility function to merge transcripts
911#[must_use]
912/// merge transcripts
913pub fn merge_transcripts(transcripts: &[Transcript]) -> Transcript {
914    if transcripts.is_empty() {
915        return Transcript {
916            text: String::new(),
917            language: LanguageCode::EnUs,
918            confidence: 0.0,
919            word_timestamps: Vec::new(),
920            sentence_boundaries: Vec::new(),
921            processing_duration: None,
922        };
923    }
924
925    let mut merged_text = String::new();
926    let mut all_word_timestamps = Vec::new();
927    let mut all_sentence_boundaries = Vec::new();
928    let mut total_confidence = 0.0;
929    let mut total_duration = std::time::Duration::ZERO;
930
931    for transcript in transcripts {
932        if !merged_text.is_empty() {
933            merged_text.push(' ');
934        }
935        merged_text.push_str(&transcript.text);
936
937        all_word_timestamps.extend(transcript.word_timestamps.clone());
938        all_sentence_boundaries.extend(transcript.sentence_boundaries.clone());
939        total_confidence += transcript.confidence;
940
941        if let Some(duration) = transcript.processing_duration {
942            total_duration += duration;
943        }
944    }
945
946    Transcript {
947        text: merged_text,
948        language: transcripts[0].language,
949        confidence: total_confidence / transcripts.len() as f32,
950        word_timestamps: all_word_timestamps,
951        sentence_boundaries: all_sentence_boundaries,
952        processing_duration: Some(total_duration),
953    }
954}
955
956#[cfg(test)]
957mod tests {
958    use super::*;
959
960    #[test]
961    #[allow(clippy::const_is_empty)]
962    fn test_version() {
963        assert!(!VERSION.is_empty(), "VERSION should not be empty");
964    }
965
966    #[test]
967    fn test_confidence_to_label() {
968        assert_eq!(confidence_to_label(0.95), "Very High");
969        assert_eq!(confidence_to_label(0.8), "High");
970        assert_eq!(confidence_to_label(0.6), "Medium");
971        assert_eq!(confidence_to_label(0.4), "Low");
972        assert_eq!(confidence_to_label(0.2), "Very Low");
973    }
974
975    #[test]
976    fn test_default_configs() {
977        let asr_config = default_asr_config(LanguageCode::EnUs);
978        assert_eq!(asr_config.language, Some(LanguageCode::EnUs));
979        assert!(asr_config.word_timestamps);
980
981        let phoneme_config = default_phoneme_config(LanguageCode::EnUs);
982        assert_eq!(phoneme_config.language, LanguageCode::EnUs);
983        assert!(phoneme_config.word_alignment);
984
985        let analysis_config = default_analysis_config();
986        assert!(analysis_config.quality_metrics);
987        assert!(analysis_config.prosody_analysis);
988    }
989
990    #[test]
991    fn test_merge_transcripts() {
992        let transcript1 = Transcript {
993            text: "Hello".to_string(),
994            language: LanguageCode::EnUs,
995            confidence: 0.9,
996            word_timestamps: vec![],
997            sentence_boundaries: vec![],
998            processing_duration: Some(std::time::Duration::from_millis(100)),
999        };
1000
1001        let transcript2 = Transcript {
1002            text: "world".to_string(),
1003            language: LanguageCode::EnUs,
1004            confidence: 0.8,
1005            word_timestamps: vec![],
1006            sentence_boundaries: vec![],
1007            processing_duration: Some(std::time::Duration::from_millis(150)),
1008        };
1009
1010        let merged = merge_transcripts(&[transcript1, transcript2]);
1011        assert_eq!(merged.text, "Hello world");
1012        assert!((merged.confidence - 0.85).abs() < f32::EPSILON);
1013        assert_eq!(
1014            merged.processing_duration,
1015            Some(std::time::Duration::from_millis(250))
1016        );
1017    }
1018}
voirs_recognizer/lib.rs

voirs_recognizer/
lib.rs