Skip to main content

voirs_recognizer/
traits.rs

1//! Core traits for the `VoiRS` recognition system
2//!
3//! This module defines the fundamental interfaces for automatic speech recognition (ASR),
4//! phoneme recognition, and audio analysis capabilities.
5
6use async_trait::async_trait;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::pin::Pin;
10use std::time::Duration;
11use tokio_stream::Stream;
12use voirs_sdk::{AudioBuffer, LanguageCode, Phoneme, VoirsError};
13
14/// Result type for recognition operations
15pub type RecognitionResult<T> = Result<T, VoirsError>;
16
17/// Stream type for real-time audio processing
18pub type AudioStream = Pin<Box<dyn Stream<Item = AudioBuffer> + Send>>;
19
20/// Stream type for real-time transcript processing
21pub type TranscriptStream = Pin<Box<dyn Stream<Item = RecognitionResult<TranscriptChunk>> + Send>>;
22
23// ============================================================================
24// Core Data Types
25// ============================================================================
26
27/// Represents a transcribed text with metadata
28#[derive(Debug, Clone, PartialEq)]
29pub struct Transcript {
30    /// The transcribed text
31    pub text: String,
32    /// Detected or specified language
33    pub language: LanguageCode,
34    /// Overall confidence score [0.0, 1.0]
35    pub confidence: f32,
36    /// Word-level timestamps
37    pub word_timestamps: Vec<WordTimestamp>,
38    /// Sentence boundaries
39    pub sentence_boundaries: Vec<SentenceBoundary>,
40    /// Processing duration
41    pub processing_duration: Option<Duration>,
42}
43
44/// Word-level timestamp information
45#[derive(Debug, Clone, PartialEq)]
46pub struct WordTimestamp {
47    /// The word text
48    pub word: String,
49    /// Start time in seconds
50    pub start_time: f32,
51    /// End time in seconds
52    pub end_time: f32,
53    /// Confidence score for this word [0.0, 1.0]
54    pub confidence: f32,
55}
56
57/// Sentence boundary information
58#[derive(Debug, Clone, PartialEq)]
59pub struct SentenceBoundary {
60    /// Start time in seconds
61    pub start_time: f32,
62    /// End time in seconds
63    pub end_time: f32,
64    /// Sentence text
65    pub text: String,
66    /// Confidence score [0.0, 1.0]
67    pub confidence: f32,
68}
69
70/// Chunk of transcript from streaming recognition
71#[derive(Debug, Clone, PartialEq)]
72pub struct TranscriptChunk {
73    /// Partial or complete text
74    pub text: String,
75    /// Whether this chunk is final
76    pub is_final: bool,
77    /// Start time of this chunk
78    pub start_time: f32,
79    /// End time of this chunk (may be provisional)
80    pub end_time: f32,
81    /// Confidence score [0.0, 1.0]
82    pub confidence: f32,
83}
84
85/// Phoneme alignment result
86#[derive(Debug, Clone, PartialEq)]
87pub struct PhonemeAlignment {
88    /// Aligned phonemes with timing
89    pub phonemes: Vec<AlignedPhoneme>,
90    /// Total duration of the audio
91    pub total_duration: f32,
92    /// Overall alignment confidence [0.0, 1.0]
93    pub alignment_confidence: f32,
94    /// Word-level alignment information
95    pub word_alignments: Vec<WordAlignment>,
96}
97
98/// Individual phoneme with timing information
99#[derive(Debug, Clone, PartialEq)]
100pub struct AlignedPhoneme {
101    /// The phoneme
102    pub phoneme: Phoneme,
103    /// Start time in seconds
104    pub start_time: f32,
105    /// End time in seconds
106    pub end_time: f32,
107    /// Alignment confidence [0.0, 1.0]
108    pub confidence: f32,
109}
110
111/// Word-level alignment information
112#[derive(Debug, Clone, PartialEq)]
113pub struct WordAlignment {
114    /// The word text
115    pub word: String,
116    /// Start time in seconds
117    pub start_time: f32,
118    /// End time in seconds
119    pub end_time: f32,
120    /// Phonemes that make up this word
121    pub phonemes: Vec<AlignedPhoneme>,
122    /// Alignment confidence [0.0, 1.0]
123    pub confidence: f32,
124}
125
126/// Comprehensive audio analysis result
127#[derive(Debug, Clone, PartialEq)]
128pub struct AudioAnalysis {
129    /// Quality metrics (SNR, THD, etc.)
130    pub quality_metrics: HashMap<String, f32>,
131    /// Prosody analysis
132    pub prosody: ProsodyAnalysis,
133    /// Speaker characteristics
134    pub speaker_characteristics: SpeakerCharacteristics,
135    /// Emotional analysis
136    pub emotional_analysis: EmotionalAnalysis,
137    /// Processing duration
138    pub processing_duration: Option<Duration>,
139}
140
141/// Prosody analysis results
142#[derive(Debug, Clone, PartialEq, Default)]
143pub struct ProsodyAnalysis {
144    /// Pitch information
145    pub pitch: PitchAnalysis,
146    /// Rhythm information
147    pub rhythm: RhythmAnalysis,
148    /// Stress patterns
149    pub stress: StressAnalysis,
150    /// Intonation patterns
151    pub intonation: IntonationAnalysis,
152    /// Energy information
153    pub energy: EnergyAnalysis,
154}
155
156/// Pitch analysis
157#[derive(Debug, Clone, PartialEq, Default)]
158pub struct PitchAnalysis {
159    /// Mean fundamental frequency (Hz)
160    pub mean_f0: f32,
161    /// Standard deviation of F0
162    pub f0_std: f32,
163    /// F0 range (max - min)
164    pub f0_range: f32,
165    /// Pitch contour over time
166    pub pitch_contour: Vec<f32>,
167}
168
169/// Rhythm analysis
170#[derive(Debug, Clone, PartialEq, Default)]
171pub struct RhythmAnalysis {
172    /// Speaking rate (syllables per second)
173    pub speaking_rate: f32,
174    /// Pause duration statistics
175    pub pause_statistics: PauseStatistics,
176    /// Rhythm regularity score
177    pub regularity_score: f32,
178}
179
180/// Pause statistics
181#[derive(Debug, Clone, PartialEq, Default)]
182pub struct PauseStatistics {
183    /// Total pause duration
184    pub total_pause_duration: f32,
185    /// Average pause duration
186    pub average_pause_duration: f32,
187    /// Number of pauses
188    pub pause_count: usize,
189    /// Pause positions
190    pub pause_positions: Vec<f32>,
191}
192
193/// Stress analysis
194#[derive(Debug, Clone, PartialEq, Default)]
195pub struct StressAnalysis {
196    /// Stress pattern over time
197    pub stress_pattern: Vec<f32>,
198    /// Primary stress locations
199    pub primary_stress: Vec<f32>,
200    /// Secondary stress locations
201    pub secondary_stress: Vec<f32>,
202}
203
204/// Intonation analysis
205#[derive(Debug, Clone, PartialEq, Default)]
206pub struct IntonationAnalysis {
207    /// Intonation pattern type
208    pub pattern_type: IntonationPattern,
209    /// Boundary tones
210    pub boundary_tones: Vec<BoundaryTone>,
211    /// Pitch accents
212    pub pitch_accents: Vec<PitchAccent>,
213}
214
215/// Energy analysis
216#[derive(Debug, Clone, PartialEq, Default)]
217pub struct EnergyAnalysis {
218    /// Mean energy level
219    pub mean_energy: f32,
220    /// Energy standard deviation
221    pub energy_std: f32,
222    /// Energy range (max - min)
223    pub energy_range: f32,
224    /// Energy contour over time
225    pub energy_contour: Vec<f32>,
226}
227
228/// Intonation pattern types
229#[derive(Debug, Clone, PartialEq, Default)]
230pub enum IntonationPattern {
231    /// Declarative statement pattern (falling intonation)
232    #[default]
233    Declarative,
234    /// Interrogative/question pattern (rising intonation)
235    Interrogative,
236    /// Exclamative pattern (dramatic intonation)
237    Exclamative,
238    /// Imperative/command pattern (firm intonation)
239    Imperative,
240    /// Mixed or unclear pattern
241    Mixed,
242}
243
244/// Boundary tone information
245#[derive(Debug, Clone, PartialEq)]
246pub struct BoundaryTone {
247    /// Time position
248    pub time: f32,
249    /// Tone type (rising, falling, level)
250    pub tone_type: ToneType,
251    /// Confidence score
252    pub confidence: f32,
253}
254
255/// Pitch accent information
256#[derive(Debug, Clone, PartialEq)]
257pub struct PitchAccent {
258    /// Time position
259    pub time: f32,
260    /// Accent type
261    pub accent_type: AccentType,
262    /// Confidence score
263    pub confidence: f32,
264}
265
266/// Tone types
267#[derive(Debug, Clone, PartialEq)]
268pub enum ToneType {
269    /// Rising tone (pitch increases)
270    Rising,
271    /// Falling tone (pitch decreases)
272    Falling,
273    /// Level tone (pitch remains stable)
274    Level,
275    /// Rising-falling tone (pitch rises then falls)
276    RisingFalling,
277    /// Falling-rising tone (pitch falls then rises)
278    FallingRising,
279}
280
281/// Accent types
282#[derive(Debug, Clone, PartialEq)]
283pub enum AccentType {
284    /// Primary accent (strongest stress)
285    Primary,
286    /// Secondary accent (moderate stress)
287    Secondary,
288    /// Tertiary accent (weakest stress)
289    Tertiary,
290}
291
292/// Speaker characteristics
293#[derive(Debug, Clone, PartialEq, Default)]
294pub struct SpeakerCharacteristics {
295    /// Estimated speaker gender
296    pub gender: Option<Gender>,
297    /// Estimated age range
298    pub age_range: Option<AgeRange>,
299    /// Voice characteristics
300    pub voice_characteristics: VoiceCharacteristics,
301    /// Accent information
302    pub accent: Option<AccentInfo>,
303}
304
305/// Gender classification
306#[derive(Debug, Clone, PartialEq)]
307pub enum Gender {
308    /// Male voice classification
309    Male,
310    /// Female voice classification
311    Female,
312    /// Other/non-binary voice classification
313    Other,
314}
315
316/// Age range classification
317#[derive(Debug, Clone, PartialEq)]
318pub enum AgeRange {
319    /// Child voice (0-12 years)
320    Child,
321    /// Teen voice (13-19 years)
322    Teen,
323    /// Adult voice (20-59 years)
324    Adult,
325    /// Senior voice (60+ years)
326    Senior,
327}
328
329/// Voice characteristics
330#[derive(Debug, Clone, PartialEq, Default)]
331pub struct VoiceCharacteristics {
332    /// Fundamental frequency range
333    pub f0_range: (f32, f32),
334    /// Formant frequencies
335    pub formants: Vec<f32>,
336    /// Voice quality measures
337    pub voice_quality: VoiceQuality,
338}
339
340/// Voice quality measures
341#[derive(Debug, Clone, PartialEq, Default)]
342pub struct VoiceQuality {
343    /// Jitter (pitch perturbation)
344    pub jitter: f32,
345    /// Shimmer (amplitude perturbation)
346    pub shimmer: f32,
347    /// Harmonic-to-noise ratio
348    pub hnr: f32,
349}
350
351/// Accent information
352#[derive(Debug, Clone, PartialEq)]
353pub struct AccentInfo {
354    /// Detected accent type
355    pub accent_type: String,
356    /// Confidence score
357    pub confidence: f32,
358    /// Regional indicators
359    pub regional_indicators: Vec<String>,
360}
361
362/// Emotional analysis results
363#[derive(Debug, Clone, PartialEq)]
364pub struct EmotionalAnalysis {
365    /// Primary emotion
366    pub primary_emotion: Emotion,
367    /// Secondary emotions with scores
368    pub emotion_scores: HashMap<Emotion, f32>,
369    /// Emotional intensity [0.0, 1.0]
370    pub intensity: f32,
371    /// Emotional valence [-1.0, 1.0] (negative to positive)
372    pub valence: f32,
373    /// Emotional arousal [0.0, 1.0] (calm to excited)
374    pub arousal: f32,
375}
376
377/// Emotion types
378#[derive(Debug, Clone, PartialEq, Eq, Hash)]
379pub enum Emotion {
380    /// Joy/happiness emotion
381    Joy,
382    /// Sadness/melancholy emotion
383    Sadness,
384    /// Anger/irritation emotion
385    Anger,
386    /// Fear/anxiety emotion
387    Fear,
388    /// Surprise/astonishment emotion
389    Surprise,
390    /// Disgust/revulsion emotion
391    Disgust,
392    /// Neutral/calm emotion (default)
393    Neutral,
394    /// Contempt/disdain emotion
395    Contempt,
396    /// Pride/satisfaction emotion
397    Pride,
398    /// Shame/embarrassment emotion
399    Shame,
400}
401
402// ============================================================================
403// Configuration Types
404// ============================================================================
405
406/// Configuration for ASR models
407#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
408pub struct ASRConfig {
409    /// Target language (None for auto-detection)
410    pub language: Option<LanguageCode>,
411    /// Enable word-level timestamps
412    pub word_timestamps: bool,
413    /// Enable sentence segmentation
414    pub sentence_segmentation: bool,
415    /// Minimum confidence threshold
416    pub confidence_threshold: f32,
417    /// Maximum audio duration (seconds)
418    pub max_duration: Option<f32>,
419    /// Enable language detection
420    pub language_detection: bool,
421    /// Custom vocabulary
422    pub custom_vocabulary: Option<Vec<String>>,
423    /// Model variant (if supported)
424    pub model_variant: Option<String>,
425    /// Whisper model size (tiny, base, small, medium, large)
426    pub whisper_model_size: Option<String>,
427    /// Preferred ASR models in order of preference
428    pub preferred_models: Vec<String>,
429    /// Enable voice activity detection
430    pub enable_voice_activity_detection: bool,
431    /// Chunk duration in milliseconds for streaming
432    pub chunk_duration_ms: u32,
433}
434
435impl Default for ASRConfig {
436    fn default() -> Self {
437        Self {
438            language: None,
439            word_timestamps: true,
440            sentence_segmentation: true,
441            confidence_threshold: 0.5,
442            max_duration: Some(60.0),
443            language_detection: true,
444            custom_vocabulary: None,
445            model_variant: None,
446            whisper_model_size: Some("base".to_string()),
447            preferred_models: vec!["whisper".to_string()],
448            enable_voice_activity_detection: true,
449            chunk_duration_ms: 30000,
450        }
451    }
452}
453
454/// Configuration for phoneme recognition
455#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
456pub struct PhonemeRecognitionConfig {
457    /// Target language
458    pub language: LanguageCode,
459    /// Enable word-level alignment
460    pub word_alignment: bool,
461    /// Alignment method
462    pub alignment_method: AlignmentMethod,
463    /// Minimum confidence threshold
464    pub confidence_threshold: f32,
465    /// Custom pronunciation dictionary
466    pub pronunciation_dict: Option<HashMap<String, Vec<String>>>,
467}
468
469impl Default for PhonemeRecognitionConfig {
470    fn default() -> Self {
471        Self {
472            language: LanguageCode::EnUs,
473            word_alignment: true,
474            alignment_method: AlignmentMethod::Forced,
475            confidence_threshold: 0.3,
476            pronunciation_dict: None,
477        }
478    }
479}
480
481/// Alignment methods for phoneme recognition
482#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
483pub enum AlignmentMethod {
484    /// Forced alignment using acoustic models
485    Forced,
486    /// Automatic alignment using neural networks
487    Automatic,
488    /// Hybrid approach combining forced and automatic methods
489    Hybrid,
490}
491
492/// Configuration for audio analysis
493#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
494pub struct AudioAnalysisConfig {
495    /// Enable quality metrics
496    pub quality_metrics: bool,
497    /// Enable prosody analysis
498    pub prosody_analysis: bool,
499    /// Enable speaker analysis
500    pub speaker_analysis: bool,
501    /// Enable emotional analysis
502    pub emotional_analysis: bool,
503    /// Quality metrics to compute
504    pub quality_metrics_list: Vec<AudioMetric>,
505    /// Frame size for analysis
506    pub frame_size: usize,
507    /// Hop size for analysis
508    pub hop_size: usize,
509}
510
511impl Default for AudioAnalysisConfig {
512    fn default() -> Self {
513        Self {
514            quality_metrics: true,
515            prosody_analysis: true,
516            speaker_analysis: true,
517            emotional_analysis: true,
518            quality_metrics_list: vec![
519                AudioMetric::SNR,
520                AudioMetric::THD,
521                AudioMetric::SpectralCentroid,
522                AudioMetric::SpectralRolloff,
523            ],
524            frame_size: 1024,
525            hop_size: 512,
526        }
527    }
528}
529
530/// Audio quality metrics
531#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
532pub enum AudioMetric {
533    /// Signal-to-Noise Ratio
534    SNR,
535    /// Total Harmonic Distortion
536    THD,
537    /// Spectral Centroid
538    SpectralCentroid,
539    /// Spectral Rolloff
540    SpectralRolloff,
541    /// Zero Crossing Rate
542    ZeroCrossingRate,
543    /// Mel Frequency Cepstral Coefficients
544    MelFrequencyCepstralCoefficients,
545    /// Chroma Features
546    ChromaFeatures,
547    /// Spectral Contrast
548    SpectralContrast,
549    /// Tonnetz Features
550    TonnetzFeatures,
551    /// Root Mean Square Energy
552    RootMeanSquare,
553}
554
555// ============================================================================
556// Metadata Types
557// ============================================================================
558
559/// Metadata for ASR models
560#[derive(Debug, Clone, PartialEq)]
561pub struct ASRMetadata {
562    /// Model name
563    pub name: String,
564    /// Model version
565    pub version: String,
566    /// Model description
567    pub description: String,
568    /// Supported languages
569    pub supported_languages: Vec<LanguageCode>,
570    /// Model architecture
571    pub architecture: String,
572    /// Model size in MB
573    pub model_size_mb: f32,
574    /// Expected inference time (relative to audio duration)
575    pub inference_speed: f32,
576    /// Word Error Rate benchmarks
577    pub wer_benchmarks: HashMap<LanguageCode, f32>,
578    /// Supported features
579    pub supported_features: Vec<ASRFeature>,
580}
581
582/// ASR model features
583#[derive(Debug, Clone, PartialEq)]
584pub enum ASRFeature {
585    /// Word-level timestamps
586    WordTimestamps,
587    /// Sentence boundary detection
588    SentenceSegmentation,
589    /// Language detection
590    LanguageDetection,
591    /// Noise robustness
592    NoiseRobustness,
593    /// Streaming inference
594    StreamingInference,
595    /// Custom vocabulary support
596    CustomVocabulary,
597    /// Speaker diarization
598    SpeakerDiarization,
599    /// Emotion recognition
600    EmotionRecognition,
601}
602
603/// Metadata for phoneme recognizers
604#[derive(Debug, Clone, PartialEq)]
605pub struct PhonemeRecognizerMetadata {
606    /// Model name
607    pub name: String,
608    /// Model version
609    pub version: String,
610    /// Model description
611    pub description: String,
612    /// Supported languages
613    pub supported_languages: Vec<LanguageCode>,
614    /// Alignment methods supported
615    pub alignment_methods: Vec<AlignmentMethod>,
616    /// Average alignment accuracy
617    pub alignment_accuracy: f32,
618    /// Supported features
619    pub supported_features: Vec<PhonemeRecognitionFeature>,
620}
621
622/// Phoneme recognition features
623#[derive(Debug, Clone, PartialEq)]
624pub enum PhonemeRecognitionFeature {
625    /// Word alignment
626    WordAlignment,
627    /// Custom pronunciation
628    CustomPronunciation,
629    /// Multi-language support
630    MultiLanguage,
631    /// Real-time alignment
632    RealTimeAlignment,
633    /// Confidence scoring
634    ConfidenceScoring,
635    /// Pronunciation assessment
636    PronunciationAssessment,
637}
638
639/// Metadata for audio analyzers
640#[derive(Debug, Clone, PartialEq)]
641pub struct AudioAnalyzerMetadata {
642    /// Analyzer name
643    pub name: String,
644    /// Analyzer version
645    pub version: String,
646    /// Analyzer description
647    pub description: String,
648    /// Supported metrics
649    pub supported_metrics: Vec<AudioMetric>,
650    /// Processing capabilities
651    pub capabilities: Vec<AnalysisCapability>,
652    /// Expected processing time
653    pub processing_speed: f32,
654}
655
656/// Analysis capabilities
657#[derive(Debug, Clone, PartialEq)]
658pub enum AnalysisCapability {
659    /// Quality metrics
660    QualityMetrics,
661    /// Prosody analysis
662    ProsodyAnalysis,
663    /// Speaker characteristics
664    SpeakerCharacteristics,
665    /// Emotional analysis
666    EmotionalAnalysis,
667    /// Real-time analysis
668    RealtimeAnalysis,
669    /// Batch processing
670    BatchProcessing,
671    /// Streaming analysis
672    StreamingAnalysis,
673}
674
675// ============================================================================
676// Core Traits
677// ============================================================================
678
679/// Trait for Automatic Speech Recognition (ASR) models
680#[async_trait]
681pub trait ASRModel: Send + Sync {
682    /// Transcribe audio to text
683    async fn transcribe(
684        &self,
685        audio: &AudioBuffer,
686        config: Option<&ASRConfig>,
687    ) -> RecognitionResult<Transcript>;
688
689    /// Stream-based transcription for real-time processing
690    async fn transcribe_streaming(
691        &self,
692        audio_stream: AudioStream,
693        config: Option<&ASRConfig>,
694    ) -> RecognitionResult<TranscriptStream>;
695
696    /// Get supported languages
697    fn supported_languages(&self) -> Vec<LanguageCode>;
698
699    /// Get model metadata
700    fn metadata(&self) -> ASRMetadata;
701
702    /// Check if a feature is supported
703    fn supports_feature(&self, feature: ASRFeature) -> bool;
704
705    /// Detect language from audio
706    async fn detect_language(&self, _audio: &AudioBuffer) -> RecognitionResult<LanguageCode> {
707        // Default implementation - models can override
708        Err(VoirsError::ModelError {
709            model_type: voirs_sdk::error::ModelType::ASR,
710            message: "Language detection not implemented for this model".to_string(),
711            source: None,
712        })
713    }
714}
715
716/// Trait for phoneme recognition and alignment
717#[async_trait]
718pub trait PhonemeRecognizer: Send + Sync {
719    /// Recognize phonemes from audio
720    async fn recognize_phonemes(
721        &self,
722        audio: &AudioBuffer,
723        config: Option<&PhonemeRecognitionConfig>,
724    ) -> RecognitionResult<Vec<Phoneme>>;
725
726    /// Align phonemes with expected sequence
727    async fn align_phonemes(
728        &self,
729        audio: &AudioBuffer,
730        expected: &[Phoneme],
731        config: Option<&PhonemeRecognitionConfig>,
732    ) -> RecognitionResult<PhonemeAlignment>;
733
734    /// Align text with audio (forced alignment)
735    async fn align_text(
736        &self,
737        audio: &AudioBuffer,
738        text: &str,
739        config: Option<&PhonemeRecognitionConfig>,
740    ) -> RecognitionResult<PhonemeAlignment>;
741
742    /// Get model metadata
743    fn metadata(&self) -> PhonemeRecognizerMetadata;
744
745    /// Check if a feature is supported
746    fn supports_feature(&self, feature: PhonemeRecognitionFeature) -> bool;
747}
748
749/// Trait for audio analysis
750#[async_trait]
751pub trait AudioAnalyzer: Send + Sync {
752    /// Analyze audio for various characteristics
753    async fn analyze(
754        &self,
755        audio: &AudioBuffer,
756        config: Option<&AudioAnalysisConfig>,
757    ) -> RecognitionResult<AudioAnalysis>;
758
759    /// Analyze audio in streaming mode
760    async fn analyze_streaming(
761        &self,
762        audio_stream: AudioStream,
763        config: Option<&AudioAnalysisConfig>,
764    ) -> RecognitionResult<Pin<Box<dyn Stream<Item = RecognitionResult<AudioAnalysis>> + Send>>>;
765
766    /// Get supported metrics
767    fn supported_metrics(&self) -> Vec<AudioMetric>;
768
769    /// Get analyzer metadata
770    fn metadata(&self) -> AudioAnalyzerMetadata;
771
772    /// Check if a capability is supported
773    fn supports_capability(&self, capability: AnalysisCapability) -> bool;
774}
775
776// ============================================================================
777// Utility Traits
778// ============================================================================
779
780/// Trait for models that can be configured
781pub trait Configurable {
782    /// Configuration type for this model
783    type Config;
784
785    /// Apply configuration to the model
786    fn configure(&mut self, config: &Self::Config) -> RecognitionResult<()>;
787
788    /// Get current configuration
789    fn get_config(&self) -> &Self::Config;
790}
791
792/// Trait for models that support batching
793#[async_trait]
794pub trait BatchProcessing<Input, Output> {
795    /// Process multiple inputs in a batch
796    async fn process_batch(&self, inputs: &[Input]) -> RecognitionResult<Vec<Output>>;
797
798    /// Get optimal batch size
799    fn optimal_batch_size(&self) -> usize;
800}
801
802/// Trait for resource management
803pub trait ResourceManager {
804    /// Load model resources
805    fn load_resources(&mut self) -> RecognitionResult<()>;
806
807    /// Unload model resources
808    fn unload_resources(&mut self) -> RecognitionResult<()>;
809
810    /// Check if resources are loaded
811    fn is_loaded(&self) -> bool;
812
813    /// Get resource usage statistics
814    fn resource_usage(&self) -> ResourceUsage;
815}
816
817/// Resource usage statistics
818#[derive(Debug, Clone)]
819pub struct ResourceUsage {
820    /// Memory usage in MB
821    pub memory_mb: f32,
822    /// CPU usage percentage
823    pub cpu_percent: f32,
824    /// GPU usage percentage (if applicable)
825    pub gpu_percent: Option<f32>,
826    /// GPU memory usage in MB (if applicable)
827    pub gpu_memory_mb: Option<f32>,
828}