trustformers 0.1.1

//! # Audio Feature Extractor Module
//!
//! This module provides comprehensive audio feature extraction capabilities for
//! TrustformeRS, supporting various audio models and preprocessing techniques.
//!
//! ## Overview
//!
//! The audio feature extractor is designed to handle multiple audio input formats
//! and extract meaningful features for downstream audio processing tasks. It supports
//! popular audio model architectures including Wav2Vec2, Whisper, and HuBERT.
//!
//! ## Key Features
//!
//! - **Multi-format Support**: Handles various audio sample rates and formats
//! - **Intelligent Resampling**: Automatic sample rate conversion when needed
//! - **Spectral Analysis**: Advanced spectral feature extraction (MFCC-like)
//! - **Normalization**: Configurable audio normalization for consistent processing
//! - **Batch Processing**: Efficient batch processing for improved throughput
//!
//! ## Supported Audio Models
//!
//! - **Wav2Vec2**: Self-supervised speech representation learning
//! - **Whisper**: Automatic speech recognition with robust preprocessing
//! - **HuBERT**: Hidden-Unit BERT for speech representation
//! - **Generic Audio**: Fallback for other audio architectures
//!
//! ## Usage Examples
//!
//! ### Basic Audio Feature Extraction
//!
//! ```rust,ignore
//! use trustformers::auto::feature_extractors::audio::{AudioFeatureExtractor, AudioFeatureConfig};
//! use trustformers::auto::types::{FeatureInput, AudioMetadata};
//!
//! // Create configuration for audio feature extraction
//! let config = AudioFeatureConfig {
//!     sampling_rate: 16000,
//!     feature_size: 80,
//!     n_fft: 512,
//!     hop_length: 160,
//!     normalize: true,
//!     max_batch_size: Some(16),
//! };
//!
//! // Create the audio feature extractor
//! let extractor = AudioFeatureExtractor::new(config);
//!
//! // Prepare audio input
//! let samples: Vec<f32> = vec![0.0; 16000]; // 1 second at 16kHz
//! let input = FeatureInput::Audio {
//!     samples,
//!     sample_rate: 16000,
//!     metadata: Some(AudioMetadata {
//!         duration: 1.0,
//!         channels: 1,
//!         bit_depth: Some(16),
//!     }),
//! };
//!
//! // Extract features
//! let features = extractor.extract_features(&input)?;
//! println!("Extracted features with shape: {:?}", features.shape);
//! ```
//!
//! ### Advanced Configuration
//!
//! ```rust,ignore
//! use serde_json::json;
//!
//! // Load configuration from model config
//! let model_config = json!({
//!     "sampling_rate": 16000,
//!     "feature_size": 768,
//!     "n_fft": 1024,
//!     "hop_length": 256,
//!     "normalize": true
//! });
//!
//! let config = AudioFeatureConfig::from_config(&model_config)?;
//! let extractor = AudioFeatureExtractor::new(config);
//! ```
//!
//! ## Performance Considerations
//!
//! - **Memory Usage**: Spectral analysis requires temporary buffers proportional to FFT size
//! - **Computation**: Feature extraction complexity scales with sequence length and feature size
//! - **Resampling**: Sample rate conversion adds computational overhead when rates don't match
//! - **Batch Processing**: Batching multiple audio clips can improve throughput significantly
//!
//! ## Audio Processing Pipeline
//!
//! The audio feature extraction follows this pipeline:
//!
//! 1. **Input Validation**: Verify audio input format and metadata
//! 2. **Resampling**: Convert to target sample rate if necessary
//! 3. **Normalization**: Apply amplitude normalization if enabled
//! 4. **Windowing**: Apply sliding window for frame-based analysis
//! 5. **Spectral Analysis**: Extract spectral features using FFT-based methods
//! 6. **Feature Formatting**: Format output features with metadata
//!
//! ## Technical Details
//!
//! ### Resampling Algorithm
//!
//! The resampling implementation uses linear interpolation for simplicity and speed.
//! For production use, consider more sophisticated resampling algorithms like
//! sinc interpolation for better audio quality.
//!
//! ### Feature Extraction
//!
//! The spectral feature extraction is based on a simplified MFCC-like approach:
//! - Sliding window analysis with configurable hop length
//! - Cosine-based spectral transformation
//! - Frame-wise feature computation
//!
//! ### Memory Management
//!
//! The implementation is designed to minimize memory allocations:
//! - Pre-allocated feature vectors based on input length
//! - Efficient resampling with minimal temporary buffers
//! - In-place normalization when possible

use super::{FeatureExtractor, FeatureExtractorConfig};
use crate::auto::types::{FeatureInput, FeatureOutput};
use crate::error::{Result, TrustformersError};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

// =============================================================================
// Audio Feature Extractor Implementation
// =============================================================================

/// Audio feature extractor for processing audio signals into model-ready features
///
/// The `AudioFeatureExtractor` handles the complete pipeline from raw audio samples
/// to processed feature vectors suitable for audio models. It supports various
/// preprocessing options including resampling, normalization, and spectral analysis.
///
/// ## Design Philosophy
///
/// This implementation prioritizes:
/// - **Robustness**: Handles various input sample rates and formats gracefully
/// - **Efficiency**: Optimized for both single samples and batch processing
/// - **Flexibility**: Configurable preprocessing to match different model requirements
/// - **Compatibility**: Works with popular audio model architectures
///
/// ## Audio Processing Capabilities
///
/// - **Sample Rate Conversion**: Automatic resampling to target sample rate
/// - **Amplitude Normalization**: Optional normalization for consistent signal levels
/// - **Spectral Features**: MFCC-like spectral feature extraction
/// - **Frame-based Analysis**: Sliding window processing for temporal modeling
///
/// ## Model Compatibility
///
/// Designed to work with:
/// - Wav2Vec2 models (Facebook's self-supervised learning)
/// - Whisper models (OpenAI's robust ASR)
/// - HuBERT models (Facebook's hidden unit BERT)
/// - Custom audio architectures
///
/// ## Thread Safety
///
/// This extractor is thread-safe and can be used concurrently across multiple
/// threads. However, individual extraction operations are not internally
/// parallelized.
#[derive(Debug, Clone)]
pub struct AudioFeatureExtractor {
    config: AudioFeatureConfig,
}

impl AudioFeatureExtractor {
    /// Create a new audio feature extractor with the specified configuration
    ///
    /// # Arguments
    ///
    /// * `config` - Configuration parameters for audio feature extraction
    ///
    /// # Examples
    ///
    /// ```rust,ignore
    /// let config = AudioFeatureConfig {
    ///     sampling_rate: 16000,
    ///     feature_size: 80,
    ///     n_fft: 512,
    ///     hop_length: 160,
    ///     normalize: true,
    ///     max_batch_size: Some(16),
    /// };
    ///
    /// let extractor = AudioFeatureExtractor::new(config);
    /// ```
    pub fn new(config: AudioFeatureConfig) -> Self {
        Self { config }
    }

    /// Get a reference to the extractor configuration
    ///
    /// This method provides access to the internal configuration parameters
    /// used by the extractor.
    ///
    /// # Returns
    ///
    /// Reference to the AudioFeatureConfig
    pub fn get_config(&self) -> &AudioFeatureConfig {
        &self.config
    }

    /// Preprocess audio samples before feature extraction
    ///
    /// This method handles sample rate conversion and normalization as needed.
    /// It's called automatically during feature extraction but can also be
    /// used independently for audio preprocessing.
    ///
    /// # Arguments
    ///
    /// * `samples` - Raw audio samples as f32 values
    /// * `sample_rate` - Sample rate of the input audio
    ///
    /// # Returns
    ///
    /// Preprocessed audio samples ready for feature extraction
    ///
    /// # Errors
    ///
    /// - `TrustformersError::ProcessingError` if resampling fails
    /// - `TrustformersError::InvalidInput` if audio samples are invalid
    fn preprocess_audio(&self, samples: &[f32], sample_rate: u32) -> Result<Vec<f32>> {
        // Resample if necessary
        let mut processed = samples.to_vec();

        if sample_rate != self.config.sampling_rate {
            processed = self.resample(samples, sample_rate, self.config.sampling_rate)?;
        }

        // Apply normalization
        if self.config.normalize {
            let max_val = processed.iter().map(|&x| x.abs()).fold(0.0f32, f32::max);
            if max_val > 0.0 {
                for sample in &mut processed {
                    *sample /= max_val;
                }
            }
        }

        Ok(processed)
    }

    /// Resample audio from one sample rate to another
    ///
    /// This implementation uses linear interpolation for resampling. While simple,
    /// it's sufficient for many applications. For high-quality audio processing,
    /// consider using more sophisticated resampling algorithms.
    ///
    /// # Arguments
    ///
    /// * `samples` - Input audio samples
    /// * `from_rate` - Source sample rate
    /// * `to_rate` - Target sample rate
    ///
    /// # Returns
    ///
    /// Resampled audio at the target sample rate
    ///
    /// # Algorithm
    ///
    /// Uses linear interpolation between adjacent samples:
    /// 1. Calculate resampling ratio
    /// 2. For each output sample, find corresponding input position
    /// 3. Interpolate between adjacent input samples
    /// 4. Handle boundary conditions appropriately
    fn resample(&self, samples: &[f32], from_rate: u32, to_rate: u32) -> Result<Vec<f32>> {
        // Simplified resampling (linear interpolation)
        let ratio = to_rate as f32 / from_rate as f32;
        let new_length = (samples.len() as f32 * ratio) as usize;
        let mut resampled = Vec::with_capacity(new_length);

        for i in 0..new_length {
            let src_index = i as f32 / ratio;
            let index = src_index as usize;

            if index + 1 < samples.len() {
                let fraction = src_index - index as f32;
                let interpolated =
                    samples[index] * (1.0 - fraction) + samples[index + 1] * fraction;
                resampled.push(interpolated);
            } else {
                resampled.push(samples[samples.len() - 1]);
            }
        }

        Ok(resampled)
    }

    /// Extract spectral features from preprocessed audio
    ///
    /// This method implements a simplified MFCC-like feature extraction algorithm
    /// that computes spectral features using sliding window analysis and cosine
    /// transformation.
    ///
    /// # Arguments
    ///
    /// * `audio` - Preprocessed audio samples
    ///
    /// # Returns
    ///
    /// Flattened feature vector containing spectral features for all frames
    ///
    /// # Algorithm Details
    ///
    /// 1. **Frame Segmentation**: Divide audio into overlapping frames
    /// 2. **Windowing**: Apply sliding window with hop_length spacing
    /// 3. **Spectral Transform**: Compute spectral features using cosine basis
    /// 4. **Feature Aggregation**: Flatten frame-wise features into single vector
    ///
    /// # Feature Layout
    ///
    /// Output features are organized as:
    /// ```text
    /// [frame_0_features, frame_1_features, ..., frame_N_features]
    /// ```
    /// Where each frame contains `feature_size` spectral coefficients.
    fn extract_audio_features(&self, audio: &[f32]) -> Result<Vec<f32>> {
        // Simplified MFCC-like feature extraction
        let n_frames = audio.len() / self.config.hop_length;
        let n_features = self.config.feature_size;

        let mut features = Vec::with_capacity(n_frames * n_features);

        for frame_idx in 0..n_frames {
            let start = frame_idx * self.config.hop_length;
            let end = std::cmp::min(start + self.config.n_fft, audio.len());

            // Extract spectral features (simplified)
            for feat_idx in 0..n_features {
                let mut feature_val = 0.0f32;
                for sample_idx in start..end {
                    feature_val += audio[sample_idx] * (feat_idx as f32 / n_features as f32).cos();
                }
                features.push(feature_val / (end - start) as f32);
            }
        }

        Ok(features)
    }
}

impl FeatureExtractor for AudioFeatureExtractor {
    /// Extract features from audio input
    ///
    /// This is the main entry point for audio feature extraction. It handles
    /// the complete pipeline from raw audio to processed features.
    ///
    /// # Arguments
    ///
    /// * `input` - Feature input containing audio data
    ///
    /// # Returns
    ///
    /// Processed features with metadata and shape information
    ///
    /// # Processing Pipeline
    ///
    /// 1. Validate input is audio type
    /// 2. Preprocess audio (resample, normalize)
    /// 3. Extract spectral features
    /// 4. Format output with metadata
    ///
    /// # Output Format
    ///
    /// Features are returned with:
    /// - **Shape**: [n_frames, feature_size] where n_frames = audio_length / hop_length
    /// - **Metadata**: Sample rate, duration, and other audio properties
    /// - **Features**: Flattened spectral feature vector
    fn extract_features(&self, input: &FeatureInput) -> Result<FeatureOutput> {
        match input {
            FeatureInput::Audio {
                samples,
                sample_rate,
                metadata,
            } => {
                // Preprocess audio
                let processed_audio = self.preprocess_audio(samples, *sample_rate)?;

                // Extract audio features
                let features = self.extract_audio_features(&processed_audio)?;

                let mut output_metadata = HashMap::new();
                output_metadata.insert(
                    "sample_rate".to_string(),
                    serde_json::Value::Number((*sample_rate).into()),
                );
                output_metadata.insert(
                    "original_length".to_string(),
                    serde_json::Value::Number(samples.len().into()),
                );
                output_metadata.insert(
                    "processed_length".to_string(),
                    serde_json::Value::Number(processed_audio.len().into()),
                );

                if let Some(meta) = metadata {
                    output_metadata.insert(
                        "duration".to_string(),
                        serde_json::Number::from_f64(meta.duration)
                            .map(serde_json::Value::Number)
                            .unwrap_or_else(|| {
                                serde_json::Value::String(format!("{}", meta.duration))
                            }),
                    );
                    output_metadata.insert(
                        "channels".to_string(),
                        serde_json::Value::Number(meta.channels.into()),
                    );
                    if let Some(bit_depth) = meta.bit_depth {
                        output_metadata.insert(
                            "bit_depth".to_string(),
                            serde_json::Value::Number(bit_depth.into()),
                        );
                    }
                }

                let n_frames = processed_audio.len() / self.config.hop_length;

                Ok(FeatureOutput {
                    features,
                    shape: vec![n_frames, self.config.feature_size],
                    metadata: output_metadata,
                    attention_mask: None,
                    special_tokens: vec![],
                })
            },
            _ => Err(TrustformersError::invalid_input_simple(
                "Audio feature extractor requires audio input".to_string(),
            )),
        }
    }

    fn config(&self) -> &dyn FeatureExtractorConfig {
        &self.config
    }
}

// =============================================================================
// Audio Feature Configuration
// =============================================================================

/// Configuration for audio feature extraction
///
/// This structure defines all parameters needed for audio feature extraction,
/// including preprocessing options, spectral analysis parameters, and
/// performance settings.
///
/// ## Core Parameters
///
/// - **sampling_rate**: Target sample rate for audio processing
/// - **feature_size**: Dimensionality of extracted spectral features
/// - **n_fft**: FFT window size for spectral analysis
/// - **hop_length**: Frame advance in samples
/// - **normalize**: Whether to apply amplitude normalization
/// - **max_batch_size**: Maximum batch size for efficient processing
///
/// ## Parameter Guidelines
///
/// ### Sampling Rate
/// - 16kHz: Standard for speech recognition models
/// - 22kHz: Good balance for music and speech
/// - 44.1kHz: High quality for music applications
///
/// ### Feature Size
/// - 80: Common for speech models (matches mel-spectrogram bins)
/// - 128: Higher resolution for music
/// - 768+: Deep features for transformer models
///
/// ### FFT Parameters
/// - n_fft should be power of 2 for efficiency
/// - hop_length typically n_fft/4 for good overlap
/// - Smaller hop_length = more temporal resolution
///
/// ## Usage Examples
///
/// ```rust,ignore
/// // Standard speech configuration
/// let speech_config = AudioFeatureConfig {
///     sampling_rate: 16000,
///     feature_size: 80,
///     n_fft: 512,
///     hop_length: 160,
///     normalize: true,
///     max_batch_size: Some(16),
/// };
///
/// // High-quality music configuration
/// let music_config = AudioFeatureConfig {
///     sampling_rate: 22050,
///     feature_size: 128,
///     n_fft: 1024,
///     hop_length: 256,
///     normalize: true,
///     max_batch_size: Some(8),
/// };
/// ```
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AudioFeatureConfig {
    /// Target sampling rate for audio processing (Hz)
    pub sampling_rate: u32,

    /// Number of spectral features to extract per frame
    pub feature_size: usize,

    /// FFT window size for spectral analysis
    pub n_fft: usize,

    /// Frame advance in samples (hop size)
    pub hop_length: usize,

    /// Whether to normalize audio amplitude
    pub normalize: bool,

    /// Maximum batch size for processing
    pub max_batch_size: Option<usize>,
}

impl AudioFeatureConfig {
    /// Create audio feature configuration from model config JSON
    ///
    /// This method extracts audio-specific configuration parameters from a
    /// general model configuration object, providing sensible defaults for
    /// missing parameters.
    ///
    /// # Arguments
    ///
    /// * `config` - JSON configuration object from model
    ///
    /// # Returns
    ///
    /// AudioFeatureConfig with parameters extracted from the JSON
    ///
    /// # Configuration Mapping
    ///
    /// The following JSON fields are recognized:
    /// - `sampling_rate` → sampling_rate (default: 16000)
    /// - `feature_size` → feature_size (default: 80)
    /// - `n_fft` → n_fft (default: 512)
    /// - `hop_length` → hop_length (default: 160)
    /// - `normalize` → normalize (default: true)
    /// - `max_batch_size` → max_batch_size (default: None)
    ///
    /// # Examples
    ///
    /// ```rust,ignore
    ///    /// use serde_json::json;
    ///
    /// let config_json = json!({
    ///     "sampling_rate": 16000,
    ///     "feature_size": 768,
    ///     "n_fft": 1024,
    ///     "hop_length": 256
    /// });
    ///
    /// let audio_config = AudioFeatureConfig::from_config(&config_json)?;

    pub fn from_config(config: &serde_json::Value) -> Result<Self> {
        Ok(Self {
            sampling_rate: config.get("sampling_rate").and_then(|v| v.as_u64()).unwrap_or(16000)
                as u32,
            feature_size: config.get("feature_size").and_then(|v| v.as_u64()).unwrap_or(80)
                as usize,
            n_fft: config.get("n_fft").and_then(|v| v.as_u64()).unwrap_or(512) as usize,
            hop_length: config.get("hop_length").and_then(|v| v.as_u64()).unwrap_or(160) as usize,
            normalize: config.get("normalize").and_then(|v| v.as_bool()).unwrap_or(true),
            max_batch_size: config
                .get("max_batch_size")
                .and_then(|v| v.as_u64())
                .map(|v| v as usize),
        })
    }

    /// Validate configuration parameters for consistency
    ///
    /// This method checks that the configuration parameters are valid and
    /// internally consistent.
    ///
    /// # Returns
    ///
    /// `Ok(())` if configuration is valid, error with details otherwise
    ///
    /// # Validation Rules
    ///
    /// - sampling_rate must be > 0
    /// - feature_size must be > 0
    /// - n_fft must be > 0 and should be power of 2
    /// - hop_length must be > 0 and <= n_fft
    /// - max_batch_size must be > 0 if specified
    ///
    /// # Examples
    ///
    /// ```rust,ignore
    /// let config = AudioFeatureConfig {
    ///     sampling_rate: 16000,
    ///     feature_size: 80,
    ///     n_fft: 512,
    ///     hop_length: 160,
    ///     normalize: true,
    ///     max_batch_size: Some(16),
    /// };
    ///
    /// config.validate_config()?; // Should succeed
    /// ```

    pub fn validate_config(&self) -> Result<()> {
        if self.sampling_rate == 0 {
            return Err(TrustformersError::invalid_input(
                "sampling_rate must be greater than 0".to_string(),
                Some("sampling_rate".to_string()),
                Some("positive integer > 0".to_string()),
                Some("0".to_string()),
            ));
        }

        if self.feature_size == 0 {
            return Err(TrustformersError::invalid_input(
                "feature_size must be greater than 0".to_string(),
                Some("feature_size".to_string()),
                Some("positive integer > 0".to_string()),
                Some("0".to_string()),
            ));
        }

        if self.n_fft == 0 {
            return Err(TrustformersError::invalid_input(
                "n_fft must be greater than 0".to_string(),
                Some("n_fft".to_string()),
                Some("positive integer > 0".to_string()),
                Some("0".to_string()),
            ));
        }

        if self.hop_length == 0 {
            return Err(TrustformersError::invalid_input(
                "hop_length must be greater than 0".to_string(),
                Some("hop_length".to_string()),
                Some("positive integer > 0".to_string()),
                Some("0".to_string()),
            ));
        }

        if self.hop_length > self.n_fft {
            return Err(TrustformersError::invalid_input(
                "hop_length should not exceed n_fft".to_string(),
                Some("hop_length".to_string()),
                Some("value <= n_fft".to_string()),
                Some("value > n_fft".to_string()),
            ));
        }

        if let Some(batch_size) = self.max_batch_size {
            if batch_size == 0 {
                return Err(TrustformersError::invalid_input(
                    "max_batch_size must be greater than 0 if specified".to_string(),
                    Some("max_batch_size".to_string()),
                    Some("positive integer > 0".to_string()),
                    Some("0".to_string()),
                ));
            }
        }

        // Check if n_fft is power of 2 (recommended for FFT efficiency)
        if !self.n_fft.is_power_of_two() {
            // This is a warning, not an error, but we could log it
            // log::warn!("n_fft ({}) is not a power of 2, FFT may be less efficient", self.n_fft);
        }

        Ok(())
    }

    /// Get the expected output shape for given input length
    ///
    /// This method calculates the expected output feature shape based on
    /// the input audio length and configuration parameters.
    ///
    /// # Arguments
    ///
    /// * `input_length` - Length of input audio in samples
    ///
    /// # Returns
    ///
    /// Tuple of (n_frames, feature_size) representing output shape
    pub fn get_output_shape(&self, input_length: usize) -> (usize, usize) {
        let n_frames = input_length / self.hop_length;
        (n_frames, self.feature_size)
    }

    /// Calculate memory requirements for processing
    ///
    /// This method estimates the memory usage for processing audio of a given length.
    ///
    /// # Arguments
    ///
    /// * `input_length` - Length of input audio in samples
    ///
    /// # Returns
    ///
    /// Estimated memory usage in bytes
    pub fn estimate_memory_usage(&self, input_length: usize) -> usize {
        let (n_frames, feature_size) = self.get_output_shape(input_length);

        // Estimate memory for:
        // - Input audio: input_length * 4 bytes (f32)
        // - Processed audio: input_length * 4 bytes (f32)
        // - Output features: n_frames * feature_size * 4 bytes (f32)
        // - Temporary buffers: n_fft * 4 bytes (f32)

        let input_memory = input_length * 4;
        let processed_memory = input_length * 4;
        let output_memory = n_frames * feature_size * 4;
        let temp_memory = self.n_fft * 4;

        input_memory + processed_memory + output_memory + temp_memory
    }
}

impl FeatureExtractorConfig for AudioFeatureConfig {
    fn feature_size(&self) -> usize {
        self.feature_size
    }

    fn supports_batching(&self) -> bool {
        true
    }

    fn max_batch_size(&self) -> Option<usize> {
        self.max_batch_size
    }

    fn additional_params(&self) -> HashMap<String, serde_json::Value> {
        let mut params = HashMap::new();
        params.insert(
            "sampling_rate".to_string(),
            serde_json::Value::Number(self.sampling_rate.into()),
        );
        params.insert(
            "n_fft".to_string(),
            serde_json::Value::Number(self.n_fft.into()),
        );
        params.insert(
            "hop_length".to_string(),
            serde_json::Value::Number(self.hop_length.into()),
        );
        params.insert(
            "normalize".to_string(),
            serde_json::Value::Bool(self.normalize),
        );
        params
    }

    fn validate(&self) -> Result<()> {
        self.validate_config()
    }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(test)]
mod tests {
    use super::*;
    use crate::auto::types::AudioMetadata;

    #[test]
    fn test_audio_feature_extractor_creation() {
        let config = AudioFeatureConfig {
            sampling_rate: 16000,
            feature_size: 80,
            n_fft: 512,
            hop_length: 160,
            normalize: true,
            max_batch_size: Some(16),
        };

        let extractor = AudioFeatureExtractor::new(config);
        assert_eq!(extractor.config().feature_size(), 80);
        assert!(extractor.config().supports_batching());
        assert_eq!(extractor.config().max_batch_size(), Some(16));
    }

    #[test]
    fn test_audio_feature_extraction() {
        let config = AudioFeatureConfig {
            sampling_rate: 16000,
            feature_size: 80,
            n_fft: 512,
            hop_length: 160,
            normalize: true,
            max_batch_size: Some(16),
        };

        let extractor = AudioFeatureExtractor::new(config);

        // Create test audio (1 second of sine wave)
        let samples: Vec<f32> = (0..16000).map(|i| (i as f32 * 0.001).sin()).collect();
        let input = FeatureInput::Audio {
            samples,
            sample_rate: 16000,
            metadata: Some(AudioMetadata {
                duration: 1.0,
                channels: 1,
                bit_depth: Some(16),
            }),
        };

        let result = extractor.extract_features(&input);
        assert!(result.is_ok());

        let output = result.expect("operation failed in test");
        let expected_frames = 16000 / 160; // input_length / hop_length
        assert_eq!(output.shape, vec![expected_frames, 80]);
        assert_eq!(output.features.len(), expected_frames * 80);
    }

    #[test]
    fn test_resampling() {
        let config = AudioFeatureConfig {
            sampling_rate: 16000,
            feature_size: 80,
            n_fft: 512,
            hop_length: 160,
            normalize: true,
            max_batch_size: Some(16),
        };

        let extractor = AudioFeatureExtractor::new(config);

        // Test resampling from 8kHz to 16kHz
        let samples_8k: Vec<f32> = (0..8000).map(|i| (i as f32 * 0.001).sin()).collect();
        let resampled =
            extractor.resample(&samples_8k, 8000, 16000).expect("operation failed in test");

        // Should double the length
        assert_eq!(resampled.len(), 16000);
    }

    #[test]
    fn test_audio_config_from_json() {
        let config_json = serde_json::json!({
            "sampling_rate": 22050,
            "feature_size": 128,
            "n_fft": 1024,
            "hop_length": 256,
            "normalize": false
        });

        let config =
            AudioFeatureConfig::from_config(&config_json).expect("operation failed in test");

        assert_eq!(config.sampling_rate, 22050);
        assert_eq!(config.feature_size, 128);
        assert_eq!(config.n_fft, 1024);
        assert_eq!(config.hop_length, 256);
        assert!(!config.normalize);
    }

    #[test]
    fn test_config_validation() {
        // Valid config
        let valid_config = AudioFeatureConfig {
            sampling_rate: 16000,
            feature_size: 80,
            n_fft: 512,
            hop_length: 160,
            normalize: true,
            max_batch_size: Some(16),
        };
        assert!(valid_config.validate_config().is_ok());

        // Invalid config - hop_length > n_fft
        let invalid_config = AudioFeatureConfig {
            sampling_rate: 16000,
            feature_size: 80,
            n_fft: 512,
            hop_length: 1024, // > n_fft
            normalize: true,
            max_batch_size: Some(16),
        };
        assert!(invalid_config.validate_config().is_err());

        // Invalid config - zero sampling rate
        let zero_rate_config = AudioFeatureConfig {
            sampling_rate: 0,
            feature_size: 80,
            n_fft: 512,
            hop_length: 160,
            normalize: true,
            max_batch_size: Some(16),
        };
        assert!(zero_rate_config.validate_config().is_err());
    }

    #[test]
    fn test_output_shape_calculation() {
        let config = AudioFeatureConfig {
            sampling_rate: 16000,
            feature_size: 80,
            n_fft: 512,
            hop_length: 160,
            normalize: true,
            max_batch_size: Some(16),
        };

        let (n_frames, feature_size) = config.get_output_shape(16000); // 1 second
        assert_eq!(n_frames, 100); // 16000 / 160
        assert_eq!(feature_size, 80);
    }

    #[test]
    fn test_memory_estimation() {
        let config = AudioFeatureConfig {
            sampling_rate: 16000,
            feature_size: 80,
            n_fft: 512,
            hop_length: 160,
            normalize: true,
            max_batch_size: Some(16),
        };

        let memory_usage = config.estimate_memory_usage(16000);
        assert!(memory_usage > 0);

        // Should include memory for input, processed, output, and temp buffers
        let expected_min = 16000 * 4 * 2 + 100 * 80 * 4 + 512 * 4;
        assert!(memory_usage >= expected_min);
    }

    #[test]
    fn test_invalid_input_type() {
        let config = AudioFeatureConfig {
            sampling_rate: 16000,
            feature_size: 80,
            n_fft: 512,
            hop_length: 160,
            normalize: true,
            max_batch_size: Some(16),
        };

        let extractor = AudioFeatureExtractor::new(config);

        // Try to pass non-audio input
        let input = FeatureInput::Text {
            content: "This is not audio".to_string(),
            metadata: None,
        };

        let result = extractor.extract_features(&input);
        assert!(result.is_err());
        assert!(matches!(
            result.unwrap_err(),
            TrustformersError::InvalidInput { .. }
        ));
    }
}