tekken/
audio.rs

1use crate::errors::{Result, TokenizerError};
2use base64::Engine;
3use ndarray::Array1;
4use serde::{Deserialize, Serialize};
5use std::path::Path;
6
7/// Configuration for generating audio spectrograms.
8///
9/// This struct contains the parameters needed to compute mel-scale spectrograms
10/// from audio waveforms, which are used in audio tokenization.
11///
12/// # Fields
13///
14/// * `num_mel_bins` - Number of mel-frequency bins (typically 80 or 128)
15/// * `hop_length` - Length of overlapping windows for STFT (typically 160)
16/// * `window_size` - Window size for Fourier transform (typically 400)
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct AudioSpectrogramConfig {
19    pub num_mel_bins: usize,
20    pub hop_length: usize,
21    pub window_size: usize,
22}
23
24impl AudioSpectrogramConfig {
25    /// Creates a new `AudioSpectrogramConfig` with validation.
26    ///
27    /// # Arguments
28    ///
29    /// * `num_mel_bins` - Number of mel-frequency bins (must be > 0)
30    /// * `hop_length` - Length of overlapping windows for STFT (must be > 0)
31    /// * `window_size` - Window size for Fourier transform (must be > 0)
32    ///
33    /// # Returns
34    ///
35    /// A new `AudioSpectrogramConfig` instance.
36    ///
37    /// # Errors
38    ///
39    /// Returns an error if any parameter is zero or invalid.
40    ///
41    /// # Examples
42    ///
43    /// ```rust
44    /// use tekken::audio::AudioSpectrogramConfig;
45    ///
46    /// let config = AudioSpectrogramConfig::new(80, 160, 400)?;
47    /// # Ok::<(), Box<dyn std::error::Error>>(())
48    /// ```
49    pub fn new(num_mel_bins: usize, hop_length: usize, window_size: usize) -> Result<Self> {
50        if num_mel_bins == 0 {
51            return Err(TokenizerError::InvalidConfig(
52                "num_mel_bins must be > 0".to_string(),
53            ));
54        }
55        if hop_length == 0 {
56            return Err(TokenizerError::InvalidConfig(
57                "hop_length must be > 0".to_string(),
58            ));
59        }
60        if window_size == 0 {
61            return Err(TokenizerError::InvalidConfig(
62                "window_size must be > 0".to_string(),
63            ));
64        }
65
66        Ok(Self {
67            num_mel_bins,
68            hop_length,
69            window_size,
70        })
71    }
72}
73
74/// Configuration for audio processing and tokenization.
75///
76/// This struct contains all parameters needed to process audio files and convert
77/// them into token sequences that can be mixed with text tokens.
78///
79/// # Fields
80///
81/// * `sampling_rate` - Target sampling rate in Hz (e.g., 16000)
82/// * `frame_rate` - Number of frames per second for the tokenizer model
83/// * `audio_encoding_config` - Spectrogram generation parameters
84/// * `chunk_length_s` - Optional chunk length in seconds for padding
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct AudioConfig {
87    pub sampling_rate: usize,
88    pub frame_rate: f64,
89    pub audio_encoding_config: AudioSpectrogramConfig,
90    pub chunk_length_s: Option<f64>,
91}
92
93impl AudioConfig {
94    /// Creates a new `AudioConfig` with validation.
95    ///
96    /// # Arguments
97    ///
98    /// * `sampling_rate` - Target sampling rate in Hz (must be > 0)
99    /// * `frame_rate` - Number of frames per second (must be > 0)
100    /// * `encoding_config` - Spectrogram configuration
101    /// * `chunk_length_s` - Optional chunk length in seconds (must be > 0 if provided)
102    ///
103    /// # Returns
104    ///
105    /// A new `AudioConfig` instance.
106    ///
107    /// # Errors
108    ///
109    /// Returns an error if any parameter is invalid.
110    pub fn new(
111        sampling_rate: usize,
112        frame_rate: f64,
113        encoding_config: AudioSpectrogramConfig,
114        chunk_length_s: Option<f64>,
115    ) -> Result<Self> {
116        if sampling_rate == 0 {
117            return Err(TokenizerError::InvalidConfig(
118                "sampling_rate must be > 0".to_string(),
119            ));
120        }
121        if frame_rate <= 0.0 {
122            return Err(TokenizerError::InvalidConfig(
123                "frame_rate must be > 0".to_string(),
124            ));
125        }
126
127        if let Some(chunk_length) = chunk_length_s {
128            if chunk_length <= 0.0 {
129                return Err(TokenizerError::InvalidConfig(
130                    "chunk_length_s must be > 0".to_string(),
131                ));
132            }
133        }
134
135        Ok(Self {
136            sampling_rate,
137            frame_rate,
138            audio_encoding_config: encoding_config,
139            chunk_length_s,
140        })
141    }
142
143    /// Calculates the number of audio frames per chunk.
144    ///
145    /// # Returns
146    ///
147    /// The number of frames per chunk based on chunk length and sampling rate.
148    ///
149    /// # Errors
150    ///
151    /// Returns an error if `chunk_length_s` is not set.
152    #[allow(
153        clippy::cast_possible_truncation,
154        clippy::cast_sign_loss,
155        clippy::cast_precision_loss
156    )]
157    pub fn chunk_frames(&self) -> Result<usize> {
158        match self.chunk_length_s {
159            Some(chunk_length) =>
160            {
161                #[allow(
162                    clippy::cast_possible_truncation,
163                    clippy::cast_sign_loss,
164                    clippy::cast_precision_loss
165                )]
166                Ok((chunk_length * self.sampling_rate as f64) as usize)
167            }
168            None => Err(TokenizerError::InvalidConfig(
169                "chunk_length_s not set".to_string(),
170            )),
171        }
172    }
173
174    /// Calculates the length of audio (in samples) represented by each token.
175    ///
176    /// This determines the downsampling factor from audio samples to tokens
177    /// based on the frame rate and spectrogram hop length.
178    ///
179    /// # Returns
180    ///
181    /// Number of audio samples per token.
182    #[must_use]
183    #[allow(
184        clippy::cast_possible_truncation,
185        clippy::cast_sign_loss,
186        clippy::cast_precision_loss
187    )]
188    pub fn audio_length_per_tok(&self) -> usize {
189        #[allow(clippy::cast_precision_loss)]
190        let mut downsample_factor = self.sampling_rate as f64 / self.frame_rate;
191        #[allow(clippy::cast_precision_loss)]
192        {
193            downsample_factor /= self.audio_encoding_config.hop_length as f64;
194        }
195        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
196        {
197            downsample_factor as usize
198        }
199    }
200}
201
202/// Represents audio data with metadata.
203///
204/// This struct holds audio waveform data along with its sampling rate and format.
205/// It provides methods for loading, processing, and converting audio data.
206///
207/// # Fields
208///
209/// * `audio_array` - Audio waveform as a 1D array of f32 samples
210/// * `sampling_rate` - Sampling rate in Hz
211/// * `format` - Audio format string (e.g., "wav")
212#[derive(Debug, Clone)]
213pub struct Audio {
214    pub audio_array: Array1<f32>,
215    pub sampling_rate: usize,
216    pub format: String,
217}
218
219impl Audio {
220    /// Creates a new Audio instance.
221    ///
222    /// # Arguments
223    ///
224    /// * `audio_array` - Audio waveform data as a 1D array
225    /// * `sampling_rate` - Sampling rate in Hz
226    /// * `format` - Audio format string
227    ///
228    /// # Returns
229    ///
230    /// A new Audio instance.
231    #[must_use]
232    pub fn new(audio_array: Array1<f32>, sampling_rate: usize, format: String) -> Self {
233        Self {
234            audio_array,
235            sampling_rate,
236            format,
237        }
238    }
239
240    /// Loads audio data from a WAV file.
241    ///
242    /// # Arguments
243    ///
244    /// * `path` - Path to the audio file
245    ///
246    /// # Returns
247    ///
248    /// A new Audio instance with the loaded data.
249    ///
250    /// # Errors
251    ///
252    /// Returns an error if:
253    /// - File cannot be opened
254    /// - File format is not supported
255    /// - Audio data cannot be read
256    ///
257    /// # Examples
258    ///
259    /// ```rust,no_run
260    /// use tekken::audio::Audio;
261    ///
262    /// let audio = Audio::from_file("audio.wav")?;
263    /// println!("Loaded audio: {} samples at {} Hz", audio.audio_array.len(), audio.sampling_rate);
264    /// # Ok::<(), Box<dyn std::error::Error>>(())
265    /// ```
266    #[allow(clippy::cast_precision_loss)]
267    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
268        let mut reader = hound::WavReader::open(path)
269            .map_err(|e| TokenizerError::Audio(format!("Failed to open audio file: {e}")))?;
270
271        let spec = reader.spec();
272        let sampling_rate = spec.sample_rate as usize;
273
274        // Read samples and convert to f32
275        let samples: std::result::Result<Vec<f32>, _> = match spec.sample_format {
276            hound::SampleFormat::Float => reader.samples::<f32>().collect(),
277            hound::SampleFormat::Int => reader
278                .samples::<i32>()
279                .map(|s| {
280                    s.map(|v| {
281                        #[allow(clippy::cast_precision_loss)]
282                        {
283                            v as f32 / i32::MAX as f32
284                        }
285                    })
286                })
287                .collect(),
288        };
289
290        let samples =
291            samples.map_err(|e| TokenizerError::Audio(format!("Failed to read samples: {e}")))?;
292
293        // Handle stereo to mono conversion (average channels)
294        let audio_array = if spec.channels == 1 {
295            Array1::from_vec(samples)
296        } else {
297            let mono_samples: Vec<f32> = samples
298                .chunks(spec.channels as usize)
299                .map(|chunk| {
300                    #[allow(clippy::cast_precision_loss)]
301                    {
302                        chunk.iter().sum::<f32>() / chunk.len() as f32
303                    }
304                })
305                .collect();
306            Array1::from_vec(mono_samples)
307        };
308
309        Ok(Self::new(audio_array, sampling_rate, "wav".to_string()))
310    }
311
312    /// Loads audio data from a base64-encoded string.
313    ///
314    /// # Arguments
315    ///
316    /// * `data` - Base64-encoded audio data
317    ///
318    /// # Returns
319    ///
320    /// A new Audio instance with the decoded data.
321    ///
322    /// # Errors
323    ///
324    /// Returns an error if decoding or parsing fails.
325    pub fn from_base64(data: &str) -> Result<Self> {
326        let audio_bytes = base64::engine::general_purpose::STANDARD.decode(data)?;
327        Self::from_bytes(&audio_bytes)
328    }
329
330    /// Loads audio data from raw bytes.
331    ///
332    /// # Arguments
333    ///
334    /// * `bytes` - Raw audio file data
335    ///
336    /// # Returns
337    ///
338    /// A new Audio instance parsed from the bytes.
339    ///
340    /// # Errors
341    ///
342    /// Returns an error if the bytes cannot be parsed as audio.
343    #[allow(clippy::cast_precision_loss)]
344    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
345        let cursor = std::io::Cursor::new(bytes);
346        let mut reader = hound::WavReader::new(cursor)
347            .map_err(|e| TokenizerError::Audio(format!("Failed to parse audio bytes: {e}")))?;
348
349        let spec = reader.spec();
350        let sampling_rate = spec.sample_rate as usize;
351
352        let samples: std::result::Result<Vec<f32>, _> = match spec.sample_format {
353            hound::SampleFormat::Float => reader.samples::<f32>().collect(),
354            hound::SampleFormat::Int => reader
355                .samples::<i32>()
356                .map(|s| {
357                    s.map(|v| {
358                        #[allow(clippy::cast_precision_loss)]
359                        {
360                            v as f32 / i32::MAX as f32
361                        }
362                    })
363                })
364                .collect(),
365        };
366
367        let samples =
368            samples.map_err(|e| TokenizerError::Audio(format!("Failed to read samples: {e}")))?;
369
370        let audio_array = if spec.channels == 1 {
371            Array1::from_vec(samples)
372        } else {
373            let mono_samples: Vec<f32> = samples
374                .chunks(spec.channels as usize)
375                .map(|chunk| {
376                    #[allow(clippy::cast_precision_loss)]
377                    {
378                        chunk.iter().sum::<f32>() / chunk.len() as f32
379                    }
380                })
381                .collect();
382            Array1::from_vec(mono_samples)
383        };
384
385        Ok(Self::new(audio_array, sampling_rate, "wav".to_string()))
386    }
387
388    /// Calculates the duration of the audio in seconds.
389    ///
390    /// # Returns
391    ///
392    /// Audio duration in seconds.
393    #[must_use]
394    #[allow(clippy::cast_precision_loss)]
395    pub fn duration(&self) -> f64 {
396        #[allow(clippy::cast_precision_loss)]
397        {
398            self.audio_array.len() as f64 / self.sampling_rate as f64
399        }
400    }
401
402    /// Resamples the audio to a target sampling rate.
403    ///
404    /// # Arguments
405    ///
406    /// * `target_rate` - Target sampling rate in Hz
407    ///
408    /// # Errors
409    ///
410    /// Currently returns an error as resampling is not yet implemented.
411    ///
412    /// # Note
413    ///
414    /// This is a placeholder implementation that needs proper resampling logic.
415    pub fn resample(&mut self, target_rate: usize) -> Result<()> {
416        if self.sampling_rate == target_rate {
417            return Ok(());
418        }
419
420        // For now, return an error for resampling - this would need proper implementation
421        Err(TokenizerError::Audio(
422            "Resampling not yet implemented".to_string(),
423        ))
424    }
425
426    /// Pads the audio to meet minimum length requirements.
427    ///
428    /// This method ensures the audio is long enough for processing by padding
429    /// with zeros if necessary. Padding is applied based on chunk length or
430    /// minimum window size requirements.
431    ///
432    /// # Arguments
433    ///
434    /// * `config` - Audio configuration specifying padding requirements
435    ///
436    /// # Errors
437    ///
438    /// Returns an error if configuration is invalid.
439    pub fn pad(&mut self, config: &AudioConfig) -> Result<()> {
440        let current_length = self.audio_array.len();
441
442        let target_length = if let Some(_chunk_length_s) = config.chunk_length_s {
443            let chunk_frames = config.chunk_frames()?;
444
445            current_length.div_ceil(chunk_frames) * chunk_frames
446        } else if current_length < config.audio_encoding_config.window_size {
447            config.audio_encoding_config.window_size
448        } else {
449            return Ok(());
450        };
451
452        if target_length > current_length {
453            let padding_length = target_length - current_length;
454            let _ = padding_length; // Padding length calculated but not used in debug
455            let mut padded = Array1::zeros(target_length);
456            padded
457                .slice_mut(ndarray::s![..current_length])
458                .assign(&self.audio_array);
459            self.audio_array = padded;
460        }
461
462        Ok(())
463    }
464}
465
466/// Result of audio tokenization containing tokens and processed audio.
467///
468/// This struct encapsulates the output of audio encoding, containing both
469/// the token sequence and the processed audio data.
470///
471/// # Fields
472///
473/// * `tokens` - Token sequence (u32) representing the audio (includes `begin_audio` and audio tokens)
474/// * `audio` - Processed audio data after resampling and padding
475#[derive(Debug, Clone)]
476pub struct AudioEncoding {
477    pub tokens: Vec<u32>,
478    pub audio: Audio,
479}
480
481/// Encoder for converting audio data into token sequences.
482///
483/// The `AudioEncoder` processes audio waveforms and converts them into token
484/// sequences that can be mixed with text tokens in multimodal applications.
485///
486/// # Fields
487///
488/// * `config` - Audio processing configuration
489/// * `audio_token_id` - Token ID (u32) for audio content tokens
490/// * `begin_audio_token_id` - Token ID (u32) for marking the start of audio
491#[derive(Debug, Clone)]
492pub struct AudioEncoder {
493    pub config: AudioConfig,
494    pub audio_token_id: u32,
495    pub begin_audio_token_id: u32,
496}
497
498impl AudioEncoder {
499    /// Creates a new `AudioEncoder`.
500    ///
501    /// # Arguments
502    ///
503    /// * `config` - Audio processing configuration
504    /// * `audio_token_id` - Token ID (u32) representing audio content
505    /// * `begin_audio_token_id` - Token ID (u32) marking the start of audio sequence
506    ///
507    /// # Returns
508    ///
509    /// A new `AudioEncoder` instance.
510    #[must_use]
511    pub fn new(config: AudioConfig, audio_token_id: u32, begin_audio_token_id: u32) -> Self {
512        Self {
513            config,
514            audio_token_id,
515            begin_audio_token_id,
516        }
517    }
518
519    /// Encodes audio data into a token sequence.
520    ///
521    /// This method processes the audio through resampling, padding, and tokenization
522    /// to produce a sequence of tokens that represents the audio content.
523    ///
524    /// # Arguments
525    ///
526    /// * `audio` - The audio data to encode
527    ///
528    /// # Returns
529    ///
530    /// An `AudioEncoding` containing the token sequence and processed audio.
531    ///
532    /// # Errors
533    ///
534    /// Returns an error if audio processing fails.
535    ///
536    /// # Examples
537    ///
538    /// ```rust,no_run
539    /// use tekken::audio::{Audio, AudioConfig, AudioSpectrogramConfig, AudioEncoder};
540    ///
541    /// let audio = Audio::from_file("audio.wav")?;
542    /// let spectrogram_config = AudioSpectrogramConfig::new(80, 160, 400)?;
543    /// let audio_config = AudioConfig::new(16000, 12.5, spectrogram_config, None)?;
544    /// let encoder = AudioEncoder::new(audio_config, 1000, 1001);
545    ///
546    /// let encoding = encoder.encode(audio)?;
547    /// println!("Audio encoded to {} tokens", encoding.tokens.len());
548    /// # Ok::<(), Box<dyn std::error::Error>>(())
549    /// ```
550    #[allow(
551        clippy::cast_possible_truncation,
552        clippy::cast_sign_loss,
553        clippy::cast_precision_loss
554    )]
555    pub fn encode(&self, mut audio: Audio) -> Result<AudioEncoding> {
556        // Resample to target sampling rate
557        audio.resample(self.config.sampling_rate)?;
558
559        // Pad audio if needed
560        audio.pad(&self.config)?;
561
562        let signal_length = audio.audio_array.len();
563
564        // Calculate signal length after downsampling for spectrogram
565        let signal_length = if signal_length % self.config.audio_encoding_config.hop_length != 0 {
566            #[allow(
567                clippy::cast_possible_truncation,
568                clippy::cast_sign_loss,
569                clippy::cast_precision_loss
570            )]
571            {
572                (signal_length as f64 / self.config.audio_encoding_config.hop_length as f64 - 1.0)
573                    .ceil() as usize
574            }
575        } else {
576            signal_length / self.config.audio_encoding_config.hop_length
577        };
578
579        #[allow(
580            clippy::cast_possible_truncation,
581            clippy::cast_sign_loss,
582            clippy::cast_precision_loss
583        )]
584        let num_audio_tokens =
585            (signal_length as f64 / self.config.audio_length_per_tok() as f64).ceil() as usize;
586
587        let mut tokens = vec![self.begin_audio_token_id];
588        tokens.extend(vec![self.audio_token_id; num_audio_tokens]);
589
590        Ok(AudioEncoding { tokens, audio })
591    }
592}
593
594/// Converts frequency from Hertz to the mel-scale using the Slaney formula.
595///
596/// The mel-scale is a perceptual scale that better represents human auditory perception.
597/// This function implements the Slaney-style conversion commonly used in audio processing.
598///
599/// # Arguments
600///
601/// * `freq` - Frequency in Hertz
602///
603/// # Returns
604///
605/// Frequency in mel-scale units.
606///
607/// # References
608///
609/// Based on the Slaney mel-scale conversion used in audio processing libraries.
610#[must_use]
611pub fn hertz_to_mel(freq: f64) -> f64 {
612    let min_log_hertz = 1000.0;
613    let min_log_mel = 15.0;
614    let logstep = 27.0 / 6.4_f64.ln();
615
616    if freq >= min_log_hertz {
617        min_log_mel + (freq / min_log_hertz).ln() * logstep
618    } else {
619        3.0 * freq / 200.0
620    }
621}
622
623/// Converts frequency from the mel-scale back to Hertz.
624///
625/// This is the inverse operation of `hertz_to_mel`, converting mel-scale
626/// frequencies back to linear Hertz frequencies.
627///
628/// # Arguments
629///
630/// * `mel` - Frequency in mel-scale units
631///
632/// # Returns
633///
634/// Frequency in Hertz.
635#[must_use]
636pub fn mel_to_hertz(mel: f64) -> f64 {
637    let min_log_hertz = 1000.0;
638    let min_log_mel = 15.0;
639    let logstep = 6.4_f64.ln() / 27.0;
640
641    if mel >= min_log_mel {
642        min_log_hertz * ((mel - min_log_mel) * logstep).exp()
643    } else {
644        200.0 * mel / 3.0
645    }
646}
647
648/// Creates a mel-scale filter bank for spectrogram processing.
649///
650/// This function generates a matrix of triangular filters distributed on the mel-scale
651/// that can be used to convert linear frequency spectrograms to mel-scale spectrograms.
652/// The implementation follows the Slaney-style mel filter bank construction.
653///
654/// # Arguments
655///
656/// * `num_frequency_bins` - Number of frequency bins in the input spectrogram
657/// * `num_mel_bins` - Number of desired mel-frequency bins in the output
658/// * `min_frequency` - Minimum frequency in Hz to consider
659/// * `max_frequency` - Maximum frequency in Hz to consider
660/// * `sampling_rate` - Audio sampling rate in Hz
661///
662/// # Returns
663///
664/// A 2D array of shape `(num_frequency_bins, num_mel_bins)` containing the filter bank.
665/// Each column represents a mel filter that can be applied to frequency bins.
666///
667/// # Errors
668///
669/// Returns an error if:
670/// - `num_frequency_bins` is less than 2
671/// - `min_frequency` is greater than `max_frequency`
672/// - Any mel filter has all zero values
673///
674/// # Examples
675///
676/// ```rust
677/// use tekken::audio::mel_filter_bank;
678///
679/// let filter_bank = mel_filter_bank(201, 80, 0.0, 8000.0, 16000)?;
680/// println!("Filter bank shape: {:?}", filter_bank.dim());
681/// # Ok::<(), Box<dyn std::error::Error>>(())
682/// ```
683#[allow(clippy::cast_precision_loss)]
684pub fn mel_filter_bank(
685    num_frequency_bins: usize,
686    num_mel_bins: usize,
687    min_frequency: f64,
688    max_frequency: f64,
689    sampling_rate: usize,
690) -> Result<ndarray::Array2<f64>> {
691    if num_frequency_bins < 2 {
692        return Err(TokenizerError::InvalidConfig(format!(
693            "num_frequency_bins must be >= 2, got {num_frequency_bins}"
694        )));
695    }
696
697    if min_frequency > max_frequency {
698        return Err(TokenizerError::InvalidConfig(format!(
699            "min_frequency ({min_frequency}) must be <= max_frequency ({max_frequency})"
700        )));
701    }
702
703    // Center points of the triangular mel filters
704    let mel_min = hertz_to_mel(min_frequency);
705    let mel_max = hertz_to_mel(max_frequency);
706    #[allow(clippy::cast_precision_loss)]
707    let mel_freqs: Vec<f64> = (0..=num_mel_bins + 1)
708        .map(|i| mel_min + (mel_max - mel_min) * i as f64 / (num_mel_bins + 1) as f64)
709        .collect();
710    let filter_freqs: Vec<f64> = mel_freqs.iter().map(|&mel| mel_to_hertz(mel)).collect();
711
712    // Frequencies of FFT bins in Hz
713    #[allow(clippy::cast_precision_loss)]
714    let fft_freqs: Vec<f64> = (0..num_frequency_bins)
715        .map(|i| i as f64 * sampling_rate as f64 / 2.0 / (num_frequency_bins - 1) as f64)
716        .collect();
717
718    // Create triangular filter bank - shape (num_frequency_bins, num_mel_bins) to match Python
719    let mut filter_bank = ndarray::Array2::zeros((num_frequency_bins, num_mel_bins));
720
721    for mel_idx in 0..num_mel_bins {
722        let left_freq = filter_freqs[mel_idx];
723        let center_freq = filter_freqs[mel_idx + 1];
724        let right_freq = filter_freqs[mel_idx + 2];
725
726        for (freq_idx, &fft_freq) in fft_freqs.iter().enumerate() {
727            let value = if fft_freq >= left_freq && fft_freq <= center_freq {
728                (fft_freq - left_freq) / (center_freq - left_freq)
729            } else if fft_freq > center_freq && fft_freq <= right_freq {
730                (right_freq - fft_freq) / (right_freq - center_freq)
731            } else {
732                0.0
733            };
734
735            filter_bank[[freq_idx, mel_idx]] = value.max(0.0);
736        }
737    }
738
739    // Apply Slaney-style energy normalization
740    for mel_idx in 0..num_mel_bins {
741        let enorm = 2.0 / (filter_freqs[mel_idx + 2] - filter_freqs[mel_idx]);
742        for freq_idx in 0..num_frequency_bins {
743            filter_bank[[freq_idx, mel_idx]] *= enorm;
744        }
745    }
746
747    Ok(filter_bank)
748}