tekken/audio.rs
1use crate::errors::{Result, TokenizerError};
2use base64::Engine;
3use ndarray::Array1;
4use serde::{Deserialize, Serialize};
5use std::path::Path;
6
7/// Configuration for generating audio spectrograms.
8///
9/// This struct contains the parameters needed to compute mel-scale spectrograms
10/// from audio waveforms, which are used in audio tokenization.
11///
12/// # Fields
13///
14/// * `num_mel_bins` - Number of mel-frequency bins (typically 80 or 128)
15/// * `hop_length` - Length of overlapping windows for STFT (typically 160)
16/// * `window_size` - Window size for Fourier transform (typically 400)
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct AudioSpectrogramConfig {
19 pub num_mel_bins: usize,
20 pub hop_length: usize,
21 pub window_size: usize,
22}
23
24impl AudioSpectrogramConfig {
25 /// Creates a new `AudioSpectrogramConfig` with validation.
26 ///
27 /// # Arguments
28 ///
29 /// * `num_mel_bins` - Number of mel-frequency bins (must be > 0)
30 /// * `hop_length` - Length of overlapping windows for STFT (must be > 0)
31 /// * `window_size` - Window size for Fourier transform (must be > 0)
32 ///
33 /// # Returns
34 ///
35 /// A new `AudioSpectrogramConfig` instance.
36 ///
37 /// # Errors
38 ///
39 /// Returns an error if any parameter is zero or invalid.
40 ///
41 /// # Examples
42 ///
43 /// ```rust
44 /// use tekken::audio::AudioSpectrogramConfig;
45 ///
46 /// let config = AudioSpectrogramConfig::new(80, 160, 400)?;
47 /// # Ok::<(), Box<dyn std::error::Error>>(())
48 /// ```
49 pub fn new(num_mel_bins: usize, hop_length: usize, window_size: usize) -> Result<Self> {
50 if num_mel_bins == 0 {
51 return Err(TokenizerError::InvalidConfig(
52 "num_mel_bins must be > 0".to_string(),
53 ));
54 }
55 if hop_length == 0 {
56 return Err(TokenizerError::InvalidConfig(
57 "hop_length must be > 0".to_string(),
58 ));
59 }
60 if window_size == 0 {
61 return Err(TokenizerError::InvalidConfig(
62 "window_size must be > 0".to_string(),
63 ));
64 }
65
66 Ok(Self {
67 num_mel_bins,
68 hop_length,
69 window_size,
70 })
71 }
72}
73
74/// Configuration for audio processing and tokenization.
75///
76/// This struct contains all parameters needed to process audio files and convert
77/// them into token sequences that can be mixed with text tokens.
78///
79/// # Fields
80///
81/// * `sampling_rate` - Target sampling rate in Hz (e.g., 16000)
82/// * `frame_rate` - Number of frames per second for the tokenizer model
83/// * `audio_encoding_config` - Spectrogram generation parameters
84/// * `chunk_length_s` - Optional chunk length in seconds for padding
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct AudioConfig {
87 pub sampling_rate: usize,
88 pub frame_rate: f64,
89 pub audio_encoding_config: AudioSpectrogramConfig,
90 pub chunk_length_s: Option<f64>,
91}
92
93impl AudioConfig {
94 /// Creates a new `AudioConfig` with validation.
95 ///
96 /// # Arguments
97 ///
98 /// * `sampling_rate` - Target sampling rate in Hz (must be > 0)
99 /// * `frame_rate` - Number of frames per second (must be > 0)
100 /// * `encoding_config` - Spectrogram configuration
101 /// * `chunk_length_s` - Optional chunk length in seconds (must be > 0 if provided)
102 ///
103 /// # Returns
104 ///
105 /// A new `AudioConfig` instance.
106 ///
107 /// # Errors
108 ///
109 /// Returns an error if any parameter is invalid.
110 pub fn new(
111 sampling_rate: usize,
112 frame_rate: f64,
113 encoding_config: AudioSpectrogramConfig,
114 chunk_length_s: Option<f64>,
115 ) -> Result<Self> {
116 if sampling_rate == 0 {
117 return Err(TokenizerError::InvalidConfig(
118 "sampling_rate must be > 0".to_string(),
119 ));
120 }
121 if frame_rate <= 0.0 {
122 return Err(TokenizerError::InvalidConfig(
123 "frame_rate must be > 0".to_string(),
124 ));
125 }
126
127 if let Some(chunk_length) = chunk_length_s {
128 if chunk_length <= 0.0 {
129 return Err(TokenizerError::InvalidConfig(
130 "chunk_length_s must be > 0".to_string(),
131 ));
132 }
133 }
134
135 Ok(Self {
136 sampling_rate,
137 frame_rate,
138 audio_encoding_config: encoding_config,
139 chunk_length_s,
140 })
141 }
142
143 /// Calculates the number of audio frames per chunk.
144 ///
145 /// # Returns
146 ///
147 /// The number of frames per chunk based on chunk length and sampling rate.
148 ///
149 /// # Errors
150 ///
151 /// Returns an error if `chunk_length_s` is not set.
152 #[allow(
153 clippy::cast_possible_truncation,
154 clippy::cast_sign_loss,
155 clippy::cast_precision_loss
156 )]
157 pub fn chunk_frames(&self) -> Result<usize> {
158 match self.chunk_length_s {
159 Some(chunk_length) =>
160 {
161 #[allow(
162 clippy::cast_possible_truncation,
163 clippy::cast_sign_loss,
164 clippy::cast_precision_loss
165 )]
166 Ok((chunk_length * self.sampling_rate as f64) as usize)
167 }
168 None => Err(TokenizerError::InvalidConfig(
169 "chunk_length_s not set".to_string(),
170 )),
171 }
172 }
173
174 /// Calculates the length of audio (in samples) represented by each token.
175 ///
176 /// This determines the downsampling factor from audio samples to tokens
177 /// based on the frame rate and spectrogram hop length.
178 ///
179 /// # Returns
180 ///
181 /// Number of audio samples per token.
182 #[must_use]
183 #[allow(
184 clippy::cast_possible_truncation,
185 clippy::cast_sign_loss,
186 clippy::cast_precision_loss
187 )]
188 pub fn audio_length_per_tok(&self) -> usize {
189 #[allow(clippy::cast_precision_loss)]
190 let mut downsample_factor = self.sampling_rate as f64 / self.frame_rate;
191 #[allow(clippy::cast_precision_loss)]
192 {
193 downsample_factor /= self.audio_encoding_config.hop_length as f64;
194 }
195 #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
196 {
197 downsample_factor as usize
198 }
199 }
200}
201
202/// Represents audio data with metadata.
203///
204/// This struct holds audio waveform data along with its sampling rate and format.
205/// It provides methods for loading, processing, and converting audio data.
206///
207/// # Fields
208///
209/// * `audio_array` - Audio waveform as a 1D array of f32 samples
210/// * `sampling_rate` - Sampling rate in Hz
211/// * `format` - Audio format string (e.g., "wav")
212#[derive(Debug, Clone)]
213pub struct Audio {
214 pub audio_array: Array1<f32>,
215 pub sampling_rate: usize,
216 pub format: String,
217}
218
219impl Audio {
220 /// Creates a new Audio instance.
221 ///
222 /// # Arguments
223 ///
224 /// * `audio_array` - Audio waveform data as a 1D array
225 /// * `sampling_rate` - Sampling rate in Hz
226 /// * `format` - Audio format string
227 ///
228 /// # Returns
229 ///
230 /// A new Audio instance.
231 #[must_use]
232 pub fn new(audio_array: Array1<f32>, sampling_rate: usize, format: String) -> Self {
233 Self {
234 audio_array,
235 sampling_rate,
236 format,
237 }
238 }
239
240 /// Loads audio data from a WAV file.
241 ///
242 /// # Arguments
243 ///
244 /// * `path` - Path to the audio file
245 ///
246 /// # Returns
247 ///
248 /// A new Audio instance with the loaded data.
249 ///
250 /// # Errors
251 ///
252 /// Returns an error if:
253 /// - File cannot be opened
254 /// - File format is not supported
255 /// - Audio data cannot be read
256 ///
257 /// # Examples
258 ///
259 /// ```rust,no_run
260 /// use tekken::audio::Audio;
261 ///
262 /// let audio = Audio::from_file("audio.wav")?;
263 /// println!("Loaded audio: {} samples at {} Hz", audio.audio_array.len(), audio.sampling_rate);
264 /// # Ok::<(), Box<dyn std::error::Error>>(())
265 /// ```
266 #[allow(clippy::cast_precision_loss)]
267 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
268 let mut reader = hound::WavReader::open(path)
269 .map_err(|e| TokenizerError::Audio(format!("Failed to open audio file: {e}")))?;
270
271 let spec = reader.spec();
272 let sampling_rate = spec.sample_rate as usize;
273
274 // Read samples and convert to f32
275 let samples: std::result::Result<Vec<f32>, _> = match spec.sample_format {
276 hound::SampleFormat::Float => reader.samples::<f32>().collect(),
277 hound::SampleFormat::Int => reader
278 .samples::<i32>()
279 .map(|s| {
280 s.map(|v| {
281 #[allow(clippy::cast_precision_loss)]
282 {
283 v as f32 / i32::MAX as f32
284 }
285 })
286 })
287 .collect(),
288 };
289
290 let samples =
291 samples.map_err(|e| TokenizerError::Audio(format!("Failed to read samples: {e}")))?;
292
293 // Handle stereo to mono conversion (average channels)
294 let audio_array = if spec.channels == 1 {
295 Array1::from_vec(samples)
296 } else {
297 let mono_samples: Vec<f32> = samples
298 .chunks(spec.channels as usize)
299 .map(|chunk| {
300 #[allow(clippy::cast_precision_loss)]
301 {
302 chunk.iter().sum::<f32>() / chunk.len() as f32
303 }
304 })
305 .collect();
306 Array1::from_vec(mono_samples)
307 };
308
309 Ok(Self::new(audio_array, sampling_rate, "wav".to_string()))
310 }
311
312 /// Loads audio data from a base64-encoded string.
313 ///
314 /// # Arguments
315 ///
316 /// * `data` - Base64-encoded audio data
317 ///
318 /// # Returns
319 ///
320 /// A new Audio instance with the decoded data.
321 ///
322 /// # Errors
323 ///
324 /// Returns an error if decoding or parsing fails.
325 pub fn from_base64(data: &str) -> Result<Self> {
326 let audio_bytes = base64::engine::general_purpose::STANDARD.decode(data)?;
327 Self::from_bytes(&audio_bytes)
328 }
329
330 /// Loads audio data from raw bytes.
331 ///
332 /// # Arguments
333 ///
334 /// * `bytes` - Raw audio file data
335 ///
336 /// # Returns
337 ///
338 /// A new Audio instance parsed from the bytes.
339 ///
340 /// # Errors
341 ///
342 /// Returns an error if the bytes cannot be parsed as audio.
343 #[allow(clippy::cast_precision_loss)]
344 pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
345 let cursor = std::io::Cursor::new(bytes);
346 let mut reader = hound::WavReader::new(cursor)
347 .map_err(|e| TokenizerError::Audio(format!("Failed to parse audio bytes: {e}")))?;
348
349 let spec = reader.spec();
350 let sampling_rate = spec.sample_rate as usize;
351
352 let samples: std::result::Result<Vec<f32>, _> = match spec.sample_format {
353 hound::SampleFormat::Float => reader.samples::<f32>().collect(),
354 hound::SampleFormat::Int => reader
355 .samples::<i32>()
356 .map(|s| {
357 s.map(|v| {
358 #[allow(clippy::cast_precision_loss)]
359 {
360 v as f32 / i32::MAX as f32
361 }
362 })
363 })
364 .collect(),
365 };
366
367 let samples =
368 samples.map_err(|e| TokenizerError::Audio(format!("Failed to read samples: {e}")))?;
369
370 let audio_array = if spec.channels == 1 {
371 Array1::from_vec(samples)
372 } else {
373 let mono_samples: Vec<f32> = samples
374 .chunks(spec.channels as usize)
375 .map(|chunk| {
376 #[allow(clippy::cast_precision_loss)]
377 {
378 chunk.iter().sum::<f32>() / chunk.len() as f32
379 }
380 })
381 .collect();
382 Array1::from_vec(mono_samples)
383 };
384
385 Ok(Self::new(audio_array, sampling_rate, "wav".to_string()))
386 }
387
388 /// Calculates the duration of the audio in seconds.
389 ///
390 /// # Returns
391 ///
392 /// Audio duration in seconds.
393 #[must_use]
394 #[allow(clippy::cast_precision_loss)]
395 pub fn duration(&self) -> f64 {
396 #[allow(clippy::cast_precision_loss)]
397 {
398 self.audio_array.len() as f64 / self.sampling_rate as f64
399 }
400 }
401
402 /// Resamples the audio to a target sampling rate.
403 ///
404 /// # Arguments
405 ///
406 /// * `target_rate` - Target sampling rate in Hz
407 ///
408 /// # Errors
409 ///
410 /// Currently returns an error as resampling is not yet implemented.
411 ///
412 /// # Note
413 ///
414 /// This is a placeholder implementation that needs proper resampling logic.
415 pub fn resample(&mut self, target_rate: usize) -> Result<()> {
416 if self.sampling_rate == target_rate {
417 return Ok(());
418 }
419
420 // For now, return an error for resampling - this would need proper implementation
421 Err(TokenizerError::Audio(
422 "Resampling not yet implemented".to_string(),
423 ))
424 }
425
426 /// Pads the audio to meet minimum length requirements.
427 ///
428 /// This method ensures the audio is long enough for processing by padding
429 /// with zeros if necessary. Padding is applied based on chunk length or
430 /// minimum window size requirements.
431 ///
432 /// # Arguments
433 ///
434 /// * `config` - Audio configuration specifying padding requirements
435 ///
436 /// # Errors
437 ///
438 /// Returns an error if configuration is invalid.
439 pub fn pad(&mut self, config: &AudioConfig) -> Result<()> {
440 let current_length = self.audio_array.len();
441
442 let target_length = if let Some(_chunk_length_s) = config.chunk_length_s {
443 let chunk_frames = config.chunk_frames()?;
444
445 current_length.div_ceil(chunk_frames) * chunk_frames
446 } else if current_length < config.audio_encoding_config.window_size {
447 config.audio_encoding_config.window_size
448 } else {
449 return Ok(());
450 };
451
452 if target_length > current_length {
453 let padding_length = target_length - current_length;
454 let _ = padding_length; // Padding length calculated but not used in debug
455 let mut padded = Array1::zeros(target_length);
456 padded
457 .slice_mut(ndarray::s![..current_length])
458 .assign(&self.audio_array);
459 self.audio_array = padded;
460 }
461
462 Ok(())
463 }
464}
465
466/// Result of audio tokenization containing tokens and processed audio.
467///
468/// This struct encapsulates the output of audio encoding, containing both
469/// the token sequence and the processed audio data.
470///
471/// # Fields
472///
473/// * `tokens` - Token sequence (u32) representing the audio (includes `begin_audio` and audio tokens)
474/// * `audio` - Processed audio data after resampling and padding
475#[derive(Debug, Clone)]
476pub struct AudioEncoding {
477 pub tokens: Vec<u32>,
478 pub audio: Audio,
479}
480
481/// Encoder for converting audio data into token sequences.
482///
483/// The `AudioEncoder` processes audio waveforms and converts them into token
484/// sequences that can be mixed with text tokens in multimodal applications.
485///
486/// # Fields
487///
488/// * `config` - Audio processing configuration
489/// * `audio_token_id` - Token ID (u32) for audio content tokens
490/// * `begin_audio_token_id` - Token ID (u32) for marking the start of audio
491#[derive(Debug, Clone)]
492pub struct AudioEncoder {
493 pub config: AudioConfig,
494 pub audio_token_id: u32,
495 pub begin_audio_token_id: u32,
496}
497
498impl AudioEncoder {
499 /// Creates a new `AudioEncoder`.
500 ///
501 /// # Arguments
502 ///
503 /// * `config` - Audio processing configuration
504 /// * `audio_token_id` - Token ID (u32) representing audio content
505 /// * `begin_audio_token_id` - Token ID (u32) marking the start of audio sequence
506 ///
507 /// # Returns
508 ///
509 /// A new `AudioEncoder` instance.
510 #[must_use]
511 pub fn new(config: AudioConfig, audio_token_id: u32, begin_audio_token_id: u32) -> Self {
512 Self {
513 config,
514 audio_token_id,
515 begin_audio_token_id,
516 }
517 }
518
519 /// Encodes audio data into a token sequence.
520 ///
521 /// This method processes the audio through resampling, padding, and tokenization
522 /// to produce a sequence of tokens that represents the audio content.
523 ///
524 /// # Arguments
525 ///
526 /// * `audio` - The audio data to encode
527 ///
528 /// # Returns
529 ///
530 /// An `AudioEncoding` containing the token sequence and processed audio.
531 ///
532 /// # Errors
533 ///
534 /// Returns an error if audio processing fails.
535 ///
536 /// # Examples
537 ///
538 /// ```rust,no_run
539 /// use tekken::audio::{Audio, AudioConfig, AudioSpectrogramConfig, AudioEncoder};
540 ///
541 /// let audio = Audio::from_file("audio.wav")?;
542 /// let spectrogram_config = AudioSpectrogramConfig::new(80, 160, 400)?;
543 /// let audio_config = AudioConfig::new(16000, 12.5, spectrogram_config, None)?;
544 /// let encoder = AudioEncoder::new(audio_config, 1000, 1001);
545 ///
546 /// let encoding = encoder.encode(audio)?;
547 /// println!("Audio encoded to {} tokens", encoding.tokens.len());
548 /// # Ok::<(), Box<dyn std::error::Error>>(())
549 /// ```
550 #[allow(
551 clippy::cast_possible_truncation,
552 clippy::cast_sign_loss,
553 clippy::cast_precision_loss
554 )]
555 pub fn encode(&self, mut audio: Audio) -> Result<AudioEncoding> {
556 // Resample to target sampling rate
557 audio.resample(self.config.sampling_rate)?;
558
559 // Pad audio if needed
560 audio.pad(&self.config)?;
561
562 let signal_length = audio.audio_array.len();
563
564 // Calculate signal length after downsampling for spectrogram
565 let signal_length = if signal_length % self.config.audio_encoding_config.hop_length != 0 {
566 #[allow(
567 clippy::cast_possible_truncation,
568 clippy::cast_sign_loss,
569 clippy::cast_precision_loss
570 )]
571 {
572 (signal_length as f64 / self.config.audio_encoding_config.hop_length as f64 - 1.0)
573 .ceil() as usize
574 }
575 } else {
576 signal_length / self.config.audio_encoding_config.hop_length
577 };
578
579 #[allow(
580 clippy::cast_possible_truncation,
581 clippy::cast_sign_loss,
582 clippy::cast_precision_loss
583 )]
584 let num_audio_tokens =
585 (signal_length as f64 / self.config.audio_length_per_tok() as f64).ceil() as usize;
586
587 let mut tokens = vec![self.begin_audio_token_id];
588 tokens.extend(vec![self.audio_token_id; num_audio_tokens]);
589
590 Ok(AudioEncoding { tokens, audio })
591 }
592}
593
594/// Converts frequency from Hertz to the mel-scale using the Slaney formula.
595///
596/// The mel-scale is a perceptual scale that better represents human auditory perception.
597/// This function implements the Slaney-style conversion commonly used in audio processing.
598///
599/// # Arguments
600///
601/// * `freq` - Frequency in Hertz
602///
603/// # Returns
604///
605/// Frequency in mel-scale units.
606///
607/// # References
608///
609/// Based on the Slaney mel-scale conversion used in audio processing libraries.
610#[must_use]
611pub fn hertz_to_mel(freq: f64) -> f64 {
612 let min_log_hertz = 1000.0;
613 let min_log_mel = 15.0;
614 let logstep = 27.0 / 6.4_f64.ln();
615
616 if freq >= min_log_hertz {
617 min_log_mel + (freq / min_log_hertz).ln() * logstep
618 } else {
619 3.0 * freq / 200.0
620 }
621}
622
623/// Converts frequency from the mel-scale back to Hertz.
624///
625/// This is the inverse operation of `hertz_to_mel`, converting mel-scale
626/// frequencies back to linear Hertz frequencies.
627///
628/// # Arguments
629///
630/// * `mel` - Frequency in mel-scale units
631///
632/// # Returns
633///
634/// Frequency in Hertz.
635#[must_use]
636pub fn mel_to_hertz(mel: f64) -> f64 {
637 let min_log_hertz = 1000.0;
638 let min_log_mel = 15.0;
639 let logstep = 6.4_f64.ln() / 27.0;
640
641 if mel >= min_log_mel {
642 min_log_hertz * ((mel - min_log_mel) * logstep).exp()
643 } else {
644 200.0 * mel / 3.0
645 }
646}
647
648/// Creates a mel-scale filter bank for spectrogram processing.
649///
650/// This function generates a matrix of triangular filters distributed on the mel-scale
651/// that can be used to convert linear frequency spectrograms to mel-scale spectrograms.
652/// The implementation follows the Slaney-style mel filter bank construction.
653///
654/// # Arguments
655///
656/// * `num_frequency_bins` - Number of frequency bins in the input spectrogram
657/// * `num_mel_bins` - Number of desired mel-frequency bins in the output
658/// * `min_frequency` - Minimum frequency in Hz to consider
659/// * `max_frequency` - Maximum frequency in Hz to consider
660/// * `sampling_rate` - Audio sampling rate in Hz
661///
662/// # Returns
663///
664/// A 2D array of shape `(num_frequency_bins, num_mel_bins)` containing the filter bank.
665/// Each column represents a mel filter that can be applied to frequency bins.
666///
667/// # Errors
668///
669/// Returns an error if:
670/// - `num_frequency_bins` is less than 2
671/// - `min_frequency` is greater than `max_frequency`
672/// - Any mel filter has all zero values
673///
674/// # Examples
675///
676/// ```rust
677/// use tekken::audio::mel_filter_bank;
678///
679/// let filter_bank = mel_filter_bank(201, 80, 0.0, 8000.0, 16000)?;
680/// println!("Filter bank shape: {:?}", filter_bank.dim());
681/// # Ok::<(), Box<dyn std::error::Error>>(())
682/// ```
683#[allow(clippy::cast_precision_loss)]
684pub fn mel_filter_bank(
685 num_frequency_bins: usize,
686 num_mel_bins: usize,
687 min_frequency: f64,
688 max_frequency: f64,
689 sampling_rate: usize,
690) -> Result<ndarray::Array2<f64>> {
691 if num_frequency_bins < 2 {
692 return Err(TokenizerError::InvalidConfig(format!(
693 "num_frequency_bins must be >= 2, got {num_frequency_bins}"
694 )));
695 }
696
697 if min_frequency > max_frequency {
698 return Err(TokenizerError::InvalidConfig(format!(
699 "min_frequency ({min_frequency}) must be <= max_frequency ({max_frequency})"
700 )));
701 }
702
703 // Center points of the triangular mel filters
704 let mel_min = hertz_to_mel(min_frequency);
705 let mel_max = hertz_to_mel(max_frequency);
706 #[allow(clippy::cast_precision_loss)]
707 let mel_freqs: Vec<f64> = (0..=num_mel_bins + 1)
708 .map(|i| mel_min + (mel_max - mel_min) * i as f64 / (num_mel_bins + 1) as f64)
709 .collect();
710 let filter_freqs: Vec<f64> = mel_freqs.iter().map(|&mel| mel_to_hertz(mel)).collect();
711
712 // Frequencies of FFT bins in Hz
713 #[allow(clippy::cast_precision_loss)]
714 let fft_freqs: Vec<f64> = (0..num_frequency_bins)
715 .map(|i| i as f64 * sampling_rate as f64 / 2.0 / (num_frequency_bins - 1) as f64)
716 .collect();
717
718 // Create triangular filter bank - shape (num_frequency_bins, num_mel_bins) to match Python
719 let mut filter_bank = ndarray::Array2::zeros((num_frequency_bins, num_mel_bins));
720
721 for mel_idx in 0..num_mel_bins {
722 let left_freq = filter_freqs[mel_idx];
723 let center_freq = filter_freqs[mel_idx + 1];
724 let right_freq = filter_freqs[mel_idx + 2];
725
726 for (freq_idx, &fft_freq) in fft_freqs.iter().enumerate() {
727 let value = if fft_freq >= left_freq && fft_freq <= center_freq {
728 (fft_freq - left_freq) / (center_freq - left_freq)
729 } else if fft_freq > center_freq && fft_freq <= right_freq {
730 (right_freq - fft_freq) / (right_freq - center_freq)
731 } else {
732 0.0
733 };
734
735 filter_bank[[freq_idx, mel_idx]] = value.max(0.0);
736 }
737 }
738
739 // Apply Slaney-style energy normalization
740 for mel_idx in 0..num_mel_bins {
741 let enorm = 2.0 / (filter_freqs[mel_idx + 2] - filter_freqs[mel_idx]);
742 for freq_idx in 0..num_frequency_bins {
743 filter_bank[[freq_idx, mel_idx]] *= enorm;
744 }
745 }
746
747 Ok(filter_bank)
748}