use anyhow::{Context, Result};
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
#[cfg(feature = "ffmpeg")]
use std::io::Read;
use std::path::Path;
use std::sync::{Arc, Mutex};
use crate::resample::{FrameResampler, WHISPER_SAMPLE_RATE};
#[cfg(feature = "vad")]
use crate::vad::VadProcessor;
#[cfg(target_os = "linux")]
mod alsa_suppress {
use std::os::raw::{c_char, c_int};
use std::sync::Once;
type SndLibErrorHandlerT =
unsafe extern "C" fn(*const c_char, c_int, *const c_char, c_int, *const c_char);
#[link(name = "asound")]
unsafe extern "C" {
fn snd_lib_error_set_handler(handler: Option<SndLibErrorHandlerT>) -> c_int;
}
unsafe extern "C" fn silent_error_handler(
_file: *const c_char,
_line: c_int,
_function: *const c_char,
_err: c_int,
_fmt: *const c_char,
) {
}
static INIT: Once = Once::new();
pub fn init() {
INIT.call_once(|| {
unsafe {
snd_lib_error_set_handler(Some(silent_error_handler));
}
});
}
}
#[cfg(not(target_os = "linux"))]
mod alsa_suppress {
pub fn init() {}
}
const CHUNK_THRESHOLD_BYTES: usize = 20 * 1024 * 1024; const CHUNK_DURATION_SECS: usize = 300; const CHUNK_OVERLAP_SECS: usize = 2;
#[derive(Clone)]
pub struct AudioChunk {
pub data: Vec<u8>,
pub index: usize,
pub has_leading_overlap: bool,
}
pub enum RecordingOutput {
Single(Vec<u8>),
Chunked(Vec<AudioChunk>),
}
pub struct RecordingData {
samples: Vec<f32>,
sample_rate: u32,
channels: u16,
}
#[cfg(feature = "vad")]
#[derive(Debug, Clone, Copy)]
pub struct VadConfig {
pub enabled: bool,
pub threshold: f32,
}
#[cfg(feature = "vad")]
impl Default for VadConfig {
fn default() -> Self {
Self {
enabled: false,
threshold: 0.5,
}
}
}
pub type AudioStreamSender = tokio::sync::mpsc::Sender<Vec<f32>>;
pub struct AudioRecorder {
samples: Arc<Mutex<Vec<f32>>>,
sample_rate: u32,
channels: u16,
stream: Option<cpal::Stream>,
resampler: Option<Arc<Mutex<FrameResampler>>>,
#[cfg(feature = "vad")]
vad: Option<Arc<Mutex<VadProcessor>>>,
#[cfg(feature = "vad")]
vad_config: VadConfig,
stream_tx: Option<Arc<AudioStreamSender>>,
}
#[cfg(target_os = "macos")]
unsafe impl Send for AudioRecorder {}
impl AudioRecorder {
pub fn new() -> Result<Self> {
Ok(AudioRecorder {
samples: Arc::new(Mutex::new(Vec::new())),
sample_rate: WHISPER_SAMPLE_RATE, channels: 1, stream: None,
resampler: None,
#[cfg(feature = "vad")]
vad: None,
#[cfg(feature = "vad")]
vad_config: VadConfig::default(),
stream_tx: None,
})
}
#[cfg(feature = "vad")]
pub fn set_vad(&mut self, enabled: bool, threshold: f32) {
self.vad_config = VadConfig {
enabled,
threshold: threshold.clamp(0.0, 1.0),
};
}
pub fn start_recording(&mut self) -> Result<()> {
self.start_recording_with_device(None)
}
pub fn start_recording_with_device(&mut self, device_name: Option<&str>) -> Result<()> {
alsa_suppress::init();
let host = cpal::default_host();
let device = if let Some(name) = device_name {
host.input_devices()?
.find(|d| {
d.description()
.map(|n| n.to_string() == name)
.unwrap_or(false)
})
.with_context(|| format!("Audio device '{}' not found", name))?
} else {
host.default_input_device()
.context("No input device available")?
};
let actual_device_name = device
.description()
.map(|d| d.to_string())
.unwrap_or_else(|_| "<unknown>".to_string());
crate::verbose!("Audio device: {}", actual_device_name);
let config = device
.default_input_config()
.context("Failed to get default input config")?;
#[cfg(target_os = "android")]
let device_channels = 1u16;
#[cfg(not(target_os = "android"))]
let device_channels = config.channels();
let device_sample_rate = config.sample_rate();
crate::verbose!(
"Audio device: {} Hz, {} channel(s) -> resampling to {} Hz mono",
device_sample_rate,
device_channels,
WHISPER_SAMPLE_RATE
);
let resampler = FrameResampler::new(device_sample_rate, device_channels)
.context("Failed to create resampler")?;
let resampler = Arc::new(Mutex::new(resampler));
self.resampler = Some(resampler.clone());
#[cfg(feature = "vad")]
let vad = if self.vad_config.enabled {
crate::verbose!("VAD enabled (threshold: {:.2})", self.vad_config.threshold);
let vad_processor = VadProcessor::new(true, self.vad_config.threshold)
.context("Failed to create VAD processor")?;
let vad = Arc::new(Mutex::new(vad_processor));
self.vad = Some(vad.clone());
Some(vad)
} else {
self.vad = None;
None
};
self.sample_rate = WHISPER_SAMPLE_RATE;
self.channels = 1;
let stream_config = cpal::StreamConfig {
channels: device_channels,
sample_rate: config.sample_rate(),
buffer_size: cpal::BufferSize::Default,
};
let samples = self.samples.clone();
samples.lock().unwrap().clear();
#[cfg(feature = "vad")]
let stream = match config.sample_format() {
cpal::SampleFormat::F32 => {
self.build_stream::<f32>(&device, &stream_config, samples, resampler, vad)?
}
cpal::SampleFormat::I16 => {
self.build_stream::<i16>(&device, &stream_config, samples, resampler, vad)?
}
cpal::SampleFormat::U16 => {
self.build_stream::<u16>(&device, &stream_config, samples, resampler, vad)?
}
_ => anyhow::bail!("Unsupported sample format"),
};
#[cfg(not(feature = "vad"))]
let stream = match config.sample_format() {
cpal::SampleFormat::F32 => {
self.build_stream::<f32>(&device, &stream_config, samples, resampler)?
}
cpal::SampleFormat::I16 => {
self.build_stream::<i16>(&device, &stream_config, samples, resampler)?
}
cpal::SampleFormat::U16 => {
self.build_stream::<u16>(&device, &stream_config, samples, resampler)?
}
_ => anyhow::bail!("Unsupported sample format"),
};
stream.play()?;
self.stream = Some(stream);
Ok(())
}
pub fn start_recording_streaming(&mut self) -> Result<tokio::sync::mpsc::Receiver<Vec<f32>>> {
self.start_recording_streaming_with_device(None)
}
pub fn start_recording_streaming_with_device(
&mut self,
device_name: Option<&str>,
) -> Result<tokio::sync::mpsc::Receiver<Vec<f32>>> {
let (tx, rx) = tokio::sync::mpsc::channel(64);
self.stream_tx = Some(Arc::new(tx));
self.start_recording_with_device(device_name)?;
Ok(rx)
}
#[cfg(feature = "vad")]
fn build_stream<T>(
&self,
device: &cpal::Device,
config: &cpal::StreamConfig,
samples: Arc<Mutex<Vec<f32>>>,
resampler: Arc<Mutex<FrameResampler>>,
vad: Option<Arc<Mutex<VadProcessor>>>,
) -> Result<cpal::Stream>
where
T: cpal::Sample + cpal::SizedSample,
f32: cpal::FromSample<T>,
{
let err_fn = |err| eprintln!("Error in audio stream: {err}");
let stream_tx = self.stream_tx.clone();
let stream = device.build_input_stream(
config,
move |data: &[T], _: &cpal::InputCallbackInfo| {
let f32_samples: Vec<f32> =
data.iter().map(|&s| cpal::Sample::from_sample(s)).collect();
let resampled = resampler.lock().unwrap().process(&f32_samples);
if resampled.is_empty() {
return;
}
let final_samples = if let Some(ref vad) = vad {
vad.lock().unwrap().process(&resampled)
} else {
resampled
};
if !final_samples.is_empty() {
samples.lock().unwrap().extend_from_slice(&final_samples);
if let Some(ref tx) = stream_tx {
let _ = tx.try_send(final_samples.clone());
}
}
},
err_fn,
None,
)?;
Ok(stream)
}
#[cfg(not(feature = "vad"))]
fn build_stream<T>(
&self,
device: &cpal::Device,
config: &cpal::StreamConfig,
samples: Arc<Mutex<Vec<f32>>>,
resampler: Arc<Mutex<FrameResampler>>,
) -> Result<cpal::Stream>
where
T: cpal::Sample + cpal::SizedSample,
f32: cpal::FromSample<T>,
{
let err_fn = |err| eprintln!("Error in audio stream: {err}");
let stream_tx = self.stream_tx.clone();
let stream = device.build_input_stream(
config,
move |data: &[T], _: &cpal::InputCallbackInfo| {
let f32_samples: Vec<f32> =
data.iter().map(|&s| cpal::Sample::from_sample(s)).collect();
let resampled = resampler.lock().unwrap().process(&f32_samples);
if !resampled.is_empty() {
samples.lock().unwrap().extend_from_slice(&resampled);
if let Some(ref tx) = stream_tx {
let _ = tx.try_send(resampled.clone());
}
}
},
err_fn,
None,
)?;
Ok(stream)
}
pub fn stop_recording(&mut self) -> Result<RecordingData> {
self.stream = None;
self.stream_tx = None;
let flushed_resampler = if let Some(resampler) = &self.resampler {
resampler.lock().unwrap().flush()
} else {
Vec::new()
};
self.resampler = None;
#[cfg(feature = "vad")]
let flushed_samples = if let Some(vad) = &self.vad {
let mut vad = vad.lock().unwrap();
let mut remaining = vad.process(&flushed_resampler);
remaining.extend(vad.flush());
remaining
} else {
flushed_resampler
};
#[cfg(not(feature = "vad"))]
let flushed_samples = flushed_resampler;
#[cfg(feature = "vad")]
{
self.vad = None;
}
let mut samples: Vec<f32> = {
let mut guard = self.samples.lock().unwrap();
std::mem::take(&mut *guard)
};
samples.extend_from_slice(&flushed_samples);
if samples.is_empty() {
crate::verbose!("No audio samples captured");
anyhow::bail!("No audio data recorded");
}
let duration_secs = samples.len() as f32 / self.sample_rate as f32;
crate::verbose!(
"Recorded {} samples ({:.1}s at {} Hz mono)",
samples.len(),
duration_secs,
self.sample_rate
);
Ok(RecordingData {
samples,
sample_rate: self.sample_rate,
channels: self.channels,
})
}
pub fn finalize_recording(&mut self) -> Result<RecordingOutput> {
self.stop_recording()?.finalize()
}
}
impl RecordingData {
pub fn finalize_raw(self) -> Vec<f32> {
self.samples
}
pub fn finalize(self) -> Result<RecordingOutput> {
let mp3_data = self.samples_to_mp3(&self.samples, "main")?;
if mp3_data.len() <= CHUNK_THRESHOLD_BYTES {
return Ok(RecordingOutput::Single(mp3_data));
}
let samples_per_second = self.sample_rate as usize * self.channels as usize;
let chunk_samples = CHUNK_DURATION_SECS * samples_per_second;
let overlap_samples = CHUNK_OVERLAP_SECS * samples_per_second;
let mut chunks = Vec::new();
let mut chunk_start = 0usize;
let mut chunk_index = 0usize;
while chunk_start < self.samples.len() {
let chunk_end = (chunk_start + chunk_samples).min(self.samples.len());
let chunk_slice = &self.samples[chunk_start..chunk_end];
let chunk_mp3 = self.samples_to_mp3(chunk_slice, &format!("chunk{chunk_index}"))?;
chunks.push(AudioChunk {
data: chunk_mp3,
index: chunk_index,
has_leading_overlap: chunk_index > 0,
});
chunk_index += 1;
if chunk_end >= self.samples.len() {
break;
}
chunk_start = chunk_end.saturating_sub(overlap_samples);
}
Ok(RecordingOutput::Chunked(chunks))
}
#[cfg(feature = "ffmpeg")]
fn samples_to_mp3(&self, samples: &[f32], suffix: &str) -> Result<Vec<u8>> {
self.samples_to_mp3_ffmpeg(samples, suffix)
}
#[cfg(all(feature = "embedded-encoder", not(feature = "ffmpeg")))]
fn samples_to_mp3(&self, samples: &[f32], _suffix: &str) -> Result<Vec<u8>> {
self.samples_to_mp3_embedded(samples)
}
#[cfg(not(any(feature = "ffmpeg", feature = "embedded-encoder")))]
fn samples_to_mp3(&self, _samples: &[f32], _suffix: &str) -> Result<Vec<u8>> {
anyhow::bail!(
"No MP3 encoder available. Enable either 'ffmpeg' or 'embedded-encoder' feature."
)
}
#[cfg(feature = "ffmpeg")]
fn samples_to_mp3_ffmpeg(&self, samples: &[f32], suffix: &str) -> Result<Vec<u8>> {
let i16_samples: Vec<i16> = samples
.iter()
.map(|&s| {
let clamped = s.clamp(-1.0, 1.0);
(clamped * i16::MAX as f32) as i16
})
.collect();
let temp_dir = std::env::temp_dir();
let unique_id = format!(
"{}_{}_{suffix}",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos(),
);
let wav_path = temp_dir.join(format!("whis_{unique_id}.wav"));
let mp3_path = temp_dir.join(format!("whis_{unique_id}.mp3"));
{
let spec = hound::WavSpec {
channels: self.channels,
sample_rate: self.sample_rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = hound::WavWriter::create(&wav_path, spec)?;
for sample in i16_samples {
writer.write_sample(sample)?;
}
writer.finalize()?;
}
let output = std::process::Command::new("ffmpeg")
.args([
"-hide_banner",
"-loglevel",
"error",
"-i",
wav_path.to_str().unwrap(),
"-codec:a",
"libmp3lame",
"-b:a",
"128k",
"-y",
mp3_path.to_str().unwrap(),
])
.output()
.context("Failed to execute ffmpeg. Make sure ffmpeg is installed.")?;
let _ = std::fs::remove_file(&wav_path);
if !output.status.success() {
let _ = std::fs::remove_file(&mp3_path);
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("FFmpeg conversion failed: {stderr}");
}
let mp3_data = std::fs::read(&mp3_path).context("Failed to read converted MP3 file")?;
let _ = std::fs::remove_file(&mp3_path);
Ok(mp3_data)
}
#[cfg(feature = "embedded-encoder")]
#[allow(dead_code)] fn samples_to_mp3_embedded(&self, samples: &[f32]) -> Result<Vec<u8>> {
use mp3lame_encoder::{Builder, FlushNoGap, InterleavedPcm, MonoPcm};
let i16_samples: Vec<i16> = samples
.iter()
.map(|&s| {
let clamped = s.clamp(-1.0, 1.0);
(clamped * i16::MAX as f32) as i16
})
.collect();
let mut builder = Builder::new().context("Failed to create LAME builder")?;
builder
.set_num_channels(self.channels as u8)
.map_err(|e| anyhow::anyhow!("Failed to set channels: {:?}", e))?;
builder
.set_sample_rate(self.sample_rate)
.map_err(|e| anyhow::anyhow!("Failed to set sample rate: {:?}", e))?;
builder
.set_brate(mp3lame_encoder::Bitrate::Kbps128)
.map_err(|e| anyhow::anyhow!("Failed to set bitrate: {:?}", e))?;
builder
.set_quality(mp3lame_encoder::Quality::Best)
.map_err(|e| anyhow::anyhow!("Failed to set quality: {:?}", e))?;
let mut encoder = builder
.build()
.map_err(|e| anyhow::anyhow!("Failed to initialize LAME encoder: {:?}", e))?;
let mut mp3_data = Vec::new();
let max_size = mp3lame_encoder::max_required_buffer_size(i16_samples.len());
mp3_data.reserve(max_size);
let encoded_size = if self.channels == 1 {
let input = MonoPcm(&i16_samples);
encoder
.encode(input, mp3_data.spare_capacity_mut())
.map_err(|e| anyhow::anyhow!("Failed to encode MP3: {:?}", e))?
} else {
let input = InterleavedPcm(&i16_samples);
encoder
.encode(input, mp3_data.spare_capacity_mut())
.map_err(|e| anyhow::anyhow!("Failed to encode MP3: {:?}", e))?
};
unsafe {
mp3_data.set_len(encoded_size);
}
let flush_size = encoder
.flush::<FlushNoGap>(mp3_data.spare_capacity_mut())
.map_err(|e| anyhow::anyhow!("Failed to flush MP3 encoder: {:?}", e))?;
unsafe {
mp3_data.set_len(mp3_data.len() + flush_size);
}
Ok(mp3_data)
}
}
#[cfg(feature = "ffmpeg")]
pub fn load_audio_file(path: &Path) -> Result<RecordingOutput> {
let extension = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
let mp3_data = match extension.as_str() {
"mp3" => {
std::fs::read(path).context("Failed to read MP3 file")?
}
"wav" | "m4a" | "ogg" | "flac" | "webm" | "aac" | "opus" => {
convert_file_to_mp3(path)?
}
_ => {
anyhow::bail!(
"Unsupported audio format: '{}'. Supported: mp3, wav, m4a, ogg, flac, webm, aac, opus",
extension
);
}
};
classify_recording_output(mp3_data)
}
#[cfg(not(feature = "ffmpeg"))]
pub fn load_audio_file(_path: &Path) -> Result<RecordingOutput> {
anyhow::bail!("File input requires the 'ffmpeg' feature (not available in mobile builds)")
}
#[cfg(feature = "ffmpeg")]
pub fn load_audio_stdin(format: &str) -> Result<RecordingOutput> {
let mut data = Vec::new();
std::io::stdin()
.read_to_end(&mut data)
.context("Failed to read audio from stdin")?;
if data.is_empty() {
anyhow::bail!("No audio data received from stdin");
}
let mp3_data = match format.to_lowercase().as_str() {
"mp3" => data, "wav" | "m4a" | "ogg" | "flac" | "webm" | "aac" | "opus" => {
convert_stdin_to_mp3(&data, format)?
}
_ => {
anyhow::bail!(
"Unsupported stdin format: '{}'. Supported: mp3, wav, m4a, ogg, flac, webm, aac, opus",
format
);
}
};
classify_recording_output(mp3_data)
}
#[cfg(not(feature = "ffmpeg"))]
pub fn load_audio_stdin(_format: &str) -> Result<RecordingOutput> {
anyhow::bail!("Stdin input requires the 'ffmpeg' feature (not available in mobile builds)")
}
#[cfg(feature = "ffmpeg")]
fn classify_recording_output(mp3_data: Vec<u8>) -> Result<RecordingOutput> {
if mp3_data.len() <= CHUNK_THRESHOLD_BYTES {
Ok(RecordingOutput::Single(mp3_data))
} else {
crate::verbose!(
"Large file ({:.1} MB) - processing as single file",
mp3_data.len() as f64 / 1024.0 / 1024.0
);
Ok(RecordingOutput::Single(mp3_data))
}
}
#[cfg(feature = "ffmpeg")]
fn convert_file_to_mp3(input_path: &Path) -> Result<Vec<u8>> {
let temp_dir = std::env::temp_dir();
let unique_id = format!(
"{}_{}",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos(),
);
let mp3_path = temp_dir.join(format!("whis_convert_{unique_id}.mp3"));
crate::verbose!("Converting {} to MP3...", input_path.display());
let output = std::process::Command::new("ffmpeg")
.args([
"-hide_banner",
"-loglevel",
"error",
"-i",
input_path.to_str().unwrap(),
"-codec:a",
"libmp3lame",
"-b:a",
"128k",
"-y",
mp3_path.to_str().unwrap(),
])
.output()
.context("Failed to execute ffmpeg. Make sure ffmpeg is installed.")?;
if !output.status.success() {
let _ = std::fs::remove_file(&mp3_path);
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("FFmpeg conversion failed: {stderr}");
}
let mp3_data = std::fs::read(&mp3_path).context("Failed to read converted MP3 file")?;
let _ = std::fs::remove_file(&mp3_path);
crate::verbose!("Converted to {:.1} KB MP3", mp3_data.len() as f64 / 1024.0);
Ok(mp3_data)
}
#[cfg(feature = "ffmpeg")]
fn convert_stdin_to_mp3(data: &[u8], format: &str) -> Result<Vec<u8>> {
use std::io::Write;
use std::process::{Command, Stdio};
let temp_dir = std::env::temp_dir();
let unique_id = format!(
"{}_{}",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos(),
);
let mp3_path = temp_dir.join(format!("whis_stdin_{unique_id}.mp3"));
crate::verbose!("Converting stdin ({} format) to MP3...", format);
let mut child = Command::new("ffmpeg")
.args([
"-hide_banner",
"-loglevel",
"error",
"-f",
format,
"-i",
"pipe:0", "-codec:a",
"libmp3lame",
"-b:a",
"128k",
"-y",
mp3_path.to_str().unwrap(),
])
.stdin(Stdio::piped())
.stdout(Stdio::null())
.stderr(Stdio::piped())
.spawn()
.context("Failed to spawn ffmpeg process")?;
if let Some(mut stdin) = child.stdin.take() {
stdin
.write_all(data)
.context("Failed to write audio data to ffmpeg")?;
}
let output = child.wait_with_output()?;
if !output.status.success() {
let _ = std::fs::remove_file(&mp3_path);
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("FFmpeg stdin conversion failed: {stderr}");
}
let mp3_data = std::fs::read(&mp3_path).context("Failed to read converted MP3 file")?;
let _ = std::fs::remove_file(&mp3_path);
crate::verbose!("Converted to {:.1} KB MP3", mp3_data.len() as f64 / 1024.0);
Ok(mp3_data)
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct AudioDeviceInfo {
pub name: String,
pub is_default: bool,
}
pub fn list_audio_devices() -> Result<Vec<AudioDeviceInfo>> {
alsa_suppress::init();
let host = cpal::default_host();
let default_device_name = host
.default_input_device()
.and_then(|d| d.description().ok())
.map(|d| d.to_string());
let mut devices = Vec::new();
for device in host.input_devices()? {
if let Ok(desc) = device.description() {
let name = desc.to_string();
devices.push(AudioDeviceInfo {
name: name.clone(),
is_default: default_device_name.as_ref() == Some(&name),
});
}
}
if devices.is_empty() {
anyhow::bail!("No audio input devices found");
}
Ok(devices)
}