use crate::config::Config;
use crate::error::TranscribeError;
use std::path::Path;
#[cfg(any(feature = "whisper", feature = "parakeet"))]
use std::path::PathBuf;
pub use whisper_guard::audio::{normalize_audio, resample, strip_silence};
#[cfg(feature = "whisper")]
pub use whisper_guard::params::{default_whisper_params, streaming_whisper_params};
pub use whisper_guard::segments::{clean_transcript, CleanStats};
#[derive(Debug, Clone, Default)]
pub struct FilterStats {
pub audio_duration_secs: f64,
pub samples_after_silence_strip: usize,
pub raw_segments: usize,
pub skipped_no_speech: usize,
pub after_no_speech_filter: usize,
pub after_dedup: usize,
pub after_interleaved: usize,
pub after_script_filter: usize,
pub after_noise_markers: usize,
pub after_trailing_trim: usize,
pub rescued_no_speech: usize,
pub final_words: usize,
}
impl FilterStats {
pub fn diagnosis(&self) -> String {
let mut parts = Vec::new();
parts.push(format!("audio: {:.1}s", self.audio_duration_secs));
if self.samples_after_silence_strip == 0 {
parts.push("silence strip removed ALL audio".into());
return parts.join(", ");
}
parts.push(format!("whisper produced {} segments", self.raw_segments));
if self.raw_segments == 0 {
return parts.join(", ");
}
if self.rescued_no_speech > 0 {
parts.push(format!(
"no_speech rescue: {} segments saved (would have been blank)",
self.rescued_no_speech
));
} else if self.skipped_no_speech > 0 {
parts.push(format!(
"no_speech filter: -{} → {}",
self.skipped_no_speech, self.after_no_speech_filter
));
}
if self.after_dedup < self.after_no_speech_filter {
parts.push(format!(
"dedup: -{} → {}",
self.after_no_speech_filter - self.after_dedup,
self.after_dedup
));
}
if self.after_interleaved < self.after_dedup {
parts.push(format!(
"interleaved: -{} → {}",
self.after_dedup - self.after_interleaved,
self.after_interleaved
));
}
if self.after_script_filter < self.after_interleaved {
parts.push(format!(
"script filter: -{} → {}",
self.after_interleaved - self.after_script_filter,
self.after_script_filter
));
}
if self.after_noise_markers < self.after_script_filter {
parts.push(format!(
"noise markers: -{} → {}",
self.after_script_filter - self.after_noise_markers,
self.after_noise_markers
));
}
if self.after_trailing_trim < self.after_noise_markers {
parts.push(format!(
"trailing trim: -{} → {}",
self.after_noise_markers - self.after_trailing_trim,
self.after_trailing_trim
));
}
parts.push(format!("final: {} words", self.final_words));
parts.join(", ")
}
}
#[derive(Debug, Clone)]
pub struct TranscribeResult {
pub text: String,
pub stats: FilterStats,
}
pub fn transcribe(audio_path: &Path, config: &Config) -> Result<TranscribeResult, TranscribeError> {
match config.transcription.engine.as_str() {
"whisper" => transcribe_whisper_dispatch(audio_path, config),
"parakeet" => transcribe_parakeet_dispatch(audio_path, config),
other => {
tracing::warn!(
engine = other,
"unknown transcription engine — falling back to whisper"
);
transcribe_whisper_dispatch(audio_path, config)
}
}
}
fn transcribe_whisper_dispatch(
audio_path: &Path,
config: &Config,
) -> Result<TranscribeResult, TranscribeError> {
let mut stats = FilterStats::default();
let samples = load_audio_samples(audio_path)?;
stats.audio_duration_secs = samples.len() as f64 / 16000.0;
if samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
#[cfg(feature = "denoise")]
let samples = if config.transcription.noise_reduction {
denoise_audio(&samples, 16000)
} else {
samples
};
#[cfg(feature = "whisper")]
let use_integrated_vad = resolve_vad_model_path(config).is_some();
#[cfg(not(feature = "whisper"))]
let use_integrated_vad = false;
let samples = if use_integrated_vad {
tracing::debug!("Silero VAD available — skipping energy-based silence stripping");
samples
} else {
strip_silence(&samples, 16000)
};
stats.samples_after_silence_strip = samples.len();
if samples.is_empty() {
tracing::warn!(
audio_duration_secs = stats.audio_duration_secs,
"silence stripping removed all audio — entire recording was below energy threshold"
);
return Err(TranscribeError::EmptyAudio);
}
#[cfg(feature = "whisper")]
{
let result = transcribe_with_whisper(&samples, audio_path, config, stats, false)?;
if result.stats.final_words == 0
&& use_integrated_vad
&& result.stats.audio_duration_secs > 60.0
{
tracing::warn!(
audio_secs = format!("{:.0}", result.stats.audio_duration_secs),
raw_segments = result.stats.raw_segments,
"blank transcript from long audio with VAD — retrying without VAD"
);
let mut retry_stats = FilterStats {
audio_duration_secs: result.stats.audio_duration_secs,
..Default::default()
};
let stripped = strip_silence(&samples, 16000);
retry_stats.samples_after_silence_strip = stripped.len();
if !stripped.is_empty() {
return transcribe_with_whisper(&stripped, audio_path, config, retry_stats, true);
}
}
Ok(result)
}
#[cfg(not(feature = "whisper"))]
{
let _ = config; let duration_secs = samples.len() as f64 / 16000.0;
let text = format!(
"[Transcription placeholder — whisper feature not enabled]\n\
Audio file: {}\n\
Duration: {:.1}s ({} samples at 16kHz)\n\
\n\
Build with `cargo build --features whisper` and download a model\n\
via `minutes setup` to enable real transcription.",
audio_path.display(),
duration_secs,
samples.len(),
);
Ok(TranscribeResult { text, stats })
}
}
fn transcribe_parakeet_dispatch(
audio_path: &Path,
config: &Config,
) -> Result<TranscribeResult, TranscribeError> {
#[cfg(feature = "parakeet")]
{
transcribe_with_parakeet(audio_path, config)
}
#[cfg(not(feature = "parakeet"))]
{
let _ = (audio_path, config);
Err(TranscribeError::EngineNotAvailable("parakeet".into()))
}
}
#[cfg(feature = "whisper")]
fn transcribe_with_whisper(
samples: &[f32],
_audio_path: &Path,
config: &Config,
mut stats: FilterStats,
force_disable_vad: bool,
) -> Result<TranscribeResult, TranscribeError> {
let model_path = resolve_model_path(config)?;
tracing::info!(model = %model_path.display(), vad_disabled = force_disable_vad, "loading whisper model");
let ctx = whisper_rs::WhisperContext::new_with_params(
model_path
.to_str()
.ok_or_else(|| TranscribeError::ModelLoadError("invalid model path encoding".into()))?,
whisper_rs::WhisperContextParameters::default(),
)
.map_err(|e| TranscribeError::ModelLoadError(format!("{}", e)))?;
tracing::info!(
samples = samples.len(),
duration_secs = samples.len() as f64 / 16000.0,
"starting whisper transcription"
);
let mut state = ctx
.create_state()
.map_err(|e| TranscribeError::TranscriptionFailed(format!("create state: {}", e)))?;
let vad_path = if force_disable_vad {
None
} else {
resolve_vad_model_path(config)
};
let vad_path_str = vad_path.as_ref().and_then(|p| p.to_str());
let mut params = default_whisper_params(vad_path_str);
params.set_n_threads(num_cpus());
params.set_language(config.transcription.language.as_deref());
params.set_token_timestamps(true);
let audio_duration_secs = samples.len() as f64 / 16000.0;
let timeout_secs = (300.0 + (audio_duration_secs * 3.0)).min(3600.0);
let deadline = std::time::Instant::now() + std::time::Duration::from_secs_f64(timeout_secs);
params.set_abort_callback_safe(move || {
let exceeded = std::time::Instant::now() > deadline;
if exceeded {
tracing::warn!(
timeout_secs = format!("{:.0}", timeout_secs),
"whisper transcription timed out — aborting"
);
}
exceeded
});
state.full(params, samples).map_err(|e| {
let msg = format!("{}", e);
if msg.contains("abort") {
TranscribeError::TranscriptionFailed(format!(
"transcription timed out after {:.0}s (audio was {:.0}s). \
Try a smaller model or ensure Silero VAD is installed: minutes setup",
timeout_secs, audio_duration_secs
))
} else {
TranscribeError::TranscriptionFailed(msg)
}
})?;
let num_segments = state.full_n_segments();
stats.raw_segments = num_segments as usize;
let mut lines: Vec<String> = Vec::new();
let mut rescued_lines: Vec<String> = Vec::new();
let mut skipped_no_speech = 0u32;
for i in 0..num_segments {
let segment = match state.get_segment(i) {
Some(seg) => seg,
None => continue,
};
let start_ts = segment.start_timestamp();
let text = segment
.to_str_lossy()
.map_err(|e| TranscribeError::TranscriptionFailed(format!("get text: {}", e)))?;
let text = text.trim();
if text.is_empty() {
continue;
}
let mins = start_ts / 6000;
let secs = (start_ts % 6000) / 100;
let line = format!("[{}:{:02}] {}", mins, secs, text);
let no_speech_prob = segment.no_speech_probability();
if no_speech_prob > 0.8 {
skipped_no_speech += 1;
tracing::debug!(
segment = i,
no_speech_prob = format!("{:.2}", no_speech_prob),
"flagged segment — high no_speech probability"
);
rescued_lines.push(line);
continue;
}
rescued_lines.push(line.clone());
lines.push(line);
}
if lines.is_empty() && !rescued_lines.is_empty() {
let rescued_count = rescued_lines.len();
tracing::warn!(
rescued = rescued_count,
skipped_no_speech = skipped_no_speech,
audio_secs = format!("{:.0}", stats.audio_duration_secs),
"no_speech filter would blank the transcript — rescuing all segments"
);
lines = rescued_lines;
stats.rescued_no_speech = rescued_count;
skipped_no_speech = 0;
}
stats.skipped_no_speech = skipped_no_speech as usize;
stats.after_no_speech_filter = lines.len();
if skipped_no_speech > 0 {
tracing::info!(
skipped = skipped_no_speech,
remaining = lines.len(),
"filtered segments with high no_speech probability"
);
}
let lines = dedup_segments(lines);
stats.after_dedup = lines.len();
let lines = dedup_interleaved(lines);
stats.after_interleaved = lines.len();
let lines = strip_foreign_script(lines);
stats.after_script_filter = lines.len();
let lines = collapse_noise_markers(lines);
stats.after_noise_markers = lines.len();
let lines = trim_trailing_noise(lines);
stats.after_trailing_trim = lines.len();
let transcript = lines.join("\n");
let transcript = if transcript.is_empty() {
transcript
} else {
format!("{}\n", transcript)
};
let word_count = transcript.split_whitespace().count();
stats.final_words = word_count;
tracing::info!(
segments = num_segments,
words = word_count,
diagnosis = stats.diagnosis(),
"transcription complete"
);
if word_count == 0 && num_segments > 0 {
tracing::warn!(
diagnosis = stats.diagnosis(),
"all segments filtered out — transcript is blank"
);
}
Ok(TranscribeResult {
text: transcript,
stats,
})
}
fn load_audio_samples(path: &Path) -> Result<Vec<f32>, TranscribeError> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
match ext.as_str() {
"wav" => load_wav(path),
"m4a" | "mp3" | "ogg" | "webm" | "mp4" | "mov" | "aac" => {
match decode_with_ffmpeg(path) {
Ok(samples) => Ok(samples),
Err(e) => {
let is_not_found = e.to_string().contains("not available")
|| e.to_string().contains("not found");
if is_not_found {
tracing::warn!(
"ffmpeg not found — falling back to symphonia for {} decoding. \
Non-English audio may produce poor results. \
Install ffmpeg: brew install ffmpeg (macOS) / apt install ffmpeg (Linux)",
ext
);
} else {
tracing::warn!(
error = %e,
"ffmpeg decode failed — falling back to symphonia"
);
}
decode_with_symphonia(path)
}
}
}
other => Err(TranscribeError::UnsupportedFormat(other.to_string())),
}
}
fn load_wav(path: &Path) -> Result<Vec<f32>, TranscribeError> {
let reader = hound::WavReader::open(path).map_err(|e| {
if e.to_string().contains("Not a WAVE file") || e.to_string().contains("unexpected EOF") {
TranscribeError::UnsupportedFormat("corrupt or invalid WAV file".into())
} else {
TranscribeError::Io(std::io::Error::other(e.to_string()))
}
})?;
let spec = reader.spec();
let sample_rate = spec.sample_rate;
let channels = spec.channels as usize;
let bits = spec.bits_per_sample;
let max_val = (1_i64 << (bits - 1)) as f32; let raw_samples: Vec<f32> = match spec.sample_format {
hound::SampleFormat::Int => reader
.into_samples::<i32>()
.filter_map(|s| s.ok())
.map(|s| s as f32 / max_val)
.collect(),
hound::SampleFormat::Float => reader
.into_samples::<f32>()
.filter_map(|s| s.ok())
.collect(),
};
if raw_samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
let mono = if channels > 1 {
raw_samples
.chunks(channels)
.map(|frame| frame.iter().sum::<f32>() / channels as f32)
.collect()
} else {
raw_samples
};
let resampled = if sample_rate != 16000 {
resample(&mono, sample_rate, 16000)
} else {
mono
};
Ok(normalize_audio(&resampled))
}
fn decode_with_ffmpeg(path: &Path) -> Result<Vec<f32>, TranscribeError> {
use std::process::Command;
let tmp_dir = std::env::temp_dir();
let tmp_wav = tmp_dir.join(format!("minutes-ffmpeg-{}.wav", std::process::id()));
#[cfg(unix)]
{
if let Ok(f) = std::fs::File::create(&tmp_wav) {
drop(f);
use std::os::unix::fs::PermissionsExt;
std::fs::set_permissions(&tmp_wav, std::fs::Permissions::from_mode(0o600)).ok();
}
}
let output = Command::new("ffmpeg")
.args([
"-i",
path.to_str().unwrap_or(""),
"-ar",
"16000", "-ac",
"1", "-f",
"wav", "-y", ])
.arg(&tmp_wav)
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::piped())
.output()
.map_err(|e| {
TranscribeError::TranscriptionFailed(format!("ffmpeg not available: {}", e))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let _ = std::fs::remove_file(&tmp_wav);
return Err(TranscribeError::TranscriptionFailed(format!(
"ffmpeg conversion failed: {}",
stderr.lines().last().unwrap_or("unknown error")
)));
}
tracing::info!(
source = %path.display(),
"decoded audio with ffmpeg (16kHz mono WAV)"
);
let result = load_wav(&tmp_wav);
let _ = std::fs::remove_file(&tmp_wav);
result
}
fn decode_with_symphonia(path: &Path) -> Result<Vec<f32>, TranscribeError> {
use symphonia::core::audio::SampleBuffer;
use symphonia::core::codecs::DecoderOptions;
use symphonia::core::formats::FormatOptions;
use symphonia::core::io::MediaSourceStream;
use symphonia::core::meta::MetadataOptions;
use symphonia::core::probe::Hint;
let file = std::fs::File::open(path)?;
let mss = MediaSourceStream::new(Box::new(file), Default::default());
let mut hint = Hint::new();
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
hint.with_extension(ext);
}
let format_opts = FormatOptions::default();
let metadata_opts = MetadataOptions::default();
let probed = symphonia::default::get_probe()
.format(&hint, mss, &format_opts, &metadata_opts)
.map_err(|e| TranscribeError::UnsupportedFormat(format!("probe failed: {}", e)))?;
let mut format = probed.format;
let track = format
.tracks()
.iter()
.find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
.ok_or_else(|| TranscribeError::UnsupportedFormat("no audio track found".into()))?;
let track_id = track.id;
let sample_rate = track.codec_params.sample_rate.unwrap_or(44100);
let channels = track.codec_params.channels.map(|c| c.count()).unwrap_or(1);
let decoder_opts = DecoderOptions::default();
let mut decoder = symphonia::default::get_codecs()
.make(&track.codec_params, &decoder_opts)
.map_err(|e| TranscribeError::UnsupportedFormat(format!("decoder: {}", e)))?;
let mut all_samples: Vec<f32> = Vec::new();
loop {
let packet = match format.next_packet() {
Ok(packet) => packet,
Err(symphonia::core::errors::Error::IoError(ref e))
if e.kind() == std::io::ErrorKind::UnexpectedEof =>
{
break; }
Err(_) => break,
};
if packet.track_id() != track_id {
continue;
}
let decoded = match decoder.decode(&packet) {
Ok(decoded) => decoded,
Err(_) => continue, };
let spec = *decoded.spec();
let duration = decoded.capacity();
let mut sample_buf = SampleBuffer::<f32>::new(duration as u64, spec);
sample_buf.copy_interleaved_ref(decoded);
let samples = sample_buf.samples();
if channels > 1 {
for chunk in samples.chunks(channels) {
let mono_sample = chunk.iter().sum::<f32>() / channels as f32;
all_samples.push(mono_sample);
}
} else {
all_samples.extend_from_slice(samples);
}
}
if all_samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
let resampled = if sample_rate != 16000 {
resample(&all_samples, sample_rate, 16000)
} else {
all_samples
};
Ok(normalize_audio(&resampled))
}
use whisper_guard::segments as wg_segments;
fn dedup_segments(lines: Vec<String>) -> Vec<String> {
wg_segments::dedup_segments(&lines)
}
fn dedup_interleaved(lines: Vec<String>) -> Vec<String> {
wg_segments::dedup_interleaved(&lines)
}
fn trim_trailing_noise(lines: Vec<String>) -> Vec<String> {
wg_segments::trim_trailing_noise(&lines)
}
fn strip_foreign_script(lines: Vec<String>) -> Vec<String> {
wg_segments::strip_foreign_script(&lines)
}
fn collapse_noise_markers(lines: Vec<String>) -> Vec<String> {
wg_segments::collapse_noise_markers(&lines)
}
#[cfg(feature = "denoise")]
fn denoise_audio(samples: &[f32], sample_rate: u32) -> Vec<f32> {
use nnnoiseless::{DenoiseState, FRAME_SIZE};
if samples.is_empty() {
return samples.to_vec();
}
let (samples_48k, original_rate) = if sample_rate != 48000 {
(resample(samples, sample_rate, 48000), Some(sample_rate))
} else {
(samples.to_vec(), None)
};
let scaled: Vec<f32> = samples_48k.iter().map(|s| s * 32767.0).collect();
let mut state = DenoiseState::new();
let mut output = Vec::with_capacity(scaled.len());
let mut frame_out = [0.0f32; FRAME_SIZE];
let silence = [0.0f32; FRAME_SIZE];
state.process_frame(&mut frame_out, &silence);
for chunk in scaled.chunks(FRAME_SIZE) {
if chunk.len() == FRAME_SIZE {
state.process_frame(&mut frame_out, chunk);
output.extend_from_slice(&frame_out);
} else {
let mut padded = [0.0f32; FRAME_SIZE];
padded[..chunk.len()].copy_from_slice(chunk);
state.process_frame(&mut frame_out, &padded);
output.extend_from_slice(&frame_out[..chunk.len()]);
}
}
let denoised: Vec<f32> = output.iter().map(|s| s / 32767.0).collect();
let denoised = if let Some(orig) = original_rate {
resample(&denoised, 48000, orig)
} else {
denoised
};
let original_rms: f32 =
(samples.iter().map(|s| s * s).sum::<f32>() / samples.len() as f32).sqrt();
let denoised_rms: f32 =
(denoised.iter().map(|s| s * s).sum::<f32>() / denoised.len() as f32).sqrt();
tracing::info!(
original_rms = format!("{:.4}", original_rms),
denoised_rms = format!("{:.4}", denoised_rms),
reduction_db = format!(
"{:.1}",
20.0 * (denoised_rms / original_rms.max(0.0001)).log10()
),
"noise reduction applied"
);
denoised
}
#[cfg(feature = "whisper")]
pub fn resolve_model_path_for_dictation(config: &Config) -> Result<PathBuf, TranscribeError> {
let model_name = &config.dictation.model;
let model_dir = &config.transcription.model_path;
let candidates = [
model_dir.join(format!("ggml-{}.bin", model_name)),
model_dir.join(format!("whisper-{}.bin", model_name)),
model_dir.join(format!("{}.bin", model_name)),
];
for candidate in &candidates {
if candidate.exists() {
return Ok(candidate.clone());
}
}
let direct = PathBuf::from(model_name);
if direct.exists() {
return Ok(direct);
}
Err(TranscribeError::ModelNotFound(format!(
"Expected model file \"ggml-{}.bin\" in {}",
model_name,
model_dir.display(),
)))
}
#[cfg(feature = "whisper")]
pub fn resolve_model_path_by_name(
model_name: &str,
config: &Config,
) -> Result<PathBuf, TranscribeError> {
let model_dir = &config.transcription.model_path;
let candidates = [
model_dir.join(format!("ggml-{}.bin", model_name)),
model_dir.join(format!("whisper-{}.bin", model_name)),
model_dir.join(format!("{}.bin", model_name)),
];
for candidate in &candidates {
if candidate.exists() {
return Ok(candidate.clone());
}
}
let direct = PathBuf::from(model_name);
if direct.exists() {
return Ok(direct);
}
let model_dir_display = model_dir.display().to_string();
let requested = model_name.to_string();
let dictation_model = &config.dictation.model;
tracing::warn!(
requested = %requested,
fallback = %dictation_model,
"live transcript model not found, falling back to dictation model"
);
resolve_model_path_for_dictation(config).map_err(|_| {
TranscribeError::ModelNotFound(format!(
"Expected model file \"ggml-{}.bin\" in {}",
requested, model_dir_display,
))
})
}
#[cfg(feature = "whisper")]
fn resolve_model_path(config: &Config) -> Result<PathBuf, TranscribeError> {
let model_name = &config.transcription.model;
let model_dir = &config.transcription.model_path;
let candidates = [
model_dir.join(format!("ggml-{}.bin", model_name)),
model_dir.join(format!("whisper-{}.bin", model_name)),
model_dir.join(format!("{}.bin", model_name)),
];
for candidate in &candidates {
if candidate.exists() {
return Ok(candidate.clone());
}
}
let direct = PathBuf::from(model_name);
if direct.exists() {
return Ok(direct);
}
Err(TranscribeError::ModelNotFound(format!(
"Expected model file \"ggml-{}.bin\" in {}",
model_name,
model_dir.display(),
)))
}
#[cfg(feature = "whisper")]
fn resolve_vad_model_path(config: &Config) -> Option<PathBuf> {
let vad_model = &config.transcription.vad_model;
if vad_model.is_empty() {
return None;
}
let model_dir = &config.transcription.model_path;
let mut candidates = vec![
model_dir.join(format!("ggml-{}.bin", vad_model)),
model_dir.join(format!("{}.bin", vad_model)),
];
if vad_model.starts_with("silero") {
candidates.push(model_dir.join("ggml-silero-vad.bin"));
}
for candidate in &candidates {
if candidate.exists() {
return Some(candidate.clone());
}
}
let direct = PathBuf::from(vad_model);
if direct.exists() {
return Some(direct);
}
tracing::debug!(
vad_model = vad_model,
"VAD model not found — falling back to energy-based silence stripping"
);
None
}
#[cfg(feature = "whisper")]
fn num_cpus() -> i32 {
whisper_guard::params::num_cpus()
}
#[cfg(feature = "parakeet")]
const VALID_PARAKEET_MODELS: &[&str] = &["tdt-ctc-110m", "tdt-600m"];
#[cfg(feature = "parakeet")]
fn transcribe_with_parakeet(
audio_path: &Path,
config: &Config,
) -> Result<TranscribeResult, TranscribeError> {
use std::process::Command;
let mut stats = FilterStats::default();
if !VALID_PARAKEET_MODELS.contains(&config.transcription.parakeet_model.as_str()) {
return Err(TranscribeError::ParakeetFailed(format!(
"unknown parakeet model '{}'. Valid: {}",
config.transcription.parakeet_model,
VALID_PARAKEET_MODELS.join(", ")
)));
}
let samples = load_audio_samples(audio_path)?;
stats.audio_duration_secs = samples.len() as f64 / 16000.0;
if samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
if config.transcription.noise_reduction {
tracing::debug!(
"noise_reduction is enabled but not applied for parakeet engine \
(nnnoiseless only supports the whisper path)"
);
}
let samples = strip_silence(&samples, 16000);
stats.samples_after_silence_strip = samples.len();
if samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
let tmp_wav = tempfile::Builder::new()
.prefix("minutes-parakeet-")
.suffix(".wav")
.tempfile()
.map_err(TranscribeError::Io)?;
write_wav_16k_mono(tmp_wav.path(), &samples)?;
let model_path = resolve_parakeet_model_path(config)?;
let vocab_path = resolve_parakeet_vocab_path(config)?;
let binary = &config.transcription.parakeet_binary;
tracing::info!(
binary = %binary,
model = %model_path.display(),
vocab = %vocab_path.display(),
audio = %audio_path.display(),
"starting parakeet transcription"
);
let model_str = model_path
.to_str()
.ok_or_else(|| TranscribeError::ParakeetFailed("model path is not valid UTF-8".into()))?;
let wav_str = tmp_wav.path().to_str().ok_or_else(|| {
TranscribeError::ParakeetFailed("temp WAV path is not valid UTF-8".into())
})?;
let vocab_str = vocab_path
.to_str()
.ok_or_else(|| TranscribeError::ParakeetFailed("vocab path is not valid UTF-8".into()))?;
let output = Command::new(binary)
.arg(model_str)
.arg(wav_str)
.args(["--vocab", vocab_str])
.args(["--model", &config.transcription.parakeet_model])
.arg("--timestamps")
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.output()
.map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
TranscribeError::ParakeetNotFound
} else {
TranscribeError::ParakeetFailed(format!("spawn error: {}", e))
}
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(TranscribeError::ParakeetFailed(
stderr.lines().last().unwrap_or("unknown error").to_string(),
));
}
let stdout = String::from_utf8_lossy(&output.stdout);
let (transcript, pstats) = parse_parakeet_output(&stdout, config)?;
stats.raw_segments = pstats.raw_segments;
stats.after_no_speech_filter = pstats.raw_segments; stats.after_dedup = pstats.after_dedup;
stats.after_interleaved = pstats.after_interleaved;
stats.after_script_filter = pstats.after_script_filter;
stats.after_noise_markers = pstats.after_noise_markers;
stats.after_trailing_trim = pstats.after_trailing_trim;
let word_count = transcript.split_whitespace().count();
stats.final_words = word_count;
tracing::info!(
words = word_count,
diagnosis = stats.diagnosis(),
"parakeet transcription complete"
);
Ok(TranscribeResult {
text: transcript,
stats,
})
}
#[cfg(feature = "parakeet")]
struct ParakeetFilterStats {
raw_segments: usize,
after_dedup: usize,
after_interleaved: usize,
after_script_filter: usize,
after_noise_markers: usize,
after_trailing_trim: usize,
}
#[cfg(feature = "parakeet")]
fn parse_parakeet_output(
raw_output: &str,
config: &Config,
) -> Result<(String, ParakeetFilterStats), TranscribeError> {
let raw = raw_output.trim();
if raw.is_empty() {
return Err(TranscribeError::EmptyTranscript(
config.transcription.min_words,
));
}
let mut lines = Vec::new();
let mut has_timestamps = false;
for line in raw.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
if let Some(rest) = line.strip_prefix('[') {
if let Some(bracket_end) = rest.find(']') {
let timestamp_part = &rest[..bracket_end];
let text = rest[bracket_end + 1..].trim();
if let Some((start_str, _end_str)) = timestamp_part.split_once('-') {
if let Ok(start_secs) = start_str.trim().parse::<f64>() {
let mins = (start_secs / 60.0) as u64;
let secs = (start_secs % 60.0) as u64;
if !text.is_empty() {
lines.push(format!("[{}:{:02}] {}", mins, secs, text));
has_timestamps = true;
}
continue;
}
}
}
}
}
let raw_segments = lines.len();
if lines.is_empty() {
if !has_timestamps {
let preview: String = raw.chars().take(200).collect();
return Err(TranscribeError::ParakeetFailed(format!(
"could not parse parakeet output (no [start - end] timestamps found). \
First 200 chars: {}",
preview
)));
}
return Err(TranscribeError::EmptyTranscript(
config.transcription.min_words,
));
}
let lines = dedup_segments(lines);
let after_dedup = lines.len();
let lines = dedup_interleaved(lines);
let after_interleaved = lines.len();
let lines = strip_foreign_script(lines);
let after_script_filter = lines.len();
let lines = collapse_noise_markers(lines);
let after_noise_markers = lines.len();
let lines = trim_trailing_noise(lines);
let after_trailing_trim = lines.len();
let pstats = ParakeetFilterStats {
raw_segments,
after_dedup,
after_interleaved,
after_script_filter,
after_noise_markers,
after_trailing_trim,
};
let transcript = lines.join("\n");
if transcript.is_empty() {
return Err(TranscribeError::EmptyTranscript(
config.transcription.min_words,
));
}
Ok((format!("{}\n", transcript), pstats))
}
#[cfg(feature = "parakeet")]
fn write_wav_16k_mono(path: &Path, samples: &[f32]) -> Result<(), TranscribeError> {
let spec = hound::WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = hound::WavWriter::create(path, spec)
.map_err(|e| TranscribeError::Io(std::io::Error::other(e.to_string())))?;
for &s in samples {
let sample = (s * 32767.0).clamp(-32768.0, 32767.0) as i16;
writer
.write_sample(sample)
.map_err(|e| TranscribeError::Io(std::io::Error::other(e.to_string())))?;
}
writer
.finalize()
.map_err(|e| TranscribeError::Io(std::io::Error::other(e.to_string())))?;
Ok(())
}
#[cfg(feature = "parakeet")]
fn resolve_parakeet_model_path(config: &Config) -> Result<PathBuf, TranscribeError> {
let model_dir = config.transcription.model_path.join("parakeet");
let model_name = &config.transcription.parakeet_model;
let candidates = [
model_dir.join(format!("{}.safetensors", model_name)),
model_dir.join(format!("parakeet-{}.safetensors", model_name)),
model_dir.join("model.safetensors"),
];
for candidate in &candidates {
if candidate.exists() {
return Ok(candidate.clone());
}
}
let direct = PathBuf::from(model_name);
if direct.exists() {
return Ok(direct);
}
Err(TranscribeError::ModelNotFound(format!(
"Expected parakeet model \"{}\" in {}. Run: minutes setup --parakeet",
model_name,
model_dir.display(),
)))
}
#[cfg(feature = "parakeet")]
fn resolve_parakeet_vocab_path(config: &Config) -> Result<PathBuf, TranscribeError> {
let model_dir = config.transcription.model_path.join("parakeet");
let vocab_name = &config.transcription.parakeet_vocab;
let candidates = [model_dir.join(vocab_name), model_dir.join("vocab.txt")];
for candidate in &candidates {
if candidate.exists() {
return Ok(candidate.clone());
}
}
let direct = PathBuf::from(vocab_name);
if direct.exists() {
return Ok(direct);
}
Err(TranscribeError::ModelNotFound(format!(
"Expected parakeet vocab file \"{}\" in {}. Generated during model conversion.",
vocab_name,
model_dir.display(),
)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "whisper")]
fn resolve_model_path_returns_error_for_missing() {
let config = Config {
transcription: crate::config::TranscriptionConfig {
model: "nonexistent".into(),
model_path: PathBuf::from("/tmp/no-such-dir"),
min_words: 10,
language: Some("en".into()),
vad_model: String::new(),
noise_reduction: false,
..crate::config::TranscriptionConfig::default()
},
..Config::default()
};
let result = resolve_model_path(&config);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("minutes setup --model tiny"),
"error should tell user how to fix it: {}",
err
);
assert!(
err.contains("ggml-nonexistent.bin"),
"error should include expected model filename: {}",
err
);
assert!(
err.contains("/tmp/no-such-dir"),
"error should include the model directory: {}",
err
);
}
#[test]
fn load_wav_rejects_empty_file() {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join("empty.wav");
std::fs::write(&path, "").unwrap();
let result = load_wav(&path);
assert!(result.is_err());
}
#[test]
fn load_wav_reads_valid_wav() {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join("test.wav");
let spec = hound::WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = hound::WavWriter::create(&path, spec).unwrap();
for i in 0..16000 {
let sample =
(10000.0 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin()) as i16;
writer.write_sample(sample).unwrap();
}
writer.finalize().unwrap();
let samples = load_wav(&path).unwrap();
assert!(!samples.is_empty());
assert_eq!(samples.len(), 16000);
}
#[test]
fn load_audio_rejects_unknown_extension() {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join("test.xyz");
std::fs::write(&path, "not audio").unwrap();
let result = load_audio_samples(&path);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("xyz"));
}
#[test]
fn strip_silence_preserves_speech() {
let speech: Vec<f32> = (0..16000)
.map(|i| 0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin())
.collect();
let result = strip_silence(&speech, 16000);
assert_eq!(result.len(), speech.len());
}
#[test]
fn strip_silence_trims_long_silence() {
let mut samples = Vec::new();
for i in 0..16000 {
samples.push(0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin());
}
samples.extend(vec![0.0f32; 16000 * 5]);
for i in 0..16000 {
samples.push(0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin());
}
let result = strip_silence(&samples, 16000);
let original_secs = samples.len() as f64 / 16000.0;
let result_secs = result.len() as f64 / 16000.0;
assert!(
result_secs < original_secs * 0.7,
"expected significant trimming: {:.1}s → {:.1}s",
original_secs,
result_secs
);
assert!(
result_secs > 2.0,
"should preserve both speech segments: {:.1}s",
result_secs
);
}
#[test]
fn strip_silence_keeps_short_pauses() {
let mut samples = Vec::new();
for i in 0..16000 {
samples.push(0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin());
}
samples.extend(vec![0.0f32; 6400]);
for i in 0..16000 {
samples.push(0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin());
}
let result = strip_silence(&samples, 16000);
let ratio = result.len() as f64 / samples.len() as f64;
assert!(
ratio > 0.9,
"short pauses should be preserved: ratio {:.2}",
ratio
);
}
#[test]
fn strip_silence_handles_all_silence() {
let samples = vec![0.0f32; 16000 * 10]; let result = strip_silence(&samples, 16000);
assert!(result.len() < samples.len() / 2, "should trim most silence");
}
#[test]
fn sinc_resample_no_aliasing() {
let n = 44100;
let samples: Vec<f32> = (0..n)
.map(|i| (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 44100.0).sin())
.collect();
let resampled = resample(&samples, 44100, 16000);
let peak = resampled.iter().map(|s| s.abs()).fold(0.0f32, f32::max);
assert!(
peak > 0.8,
"440Hz tone should survive resampling with peak > 0.8, got {}",
peak
);
}
#[test]
fn dedup_no_repetition() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] How are you".into(),
"[0:06] Fine thanks".into(),
];
let result = dedup_segments(lines.clone());
assert_eq!(result, lines);
}
#[test]
fn dedup_collapses_exact_repetition() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] Hello world".into(),
"[0:06] Hello world".into(),
"[0:09] Hello world".into(),
"[0:12] Something different".into(),
];
let result = dedup_segments(lines);
assert_eq!(result.len(), 3); assert!(result[0].contains("Hello world"));
assert!(result[1].contains("repeated audio removed"));
assert!(result[2].contains("Something different"));
}
#[test]
fn dedup_collapses_near_identical() {
let lines = vec![
"[0:00] Ok bene le macedi diesel".into(),
"[0:03] Ok, bene le macedi diesel".into(),
"[0:06] Ok bene, le macedi diesel".into(),
"[0:09] Good morning".into(),
];
let result = dedup_segments(lines);
assert_eq!(result.len(), 3); assert!(result[1].contains("repeated audio removed"));
}
#[test]
fn dedup_leaves_two_similar_alone() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] Hello world".into(),
"[0:06] Something else".into(),
];
let result = dedup_segments(lines.clone());
assert_eq!(result, lines);
}
#[test]
fn dedup_handles_empty() {
let result = dedup_segments(vec![]);
assert!(result.is_empty());
}
#[test]
fn dedup_handles_single_line() {
let lines = vec!["[0:00] Hello".into()];
let result = dedup_segments(lines.clone());
assert_eq!(result, lines);
}
#[test]
fn dedup_multiple_runs() {
let lines = vec![
"[0:00] First phrase".into(),
"[0:03] First phrase".into(),
"[0:06] First phrase".into(),
"[0:09] Second phrase".into(),
"[0:12] Second phrase".into(),
"[0:15] Second phrase".into(),
"[0:18] Second phrase".into(),
"[0:21] Normal text".into(),
];
let result = dedup_segments(lines);
assert_eq!(result.len(), 5); assert!(result[1].contains("2 identical"));
assert!(result[3].contains("3 identical"));
}
#[test]
fn engine_defaults_to_whisper_dispatch() {
let config = Config::default();
assert_eq!(config.transcription.engine, "whisper");
}
#[test]
fn engine_not_available_without_feature() {
#[cfg(not(feature = "parakeet"))]
{
let config = Config {
transcription: crate::config::TranscriptionConfig {
engine: "parakeet".into(),
..crate::config::TranscriptionConfig::default()
},
..Config::default()
};
let result = transcribe(Path::new("/nonexistent/test.wav"), &config);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("parakeet"),
"error should mention parakeet: {}",
err
);
}
}
#[test]
#[cfg(feature = "parakeet")]
fn parse_parakeet_text_basic() {
let text = "[0.00 - 2.50] Hello world\n[3.00 - 5.10] How are you\n";
let config = Config::default();
let result = parse_parakeet_output(text, &config).unwrap();
let lines: Vec<&str> = result.trim().lines().collect();
assert_eq!(lines.len(), 2, "should have 2 lines: {:?}", lines);
assert!(
lines[0].contains("[0:00] Hello world"),
"first: {}",
lines[0]
);
assert!(
lines[1].contains("[0:03] How are you"),
"second: {}",
lines[1]
);
}
#[test]
#[cfg(feature = "parakeet")]
fn parse_parakeet_empty_input() {
let config = Config::default();
let result = parse_parakeet_output("", &config);
assert!(result.is_err(), "empty input should fail");
}
#[test]
#[cfg(feature = "parakeet")]
fn parse_parakeet_plain_text_rejected() {
let text = "this is plain text output without timestamps";
let config = Config::default();
let result = parse_parakeet_output(text, &config);
assert!(result.is_err(), "plain text without timestamps should fail");
let err = result.unwrap_err().to_string();
assert!(
err.contains("no [start - end] timestamps"),
"error should explain the issue: {}",
err
);
}
#[test]
#[cfg(feature = "parakeet")]
fn parse_parakeet_timestamp_formatting() {
let text = "[125.00 - 126.00] late segment\n";
let config = Config::default();
let result = parse_parakeet_output(text, &config).unwrap();
assert!(
result.contains("[2:05]"),
"125s should be [2:05]: {}",
result
);
}
#[test]
#[cfg(feature = "parakeet")]
fn parse_parakeet_dedup_applied() {
let text = "[0.00 - 1.00] Hello world\n\
[1.00 - 2.00] Hello world\n\
[2.00 - 3.00] Hello world\n\
[3.00 - 4.00] Hello world\n\
[5.00 - 6.00] Something different\n";
let config = Config::default();
let result = parse_parakeet_output(text, &config).unwrap();
let lines: Vec<&str> = result.trim().lines().collect();
assert!(
lines.len() < 5,
"dedup should collapse repetitions: {:?}",
lines
);
assert!(lines.last().unwrap().contains("Something different"));
}
#[test]
#[cfg(feature = "parakeet")]
fn parse_parakeet_model_validation() {
let config = Config {
transcription: crate::config::TranscriptionConfig {
engine: "parakeet".into(),
parakeet_model: "totally-fake-model".into(),
..crate::config::TranscriptionConfig::default()
},
..Config::default()
};
let result = transcribe(std::path::Path::new("/nonexistent.wav"), &config);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("unknown parakeet model"),
"should reject invalid model: {}",
err
);
}
#[test]
#[cfg(feature = "parakeet")]
fn write_wav_roundtrip() {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join("test.wav");
let samples: Vec<f32> = (0..16000)
.map(|i| 0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin())
.collect();
write_wav_16k_mono(&path, &samples).unwrap();
let reader = hound::WavReader::open(&path).unwrap();
let spec = reader.spec();
assert_eq!(spec.channels, 1);
assert_eq!(spec.sample_rate, 16000);
assert_eq!(spec.bits_per_sample, 16);
let read_samples: Vec<i16> = reader.into_samples().filter_map(|s| s.ok()).collect();
assert_eq!(read_samples.len(), 16000);
}
#[test]
#[cfg(feature = "parakeet")]
fn resolve_parakeet_model_missing() {
let config = Config {
transcription: crate::config::TranscriptionConfig {
model_path: PathBuf::from("/tmp/no-such-dir"),
parakeet_model: "tdt-600m".into(),
..crate::config::TranscriptionConfig::default()
},
..Config::default()
};
let result = resolve_parakeet_model_path(&config);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("minutes setup --parakeet"),
"error should tell user how to fix it: {}",
err
);
}
#[test]
fn diagnosis_shows_rescue_when_all_segments_would_be_filtered() {
let stats = FilterStats {
audio_duration_secs: 2585.0,
samples_after_silence_strip: 16000 * 2585,
raw_segments: 1,
skipped_no_speech: 0, after_no_speech_filter: 1,
after_dedup: 1,
after_interleaved: 1,
after_script_filter: 1,
after_noise_markers: 1,
after_trailing_trim: 1,
rescued_no_speech: 1,
final_words: 5,
};
let d = stats.diagnosis();
assert!(
d.contains("no_speech rescue: 1 segments saved"),
"should report rescue: {}",
d
);
assert!(
!d.contains("no_speech filter:"),
"should not show filter when rescue happened: {}",
d
);
assert!(
d.contains("final: 5 words"),
"should show final words: {}",
d
);
}
#[test]
fn diagnosis_shows_normal_no_speech_filter_when_some_segments_survive() {
let stats = FilterStats {
audio_duration_secs: 300.0,
samples_after_silence_strip: 16000 * 300,
raw_segments: 50,
skipped_no_speech: 5,
after_no_speech_filter: 45,
after_dedup: 45,
after_interleaved: 45,
after_script_filter: 45,
after_noise_markers: 45,
after_trailing_trim: 45,
rescued_no_speech: 0,
final_words: 500,
};
let d = stats.diagnosis();
assert!(
d.contains("no_speech filter: -5 → 45"),
"should show normal filter: {}",
d
);
assert!(!d.contains("rescue"), "should not mention rescue: {}", d);
}
#[test]
fn timeout_cap_limits_to_one_hour() {
let audio_secs = 2580.0_f64;
let timeout = (300.0 + (audio_secs * 3.0)).min(3600.0);
assert_eq!(timeout, 3600.0, "should cap at 1 hour for long audio");
let short_audio = 30.0_f64;
let timeout = (300.0 + (short_audio * 3.0)).min(3600.0);
assert_eq!(timeout, 390.0, "short audio should not be capped");
}
}