use crate::config::Config;
use crate::error::TranscribeError;
use std::path::Path;
#[cfg(feature = "whisper")]
use std::path::PathBuf;
pub fn transcribe(audio_path: &Path, config: &Config) -> Result<String, TranscribeError> {
let samples = load_audio_samples(audio_path)?;
if samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
#[cfg(feature = "whisper")]
let use_integrated_vad = resolve_vad_model_path(config).is_some();
#[cfg(not(feature = "whisper"))]
let use_integrated_vad = false;
let samples = if use_integrated_vad {
tracing::debug!("Silero VAD available — skipping energy-based silence stripping");
samples
} else {
strip_silence(&samples)
};
if samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
#[cfg(feature = "whisper")]
{
transcribe_with_whisper(&samples, audio_path, config)
}
#[cfg(not(feature = "whisper"))]
{
let _ = config; let duration_secs = samples.len() as f64 / 16000.0;
Ok(format!(
"[Transcription placeholder — whisper feature not enabled]\n\
Audio file: {}\n\
Duration: {:.1}s ({} samples at 16kHz)\n\
\n\
Build with `cargo build --features whisper` and download a model\n\
via `minutes setup` to enable real transcription.",
audio_path.display(),
duration_secs,
samples.len(),
))
}
}
#[cfg(feature = "whisper")]
fn transcribe_with_whisper(
samples: &[f32],
_audio_path: &Path,
config: &Config,
) -> Result<String, TranscribeError> {
let model_path = resolve_model_path(config)?;
tracing::info!(model = %model_path.display(), "loading whisper model");
let ctx = whisper_rs::WhisperContext::new_with_params(
model_path
.to_str()
.ok_or_else(|| TranscribeError::ModelLoadError("invalid model path encoding".into()))?,
whisper_rs::WhisperContextParameters::default(),
)
.map_err(|e| TranscribeError::ModelLoadError(format!("{}", e)))?;
tracing::info!(
samples = samples.len(),
duration_secs = samples.len() as f64 / 16000.0,
"starting whisper transcription"
);
let mut state = ctx
.create_state()
.map_err(|e| TranscribeError::TranscriptionFailed(format!("create state: {}", e)))?;
let vad_path = resolve_vad_model_path(config);
let vad_path_str = vad_path.as_ref().and_then(|p| p.to_str());
let mut params = default_whisper_params(vad_path_str);
params.set_n_threads(num_cpus());
params.set_language(config.transcription.language.as_deref());
params.set_token_timestamps(true);
state
.full(params, samples)
.map_err(|e| TranscribeError::TranscriptionFailed(format!("{}", e)))?;
let num_segments = state.full_n_segments();
let mut lines: Vec<String> = Vec::new();
let mut skipped_no_speech = 0u32;
for i in 0..num_segments {
let segment = match state.get_segment(i) {
Some(seg) => seg,
None => continue,
};
let no_speech_prob = segment.no_speech_probability();
if no_speech_prob > 0.8 {
skipped_no_speech += 1;
tracing::debug!(
segment = i,
no_speech_prob = format!("{:.2}", no_speech_prob),
"skipping segment — high no_speech probability"
);
continue;
}
let start_ts = segment.start_timestamp();
let text = segment
.to_str_lossy()
.map_err(|e| TranscribeError::TranscriptionFailed(format!("get text: {}", e)))?;
let text = text.trim();
if text.is_empty() {
continue;
}
let mins = start_ts / 6000;
let secs = (start_ts % 6000) / 100;
lines.push(format!("[{}:{:02}] {}", mins, secs, text));
}
if skipped_no_speech > 0 {
tracing::info!(
skipped = skipped_no_speech,
"filtered segments with high no_speech probability"
);
}
let lines = dedup_segments(lines);
let lines = dedup_interleaved(lines);
let lines = trim_trailing_noise(lines);
let transcript = lines.join("\n");
let transcript = if transcript.is_empty() {
transcript
} else {
format!("{}\n", transcript)
};
let word_count = transcript.split_whitespace().count();
tracing::info!(
segments = num_segments,
words = word_count,
"transcription complete"
);
Ok(transcript)
}
fn load_audio_samples(path: &Path) -> Result<Vec<f32>, TranscribeError> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
match ext.as_str() {
"wav" => load_wav(path),
"m4a" | "mp3" | "ogg" | "webm" | "mp4" | "aac" => {
match decode_with_ffmpeg(path) {
Ok(samples) => Ok(samples),
Err(e) => {
let is_not_found = e.to_string().contains("not available")
|| e.to_string().contains("not found");
if is_not_found {
tracing::warn!(
"ffmpeg not found — falling back to symphonia for {} decoding. \
Non-English audio may produce poor results. \
Install ffmpeg: brew install ffmpeg (macOS) / apt install ffmpeg (Linux)",
ext
);
} else {
tracing::warn!(
error = %e,
"ffmpeg decode failed — falling back to symphonia"
);
}
decode_with_symphonia(path)
}
}
}
other => Err(TranscribeError::UnsupportedFormat(other.to_string())),
}
}
fn load_wav(path: &Path) -> Result<Vec<f32>, TranscribeError> {
let reader = hound::WavReader::open(path).map_err(|e| {
if e.to_string().contains("Not a WAVE file") || e.to_string().contains("unexpected EOF") {
TranscribeError::UnsupportedFormat("corrupt or invalid WAV file".into())
} else {
TranscribeError::Io(std::io::Error::other(e.to_string()))
}
})?;
let spec = reader.spec();
let sample_rate = spec.sample_rate;
let channels = spec.channels as usize;
let bits = spec.bits_per_sample;
let max_val = (1_i64 << (bits - 1)) as f32; let raw_samples: Vec<f32> = match spec.sample_format {
hound::SampleFormat::Int => reader
.into_samples::<i32>()
.filter_map(|s| s.ok())
.map(|s| s as f32 / max_val)
.collect(),
hound::SampleFormat::Float => reader
.into_samples::<f32>()
.filter_map(|s| s.ok())
.collect(),
};
if raw_samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
let mono = if channels > 1 {
raw_samples
.chunks(channels)
.map(|frame| frame.iter().sum::<f32>() / channels as f32)
.collect()
} else {
raw_samples
};
let resampled = if sample_rate != 16000 {
resample(&mono, sample_rate, 16000)
} else {
mono
};
Ok(normalize_audio(resampled))
}
fn decode_with_ffmpeg(path: &Path) -> Result<Vec<f32>, TranscribeError> {
use std::process::Command;
let tmp_dir = std::env::temp_dir();
let tmp_wav = tmp_dir.join(format!("minutes-ffmpeg-{}.wav", std::process::id()));
let output = Command::new("ffmpeg")
.args([
"-i",
path.to_str().unwrap_or(""),
"-ar",
"16000", "-ac",
"1", "-f",
"wav", "-y", ])
.arg(&tmp_wav)
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::piped())
.output()
.map_err(|e| {
TranscribeError::TranscriptionFailed(format!("ffmpeg not available: {}", e))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let _ = std::fs::remove_file(&tmp_wav);
return Err(TranscribeError::TranscriptionFailed(format!(
"ffmpeg conversion failed: {}",
stderr.lines().last().unwrap_or("unknown error")
)));
}
tracing::info!(
source = %path.display(),
"decoded audio with ffmpeg (16kHz mono WAV)"
);
let result = load_wav(&tmp_wav);
let _ = std::fs::remove_file(&tmp_wav);
result
}
fn decode_with_symphonia(path: &Path) -> Result<Vec<f32>, TranscribeError> {
use symphonia::core::audio::SampleBuffer;
use symphonia::core::codecs::DecoderOptions;
use symphonia::core::formats::FormatOptions;
use symphonia::core::io::MediaSourceStream;
use symphonia::core::meta::MetadataOptions;
use symphonia::core::probe::Hint;
let file = std::fs::File::open(path)?;
let mss = MediaSourceStream::new(Box::new(file), Default::default());
let mut hint = Hint::new();
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
hint.with_extension(ext);
}
let format_opts = FormatOptions::default();
let metadata_opts = MetadataOptions::default();
let probed = symphonia::default::get_probe()
.format(&hint, mss, &format_opts, &metadata_opts)
.map_err(|e| TranscribeError::UnsupportedFormat(format!("probe failed: {}", e)))?;
let mut format = probed.format;
let track = format
.tracks()
.iter()
.find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
.ok_or_else(|| TranscribeError::UnsupportedFormat("no audio track found".into()))?;
let track_id = track.id;
let sample_rate = track.codec_params.sample_rate.unwrap_or(44100);
let channels = track.codec_params.channels.map(|c| c.count()).unwrap_or(1);
let decoder_opts = DecoderOptions::default();
let mut decoder = symphonia::default::get_codecs()
.make(&track.codec_params, &decoder_opts)
.map_err(|e| TranscribeError::UnsupportedFormat(format!("decoder: {}", e)))?;
let mut all_samples: Vec<f32> = Vec::new();
loop {
let packet = match format.next_packet() {
Ok(packet) => packet,
Err(symphonia::core::errors::Error::IoError(ref e))
if e.kind() == std::io::ErrorKind::UnexpectedEof =>
{
break; }
Err(_) => break,
};
if packet.track_id() != track_id {
continue;
}
let decoded = match decoder.decode(&packet) {
Ok(decoded) => decoded,
Err(_) => continue, };
let spec = *decoded.spec();
let duration = decoded.capacity();
let mut sample_buf = SampleBuffer::<f32>::new(duration as u64, spec);
sample_buf.copy_interleaved_ref(decoded);
let samples = sample_buf.samples();
if channels > 1 {
for chunk in samples.chunks(channels) {
let mono_sample = chunk.iter().sum::<f32>() / channels as f32;
all_samples.push(mono_sample);
}
} else {
all_samples.extend_from_slice(samples);
}
}
if all_samples.is_empty() {
return Err(TranscribeError::EmptyAudio);
}
let resampled = if sample_rate != 16000 {
resample(&all_samples, sample_rate, 16000)
} else {
all_samples
};
Ok(normalize_audio(resampled))
}
fn resample(samples: &[f32], from_rate: u32, to_rate: u32) -> Vec<f32> {
if from_rate == to_rate {
return samples.to_vec();
}
let ratio = from_rate as f64 / to_rate as f64;
let output_len = (samples.len() as f64 / ratio) as usize;
let mut output = Vec::with_capacity(output_len);
let cutoff = if to_rate < from_rate {
to_rate as f64 / from_rate as f64
} else {
1.0
};
const HALF_WIDTH: i32 = 16;
for i in 0..output_len {
let src_pos = i as f64 * ratio;
let src_center = src_pos as i32;
let mut sum = 0.0f64;
let mut weight_sum = 0.0f64;
for j in (src_center - HALF_WIDTH + 1)..=(src_center + HALF_WIDTH) {
if j < 0 || j >= samples.len() as i32 {
continue;
}
let delta = src_pos - j as f64;
let sinc = if delta.abs() < 1e-10 {
cutoff
} else {
let x = std::f64::consts::PI * delta * cutoff;
(x.sin() / (std::f64::consts::PI * delta)) * cutoff
};
let window_pos = (delta / HALF_WIDTH as f64 + 1.0) * 0.5;
let window = if (0.0..=1.0).contains(&window_pos) {
0.5 * (1.0 - (2.0 * std::f64::consts::PI * window_pos).cos())
} else {
0.0
};
let w = sinc * window;
sum += samples[j as usize] as f64 * w;
weight_sum += w;
}
let sample = if weight_sum.abs() > 1e-10 {
sum / weight_sum
} else {
0.0
};
output.push(sample as f32);
}
output
}
fn normalize_audio(mut samples: Vec<f32>) -> Vec<f32> {
if samples.is_empty() {
return samples;
}
let peak = samples.iter().map(|s| s.abs()).fold(0.0f32, f32::max);
const TARGET_PEAK: f32 = 0.5;
const QUIET_THRESHOLD: f32 = 0.1;
const NOISE_FLOOR: f32 = 0.0001;
if peak < QUIET_THRESHOLD && peak > NOISE_FLOOR {
let gain = TARGET_PEAK / peak;
let gain = gain.min(100.0);
tracing::info!(
peak = format!("{:.4}", peak),
gain = format!("{:.1}x", gain),
"auto-normalizing quiet audio"
);
for s in &mut samples {
*s = (*s * gain).clamp(-1.0, 1.0);
}
}
samples
}
#[allow(dead_code)] fn dedup_segments(lines: Vec<String>) -> Vec<String> {
if lines.len() < 3 {
return lines;
}
fn text_part(line: &str) -> &str {
line.find("] ").map(|i| &line[i + 2..]).unwrap_or(line)
}
fn similarity(a: &str, b: &str) -> f64 {
if a.is_empty() || b.is_empty() {
return 0.0;
}
let a_lower = a.to_lowercase();
let b_lower = b.to_lowercase();
if a_lower == b_lower {
return 1.0;
}
let (short, long) = if a_lower.len() <= b_lower.len() {
(&a_lower, &b_lower)
} else {
(&b_lower, &a_lower)
};
if long.contains(short.as_str()) {
return short.len() as f64 / long.len() as f64;
}
let a_words: Vec<&str> = a_lower.split_whitespace().collect();
let b_words: Vec<&str> = b_lower.split_whitespace().collect();
let matching = a_words.iter().filter(|w| b_words.contains(w)).count();
let total = a_words.len().max(b_words.len());
if total == 0 {
return 0.0;
}
matching as f64 / total as f64
}
let mut result = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
let base_text = text_part(&lines[i]);
let mut run_end = i + 1;
while run_end < lines.len() {
let candidate = text_part(&lines[run_end]);
if similarity(base_text, candidate) >= 0.8 {
run_end += 1;
} else {
break;
}
}
let run_len = run_end - i;
if run_len >= 3 {
tracing::warn!(
first_segment = i,
repeated_count = run_len,
text = base_text,
"detected repetition loop in whisper output — collapsing {} segments",
run_len
);
result.push(lines[i].clone());
result.push(format!(
"[...] [repeated audio removed — {} identical segments collapsed]",
run_len - 1
));
i = run_end;
} else {
result.push(lines[i].clone());
i += 1;
}
}
result
}
pub fn clean_transcript(transcript: &str) -> (String, CleanStats) {
let lines: Vec<String> = transcript.lines().map(|l| l.to_string()).collect();
let original_count = lines.len();
let lines = dedup_segments(lines);
let after_consecutive = lines.len();
let lines = dedup_interleaved(lines);
let after_interleaved = lines.len();
let lines = trim_trailing_noise(lines);
let after_trim = lines.len();
let stats = CleanStats {
original_lines: original_count,
after_consecutive_dedup: after_consecutive,
after_interleaved_dedup: after_interleaved,
after_trailing_trim: after_trim,
lines_removed: original_count.saturating_sub(after_trim),
};
(lines.join("\n"), stats)
}
#[derive(Debug)]
pub struct CleanStats {
pub original_lines: usize,
pub after_consecutive_dedup: usize,
pub after_interleaved_dedup: usize,
pub after_trailing_trim: usize,
pub lines_removed: usize,
}
#[allow(dead_code)] fn dedup_interleaved(lines: Vec<String>) -> Vec<String> {
if lines.len() < 6 {
return lines;
}
fn text_part(line: &str) -> &str {
line.find("] ").map(|i| &line[i + 2..]).unwrap_or(line)
}
fn normalize(text: &str) -> String {
text.to_lowercase()
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn is_filler(text: &str) -> bool {
let normalized = text.trim().to_lowercase();
let normalized = normalized.trim_matches(|c: char| !c.is_alphanumeric());
matches!(
normalized,
"okay"
| "ok"
| "yeah"
| "yes"
| "right"
| "so"
| "and"
| "but"
| "well"
| "uh"
| "um"
| "hmm"
| "mhm"
)
}
let texts: Vec<String> = lines.iter().map(|l| normalize(text_part(l))).collect();
let fillers: Vec<bool> = texts.iter().map(|t| is_filler(t)).collect();
let mut remove = vec![false; lines.len()];
let window_size = 10;
let min_occurrences = 5;
let mut i = 0;
while i + window_size <= lines.len() {
let mut freq: std::collections::BTreeMap<&str, Vec<usize>> =
std::collections::BTreeMap::new();
for j in i..i + window_size {
if !fillers[j] && !texts[j].is_empty() {
freq.entry(&texts[j]).or_default().push(j);
}
}
let dominant = freq
.iter()
.max_by(|(phrase_a, pos_a), (phrase_b, pos_b)| {
pos_a
.len()
.cmp(&pos_b.len())
.then_with(|| phrase_a.cmp(phrase_b))
})
.filter(|(_, positions)| positions.len() >= min_occurrences);
if let Some((phrase, _)) = dominant {
let phrase = phrase.to_string();
let mut region_end = i + window_size;
while region_end < lines.len() {
let t = &texts[region_end];
if *t == phrase || fillers[region_end] {
region_end += 1;
} else {
let mut gap = 0;
let mut found_resume = false;
for t in texts
.iter()
.take(lines.len().min(region_end + 3))
.skip(region_end)
{
if *t == phrase {
found_resume = true;
break;
}
gap += 1;
}
if found_resume && gap <= 2 {
region_end += gap + 1;
} else {
break;
}
}
}
let region_len = region_end - i;
let actual_count = (i..region_end).filter(|&j| texts[j] == phrase).count();
if actual_count >= min_occurrences && region_len >= 6 {
tracing::warn!(
region_start = i,
region_end = region_end,
occurrences = actual_count,
filler_count = (i..region_end).filter(|&j| fillers[j]).count(),
phrase = phrase,
"detected interleaved hallucination loop — marking {} lines for removal",
region_len
);
let mut kept_first = false;
for j in i..region_end {
if !kept_first && texts[j] == phrase {
kept_first = true; } else {
remove[j] = true;
}
}
i = region_end;
continue;
}
}
i += 1;
}
let removed_count = remove.iter().filter(|&&r| r).count();
if removed_count > 0 {
let mut result = Vec::with_capacity(lines.len() - removed_count + 1);
let mut in_removed_run = false;
for (idx, line) in lines.iter().enumerate() {
if remove[idx] {
if !in_removed_run {
in_removed_run = true;
let run_len = (idx..lines.len()).take_while(|&j| remove[j]).count();
result.push(format!(
"[...] [hallucinated repetition removed — {} lines collapsed]",
run_len
));
}
} else {
in_removed_run = false;
result.push(line.clone());
}
}
tracing::info!(
original = lines.len(),
removed = removed_count,
remaining = result.len(),
"interleaved dedup complete"
);
result
} else {
lines
}
}
#[allow(dead_code)] fn trim_trailing_noise(lines: Vec<String>) -> Vec<String> {
if lines.is_empty() {
return lines;
}
fn text_part(line: &str) -> &str {
line.find("] ").map(|i| &line[i + 2..]).unwrap_or(line)
}
fn is_noise(text: &str) -> bool {
let t = text.trim().to_lowercase();
t == "[music]"
|| t == "[blank_audio]"
|| t == "[silence]"
|| t == "music"
|| t == "you" || t == "okay."
|| t == "yeah."
}
let mut trim_from = lines.len();
for i in (0..lines.len()).rev() {
let text = text_part(&lines[i]);
if is_noise(text) {
trim_from = i;
} else {
break;
}
}
let trimmed_count = lines.len() - trim_from;
if trimmed_count >= 5 {
tracing::info!(
trimmed = trimmed_count,
"removed trailing noise from transcript"
);
let mut result: Vec<String> = lines[..trim_from].to_vec();
result.push(format!(
"\n[Recording ended — {} lines of trailing noise removed]",
trimmed_count
));
result
} else {
lines
}
}
fn strip_silence(samples: &[f32]) -> Vec<f32> {
const SAMPLE_RATE: usize = 16000;
const CHUNK_SIZE: usize = SAMPLE_RATE / 10; const MAX_SILENCE_CHUNKS: usize = 5; const PAD_CHUNKS: usize = 3; const CONTEXT_CHUNKS: usize = 2; const ENERGY_MULTIPLIER: f32 = 4.0;
if samples.len() < CHUNK_SIZE * 3 {
return samples.to_vec();
}
let num_chunks = samples.len() / CHUNK_SIZE;
let rms_values: Vec<f32> = (0..num_chunks)
.map(|i| {
let start = i * CHUNK_SIZE;
let end = (start + CHUNK_SIZE).min(samples.len());
let chunk = &samples[start..end];
(chunk.iter().map(|s| s * s).sum::<f32>() / chunk.len() as f32).sqrt()
})
.collect();
let mut sorted_rms = rms_values.clone();
sorted_rms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let quiet_count = (num_chunks / 5).max(1);
let noise_floor =
(sorted_rms[..quiet_count].iter().sum::<f32>() / quiet_count as f32).clamp(0.0001, 0.02);
let threshold = noise_floor * ENERGY_MULTIPLIER;
let mut is_speech = vec![false; num_chunks];
let mut hangover = 0u32;
const HANGOVER_CHUNKS: u32 = 5; for (i, rms) in rms_values.iter().enumerate() {
if *rms > threshold {
is_speech[i] = true;
hangover = HANGOVER_CHUNKS;
} else if hangover > 0 {
is_speech[i] = true;
hangover -= 1;
}
}
let mut keep = is_speech.clone();
for (i, &speech) in is_speech.iter().enumerate() {
if speech {
let from = i.saturating_sub(CONTEXT_CHUNKS);
let to = (i + CONTEXT_CHUNKS + 1).min(num_chunks);
for k in &mut keep[from..to] {
*k = true;
}
}
}
let mut output = Vec::with_capacity(samples.len());
let mut consecutive_silence = 0usize;
let silence_pad: Vec<f32> = vec![0.0; PAD_CHUNKS * CHUNK_SIZE];
for (i, &kept) in keep.iter().enumerate() {
let start = i * CHUNK_SIZE;
let end = (start + CHUNK_SIZE).min(samples.len());
if kept {
if consecutive_silence > MAX_SILENCE_CHUNKS {
output.extend_from_slice(&silence_pad);
}
consecutive_silence = 0;
output.extend_from_slice(&samples[start..end]);
} else {
consecutive_silence += 1;
if consecutive_silence <= MAX_SILENCE_CHUNKS {
output.extend_from_slice(&samples[start..end]);
}
}
}
let remainder_start = num_chunks * CHUNK_SIZE;
if remainder_start < samples.len() {
output.extend_from_slice(&samples[remainder_start..]);
}
let original_secs = samples.len() as f64 / SAMPLE_RATE as f64;
let stripped_secs = output.len() as f64 / SAMPLE_RATE as f64;
if stripped_secs < original_secs * 0.95 {
tracing::info!(
original_secs = format!("{:.1}", original_secs),
stripped_secs = format!("{:.1}", stripped_secs),
removed_pct = format!("{:.0}", (1.0 - stripped_secs / original_secs) * 100.0),
"VAD stripped silence from audio"
);
}
output
}
#[cfg(feature = "whisper")]
pub fn resolve_model_path_for_dictation(config: &Config) -> Result<PathBuf, TranscribeError> {
let model_name = &config.dictation.model;
let model_dir = &config.transcription.model_path;
let candidates = [
model_dir.join(format!("ggml-{}.bin", model_name)),
model_dir.join(format!("whisper-{}.bin", model_name)),
model_dir.join(format!("{}.bin", model_name)),
];
for candidate in &candidates {
if candidate.exists() {
return Ok(candidate.clone());
}
}
let direct = PathBuf::from(model_name);
if direct.exists() {
return Ok(direct);
}
Err(TranscribeError::ModelNotFound(format!(
"Expected model file \"ggml-{}.bin\" in {}",
model_name,
model_dir.display(),
)))
}
#[cfg(feature = "whisper")]
fn resolve_model_path(config: &Config) -> Result<PathBuf, TranscribeError> {
let model_name = &config.transcription.model;
let model_dir = &config.transcription.model_path;
let candidates = [
model_dir.join(format!("ggml-{}.bin", model_name)),
model_dir.join(format!("whisper-{}.bin", model_name)),
model_dir.join(format!("{}.bin", model_name)),
];
for candidate in &candidates {
if candidate.exists() {
return Ok(candidate.clone());
}
}
let direct = PathBuf::from(model_name);
if direct.exists() {
return Ok(direct);
}
Err(TranscribeError::ModelNotFound(format!(
"Expected model file \"ggml-{}.bin\" in {}",
model_name,
model_dir.display(),
)))
}
#[cfg(feature = "whisper")]
fn resolve_vad_model_path(config: &Config) -> Option<PathBuf> {
let vad_model = &config.transcription.vad_model;
if vad_model.is_empty() {
return None;
}
let model_dir = &config.transcription.model_path;
let mut candidates = vec![
model_dir.join(format!("ggml-{}.bin", vad_model)),
model_dir.join(format!("{}.bin", vad_model)),
];
if vad_model.starts_with("silero") {
candidates.push(model_dir.join("ggml-silero-vad.bin"));
}
for candidate in &candidates {
if candidate.exists() {
return Some(candidate.clone());
}
}
let direct = PathBuf::from(vad_model);
if direct.exists() {
return Some(direct);
}
tracing::debug!(
vad_model = vad_model,
"VAD model not found — falling back to energy-based silence stripping"
);
None
}
#[cfg(feature = "whisper")]
pub fn default_whisper_params<'a, 'b>(
vad_model_path: Option<&str>,
) -> whisper_rs::FullParams<'a, 'b> {
let mut params =
whisper_rs::FullParams::new(whisper_rs::SamplingStrategy::Greedy { best_of: 5 });
params.set_temperature(0.0);
params.set_temperature_inc(0.2); params.set_entropy_thold(2.4); params.set_logprob_thold(-1.0); params.set_no_speech_thold(0.6); params.set_suppress_blank(true);
if let Some(path) = vad_model_path {
params.set_vad_model_path(Some(path));
params.enable_vad(true);
params.set_vad_params(whisper_rs::WhisperVadParams::default());
tracing::info!("Silero VAD enabled for transcription");
}
params.set_print_special(false);
params.set_print_progress(false);
params.set_print_realtime(false);
params.set_print_timestamps(false);
params
}
#[cfg(feature = "whisper")]
pub fn streaming_whisper_params<'a, 'b>() -> whisper_rs::FullParams<'a, 'b> {
let mut params =
whisper_rs::FullParams::new(whisper_rs::SamplingStrategy::Greedy { best_of: 1 });
params.set_temperature(0.0);
params.set_temperature_inc(0.0); params.set_entropy_thold(2.4);
params.set_logprob_thold(-1.0);
params.set_no_speech_thold(0.6);
params.set_suppress_blank(true);
params.set_print_special(false);
params.set_print_progress(false);
params.set_print_realtime(false);
params.set_print_timestamps(false);
params
}
#[cfg(feature = "whisper")]
fn num_cpus() -> i32 {
std::thread::available_parallelism()
.map(|p| p.get() as i32)
.unwrap_or(4)
.min(8) }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resample_preserves_length_proportionally() {
let samples: Vec<f32> = (0..44100).map(|i| (i as f32 / 44100.0).sin()).collect();
let resampled = resample(&samples, 44100, 16000);
let expected = 16000;
assert!(
(resampled.len() as i64 - expected as i64).unsigned_abs() < 10,
"expected ~{} samples, got {}",
expected,
resampled.len()
);
}
#[test]
fn resample_noop_at_same_rate() {
let samples = vec![1.0f32, 2.0, 3.0, 4.0];
let resampled = resample(&samples, 16000, 16000);
assert_eq!(samples, resampled);
}
#[test]
fn normalize_boosts_quiet_audio() {
let samples = vec![0.005f32, -0.008, 0.01, -0.003, 0.007];
let normalized = normalize_audio(samples);
let peak = normalized.iter().map(|s| s.abs()).fold(0.0f32, f32::max);
assert!(peak > 0.4, "expected peak > 0.4, got {}", peak);
assert!(peak <= 0.5, "expected peak <= 0.5, got {}", peak);
}
#[test]
fn normalize_leaves_loud_audio_untouched() {
let samples = vec![0.3f32, -0.5, 0.2, -0.1];
let normalized = normalize_audio(samples.clone());
assert_eq!(samples, normalized);
}
#[test]
fn normalize_ignores_noise_floor() {
let samples = vec![0.00001f32, -0.00002, 0.00001];
let normalized = normalize_audio(samples.clone());
assert_eq!(samples, normalized);
}
#[test]
#[cfg(feature = "whisper")]
fn resolve_model_path_returns_error_for_missing() {
let config = Config {
transcription: crate::config::TranscriptionConfig {
model: "nonexistent".into(),
model_path: PathBuf::from("/tmp/no-such-dir"),
min_words: 10,
language: Some("en".into()),
vad_model: String::new(),
},
..Config::default()
};
let result = resolve_model_path(&config);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("minutes setup --model tiny"),
"error should tell user how to fix it: {}",
err
);
assert!(
err.contains("ggml-nonexistent.bin"),
"error should include expected model filename: {}",
err
);
assert!(
err.contains("/tmp/no-such-dir"),
"error should include the model directory: {}",
err
);
}
#[test]
fn load_wav_rejects_empty_file() {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join("empty.wav");
std::fs::write(&path, "").unwrap();
let result = load_wav(&path);
assert!(result.is_err());
}
#[test]
fn load_wav_reads_valid_wav() {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join("test.wav");
let spec = hound::WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = hound::WavWriter::create(&path, spec).unwrap();
for i in 0..16000 {
let sample =
(10000.0 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin()) as i16;
writer.write_sample(sample).unwrap();
}
writer.finalize().unwrap();
let samples = load_wav(&path).unwrap();
assert!(!samples.is_empty());
assert_eq!(samples.len(), 16000);
}
#[test]
fn load_audio_rejects_unknown_extension() {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join("test.xyz");
std::fs::write(&path, "not audio").unwrap();
let result = load_audio_samples(&path);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("xyz"));
}
#[test]
fn strip_silence_preserves_speech() {
let speech: Vec<f32> = (0..16000)
.map(|i| 0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin())
.collect();
let result = strip_silence(&speech);
assert_eq!(result.len(), speech.len());
}
#[test]
fn strip_silence_trims_long_silence() {
let mut samples = Vec::new();
for i in 0..16000 {
samples.push(0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin());
}
samples.extend(vec![0.0f32; 16000 * 5]);
for i in 0..16000 {
samples.push(0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin());
}
let result = strip_silence(&samples);
let original_secs = samples.len() as f64 / 16000.0;
let result_secs = result.len() as f64 / 16000.0;
assert!(
result_secs < original_secs * 0.7,
"expected significant trimming: {:.1}s → {:.1}s",
original_secs,
result_secs
);
assert!(
result_secs > 2.0,
"should preserve both speech segments: {:.1}s",
result_secs
);
}
#[test]
fn strip_silence_keeps_short_pauses() {
let mut samples = Vec::new();
for i in 0..16000 {
samples.push(0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin());
}
samples.extend(vec![0.0f32; 6400]);
for i in 0..16000 {
samples.push(0.5 * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 16000.0).sin());
}
let result = strip_silence(&samples);
let ratio = result.len() as f64 / samples.len() as f64;
assert!(
ratio > 0.9,
"short pauses should be preserved: ratio {:.2}",
ratio
);
}
#[test]
fn strip_silence_handles_all_silence() {
let samples = vec![0.0f32; 16000 * 10]; let result = strip_silence(&samples);
assert!(result.len() < samples.len() / 2, "should trim most silence");
}
#[test]
fn sinc_resample_no_aliasing() {
let n = 44100;
let samples: Vec<f32> = (0..n)
.map(|i| (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 44100.0).sin())
.collect();
let resampled = resample(&samples, 44100, 16000);
let peak = resampled.iter().map(|s| s.abs()).fold(0.0f32, f32::max);
assert!(
peak > 0.8,
"440Hz tone should survive resampling with peak > 0.8, got {}",
peak
);
}
#[test]
fn dedup_no_repetition() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] How are you".into(),
"[0:06] Fine thanks".into(),
];
let result = dedup_segments(lines.clone());
assert_eq!(result, lines);
}
#[test]
fn dedup_collapses_exact_repetition() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] Hello world".into(),
"[0:06] Hello world".into(),
"[0:09] Hello world".into(),
"[0:12] Something different".into(),
];
let result = dedup_segments(lines);
assert_eq!(result.len(), 3); assert!(result[0].contains("Hello world"));
assert!(result[1].contains("repeated audio removed"));
assert!(result[2].contains("Something different"));
}
#[test]
fn dedup_collapses_near_identical() {
let lines = vec![
"[0:00] Ok bene le macedi diesel".into(),
"[0:03] Ok, bene le macedi diesel".into(),
"[0:06] Ok bene, le macedi diesel".into(),
"[0:09] Good morning".into(),
];
let result = dedup_segments(lines);
assert_eq!(result.len(), 3); assert!(result[1].contains("repeated audio removed"));
}
#[test]
fn dedup_leaves_two_similar_alone() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] Hello world".into(),
"[0:06] Something else".into(),
];
let result = dedup_segments(lines.clone());
assert_eq!(result, lines);
}
#[test]
fn dedup_handles_empty() {
let result = dedup_segments(vec![]);
assert!(result.is_empty());
}
#[test]
fn dedup_handles_single_line() {
let lines = vec!["[0:00] Hello".into()];
let result = dedup_segments(lines.clone());
assert_eq!(result, lines);
}
#[test]
fn dedup_multiple_runs() {
let lines = vec![
"[0:00] First phrase".into(),
"[0:03] First phrase".into(),
"[0:06] First phrase".into(),
"[0:09] Second phrase".into(),
"[0:12] Second phrase".into(),
"[0:15] Second phrase".into(),
"[0:18] Second phrase".into(),
"[0:21] Normal text".into(),
];
let result = dedup_segments(lines);
assert_eq!(result.len(), 5); assert!(result[1].contains("2 identical"));
assert!(result[3].contains("3 identical"));
}
#[test]
fn interleaved_catches_alternating_pattern() {
let mut lines: Vec<String> = Vec::new();
for i in 0..20 {
let ts = i * 2;
if i % 2 == 0 {
lines.push(format!(
"[{}:{:02}] So I'm going to pick his brain as well.",
ts / 60,
ts % 60
));
} else {
lines.push(format!("[{}:{:02}] Okay.", ts / 60, ts % 60));
}
}
lines.push("[0:40] Something completely different".into());
let result = dedup_interleaved(lines);
assert!(
result.len() <= 4,
"expected <=4 lines, got {}: {:?}",
result.len(),
result
);
assert!(result.iter().any(|l| l.contains("pick his brain")));
assert!(result
.iter()
.any(|l| l.contains("hallucinated repetition removed")));
assert!(result
.last()
.unwrap()
.contains("Something completely different"));
}
#[test]
fn interleaved_leaves_normal_conversation() {
let lines = vec![
"[0:00] Hello how are you".into(),
"[0:05] I'm fine thanks".into(),
"[0:10] Great to hear".into(),
"[0:15] Let's talk about the project".into(),
"[0:20] Sure what's the update".into(),
"[0:25] We shipped the feature".into(),
];
let result = dedup_interleaved(lines.clone());
assert_eq!(result, lines);
}
#[test]
fn interleaved_ignores_short_repeats() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:02] Okay.".into(),
"[0:04] Hello world".into(),
"[0:06] Okay.".into(),
"[0:08] Hello world".into(),
"[0:10] Something else".into(),
];
let result = dedup_interleaved(lines.clone());
assert_eq!(result, lines);
}
#[test]
fn trim_trailing_music() {
let mut lines: Vec<String> = vec![
"[0:00] Hello world".into(),
"[0:05] Some real content".into(),
];
for i in 0..20 {
lines.push(format!("[{}:00] [music]", i + 1));
}
let result = trim_trailing_noise(lines);
assert_eq!(result.len(), 3); assert!(result[0].contains("Hello world"));
assert!(result[1].contains("real content"));
assert!(result[2].contains("trailing noise removed"));
}
#[test]
fn trim_keeps_short_trailing_noise() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:05] [music]".into(),
"[0:10] [music]".into(),
"[0:15] [music]".into(),
];
let result = trim_trailing_noise(lines.clone());
assert_eq!(result, lines); }
#[test]
fn trim_handles_empty() {
assert!(trim_trailing_noise(vec![]).is_empty());
}
#[test]
fn trim_all_noise() {
let lines: Vec<String> = (0..10).map(|i| format!("[{}:00] [music]", i)).collect();
let result = trim_trailing_noise(lines);
assert_eq!(result.len(), 1);
assert!(result[0].contains("trailing noise removed"));
}
}