use anyhow::{Context, Result};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use whisper_rs::{
FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters, WhisperState,
};
use crate::config::Config;
use crate::transcription::vad;
pub fn inference_thread_count() -> i32 {
let n = std::thread::available_parallelism()
.map(|n| n.get() as i32)
.unwrap_or(4);
(n * 3 / 4).clamp(4, 8)
}
const MIN_AUDIO_SAMPLES: usize = 4_000;
const HALLUCINATED_PHRASES: &[&str] = &[
"the following",
"thank you",
"thanks for watching",
"thank you for watching",
"thanks for listening",
"thank you for listening",
"like and subscribe",
"please subscribe",
"subscribe",
"goodbye",
"bye bye",
"bye",
"you",
];
fn is_hallucination(text: &str) -> bool {
let normalized = text
.trim()
.trim_matches(|c: char| c.is_ascii_punctuation())
.to_lowercase();
if normalized.is_empty() {
return false;
}
HALLUCINATED_PHRASES.iter().any(|&h| normalized == h)
}
#[derive(Debug, Clone, Default)]
pub struct TranscriptionContext {
pub vocabulary: Vec<String>,
pub surrounding_text: Option<String>,
pub prompt_prefix: Option<String>,
}
const MAX_PROMPT_CHARS: usize = 500;
const MAX_TOKENS_PER_TERM: usize = 32;
const NOVELTY_TOKEN_THRESHOLD: usize = 2;
#[derive(Debug, Clone)]
pub struct RankedTerm {
pub term: String,
pub token_count: usize,
}
pub fn rank_vocabulary(ctx: &WhisperContext, terms: &[String]) -> Vec<RankedTerm> {
let mut ranked: Vec<RankedTerm> = terms
.iter()
.map(|term| {
let token_count = match ctx.tokenize(term, MAX_TOKENS_PER_TERM) {
Ok(tokens) => tokens.len(),
Err(_) => {
log::debug!("Failed to tokenize term '{term}', treating as novel");
MAX_TOKENS_PER_TERM
}
};
RankedTerm {
term: term.clone(),
token_count,
}
})
.collect();
ranked.sort_by(|a, b| b.token_count.cmp(&a.token_count));
ranked
}
pub fn filter_novel_terms(ranked: &[RankedTerm]) -> Vec<&RankedTerm> {
ranked
.iter()
.filter(|rt| rt.token_count >= NOVELTY_TOKEN_THRESHOLD)
.collect()
}
pub fn build_initial_prompt(ctx: &TranscriptionContext) -> Option<String> {
if ctx.vocabulary.is_empty() && ctx.surrounding_text.is_none() && ctx.prompt_prefix.is_none() {
return None;
}
let mut parts: Vec<String> = Vec::new();
if let Some(prefix) = &ctx.prompt_prefix {
let trimmed = prefix.trim();
if !trimmed.is_empty() {
parts.push(trimmed.to_string());
}
}
if !ctx.vocabulary.is_empty() {
let vocab_str = ctx.vocabulary.join(", ");
parts.push(vocab_str);
}
if let Some(surrounding) = &ctx.surrounding_text {
let trimmed = surrounding.trim();
if !trimmed.is_empty() {
parts.push(trimmed.to_string());
}
}
let prompt = parts.join(". ");
if prompt.is_empty() {
return None;
}
if prompt.len() > MAX_PROMPT_CHARS {
let start = prompt.len() - MAX_PROMPT_CHARS;
let start = snap_to_char_boundary(&prompt, start);
let adjusted_start = if is_cjk_heavy(&prompt[start..]) {
start
} else if let Some(i) = prompt[start..].find(' ') {
start + i + 1
} else if let Some(i) = prompt[start..].find(", ") {
start + i + 2
} else {
start
};
Some(prompt[adjusted_start..].to_string())
} else {
Some(prompt)
}
}
fn snap_to_char_boundary(s: &str, byte_offset: usize) -> usize {
let mut pos = byte_offset;
while pos < s.len() && !s.is_char_boundary(pos) {
pos += 1;
}
pos
}
fn is_cjk_heavy(text: &str) -> bool {
let sample: String = text.chars().take(100).collect();
if sample.is_empty() {
return false;
}
let cjk_count = sample.chars().filter(|c| is_cjk_char(*c)).count();
cjk_count * 100 / sample.chars().count() > 30
}
fn is_cjk_char(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' | '\u{F900}'..='\u{FAFF}' )
}
pub struct Transcriber {
ctx: WhisperContext,
language: String,
model_path: PathBuf,
}
pub fn read_wav_samples(audio_path: &Path) -> Result<Vec<f32>> {
let reader = hound::WavReader::open(audio_path).context("Failed to open audio file")?;
let spec = reader.spec();
let samples: Vec<f32> = match spec.sample_format {
hound::SampleFormat::Int => {
let max_val = (1_i64 << (spec.bits_per_sample - 1)) as f32;
reader
.into_samples::<i32>()
.collect::<std::result::Result<Vec<_>, _>>()
.context("Failed to decode integer WAV samples")?
.into_iter()
.map(|s| s as f32 / max_val)
.collect()
}
hound::SampleFormat::Float => reader
.into_samples::<f32>()
.collect::<std::result::Result<Vec<_>, _>>()
.context("Failed to decode float WAV samples")?,
};
Ok(samples)
}
impl Transcriber {
pub fn new(model_path: &Path, language: &str) -> Result<Self> {
let ctx = WhisperContext::new_with_params(
model_path.to_str().context("Invalid model path")?,
WhisperContextParameters::default(),
)
.map_err(|e| anyhow::anyhow!("Failed to load Whisper model: {e}"))?;
Ok(Self {
ctx,
language: language.to_string(),
model_path: model_path.to_path_buf(),
})
}
pub fn model_path(&self) -> &Path {
&self.model_path
}
pub fn language(&self) -> &str {
&self.language
}
fn language_param(&self) -> Option<&str> {
if self.language == "auto" {
None
} else {
Some(&self.language)
}
}
pub fn transcribe(&self, audio_path: &Path, translate: bool) -> Result<String> {
let samples = read_wav_samples(audio_path)?;
self.run_inference(&samples, translate)
}
pub fn transcribe_samples(&self, samples: &[f32], translate: bool) -> Result<String> {
self.run_inference(samples, translate)
}
pub fn transcribe_with_context(
&self,
audio_path: &Path,
translate: bool,
context: &TranscriptionContext,
) -> Result<String> {
let samples = read_wav_samples(audio_path)?;
self.run_inference_with_context(&samples, translate, Some(context))
}
pub fn transcribe_samples_with_context(
&self,
samples: &[f32],
translate: bool,
context: &TranscriptionContext,
) -> Result<String> {
self.run_inference_with_context(samples, translate, Some(context))
}
pub fn create_streaming_state(&self) -> Result<WhisperState> {
self.ctx
.create_state()
.map_err(|e| anyhow::anyhow!("Failed to create whisper state: {e}"))
}
pub fn streaming_transcribe(
&self,
state: &mut WhisperState,
samples: &[f32],
translate: bool,
abort_flag: &Arc<AtomicBool>,
) -> Result<String> {
if samples.is_empty() {
return Ok(String::new());
}
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
params.set_n_threads(inference_thread_count());
params.set_language(self.language_param());
params.set_translate(translate);
params.set_print_special(false);
params.set_print_progress(false);
params.set_print_realtime(false);
params.set_print_timestamps(false);
params.set_suppress_nst(true);
params.set_no_context(true);
let flag = Arc::clone(abort_flag);
params.set_abort_callback_safe(move || flag.load(Ordering::Relaxed));
state
.full(params, samples)
.map_err(|e| anyhow::anyhow!("Transcription failed: {e}"))?;
if abort_flag.load(Ordering::Relaxed) {
return Ok(String::new());
}
let text = self.extract_text(state);
if is_hallucination(&text) {
log::debug!("Filtered hallucinated text: '{text}'");
return Ok(String::new());
}
Ok(text)
}
fn run_inference(&self, samples: &[f32], translate: bool) -> Result<String> {
self.run_inference_with_context(samples, translate, None)
}
fn run_inference_with_context(
&self,
samples: &[f32],
translate: bool,
context: Option<&TranscriptionContext>,
) -> Result<String> {
if samples.is_empty() {
return Ok(String::new());
}
if samples.len() < MIN_AUDIO_SAMPLES {
log::debug!(
"Audio too short ({} samples, need {}), skipping",
samples.len(),
MIN_AUDIO_SAMPLES
);
return Ok(String::new());
}
if !vad::contains_speech(samples) {
log::debug!("VAD: no speech detected, skipping transcription");
return Ok(String::new());
}
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
params.set_n_threads(inference_thread_count());
params.set_language(self.language_param());
params.set_translate(translate);
params.set_print_special(false);
params.set_print_progress(false);
params.set_print_realtime(false);
params.set_print_timestamps(false);
params.set_suppress_nst(true);
let prompt_string;
if let Some(ctx) = context {
let ranked_ctx;
let effective_ctx = if !ctx.vocabulary.is_empty() {
let ranked = rank_vocabulary(&self.ctx, &ctx.vocabulary);
let novel: Vec<String> = filter_novel_terms(&ranked)
.into_iter()
.map(|rt| rt.term.clone())
.collect();
let kept = novel.len();
let dropped = ctx.vocabulary.len() - kept;
if dropped > 0 {
log::debug!(
"Vocabulary ranking: {} terms → {} novel (dropped {} known)",
ctx.vocabulary.len(),
kept,
dropped,
);
}
ranked_ctx = TranscriptionContext {
vocabulary: novel,
surrounding_text: ctx.surrounding_text.clone(),
prompt_prefix: ctx.prompt_prefix.clone(),
};
&ranked_ctx
} else {
ctx
};
if let Some(prompt) = build_initial_prompt(effective_ctx) {
log::debug!(
"Using initial_prompt ({} chars): {}...",
prompt.len(),
&prompt[..prompt.len().min(80)]
);
prompt_string = prompt;
params.set_initial_prompt(&prompt_string);
}
}
let mut state = self
.ctx
.create_state()
.map_err(|e| anyhow::anyhow!("Failed to create whisper state: {e}"))?;
state
.full(params, samples)
.map_err(|e| anyhow::anyhow!("Transcription failed: {e}"))?;
let text = self.extract_text(&state);
if is_hallucination(&text) {
log::debug!("Filtered hallucinated text: '{text}'");
return Ok(String::new());
}
Ok(text)
}
fn extract_text(&self, state: &WhisperState) -> String {
let num_segments = state.full_n_segments();
if num_segments < 0 {
return String::new();
}
let mut text = String::new();
for i in 0..num_segments {
if let Some(segment) = state.get_segment(i) {
if let Ok(segment_text) = segment.to_str_lossy() {
text.push_str(&segment_text);
}
}
}
text.trim().to_string()
}
}
pub fn model_exists(model_size: &str) -> bool {
find_model(model_size).is_some()
}
pub fn find_model(model_size: &str) -> Option<PathBuf> {
let model_filename = format!("ggml-{model_size}.bin");
let candidates = vec![
Config::dir().join("models").join(&model_filename),
dirs::data_dir()
.unwrap_or_default()
.join("whisper-cpp")
.join("models")
.join(&model_filename),
dirs::home_dir()
.unwrap_or_default()
.join(".cache")
.join("whisper")
.join(&model_filename),
];
#[cfg(target_os = "macos")]
let candidates = {
let mut c = candidates;
c.push(PathBuf::from(format!(
"/opt/homebrew/share/whisper-cpp/models/{model_filename}"
)));
c.push(PathBuf::from(format!(
"/usr/local/share/whisper-cpp/models/{model_filename}"
)));
c
};
candidates.into_iter().find(|p| p.exists())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_model_exists_nonexistent() {
assert!(!model_exists("nonexistent_model_that_doesnt_exist_xyz"));
}
#[test]
fn test_find_model_nonexistent() {
assert!(find_model("nonexistent_model_that_doesnt_exist_xyz").is_none());
}
#[test]
fn test_find_model_builds_correct_filename() {
let result = find_model("test_does_not_exist");
assert!(result.is_none());
}
#[test]
fn test_find_model_checks_config_dir() {
let models_dir = Config::dir().join("models");
let _ = std::fs::create_dir_all(&models_dir);
let model_path = models_dir.join("ggml-test_temp_model.bin");
std::fs::write(&model_path, b"test model content").unwrap();
let result = find_model("test_temp_model");
assert!(result.is_some());
assert_eq!(result.unwrap(), model_path);
let _ = std::fs::remove_file(&model_path);
}
#[test]
fn test_read_wav_samples_int16() {
use hound::{SampleFormat, WavSpec, WavWriter};
let tmp = tempfile::NamedTempFile::new().unwrap();
let spec = WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 16,
sample_format: SampleFormat::Int,
};
let mut writer = WavWriter::create(tmp.path(), spec).unwrap();
writer.write_sample(0i16).unwrap();
writer.write_sample(16384i16).unwrap();
writer.write_sample(-16384i16).unwrap();
writer.finalize().unwrap();
let samples = read_wav_samples(tmp.path()).unwrap();
assert_eq!(samples.len(), 3);
assert!((samples[0] - 0.0).abs() < 0.01);
assert!(samples[1] > 0.4 && samples[1] < 0.6);
assert!(samples[2] < -0.4 && samples[2] > -0.6);
}
#[test]
fn test_read_wav_samples_float32() {
use hound::{SampleFormat, WavSpec, WavWriter};
let tmp = tempfile::NamedTempFile::new().unwrap();
let spec = WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 32,
sample_format: SampleFormat::Float,
};
let mut writer = WavWriter::create(tmp.path(), spec).unwrap();
writer.write_sample(0.0f32).unwrap();
writer.write_sample(0.5f32).unwrap();
writer.write_sample(-0.5f32).unwrap();
writer.finalize().unwrap();
let samples = read_wav_samples(tmp.path()).unwrap();
assert_eq!(samples.len(), 3);
assert!((samples[0] - 0.0).abs() < 0.001);
assert!((samples[1] - 0.5).abs() < 0.001);
assert!((samples[2] + 0.5).abs() < 0.001);
}
#[test]
fn test_read_wav_samples_empty() {
use hound::{SampleFormat, WavSpec, WavWriter};
let tmp = tempfile::NamedTempFile::new().unwrap();
let spec = WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 16,
sample_format: SampleFormat::Int,
};
let writer = WavWriter::create(tmp.path(), spec).unwrap();
writer.finalize().unwrap();
let samples = read_wav_samples(tmp.path()).unwrap();
assert!(samples.is_empty());
}
#[test]
fn test_read_wav_samples_nonexistent() {
let result = read_wav_samples(std::path::Path::new("/nonexistent/file.wav"));
assert!(result.is_err());
}
#[test]
fn test_read_wav_samples_24bit_int() {
use hound::{SampleFormat, WavSpec, WavWriter};
let tmp = tempfile::NamedTempFile::new().unwrap();
let spec = WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 24,
sample_format: SampleFormat::Int,
};
let mut writer = WavWriter::create(tmp.path(), spec).unwrap();
writer.write_sample(0i32).unwrap();
writer.write_sample(4194304i32).unwrap(); writer.write_sample(-4194304i32).unwrap();
writer.finalize().unwrap();
let samples = read_wav_samples(tmp.path()).unwrap();
assert_eq!(samples.len(), 3);
assert!((samples[0] - 0.0).abs() < 0.01);
assert!(samples[1] > 0.4 && samples[1] < 0.6);
assert!(samples[2] < -0.4 && samples[2] > -0.6);
}
#[test]
fn test_read_wav_samples_max_values() {
use hound::{SampleFormat, WavSpec, WavWriter};
let tmp = tempfile::NamedTempFile::new().unwrap();
let spec = WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 16,
sample_format: SampleFormat::Int,
};
let mut writer = WavWriter::create(tmp.path(), spec).unwrap();
writer.write_sample(i16::MAX).unwrap();
writer.write_sample(i16::MIN).unwrap();
writer.finalize().unwrap();
let samples = read_wav_samples(tmp.path()).unwrap();
assert_eq!(samples.len(), 2);
assert!(samples[0] > 0.99);
assert!(samples[1] < -0.99);
}
#[test]
fn test_read_wav_samples_multi_sample() {
use hound::{SampleFormat, WavSpec, WavWriter};
let tmp = tempfile::NamedTempFile::new().unwrap();
let spec = WavSpec {
channels: 1,
sample_rate: 16000,
bits_per_sample: 32,
sample_format: SampleFormat::Float,
};
let mut writer = WavWriter::create(tmp.path(), spec).unwrap();
for i in 0..100 {
writer.write_sample(i as f32 / 100.0).unwrap();
}
writer.finalize().unwrap();
let samples = read_wav_samples(tmp.path()).unwrap();
assert_eq!(samples.len(), 100);
assert!((samples[50] - 0.5).abs() < 0.01);
}
#[test]
fn test_find_model_returns_none_for_empty_string() {
assert!(find_model("").is_none());
}
#[test]
fn test_model_exists_consistent_with_find_model() {
let size = "nonexistent_test_model_xyz";
assert_eq!(model_exists(size), find_model(size).is_some());
}
#[test]
fn test_vad_silence_has_no_speech() {
assert!(!vad::contains_speech(&vec![0.0f32; 16000]));
}
#[test]
fn test_vad_low_noise_has_no_speech() {
assert!(!vad::contains_speech(&vec![0.001f32; 16000]));
}
#[test]
fn test_vad_empty_has_no_speech() {
assert!(!vad::contains_speech(&[]));
}
#[test]
fn test_is_hallucination_known_phrases() {
assert!(is_hallucination("The following:"));
assert!(is_hallucination("Thank you."));
assert!(is_hallucination(" thanks for watching "));
assert!(is_hallucination("Goodbye."));
assert!(is_hallucination("you"));
assert!(is_hallucination("Bye."));
}
#[test]
fn test_is_hallucination_real_speech() {
assert!(!is_hallucination("Hello, my name is Jacob"));
assert!(!is_hallucination("The following steps are important"));
assert!(!is_hallucination("Thank you for your help with the code"));
}
#[test]
fn test_is_hallucination_empty() {
assert!(!is_hallucination(""));
assert!(!is_hallucination(" "));
}
#[test]
fn test_constants() {
const { assert!(MIN_AUDIO_SAMPLES > 0) };
#[allow(clippy::const_is_empty)]
{
assert!(!HALLUCINATED_PHRASES.is_empty());
}
}
#[test]
fn test_transcription_context_default() {
let ctx = TranscriptionContext::default();
assert!(ctx.vocabulary.is_empty());
assert!(ctx.surrounding_text.is_none());
assert!(ctx.prompt_prefix.is_none());
}
#[test]
fn test_build_prompt_empty_context() {
let ctx = TranscriptionContext::default();
assert!(build_initial_prompt(&ctx).is_none());
}
#[test]
fn test_build_prompt_vocabulary_only() {
let ctx = TranscriptionContext {
vocabulary: vec![
"useState".to_string(),
"async".to_string(),
"impl".to_string(),
],
..Default::default()
};
let prompt = build_initial_prompt(&ctx).unwrap();
assert!(prompt.contains("useState"));
assert!(prompt.contains("async"));
assert!(prompt.contains("impl"));
}
#[test]
fn test_build_prompt_surrounding_text_only() {
let ctx = TranscriptionContext {
surrounding_text: Some("The function returns a".to_string()),
..Default::default()
};
let prompt = build_initial_prompt(&ctx).unwrap();
assert!(prompt.contains("The function returns a"));
}
#[test]
fn test_build_prompt_combined() {
let ctx = TranscriptionContext {
vocabulary: vec!["boolean".to_string()],
surrounding_text: Some("The function returns a".to_string()),
prompt_prefix: None,
};
let prompt = build_initial_prompt(&ctx).unwrap();
assert!(prompt.contains("boolean"));
assert!(prompt.contains("The function returns a"));
}
#[test]
fn test_build_prompt_with_prefix() {
let ctx = TranscriptionContext {
vocabulary: vec![],
surrounding_text: None,
prompt_prefix: Some("Technical programming discussion.".to_string()),
};
let prompt = build_initial_prompt(&ctx).unwrap();
assert!(prompt.contains("Technical programming discussion"));
}
#[test]
fn test_build_prompt_truncation() {
let ctx = TranscriptionContext {
vocabulary: (0..100).map(|i| format!("word{i}")).collect(),
surrounding_text: Some("important context at the end".to_string()),
prompt_prefix: None,
};
let prompt = build_initial_prompt(&ctx).unwrap();
assert!(prompt.len() <= MAX_PROMPT_CHARS);
assert!(prompt.contains("important context at the end"));
}
#[test]
fn test_build_prompt_whitespace_handling() {
let ctx = TranscriptionContext {
vocabulary: vec![],
surrounding_text: Some(" ".to_string()),
prompt_prefix: Some(" ".to_string()),
};
assert!(build_initial_prompt(&ctx).is_none());
}
#[test]
fn test_build_prompt_vocabulary_ordering() {
let ctx = TranscriptionContext {
vocabulary: vec!["alpha".to_string(), "beta".to_string(), "gamma".to_string()],
..Default::default()
};
let prompt = build_initial_prompt(&ctx).unwrap();
let alpha_pos = prompt.find("alpha").unwrap();
let beta_pos = prompt.find("beta").unwrap();
let gamma_pos = prompt.find("gamma").unwrap();
assert!(alpha_pos < beta_pos);
assert!(beta_pos < gamma_pos);
}
#[test]
fn test_ranked_term_struct() {
let rt = RankedTerm {
term: "useState".to_string(),
token_count: 3,
};
assert_eq!(rt.term, "useState");
assert_eq!(rt.token_count, 3);
}
#[test]
fn test_filter_novel_terms_above_threshold() {
let ranked = vec![
RankedTerm {
term: "kAXValueAttribute".to_string(),
token_count: 5,
},
RankedTerm {
term: "rustfmt".to_string(),
token_count: 3,
},
RankedTerm {
term: "useState".to_string(),
token_count: 2,
},
RankedTerm {
term: "function".to_string(),
token_count: 1,
},
];
let novel = filter_novel_terms(&ranked);
assert_eq!(novel.len(), 3);
assert_eq!(novel[0].term, "kAXValueAttribute");
assert_eq!(novel[1].term, "rustfmt");
assert_eq!(novel[2].term, "useState");
}
#[test]
fn test_filter_novel_terms_all_known() {
let ranked = vec![
RankedTerm {
term: "hello".to_string(),
token_count: 1,
},
RankedTerm {
term: "world".to_string(),
token_count: 1,
},
];
let novel = filter_novel_terms(&ranked);
assert!(novel.is_empty());
}
#[test]
fn test_filter_novel_terms_empty() {
let novel = filter_novel_terms(&[]);
assert!(novel.is_empty());
}
#[test]
fn test_novelty_threshold_constant() {
const { assert!(NOVELTY_TOKEN_THRESHOLD >= 2) };
}
#[test]
fn test_max_prompt_chars_within_whisper_limits() {
const { assert!(MAX_PROMPT_CHARS <= 1000) };
const { assert!(MAX_PROMPT_CHARS >= 200) };
}
#[test]
fn test_snap_to_char_boundary_ascii() {
let s = "hello world";
assert_eq!(snap_to_char_boundary(s, 0), 0);
assert_eq!(snap_to_char_boundary(s, 5), 5);
}
#[test]
fn test_snap_to_char_boundary_multibyte() {
let s = "héllo";
assert!(s.is_char_boundary(0));
let snapped = snap_to_char_boundary(s, 2);
assert!(s.is_char_boundary(snapped));
}
#[test]
fn test_snap_to_char_boundary_cjk() {
let s = "你好世界";
let snapped = snap_to_char_boundary(s, 1);
assert!(s.is_char_boundary(snapped));
assert_eq!(snapped, 3); }
#[test]
fn test_is_cjk_heavy_chinese() {
assert!(is_cjk_heavy("你好世界这是一个测试"));
}
#[test]
fn test_is_cjk_heavy_japanese() {
assert!(is_cjk_heavy("こんにちは世界"));
}
#[test]
fn test_is_cjk_heavy_english() {
assert!(!is_cjk_heavy("hello world this is a test"));
}
#[test]
fn test_is_cjk_heavy_mixed() {
assert!(!is_cjk_heavy("hello world 你好 this is a test string"));
}
#[test]
fn test_is_cjk_heavy_empty() {
assert!(!is_cjk_heavy(""));
}
#[test]
fn test_truncation_preserves_cjk_chars() {
let cjk_text = "你好".repeat(300); let ctx = TranscriptionContext {
surrounding_text: Some(cjk_text),
..Default::default()
};
let prompt = build_initial_prompt(&ctx).unwrap();
assert!(prompt.len() <= MAX_PROMPT_CHARS);
for c in prompt.chars() {
assert!(c.len_utf8() > 0);
}
}
}