#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Level { None, Light, Medium, High }
const FILLERS: &[&str] = &["um", "uh", "er", "ah", "like", "you", "know", "so", "well", "i", "mean"];
const LEADING_DISFLUENCIES: &[&str] = &["um", "uh", "er", "ah", "mm", "hmm", "uhm", "erm", "hm"];
fn content_words(text: &str) -> Vec<String> {
text.to_lowercase()
.split(|c: char| !c.is_alphanumeric())
.filter(|w| !w.is_empty())
.filter(|w| !FILLERS.contains(w))
.map(|w| w.to_string())
.collect()
}
pub fn guard_accepts(input: &str, output: &str) -> bool {
content_words(input) == content_words(output)
}
pub fn apply_spoken_commands(text: &str) -> String {
format!(" {} ", text)
.replace(" new paragraph ", "\n\n")
.replace(" new line ", "\n")
.replace(" period ", ". ")
.replace(" comma ", ", ")
.trim()
.to_string()
}
fn find_word_bounded(hay: &str, needle_lower: &str) -> Option<usize> {
let hb = hay.as_bytes();
let nb = needle_lower.as_bytes();
let nlen = nb.len();
if nlen == 0 || hb.len() < nlen { return None; }
let mut i = 0;
while i + nlen <= hb.len() {
if (0..nlen).all(|k| hb[i + k].to_ascii_lowercase() == nb[k]) {
let before_ok = i == 0 || !hb[i - 1].is_ascii_alphanumeric();
let after = i + nlen;
let after_ok = after == hb.len() || !hb[after].is_ascii_alphanumeric();
if before_ok && after_ok { return Some(i); }
}
i += 1;
}
None
}
pub fn apply_backtrack(text: &str) -> String {
const TRIGGERS: &[&str] = &["scratch that", "actually no"];
let mut result = text.to_string();
for trigger in TRIGGERS {
while let Some(pos) = find_word_bounded(&result, trigger) {
let before = result[..pos].trim_end();
let after = &result[pos + trigger.len()..];
let kept: Vec<&str> = before.split_whitespace().collect();
if kept.len() >= 3 {
let cut = before.rfind(['.', '\n']).map(|i| i + 1).unwrap_or(0);
result = format!("{}{}", &before[..cut], after);
} else {
result = format!("{} {}", before, after.trim_start());
}
}
}
result.split_whitespace().collect::<Vec<_>>().join(" ")
}
const CONTINUATIONS: &[&str] = &[
"and", "but", "so", "or", "the", "a", "an", "it", "that", "this", "these",
"those", "all", "then", "because", "which", "who",
];
pub fn decapitalize_continuation(text: &str, prev_clean: Option<&str>) -> String {
let continues = prev_clean.is_some_and(|p| {
let tail = p.trim_end().trim_end_matches(['"', '\'', ')', ']', '”', '’']);
!matches!(tail.chars().last(), Some('.' | '!' | '?' | '…') | None)
});
if !continues {
return text.to_string();
}
let first = text.split_whitespace().next().unwrap_or("");
let bare = first.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
if !CONTINUATIONS.contains(&bare.as_str()) {
return text.to_string();
}
let mut chars = text.chars();
match chars.next() {
Some(c) if c.is_uppercase() => c.to_lowercase().collect::<String>() + chars.as_str(),
_ => text.to_string(),
}
}
pub fn format_revise(whisper: &str, prev_clean: Option<&str>) -> String {
let pre = apply_spoken_commands(&apply_backtrack(whisper));
decapitalize_continuation(&pre, prev_clean)
}
pub fn deterministic_light(text: &str) -> String {
let trimmed = text.trim();
let without_lead = strip_leading_fillers(trimmed);
let capped = capitalize_sentences(&without_lead);
ensure_terminal(&capitalize_standalone_i(&capped))
}
fn capitalize_standalone_i(text: &str) -> String {
let chars: Vec<char> = text.chars().collect();
let mut out = String::with_capacity(text.len());
for (idx, &ch) in chars.iter().enumerate() {
let alone_before = idx == 0 || !chars[idx - 1].is_alphanumeric();
let alone_after = idx + 1 == chars.len() || !chars[idx + 1].is_alphanumeric();
out.push(if ch == 'i' && alone_before && alone_after { 'I' } else { ch });
}
out
}
fn strip_leading_fillers(text: &str) -> String {
let mut words: Vec<&str> = text.split_whitespace().collect();
while let Some(first) = words.first() {
let lw = first.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
if LEADING_DISFLUENCIES.contains(&lw.as_str()) { words.remove(0); } else { break; }
}
words.join(" ")
}
fn capitalize_sentences(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let mut at_start = true;
for ch in text.chars() {
if at_start && ch.is_alphabetic() {
out.extend(ch.to_uppercase());
at_start = false;
} else {
out.push(ch);
if ch == '.' || ch == '!' || ch == '?' { at_start = true; }
}
}
out
}
fn ensure_terminal(text: &str) -> String {
let t = text.trim_end();
if t.is_empty() || matches!(t.chars().last(), Some('.') | Some('!') | Some('?')) {
t.to_string()
} else {
format!("{}.", t)
}
}
pub fn parse_level(s: &str) -> Level {
match s.trim().to_lowercase().as_str() {
"none" => Level::None,
"medium" => Level::Medium,
"high" => Level::High,
_ => Level::Light,
}
}
pub struct RewritePrompt {
pub system: String,
pub user: String,
}
pub fn rewrite_prompt(level: Level, text: &str) -> RewritePrompt {
let restraint = "You clean up raw voice transcripts. Return ONLY the cleaned text, nothing else — no preamble, no quotes. NEVER change meaning: never swap a word for a different one, never add words that change meaning, never drop a negation, never reorder clauses. When unsure, leave it as it is.";
let rule = match level {
Level::None => "Return the text exactly as given.",
Level::Light => "Fix only capitalization and punctuation, and drop leading non-lexical filler (um, uh, er, ah). Remove no other words.",
Level::Medium => "Also remove disfluencies and false starts and join fragments into sentences. Keep every meaning-bearing word.",
Level::High => "Also break into paragraphs at topic shifts and turn spoken lists into bullets. Keep every meaning-bearing word.",
};
RewritePrompt {
system: format!("{restraint} {rule}"),
user: format!("Clean this transcript:\n{text}"),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn accepts_pure_punctuation_and_filler_cleanup() {
assert!(guard_accepts(
"um so the thing is i keep avoiding it",
"The thing is, I keep avoiding it.",
));
}
#[test]
fn rejects_a_substituted_meaning_word() {
assert!(!guard_accepts("i love her", "I loathe her."));
}
#[test]
fn rejects_a_dropped_content_word() {
assert!(!guard_accepts("i never said that", "I said that."));
}
#[test]
fn rejects_an_added_content_word() {
assert!(!guard_accepts("i am tired", "I am very tired."));
}
#[test]
fn guard_permits_dropping_filler_homographs_known_limit() {
assert!(guard_accepts("do you know the way", "do the way"));
assert!(guard_accepts("i like it a lot", "it a lot"));
}
#[test]
fn deterministic_light_caps_and_terminates() {
assert_eq!(deterministic_light("um the thing is"), "The thing is.");
}
#[test]
fn does_not_strip_a_leading_content_word() {
assert_eq!(deterministic_light("i sometimes forget the small things"),
"I sometimes forget the small things.");
assert_eq!(deterministic_light("you should go now"), "You should go now.");
assert_eq!(deterministic_light("so i realized the answer"), "So I realized the answer.");
assert_eq!(deterministic_light("well that is the thing"), "Well that is the thing.");
}
#[test]
fn still_strips_leading_nonlexical_disfluencies() {
assert_eq!(deterministic_light("um uh the thing is"), "The thing is.");
assert_eq!(deterministic_light("ah i see it now"), "I see it now.");
assert_eq!(deterministic_light("um, the thing is"), "The thing is.");
}
#[test]
fn a_leading_pure_punctuation_token_survives() {
assert_eq!(deterministic_light("-- the thing is"), "-- The thing is.");
}
#[test]
fn standalone_i_is_capitalized_mid_sentence() {
assert_eq!(
deterministic_light("the thing is i keep avoiding it"),
"The thing is I keep avoiding it."
);
assert_eq!(
deterministic_light("i'm sure i'll try what i've found"),
"I'm sure I'll try what I've found."
);
assert_eq!(deterministic_light("it is in the bin"), "It is in the bin.");
}
#[test]
fn deterministic_light_is_guard_safe() {
let raw = "um so i keep avoiding the hard conversation";
assert!(guard_accepts(raw, &deterministic_light(raw)));
}
#[test]
fn spoken_command_becomes_newline() {
assert_eq!(apply_spoken_commands("a new line b"), "a\nb");
}
#[test]
fn backtrack_drops_preceding_clause() {
let out = apply_backtrack("the answer is yes scratch that the answer is no");
assert!(!out.contains("yes"));
assert!(out.contains("the answer is no"));
}
#[test]
fn backtrack_does_not_fire_inside_a_word() {
let out = apply_backtrack("well actually nobody knows the truth");
assert!(out.contains("nobody"));
assert!(out.contains("the truth"));
}
#[test]
fn spoken_command_at_phrase_start_and_end() {
assert_eq!(apply_spoken_commands("new line b"), "b");
assert_eq!(apply_spoken_commands("a new line"), "a");
}
#[test]
fn backtrack_handles_non_ascii_without_panicking() {
let out = apply_backtrack("aa bb ẞ scratch that ẞ tail");
assert!(out.contains("tail"));
assert!(!out.contains("scratch that"));
}
#[test]
fn parse_level_maps_known_and_defaults_to_light() {
assert_eq!(parse_level("none"), Level::None);
assert_eq!(parse_level("Medium"), Level::Medium);
assert_eq!(parse_level("HIGH"), Level::High);
assert_eq!(parse_level("light"), Level::Light);
assert_eq!(parse_level("nonsense"), Level::Light);
}
#[test]
fn rewrite_prompt_widens_by_level_and_carries_the_text() {
assert!(rewrite_prompt(Level::Light, "x").system.to_lowercase().contains("capitalization"));
assert!(rewrite_prompt(Level::Medium, "x").system.to_lowercase().contains("disfluencies"));
assert!(rewrite_prompt(Level::High, "x").system.to_lowercase().contains("paragraph"));
assert!(rewrite_prompt(Level::Light, "the raw phrase").user.contains("the raw phrase"));
}
#[test]
fn rewrite_prompt_always_states_the_restraint() {
for lvl in [Level::Light, Level::Medium, Level::High] {
assert!(rewrite_prompt(lvl, "x").system.to_lowercase().contains("never change meaning"));
}
}
#[test]
fn decapitalize_lowercases_an_allowlist_continuation_after_unterminated_prior() {
assert_eq!(
decapitalize_continuation("All these edge cases get sorted out.", Some("with their product")),
"all these edge cases get sorted out."
);
}
#[test]
fn decapitalize_keeps_capital_after_a_terminated_prior() {
assert_eq!(
decapitalize_continuation("All these edge cases.", Some("That worked.")),
"All these edge cases."
);
}
#[test]
fn decapitalize_never_lowercases_a_non_allowlist_word_protecting_proper_nouns() {
assert_eq!(
decapitalize_continuation("Whisper does the rest", Some("the tool i use is")),
"Whisper does the rest"
);
}
#[test]
fn format_revise_trusts_whisper_casing_and_applies_features() {
assert_eq!(format_revise("hello there", None), "hello there");
assert_eq!(format_revise("first line new line second", None), "first line\nsecond");
}
}