1#[derive(Clone, Copy, Debug, PartialEq, Eq)]
4pub enum Level { None, Light, Medium, High }
5
6const FILLERS: &[&str] = &["um", "uh", "er", "ah", "like", "you", "know", "so", "well", "i", "mean"];
16
17const LEADING_DISFLUENCIES: &[&str] = &["um", "uh", "er", "ah", "mm", "hmm", "uhm", "erm", "hm"];
24
25fn content_words(text: &str) -> Vec<String> {
26 text.to_lowercase()
27 .split(|c: char| !c.is_alphanumeric())
28 .filter(|w| !w.is_empty())
29 .filter(|w| !FILLERS.contains(w))
30 .map(|w| w.to_string())
31 .collect()
32}
33
34pub fn guard_accepts(input: &str, output: &str) -> bool {
38 content_words(input) == content_words(output)
39}
40
41pub fn apply_spoken_commands(text: &str) -> String {
46 format!(" {} ", text)
47 .replace(" new paragraph ", "\n\n")
48 .replace(" new line ", "\n")
49 .replace(" period ", ". ")
50 .replace(" comma ", ", ")
51 .trim()
52 .to_string()
53}
54
55fn find_word_bounded(hay: &str, needle_lower: &str) -> Option<usize> {
60 let hb = hay.as_bytes();
61 let nb = needle_lower.as_bytes();
62 let nlen = nb.len();
63 if nlen == 0 || hb.len() < nlen { return None; }
64 let mut i = 0;
65 while i + nlen <= hb.len() {
66 if (0..nlen).all(|k| hb[i + k].to_ascii_lowercase() == nb[k]) {
69 let before_ok = i == 0 || !hb[i - 1].is_ascii_alphanumeric();
70 let after = i + nlen;
71 let after_ok = after == hb.len() || !hb[after].is_ascii_alphanumeric();
72 if before_ok && after_ok { return Some(i); }
73 }
74 i += 1;
75 }
76 None
77}
78
79pub fn apply_backtrack(text: &str) -> String {
84 const TRIGGERS: &[&str] = &["scratch that", "actually no"];
85 let mut result = text.to_string();
86 for trigger in TRIGGERS {
87 while let Some(pos) = find_word_bounded(&result, trigger) {
88 let before = result[..pos].trim_end();
89 let after = &result[pos + trigger.len()..];
90 let kept: Vec<&str> = before.split_whitespace().collect();
91 if kept.len() >= 3 {
92 let cut = before.rfind(['.', '\n']).map(|i| i + 1).unwrap_or(0);
94 result = format!("{}{}", &before[..cut], after);
95 } else {
96 result = format!("{} {}", before, after.trim_start());
98 }
99 }
100 }
101 result.split_whitespace().collect::<Vec<_>>().join(" ")
102}
103
104const CONTINUATIONS: &[&str] = &[
108 "and", "but", "so", "or", "the", "a", "an", "it", "that", "this", "these",
109 "those", "all", "then", "because", "which", "who",
110];
111
112pub fn decapitalize_continuation(text: &str, prev_clean: Option<&str>) -> String {
118 let continues = prev_clean.is_some_and(|p| {
119 let tail = p.trim_end().trim_end_matches(['"', '\'', ')', ']', '”', '’']);
122 !matches!(tail.chars().last(), Some('.' | '!' | '?' | '…') | None)
123 });
124 if !continues {
125 return text.to_string();
126 }
127 let first = text.split_whitespace().next().unwrap_or("");
128 let bare = first.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
129 if !CONTINUATIONS.contains(&bare.as_str()) {
130 return text.to_string();
131 }
132 let mut chars = text.chars();
133 match chars.next() {
134 Some(c) if c.is_uppercase() => c.to_lowercase().collect::<String>() + chars.as_str(),
135 _ => text.to_string(),
136 }
137}
138
139pub fn format_revise(whisper: &str, prev_clean: Option<&str>) -> String {
151 let pre = apply_spoken_commands(&apply_backtrack(whisper));
152 decapitalize_continuation(&pre, prev_clean)
153}
154
155pub fn deterministic_light(text: &str) -> String {
158 let trimmed = text.trim();
159 let without_lead = strip_leading_fillers(trimmed);
160 let capped = capitalize_sentences(&without_lead);
161 ensure_terminal(&capitalize_standalone_i(&capped))
162}
163
164fn capitalize_standalone_i(text: &str) -> String {
170 let chars: Vec<char> = text.chars().collect();
171 let mut out = String::with_capacity(text.len());
172 for (idx, &ch) in chars.iter().enumerate() {
173 let alone_before = idx == 0 || !chars[idx - 1].is_alphanumeric();
174 let alone_after = idx + 1 == chars.len() || !chars[idx + 1].is_alphanumeric();
175 out.push(if ch == 'i' && alone_before && alone_after { 'I' } else { ch });
176 }
177 out
178}
179
180fn strip_leading_fillers(text: &str) -> String {
181 let mut words: Vec<&str> = text.split_whitespace().collect();
182 while let Some(first) = words.first() {
183 let lw = first.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
185 if LEADING_DISFLUENCIES.contains(&lw.as_str()) { words.remove(0); } else { break; }
186 }
187 words.join(" ")
188}
189
190fn capitalize_sentences(text: &str) -> String {
191 let mut out = String::with_capacity(text.len());
192 let mut at_start = true;
193 for ch in text.chars() {
194 if at_start && ch.is_alphabetic() {
195 out.extend(ch.to_uppercase());
196 at_start = false;
197 } else {
198 out.push(ch);
199 if ch == '.' || ch == '!' || ch == '?' { at_start = true; }
200 }
201 }
202 out
203}
204
205fn ensure_terminal(text: &str) -> String {
206 let t = text.trim_end();
207 if t.is_empty() || matches!(t.chars().last(), Some('.') | Some('!') | Some('?')) {
208 t.to_string()
209 } else {
210 format!("{}.", t)
211 }
212}
213
214const SOUND_WORDS: &[&str] = &[
218 "buzzer", "buzzing", "music", "applause", "applauding", "laughter", "laughs",
219 "laughing", "coughs", "coughing", "cough", "sighs", "sigh", "beep", "beeping",
220 "breathing", "breath", "breathes", "static", "noise", "silence", "blank_audio",
221 "wind", "blowing", "clears", "throat", "typing", "footsteps", "door", "closes",
222 "knock", "knocking", "indistinct", "inaudible", "sniffles", "chuckles",
223];
224
225pub fn strip_sound_tags(text: &str) -> String {
230 let mut out = String::with_capacity(text.len());
231 let mut rest = text;
232 while let Some(open) = rest.find(['(', '[']) {
233 let open_ch = rest.as_bytes()[open];
234 let close_ch = if open_ch == b'(' { ')' } else { ']' };
235 let Some(rel) = rest[open + 1..].find(close_ch) else {
236 out.push_str(&rest[..=open]);
237 rest = &rest[open + 1..];
238 continue;
239 };
240 let close = open + 1 + rel;
241 let inner = rest[open + 1..close].trim();
242 let remove = if open_ch == b'(' {
243 is_all_sound_words(inner)
244 } else {
245 is_event_bracket(inner)
246 };
247 out.push_str(&rest[..open]);
248 if !remove {
249 out.push_str(&rest[open..=close]);
250 }
251 rest = &rest[close + 1..];
252 }
253 out.push_str(rest);
254 out.split_whitespace().collect::<Vec<_>>().join(" ")
255}
256
257fn is_all_sound_words(inner: &str) -> bool {
258 let mut any = false;
259 for w in inner.split_whitespace() {
260 any = true;
261 let bare = w.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
262 if !SOUND_WORDS.contains(&bare.as_str()) {
263 return false;
264 }
265 }
266 any
267}
268
269fn is_event_bracket(inner: &str) -> bool {
270 if SOUND_WORDS.contains(&inner.to_lowercase().as_str()) {
271 return true;
272 }
273 inner.contains('_')
274 && inner.chars().any(|c| c.is_ascii_uppercase())
275 && inner.chars().all(|c| c.is_ascii_uppercase() || c == '_' || c == ' ')
276}
277
278pub fn parse_level(s: &str) -> Level {
281 match s.trim().to_lowercase().as_str() {
282 "none" => Level::None,
283 "medium" => Level::Medium,
284 "high" => Level::High,
285 _ => Level::Light,
286 }
287}
288
289const PARA_OPENERS: &[&str] = &[
291 "anyway", "anyways", "so", "but", "now", "another", "also", "okay", "alright",
292 "well", "then", "actually", "honestly", "basically",
293];
294
295const MIN_SENTENCES_PER_PARA: usize = 3;
296const MAX_SENTENCES_PER_PARA: usize = 6;
297
298pub fn paragraphize(text: &str) -> String {
305 text.split("\n\n")
306 .map(|block| paragraphize_run(block.trim()))
307 .filter(|b| !b.is_empty())
308 .collect::<Vec<_>>()
309 .join("\n\n")
310}
311
312fn paragraphize_run(run: &str) -> String {
313 let mut paras: Vec<Vec<String>> = vec![Vec::new()];
314 for s in split_sentences(run) {
315 let cur_len = paras.last().unwrap().len();
316 let opens = s.split_whitespace().next()
317 .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase())
318 .is_some_and(|w| PARA_OPENERS.contains(&w.as_str()));
319 if (opens && cur_len >= MIN_SENTENCES_PER_PARA) || cur_len >= MAX_SENTENCES_PER_PARA {
320 paras.push(Vec::new());
321 }
322 paras.last_mut().unwrap().push(s);
323 }
324 paras.into_iter()
325 .filter(|p| !p.is_empty())
326 .map(|p| p.join(" "))
327 .collect::<Vec<_>>()
328 .join("\n\n")
329}
330
331fn split_sentences(text: &str) -> Vec<String> {
334 let mut out = Vec::new();
335 let mut cur = String::new();
336 let mut chars = text.chars().peekable();
337 while let Some(c) = chars.next() {
338 cur.push(c);
339 if matches!(c, '.' | '!' | '?') {
340 while matches!(chars.peek(), Some('"' | '\'' | '\u{201d}' | '\u{2019}' | ')')) {
341 cur.push(chars.next().unwrap());
342 }
343 if chars.peek().is_none_or(|n| n.is_whitespace()) {
344 out.push(cur.trim().to_string());
345 cur.clear();
346 }
347 }
348 }
349 if !cur.trim().is_empty() {
350 out.push(cur.trim().to_string());
351 }
352 out
353}
354
355pub fn shape_entry(level: Level, text: &str) -> String {
358 match level {
359 Level::High => paragraphize(text),
360 _ => text.to_string(),
361 }
362}
363
364pub struct RewritePrompt {
369 pub system: String,
370 pub user: String,
371}
372
373pub fn rewrite_prompt(level: Level, text: &str) -> RewritePrompt {
378 let restraint = "You clean up raw voice transcripts. Return ONLY the cleaned text, nothing else — no preamble, no quotes. NEVER change meaning: never swap a word for a different one, never add words that change meaning, never drop a negation, never reorder clauses. When unsure, leave it as it is.";
379 let rule = match level {
380 Level::None => "Return the text exactly as given.",
381 Level::Light => "Fix only capitalization and punctuation, and drop leading non-lexical filler (um, uh, er, ah). Remove no other words.",
382 Level::Medium => "Also remove disfluencies and false starts and join fragments into sentences. Keep every meaning-bearing word.",
383 Level::High => "Also break into paragraphs at topic shifts. Keep every meaning-bearing word, in its original order, adding nothing.",
384 };
385 RewritePrompt {
386 system: format!("{restraint} {rule}"),
387 user: format!("Clean this transcript:\n{text}"),
388 }
389}
390
391#[cfg(test)]
392mod tests {
393 use super::*;
394
395 #[test]
396 fn accepts_pure_punctuation_and_filler_cleanup() {
397 assert!(guard_accepts(
398 "um so the thing is i keep avoiding it",
399 "The thing is, I keep avoiding it.",
400 ));
401 }
402
403 #[test]
404 fn rejects_a_substituted_meaning_word() {
405 assert!(!guard_accepts("i love her", "I loathe her."));
407 }
408
409 #[test]
410 fn rejects_a_dropped_content_word() {
411 assert!(!guard_accepts("i never said that", "I said that."));
412 }
413
414 #[test]
415 fn rejects_an_added_content_word() {
416 assert!(!guard_accepts("i am tired", "I am very tired."));
417 }
418
419 #[test]
420 fn guard_permits_dropping_filler_homographs_known_limit() {
421 assert!(guard_accepts("do you know the way", "do the way"));
425 assert!(guard_accepts("i like it a lot", "it a lot"));
426 }
427
428 #[test]
429 fn deterministic_light_caps_and_terminates() {
430 assert_eq!(deterministic_light("um the thing is"), "The thing is.");
431 }
432
433 #[test]
434 fn does_not_strip_a_leading_content_word() {
435 assert_eq!(deterministic_light("i sometimes forget the small things"),
438 "I sometimes forget the small things.");
439 assert_eq!(deterministic_light("you should go now"), "You should go now.");
440 assert_eq!(deterministic_light("so i realized the answer"), "So I realized the answer.");
441 assert_eq!(deterministic_light("well that is the thing"), "Well that is the thing.");
442 }
443
444 #[test]
445 fn still_strips_leading_nonlexical_disfluencies() {
446 assert_eq!(deterministic_light("um uh the thing is"), "The thing is.");
447 assert_eq!(deterministic_light("ah i see it now"), "I see it now.");
448 assert_eq!(deterministic_light("um, the thing is"), "The thing is.");
450 }
451
452 #[test]
453 fn a_leading_pure_punctuation_token_survives() {
454 assert_eq!(deterministic_light("-- the thing is"), "-- The thing is.");
458 }
459
460 #[test]
461 fn standalone_i_is_capitalized_mid_sentence() {
462 assert_eq!(
463 deterministic_light("the thing is i keep avoiding it"),
464 "The thing is I keep avoiding it."
465 );
466 assert_eq!(
467 deterministic_light("i'm sure i'll try what i've found"),
468 "I'm sure I'll try what I've found."
469 );
470 assert_eq!(deterministic_light("it is in the bin"), "It is in the bin.");
472 }
473
474 #[test]
475 fn deterministic_light_is_guard_safe() {
476 let raw = "um so i keep avoiding the hard conversation";
477 assert!(guard_accepts(raw, &deterministic_light(raw)));
478 }
479
480 #[test]
481 fn spoken_command_becomes_newline() {
482 assert_eq!(apply_spoken_commands("a new line b"), "a\nb");
483 }
484
485 #[test]
486 fn backtrack_drops_preceding_clause() {
487 let out = apply_backtrack("the answer is yes scratch that the answer is no");
488 assert!(!out.contains("yes"));
489 assert!(out.contains("the answer is no"));
490 }
491
492 #[test]
493 fn backtrack_does_not_fire_inside_a_word() {
494 let out = apply_backtrack("well actually nobody knows the truth");
496 assert!(out.contains("nobody"));
497 assert!(out.contains("the truth"));
498 }
499
500 #[test]
501 fn spoken_command_at_phrase_start_and_end() {
502 assert_eq!(apply_spoken_commands("new line b"), "b");
503 assert_eq!(apply_spoken_commands("a new line"), "a");
504 }
505
506 #[test]
507 fn backtrack_handles_non_ascii_without_panicking() {
508 let out = apply_backtrack("aa bb ẞ scratch that ẞ tail");
510 assert!(out.contains("tail"));
511 assert!(!out.contains("scratch that"));
512 }
513
514 #[test]
515 fn parse_level_maps_known_and_defaults_to_light() {
516 assert_eq!(parse_level("none"), Level::None);
517 assert_eq!(parse_level("Medium"), Level::Medium);
518 assert_eq!(parse_level("HIGH"), Level::High);
519 assert_eq!(parse_level("light"), Level::Light);
520 assert_eq!(parse_level("nonsense"), Level::Light);
521 }
522
523 #[test]
524 fn rewrite_prompt_widens_by_level_and_carries_the_text() {
525 assert!(rewrite_prompt(Level::Light, "x").system.to_lowercase().contains("capitalization"));
526 assert!(rewrite_prompt(Level::Medium, "x").system.to_lowercase().contains("disfluencies"));
527 assert!(rewrite_prompt(Level::High, "x").system.to_lowercase().contains("paragraph"));
528 assert!(rewrite_prompt(Level::Light, "the raw phrase").user.contains("the raw phrase"));
529 }
530
531 #[test]
532 fn rewrite_prompt_always_states_the_restraint() {
533 for lvl in [Level::Light, Level::Medium, Level::High] {
534 assert!(rewrite_prompt(lvl, "x").system.to_lowercase().contains("never change meaning"));
535 }
536 }
537
538 #[test]
539 fn decapitalize_lowercases_an_allowlist_continuation_after_unterminated_prior() {
540 assert_eq!(
541 decapitalize_continuation("All these edge cases get sorted out.", Some("with their product")),
542 "all these edge cases get sorted out."
543 );
544 }
545
546 #[test]
547 fn decapitalize_keeps_capital_after_a_terminated_prior() {
548 assert_eq!(
549 decapitalize_continuation("All these edge cases.", Some("That worked.")),
550 "All these edge cases."
551 );
552 }
553
554 #[test]
555 fn decapitalize_never_lowercases_a_non_allowlist_word_protecting_proper_nouns() {
556 assert_eq!(
557 decapitalize_continuation("Whisper does the rest", Some("the tool i use is")),
558 "Whisper does the rest"
559 );
560 }
561
562 #[test]
563 fn format_revise_trusts_whisper_casing_and_applies_features() {
564 assert_eq!(format_revise("hello there", None), "hello there");
565 assert_eq!(format_revise("first line new line second", None), "first line\nsecond");
566 }
567
568 #[test]
569 fn strip_sound_tags_removes_known_parenthesized_and_collapses_space() {
570 assert_eq!(strip_sound_tags("woke up (buzzer) early"), "woke up early");
571 assert_eq!(strip_sound_tags("(wind blowing) i sat down"), "i sat down");
572 assert_eq!(strip_sound_tags("then (clears throat) i spoke"), "then i spoke");
573 }
574
575 #[test]
576 fn strip_sound_tags_removes_bracketed_events_only() {
577 assert_eq!(strip_sound_tags("a [BLANK_AUDIO] b"), "a b");
578 assert_eq!(strip_sound_tags("a [MUSIC] b"), "a b");
579 assert_eq!(strip_sound_tags("see note [7] here"), "see note [7] here");
581 assert_eq!(strip_sound_tags("from [Smith] today"), "from [Smith] today");
582 }
583
584 #[test]
585 fn strip_sound_tags_keeps_real_words_and_asides() {
586 assert_eq!(strip_sound_tags("the buzzer rang"), "the buzzer rang"); assert_eq!(strip_sound_tags("it works (I think) well"), "it works (I think) well");
588 }
589
590 #[test]
591 fn strip_sound_tags_keeps_user_acronyms_but_strips_whisper_events() {
592 assert_eq!(strip_sound_tags("the [FBI] case"), "the [FBI] case");
593 assert_eq!(strip_sound_tags("sign the [NDA] today"), "sign the [NDA] today");
594 assert_eq!(strip_sound_tags("a [TODO] item"), "a [TODO] item");
595 assert_eq!(strip_sound_tags("a [BLANK_AUDIO] b"), "a b");
596 assert_eq!(strip_sound_tags("a [MUSIC] b"), "a b");
597 }
598
599 #[test]
600 fn strip_sound_tags_keeps_an_unmatched_bracket() {
601 assert_eq!(strip_sound_tags("hello (world"), "hello (world"); }
603
604 #[test]
605 fn strip_sound_tags_skips_a_lone_opener_and_keeps_stripping() {
606 assert_eq!(strip_sound_tags("a [ b (buzzer) c"), "a [ b c");
607 }
608
609 #[test]
610 fn strip_sound_tags_removes_consecutive_tags() {
611 assert_eq!(strip_sound_tags("(cough) (laughs) okay"), "okay");
612 }
613
614 #[test]
615 fn paragraphize_preserves_explicit_breaks() {
616 assert_eq!(paragraphize("First thought.\n\nSecond thought."), "First thought.\n\nSecond thought.");
617 }
618 #[test]
619 fn paragraphize_breaks_at_a_marker_after_enough_sentences() {
620 let t = "I woke up early. I made coffee. I read a book. Anyway, then I went for a walk. It was nice. The sun was out.";
621 let out = paragraphize(t);
622 assert!(out.contains("read a book.\n\nAnyway"), "{out}");
623 }
624 #[test]
625 fn paragraphize_leaves_short_text_in_one_paragraph() {
626 assert_eq!(paragraphize("Just one. And two."), "Just one. And two.");
627 }
628 #[test]
629 fn paragraphize_caps_a_long_marker_less_run() {
630 assert!(paragraphize("One. Two. Three. Four. Five. Six. Seven.").contains("\n\n"));
631 }
632 #[test]
633 fn shape_entry_only_paragraphizes_at_high() {
634 assert_eq!(shape_entry(Level::Medium, "A. B. C. D. E. F. G."), "A. B. C. D. E. F. G.");
635 assert!(shape_entry(Level::High, "A. B. C. D. E. F. G.").contains("\n\n"));
636 }
637
638 #[test]
639 fn paragraphize_never_alters_content_only_whitespace() {
640 let strip = |s: &str| s.chars().filter(|c| !c.is_whitespace()).collect::<String>();
643 for inp in [
644 "Okay. So just testing. I asked Claude. And it works. Anyway that's all.",
645 "no punctuation here just a run on stream of words with no breaks",
646 "First.\n\nSecond. Third.",
647 ] {
648 assert_eq!(strip(¶graphize(inp)), strip(inp), "content changed for {inp:?}");
649 }
650 }
651
652 #[test]
653 fn paragraphize_handles_degenerate_inputs() {
654 assert_eq!(paragraphize(""), "");
655 assert_eq!(paragraphize("\n\n"), "");
656 assert_eq!(paragraphize("no terminal punctuation here at all"), "no terminal punctuation here at all");
657 let out = paragraphize("So I started. Then I paused. And I thought. Anyway I went on. It was fine. The end came.");
659 assert!(!out.starts_with("\n\n") && !out.contains("\n\n\n"), "{out}");
660 }
661
662}