talk_core/
cleanup.rs

1/// Cleanup intensity. Plan 3 wires this into the LLM rewrite; deterministic-Light
2/// is the instant, always-present layer.
3#[derive(Clone, Copy, Debug, PartialEq, Eq)]
4pub enum Level { None, Light, Medium, High }
5
6/// Words the GUARD treats as droppable (disfluencies + conversational filler), so a
7/// rewrite that removes them still passes `guard_accepts`.
8///
9/// KNOWN LIMIT (Plan 3 code review): the guard drops these from BOTH sides of its
10/// comparison, so it permits removing a *content* use of you/know/like/so/well/i/
11/// mean anywhere (e.g. `guard_accepts("do you know the way", "do the way")` is
12/// true). This is a moat for an LLM rewriter, not an instruction to remove them.
13/// `deterministic_light` deliberately does NOT strip from this set — see
14/// `LEADING_DISFLUENCIES`.
15const FILLERS: &[&str] = &["um", "uh", "er", "ah", "like", "you", "know", "so", "well", "i", "mean"];
16
17/// The ONLY words `deterministic_light` strips from the phrase start: non-lexical
18/// vocalizations that are never content. The broader `FILLERS` set is NOT used
19/// here — a leading `i`/`you`/`so`/`well`/`like` is almost always a real sentence
20/// opener ("I think…", "You know what…", "So I realized…"), and dropping it
21/// silently rewrites the user's meaning. Restraint wins: when a leading word is a
22/// real dictionary word, keep it.
23const LEADING_DISFLUENCIES: &[&str] = &["um", "uh", "er", "ah", "mm", "hmm", "uhm", "erm", "hm"];
24
25fn content_words(text: &str) -> Vec<String> {
26    text.to_lowercase()
27        .split(|c: char| !c.is_alphanumeric())
28        .filter(|w| !w.is_empty())
29        .filter(|w| !FILLERS.contains(w))
30        .map(|w| w.to_string())
31        .collect()
32}
33
34/// The moat: accept a rewrite only if it preserves every content word from the
35/// input, in order, adding/removing nothing but allowed fillers. Guards *harm*
36/// (a substituted/dropped meaning word) rather than edit *volume*.
37pub fn guard_accepts(input: &str, output: &str) -> bool {
38    content_words(input) == content_words(output)
39}
40
41/// Apply spoken formatting commands deterministically. Padding the input with
42/// spaces lets a command at the phrase start or end match too (the replacements
43/// are space-delimited). Note: back-to-back identical commands ("new line new
44/// line") collapse to one — an accepted Plan-1 edge case.
45pub fn apply_spoken_commands(text: &str) -> String {
46    format!(" {} ", text)
47        .replace(" new paragraph ", "\n\n")
48        .replace(" new line ", "\n")
49        .replace(" period ", ". ")
50        .replace(" comma ", ", ")
51        .trim()
52        .to_string()
53}
54
55/// Find `needle_lower` (lowercase ASCII) in `hay` at word boundaries, returning
56/// a byte offset valid in `hay`. Case-insensitive without lowercasing `hay`, so
57/// the offset never lands mid-codepoint (a prior bug: offsets from a lowercased
58/// copy were sliced against the original and panicked on case-shrinking chars).
59fn find_word_bounded(hay: &str, needle_lower: &str) -> Option<usize> {
60    let hb = hay.as_bytes();
61    let nb = needle_lower.as_bytes();
62    let nlen = nb.len();
63    if nlen == 0 || hb.len() < nlen { return None; }
64    let mut i = 0;
65    while i + nlen <= hb.len() {
66        // ASCII needle bytes can only match ASCII haystack bytes, so a match
67        // always starts/ends on a char boundary.
68        if (0..nlen).all(|k| hb[i + k].to_ascii_lowercase() == nb[k]) {
69            let before_ok = i == 0 || !hb[i - 1].is_ascii_alphanumeric();
70            let after = i + nlen;
71            let after_ok = after == hb.len() || !hb[after].is_ascii_alphanumeric();
72            if before_ok && after_ok { return Some(i); }
73        }
74        i += 1;
75    }
76    None
77}
78
79/// Remove a self-correction: when a backtrack trigger appears AS A WHOLE PHRASE,
80/// drop the words immediately preceding it (the spec's >3-word-reduction guard:
81/// only fire when at least 3 words precede the trigger, so we don't nuke a short
82/// true clause). Word-bounded so it never deletes content words it matched inside.
83pub fn apply_backtrack(text: &str) -> String {
84    const TRIGGERS: &[&str] = &["scratch that", "actually no"];
85    let mut result = text.to_string();
86    for trigger in TRIGGERS {
87        while let Some(pos) = find_word_bounded(&result, trigger) {
88            let before = result[..pos].trim_end();
89            let after = &result[pos + trigger.len()..];
90            let kept: Vec<&str> = before.split_whitespace().collect();
91            if kept.len() >= 3 {
92                // Drop everything back to the previous sentence boundary.
93                let cut = before.rfind(['.', '\n']).map(|i| i + 1).unwrap_or(0);
94                result = format!("{}{}", &before[..cut], after);
95            } else {
96                // Too short to be a real correction — just remove the trigger.
97                result = format!("{} {}", before, after.trim_start());
98            }
99        }
100    }
101    result.split_whitespace().collect::<Vec<_>>().join(" ")
102}
103
104/// Continuation function-words: common enough as sentence-internal openers that
105/// lowercasing them when a sentence spans a pause is safe. Deliberately excludes
106/// anything proper-noun-shaped — we never lowercase an arbitrary capitalized token.
107const CONTINUATIONS: &[&str] = &[
108    "and", "but", "so", "or", "the", "a", "an", "it", "that", "this", "these",
109    "those", "all", "then", "because", "which", "who",
110];
111
112/// Lowercase the first letter of `text` when it CONTINUES the previous block —
113/// the previous block didn't end a sentence (no terminal `.!?`) AND the first word
114/// is an allow-listed continuation word. Whisper cases each segment as a fresh
115/// sentence; this undoes the spurious mid-sentence capital when a sentence spans a
116/// pause. Conservative by construction (only the allow-list; never a proper noun).
117pub fn decapitalize_continuation(text: &str, prev_clean: Option<&str>) -> String {
118    let continues = prev_clean.is_some_and(|p| {
119        // Look past trailing closing quotes/brackets so `."` reads as terminated;
120        // a Unicode ellipsis is a deliberate trail-off, also terminal.
121        let tail = p.trim_end().trim_end_matches(['"', '\'', ')', ']', '”', '’']);
122        !matches!(tail.chars().last(), Some('.' | '!' | '?' | '…') | None)
123    });
124    if !continues {
125        return text.to_string();
126    }
127    let first = text.split_whitespace().next().unwrap_or("");
128    let bare = first.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
129    if !CONTINUATIONS.contains(&bare.as_str()) {
130        return text.to_string();
131    }
132    let mut chars = text.chars();
133    match chars.next() {
134        Some(c) if c.is_uppercase() => c.to_lowercase().collect::<String>() + chars.as_str(),
135        _ => text.to_string(),
136    }
137}
138
139/// Format a pass-2 Whisper revise. Whisper already cased + punctuated, so this does
140/// NOT re-capitalize sentence starts or force terminal punctuation (that re-creates
141/// the per-segment mid-sentence capital). It applies only the spoken-command and
142/// `scratch that` backtrack features and the continuation de-capitalizer.
143///
144/// Order is DELIBERATELY the reverse of the commit path's
145/// `apply_backtrack(apply_spoken_commands(raw))` (live.rs/session.rs/format.rs):
146/// `apply_backtrack` ends with `split_whitespace().join(" ")`, which collapses the
147/// `\n` that spoken commands insert. Running backtrack FIRST (on whitespace-only
148/// text) and spoken commands LAST lets a spoken `new line` survive into the output.
149/// Do not "tidy" this back to the commit order — it silently drops newlines.
150pub fn format_revise(whisper: &str, prev_clean: Option<&str>) -> String {
151    let pre = apply_spoken_commands(&apply_backtrack(whisper));
152    decapitalize_continuation(&pre, prev_clean)
153}
154
155/// Deterministic "Light": capitalize sentence starts, ensure terminal
156/// punctuation, strip leading fillers. Always guard-safe by construction.
157pub fn deterministic_light(text: &str) -> String {
158    let trimmed = text.trim();
159    let without_lead = strip_leading_fillers(trimmed);
160    let capped = capitalize_sentences(&without_lead);
161    ensure_terminal(&capitalize_standalone_i(&capped))
162}
163
164/// Capitalize a standalone `i` (and its contractions — the `'` after it is a
165/// non-alphanumeric boundary, so `i'm`/`i'll` qualify) anywhere in the phrase.
166/// The Plan-3 T1 spike showed this is the LLM's main visible improvement over the
167/// deterministic layer — and it's free, and invisible to the case-insensitive
168/// content-word guard.
169fn capitalize_standalone_i(text: &str) -> String {
170    let chars: Vec<char> = text.chars().collect();
171    let mut out = String::with_capacity(text.len());
172    for (idx, &ch) in chars.iter().enumerate() {
173        let alone_before = idx == 0 || !chars[idx - 1].is_alphanumeric();
174        let alone_after = idx + 1 == chars.len() || !chars[idx + 1].is_alphanumeric();
175        out.push(if ch == 'i' && alone_before && alone_after { 'I' } else { ch });
176    }
177    out
178}
179
180fn strip_leading_fillers(text: &str) -> String {
181    let mut words: Vec<&str> = text.split_whitespace().collect();
182    while let Some(first) = words.first() {
183        // Strip trailing punctuation so "um," / "uh." still match the bare token.
184        let lw = first.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
185        if LEADING_DISFLUENCIES.contains(&lw.as_str()) { words.remove(0); } else { break; }
186    }
187    words.join(" ")
188}
189
190fn capitalize_sentences(text: &str) -> String {
191    let mut out = String::with_capacity(text.len());
192    let mut at_start = true;
193    for ch in text.chars() {
194        if at_start && ch.is_alphabetic() {
195            out.extend(ch.to_uppercase());
196            at_start = false;
197        } else {
198            out.push(ch);
199            if ch == '.' || ch == '!' || ch == '?' { at_start = true; }
200        }
201    }
202    out
203}
204
205fn ensure_terminal(text: &str) -> String {
206    let t = text.trim_end();
207    if t.is_empty() || matches!(t.chars().last(), Some('.') | Some('!') | Some('?')) {
208        t.to_string()
209    } else {
210        format!("{}.", t)
211    }
212}
213
214/// Non-speech event words Whisper emits inside `(...)`. A parenthesized span is
215/// removed only when EVERY inner word is in this set, so multi-word events
216/// (`(wind blowing)`) strip while a real aside (`(I think)`) survives.
217const SOUND_WORDS: &[&str] = &[
218    "buzzer", "buzzing", "music", "applause", "applauding", "laughter", "laughs",
219    "laughing", "coughs", "coughing", "cough", "sighs", "sigh", "beep", "beeping",
220    "breathing", "breath", "breathes", "static", "noise", "silence", "blank_audio",
221    "wind", "blowing", "clears", "throat", "typing", "footsteps", "door", "closes",
222    "knock", "knocking", "indistinct", "inaudible", "sniffles", "chuckles",
223];
224
225/// Remove Whisper's non-speech tags. `[...]` spans go only when a known tag or an
226/// all-caps event shape (`[BLANK_AUDIO]`); `(...)` spans go only when every inner
227/// word is a sound word. Runs in the pre-layer (before the content-word guard), so
228/// nothing it removes ever reaches the guard as a content-word change.
229pub fn strip_sound_tags(text: &str) -> String {
230    let mut out = String::with_capacity(text.len());
231    let mut rest = text;
232    while let Some(open) = rest.find(['(', '[']) {
233        let open_ch = rest.as_bytes()[open];
234        let close_ch = if open_ch == b'(' { ')' } else { ']' };
235        let Some(rel) = rest[open + 1..].find(close_ch) else {
236            out.push_str(&rest[..=open]);
237            rest = &rest[open + 1..];
238            continue;
239        };
240        let close = open + 1 + rel;
241        let inner = rest[open + 1..close].trim();
242        let remove = if open_ch == b'(' {
243            is_all_sound_words(inner)
244        } else {
245            is_event_bracket(inner)
246        };
247        out.push_str(&rest[..open]);
248        if !remove {
249            out.push_str(&rest[open..=close]);
250        }
251        rest = &rest[close + 1..];
252    }
253    out.push_str(rest);
254    out.split_whitespace().collect::<Vec<_>>().join(" ")
255}
256
257fn is_all_sound_words(inner: &str) -> bool {
258    let mut any = false;
259    for w in inner.split_whitespace() {
260        any = true;
261        let bare = w.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
262        if !SOUND_WORDS.contains(&bare.as_str()) {
263            return false;
264        }
265    }
266    any
267}
268
269fn is_event_bracket(inner: &str) -> bool {
270    if SOUND_WORDS.contains(&inner.to_lowercase().as_str()) {
271        return true;
272    }
273    inner.contains('_')
274        && inner.chars().any(|c| c.is_ascii_uppercase())
275        && inner.chars().all(|c| c.is_ascii_uppercase() || c == '_' || c == ' ')
276}
277
278/// Parse a config string into a `Level` (defaults to Light — the safe, restrained
279/// default — on anything unrecognized).
280pub fn parse_level(s: &str) -> Level {
281    match s.trim().to_lowercase().as_str() {
282        "none" => Level::None,
283        "medium" => Level::Medium,
284        "high" => Level::High,
285        _ => Level::Light,
286    }
287}
288
289/// Discourse openers that often begin a new train of thought in spoken monologue.
290const PARA_OPENERS: &[&str] = &[
291    "anyway", "anyways", "so", "but", "now", "another", "also", "okay", "alright",
292    "well", "then", "actually", "honestly", "basically",
293];
294
295const MIN_SENTENCES_PER_PARA: usize = 3;
296const MAX_SENTENCES_PER_PARA: usize = 6;
297
298/// Group a long monologue into readable paragraphs (the High level). Existing blank
299/// lines (a spoken "new paragraph") are preserved as hard breaks; within each run, a
300/// new paragraph starts at a sentence that opens with a discourse marker once the
301/// current paragraph already holds MIN_SENTENCES_PER_PARA sentences, or unconditionally
302/// once it reaches MAX_SENTENCES_PER_PARA — so breaks fall at thought shifts, never
303/// after every sentence, and no paragraph runs on forever.
304pub fn paragraphize(text: &str) -> String {
305    text.split("\n\n")
306        .map(|block| paragraphize_run(block.trim()))
307        .filter(|b| !b.is_empty())
308        .collect::<Vec<_>>()
309        .join("\n\n")
310}
311
312fn paragraphize_run(run: &str) -> String {
313    let mut paras: Vec<Vec<String>> = vec![Vec::new()];
314    for s in split_sentences(run) {
315        let cur_len = paras.last().unwrap().len();
316        let opens = s.split_whitespace().next()
317            .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase())
318            .is_some_and(|w| PARA_OPENERS.contains(&w.as_str()));
319        if (opens && cur_len >= MIN_SENTENCES_PER_PARA) || cur_len >= MAX_SENTENCES_PER_PARA {
320            paras.push(Vec::new());
321        }
322        paras.last_mut().unwrap().push(s);
323    }
324    paras.into_iter()
325        .filter(|p| !p.is_empty())
326        .map(|p| p.join(" "))
327        .collect::<Vec<_>>()
328        .join("\n\n")
329}
330
331/// Split into sentences, keeping each sentence's terminal punctuation. A boundary is
332/// `.`/`!`/`?` (plus any trailing closing quotes) followed by whitespace or end.
333fn split_sentences(text: &str) -> Vec<String> {
334    let mut out = Vec::new();
335    let mut cur = String::new();
336    let mut chars = text.chars().peekable();
337    while let Some(c) = chars.next() {
338        cur.push(c);
339        if matches!(c, '.' | '!' | '?') {
340            while matches!(chars.peek(), Some('"' | '\'' | '\u{201d}' | '\u{2019}' | ')')) {
341                cur.push(chars.next().unwrap());
342            }
343            if chars.peek().is_none_or(|n| n.is_whitespace()) {
344                out.push(cur.trim().to_string());
345                cur.clear();
346            }
347        }
348    }
349    if !cur.trim().is_empty() {
350        out.push(cur.trim().to_string());
351    }
352    out
353}
354
355/// Apply whole-entry shaping for a level (called at session end on the joined clean
356/// text). High paragraphizes; lower levels pass through (per-phrase Light already ran).
357pub fn shape_entry(level: Level, text: &str) -> String {
358    match level {
359        Level::High => paragraphize(text),
360        _ => text.to_string(),
361    }
362}
363
364/// The constrained-rewrite prompt for the LLM formatter (consumed by the Candle
365/// façade in T7). `system` is hard restraint that holds at every level; the
366/// per-level rule only *widens* which edits are permitted. Restraint is the
367/// wording, so it lives here in the pure core, not in the inference façade.
368pub struct RewritePrompt {
369    pub system: String,
370    pub user: String,
371}
372
373/// Build the per-level rewrite prompt for T7's Candle façade. The Light rule keeps
374/// filler removal to LEADING disfluencies only — mid-sentence `you know`/`i mean`
375/// removal is deliberately NOT requested, because the content-word guard would
376/// accept such drops (see the `FILLERS` note). T7 must preserve this restriction.
377pub fn rewrite_prompt(level: Level, text: &str) -> RewritePrompt {
378    let restraint = "You clean up raw voice transcripts. Return ONLY the cleaned text, nothing else — no preamble, no quotes. NEVER change meaning: never swap a word for a different one, never add words that change meaning, never drop a negation, never reorder clauses. When unsure, leave it as it is.";
379    let rule = match level {
380        Level::None => "Return the text exactly as given.",
381        Level::Light => "Fix only capitalization and punctuation, and drop leading non-lexical filler (um, uh, er, ah). Remove no other words.",
382        Level::Medium => "Also remove disfluencies and false starts and join fragments into sentences. Keep every meaning-bearing word.",
383        Level::High => "Also break into paragraphs at topic shifts. Keep every meaning-bearing word, in its original order, adding nothing.",
384    };
385    RewritePrompt {
386        system: format!("{restraint} {rule}"),
387        user: format!("Clean this transcript:\n{text}"),
388    }
389}
390
391#[cfg(test)]
392mod tests {
393    use super::*;
394
395    #[test]
396    fn accepts_pure_punctuation_and_filler_cleanup() {
397        assert!(guard_accepts(
398            "um so the thing is i keep avoiding it",
399            "The thing is, I keep avoiding it.",
400        ));
401    }
402
403    #[test]
404    fn rejects_a_substituted_meaning_word() {
405        // "love" -> "loathe": tiny edit distance, catastrophic meaning change.
406        assert!(!guard_accepts("i love her", "I loathe her."));
407    }
408
409    #[test]
410    fn rejects_a_dropped_content_word() {
411        assert!(!guard_accepts("i never said that", "I said that."));
412    }
413
414    #[test]
415    fn rejects_an_added_content_word() {
416        assert!(!guard_accepts("i am tired", "I am very tired."));
417    }
418
419    #[test]
420    fn guard_permits_dropping_filler_homographs_known_limit() {
421        // Documents the Plan-3 review limit: filler-set words can be dropped even
422        // as content. deterministic_light never does this (leading-only); the T7
423        // LLM prompt must not request mid-sentence filler removal.
424        assert!(guard_accepts("do you know the way", "do the way"));
425        assert!(guard_accepts("i like it a lot", "it a lot"));
426    }
427
428    #[test]
429    fn deterministic_light_caps_and_terminates() {
430        assert_eq!(deterministic_light("um the thing is"), "The thing is.");
431    }
432
433    #[test]
434    fn does_not_strip_a_leading_content_word() {
435        // The reported "cleaning up too much" bug: a leading subject pronoun or
436        // discourse opener is CONTENT, not a disfluency — it must survive.
437        assert_eq!(deterministic_light("i sometimes forget the small things"),
438            "I sometimes forget the small things.");
439        assert_eq!(deterministic_light("you should go now"), "You should go now.");
440        assert_eq!(deterministic_light("so i realized the answer"), "So I realized the answer.");
441        assert_eq!(deterministic_light("well that is the thing"), "Well that is the thing.");
442    }
443
444    #[test]
445    fn still_strips_leading_nonlexical_disfluencies() {
446        assert_eq!(deterministic_light("um uh the thing is"), "The thing is.");
447        assert_eq!(deterministic_light("ah i see it now"), "I see it now.");
448        // Trailing punctuation on the disfluency token must not shield it.
449        assert_eq!(deterministic_light("um, the thing is"), "The thing is.");
450    }
451
452    #[test]
453    fn a_leading_pure_punctuation_token_survives() {
454        // It trims to "" which is not a disfluency, so the loop stops and the token
455        // is kept — no panic, no over-strip. (Capitalization still lands on the
456        // first real word.)
457        assert_eq!(deterministic_light("-- the thing is"), "-- The thing is.");
458    }
459
460    #[test]
461    fn standalone_i_is_capitalized_mid_sentence() {
462        assert_eq!(
463            deterministic_light("the thing is i keep avoiding it"),
464            "The thing is I keep avoiding it."
465        );
466        assert_eq!(
467            deterministic_light("i'm sure i'll try what i've found"),
468            "I'm sure I'll try what I've found."
469        );
470        // never inside words
471        assert_eq!(deterministic_light("it is in the bin"), "It is in the bin.");
472    }
473
474    #[test]
475    fn deterministic_light_is_guard_safe() {
476        let raw = "um so i keep avoiding the hard conversation";
477        assert!(guard_accepts(raw, &deterministic_light(raw)));
478    }
479
480    #[test]
481    fn spoken_command_becomes_newline() {
482        assert_eq!(apply_spoken_commands("a new line b"), "a\nb");
483    }
484
485    #[test]
486    fn backtrack_drops_preceding_clause() {
487        let out = apply_backtrack("the answer is yes scratch that the answer is no");
488        assert!(!out.contains("yes"));
489        assert!(out.contains("the answer is no"));
490    }
491
492    #[test]
493    fn backtrack_does_not_fire_inside_a_word() {
494        // "actually no" must NOT match inside "actually nobody" (word-bounded).
495        let out = apply_backtrack("well actually nobody knows the truth");
496        assert!(out.contains("nobody"));
497        assert!(out.contains("the truth"));
498    }
499
500    #[test]
501    fn spoken_command_at_phrase_start_and_end() {
502        assert_eq!(apply_spoken_commands("new line b"), "b");
503        assert_eq!(apply_spoken_commands("a new line"), "a");
504    }
505
506    #[test]
507    fn backtrack_handles_non_ascii_without_panicking() {
508        // 'ẞ' lowercases to fewer bytes; offsets must stay on char boundaries.
509        let out = apply_backtrack("aa bb ẞ scratch that ẞ tail");
510        assert!(out.contains("tail"));
511        assert!(!out.contains("scratch that"));
512    }
513
514    #[test]
515    fn parse_level_maps_known_and_defaults_to_light() {
516        assert_eq!(parse_level("none"), Level::None);
517        assert_eq!(parse_level("Medium"), Level::Medium);
518        assert_eq!(parse_level("HIGH"), Level::High);
519        assert_eq!(parse_level("light"), Level::Light);
520        assert_eq!(parse_level("nonsense"), Level::Light);
521    }
522
523    #[test]
524    fn rewrite_prompt_widens_by_level_and_carries_the_text() {
525        assert!(rewrite_prompt(Level::Light, "x").system.to_lowercase().contains("capitalization"));
526        assert!(rewrite_prompt(Level::Medium, "x").system.to_lowercase().contains("disfluencies"));
527        assert!(rewrite_prompt(Level::High, "x").system.to_lowercase().contains("paragraph"));
528        assert!(rewrite_prompt(Level::Light, "the raw phrase").user.contains("the raw phrase"));
529    }
530
531    #[test]
532    fn rewrite_prompt_always_states_the_restraint() {
533        for lvl in [Level::Light, Level::Medium, Level::High] {
534            assert!(rewrite_prompt(lvl, "x").system.to_lowercase().contains("never change meaning"));
535        }
536    }
537
538    #[test]
539    fn decapitalize_lowercases_an_allowlist_continuation_after_unterminated_prior() {
540        assert_eq!(
541            decapitalize_continuation("All these edge cases get sorted out.", Some("with their product")),
542            "all these edge cases get sorted out."
543        );
544    }
545
546    #[test]
547    fn decapitalize_keeps_capital_after_a_terminated_prior() {
548        assert_eq!(
549            decapitalize_continuation("All these edge cases.", Some("That worked.")),
550            "All these edge cases."
551        );
552    }
553
554    #[test]
555    fn decapitalize_never_lowercases_a_non_allowlist_word_protecting_proper_nouns() {
556        assert_eq!(
557            decapitalize_continuation("Whisper does the rest", Some("the tool i use is")),
558            "Whisper does the rest"
559        );
560    }
561
562    #[test]
563    fn format_revise_trusts_whisper_casing_and_applies_features() {
564        assert_eq!(format_revise("hello there", None), "hello there");
565        assert_eq!(format_revise("first line new line second", None), "first line\nsecond");
566    }
567
568    #[test]
569    fn strip_sound_tags_removes_known_parenthesized_and_collapses_space() {
570        assert_eq!(strip_sound_tags("woke up (buzzer) early"), "woke up early");
571        assert_eq!(strip_sound_tags("(wind blowing) i sat down"), "i sat down");
572        assert_eq!(strip_sound_tags("then (clears throat) i spoke"), "then i spoke");
573    }
574
575    #[test]
576    fn strip_sound_tags_removes_bracketed_events_only() {
577        assert_eq!(strip_sound_tags("a [BLANK_AUDIO] b"), "a b");
578        assert_eq!(strip_sound_tags("a [MUSIC] b"), "a b");
579        // not an event shape → kept
580        assert_eq!(strip_sound_tags("see note [7] here"), "see note [7] here");
581        assert_eq!(strip_sound_tags("from [Smith] today"), "from [Smith] today");
582    }
583
584    #[test]
585    fn strip_sound_tags_keeps_real_words_and_asides() {
586        assert_eq!(strip_sound_tags("the buzzer rang"), "the buzzer rang"); // bare word kept
587        assert_eq!(strip_sound_tags("it works (I think) well"), "it works (I think) well");
588    }
589
590    #[test]
591    fn strip_sound_tags_keeps_user_acronyms_but_strips_whisper_events() {
592        assert_eq!(strip_sound_tags("the [FBI] case"), "the [FBI] case");
593        assert_eq!(strip_sound_tags("sign the [NDA] today"), "sign the [NDA] today");
594        assert_eq!(strip_sound_tags("a [TODO] item"), "a [TODO] item");
595        assert_eq!(strip_sound_tags("a [BLANK_AUDIO] b"), "a b");
596        assert_eq!(strip_sound_tags("a [MUSIC] b"), "a b");
597    }
598
599    #[test]
600    fn strip_sound_tags_keeps_an_unmatched_bracket() {
601        assert_eq!(strip_sound_tags("hello (world"), "hello (world"); // unmatched open → kept
602    }
603
604    #[test]
605    fn strip_sound_tags_skips_a_lone_opener_and_keeps_stripping() {
606        assert_eq!(strip_sound_tags("a [ b (buzzer) c"), "a [ b c");
607    }
608
609    #[test]
610    fn strip_sound_tags_removes_consecutive_tags() {
611        assert_eq!(strip_sound_tags("(cough) (laughs) okay"), "okay");
612    }
613
614    #[test]
615    fn paragraphize_preserves_explicit_breaks() {
616        assert_eq!(paragraphize("First thought.\n\nSecond thought."), "First thought.\n\nSecond thought.");
617    }
618    #[test]
619    fn paragraphize_breaks_at_a_marker_after_enough_sentences() {
620        let t = "I woke up early. I made coffee. I read a book. Anyway, then I went for a walk. It was nice. The sun was out.";
621        let out = paragraphize(t);
622        assert!(out.contains("read a book.\n\nAnyway"), "{out}");
623    }
624    #[test]
625    fn paragraphize_leaves_short_text_in_one_paragraph() {
626        assert_eq!(paragraphize("Just one. And two."), "Just one. And two.");
627    }
628    #[test]
629    fn paragraphize_caps_a_long_marker_less_run() {
630        assert!(paragraphize("One. Two. Three. Four. Five. Six. Seven.").contains("\n\n"));
631    }
632    #[test]
633    fn shape_entry_only_paragraphizes_at_high() {
634        assert_eq!(shape_entry(Level::Medium, "A. B. C. D. E. F. G."), "A. B. C. D. E. F. G.");
635        assert!(shape_entry(Level::High, "A. B. C. D. E. F. G.").contains("\n\n"));
636    }
637
638    #[test]
639    fn paragraphize_never_alters_content_only_whitespace() {
640        // The safety property that replaces the removed LLM guard: paragraphize only
641        // moves whitespace; the non-whitespace character sequence is identical.
642        let strip = |s: &str| s.chars().filter(|c| !c.is_whitespace()).collect::<String>();
643        for inp in [
644            "Okay. So just testing. I asked Claude. And it works. Anyway that's all.",
645            "no punctuation here just a run on stream of words with no breaks",
646            "First.\n\nSecond. Third.",
647        ] {
648            assert_eq!(strip(&paragraphize(inp)), strip(inp), "content changed for {inp:?}");
649        }
650    }
651
652    #[test]
653    fn paragraphize_handles_degenerate_inputs() {
654        assert_eq!(paragraphize(""), "");
655        assert_eq!(paragraphize("\n\n"), "");
656        assert_eq!(paragraphize("no terminal punctuation here at all"), "no terminal punctuation here at all");
657        // a marker as the very first sentence must not emit an empty leading paragraph
658        let out = paragraphize("So I started. Then I paused. And I thought. Anyway I went on. It was fine. The end came.");
659        assert!(!out.starts_with("\n\n") && !out.contains("\n\n\n"), "{out}");
660    }
661
662}
talk_core/cleanup.rs

talk_core/
cleanup.rs