talk-core 0.1.0

The pure talk-cli engine: selection, slugs, frontmatter, settle, cleanup.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
/// Cleanup intensity. Plan 3 wires this into the LLM rewrite; deterministic-Light
/// is the instant, always-present layer.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Level { None, Light, Medium, High }

/// Words the GUARD treats as droppable (disfluencies + conversational filler), so a
/// rewrite that removes them still passes `guard_accepts`.
///
/// KNOWN LIMIT (Plan 3 code review): the guard drops these from BOTH sides of its
/// comparison, so it permits removing a *content* use of you/know/like/so/well/i/
/// mean anywhere (e.g. `guard_accepts("do you know the way", "do the way")` is
/// true). This is a moat for an LLM rewriter, not an instruction to remove them.
/// `deterministic_light` deliberately does NOT strip from this set — see
/// `LEADING_DISFLUENCIES`.
const FILLERS: &[&str] = &["um", "uh", "er", "ah", "like", "you", "know", "so", "well", "i", "mean"];

/// The ONLY words `deterministic_light` strips from the phrase start: non-lexical
/// vocalizations that are never content. The broader `FILLERS` set is NOT used
/// here — a leading `i`/`you`/`so`/`well`/`like` is almost always a real sentence
/// opener ("I think…", "You know what…", "So I realized…"), and dropping it
/// silently rewrites the user's meaning. Restraint wins: when a leading word is a
/// real dictionary word, keep it.
const LEADING_DISFLUENCIES: &[&str] = &["um", "uh", "er", "ah", "mm", "hmm", "uhm", "erm", "hm"];

fn content_words(text: &str) -> Vec<String> {
    text.to_lowercase()
        .split(|c: char| !c.is_alphanumeric())
        .filter(|w| !w.is_empty())
        .filter(|w| !FILLERS.contains(w))
        .map(|w| w.to_string())
        .collect()
}

/// The moat: accept a rewrite only if it preserves every content word from the
/// input, in order, adding/removing nothing but allowed fillers. Guards *harm*
/// (a substituted/dropped meaning word) rather than edit *volume*.
pub fn guard_accepts(input: &str, output: &str) -> bool {
    content_words(input) == content_words(output)
}

/// Apply spoken formatting commands deterministically. Padding the input with
/// spaces lets a command at the phrase start or end match too (the replacements
/// are space-delimited). Note: back-to-back identical commands ("new line new
/// line") collapse to one — an accepted Plan-1 edge case.
pub fn apply_spoken_commands(text: &str) -> String {
    format!(" {} ", text)
        .replace(" new paragraph ", "\n\n")
        .replace(" new line ", "\n")
        .replace(" period ", ". ")
        .replace(" comma ", ", ")
        .trim()
        .to_string()
}

/// Find `needle_lower` (lowercase ASCII) in `hay` at word boundaries, returning
/// a byte offset valid in `hay`. Case-insensitive without lowercasing `hay`, so
/// the offset never lands mid-codepoint (a prior bug: offsets from a lowercased
/// copy were sliced against the original and panicked on case-shrinking chars).
fn find_word_bounded(hay: &str, needle_lower: &str) -> Option<usize> {
    let hb = hay.as_bytes();
    let nb = needle_lower.as_bytes();
    let nlen = nb.len();
    if nlen == 0 || hb.len() < nlen { return None; }
    let mut i = 0;
    while i + nlen <= hb.len() {
        // ASCII needle bytes can only match ASCII haystack bytes, so a match
        // always starts/ends on a char boundary.
        if (0..nlen).all(|k| hb[i + k].to_ascii_lowercase() == nb[k]) {
            let before_ok = i == 0 || !hb[i - 1].is_ascii_alphanumeric();
            let after = i + nlen;
            let after_ok = after == hb.len() || !hb[after].is_ascii_alphanumeric();
            if before_ok && after_ok { return Some(i); }
        }
        i += 1;
    }
    None
}

/// Remove a self-correction: when a backtrack trigger appears AS A WHOLE PHRASE,
/// drop the words immediately preceding it (the spec's >3-word-reduction guard:
/// only fire when at least 3 words precede the trigger, so we don't nuke a short
/// true clause). Word-bounded so it never deletes content words it matched inside.
pub fn apply_backtrack(text: &str) -> String {
    const TRIGGERS: &[&str] = &["scratch that", "actually no"];
    let mut result = text.to_string();
    for trigger in TRIGGERS {
        while let Some(pos) = find_word_bounded(&result, trigger) {
            let before = result[..pos].trim_end();
            let after = &result[pos + trigger.len()..];
            let kept: Vec<&str> = before.split_whitespace().collect();
            if kept.len() >= 3 {
                // Drop everything back to the previous sentence boundary.
                let cut = before.rfind(['.', '\n']).map(|i| i + 1).unwrap_or(0);
                result = format!("{}{}", &before[..cut], after);
            } else {
                // Too short to be a real correction — just remove the trigger.
                result = format!("{} {}", before, after.trim_start());
            }
        }
    }
    result.split_whitespace().collect::<Vec<_>>().join(" ")
}

/// Continuation function-words: common enough as sentence-internal openers that
/// lowercasing them when a sentence spans a pause is safe. Deliberately excludes
/// anything proper-noun-shaped — we never lowercase an arbitrary capitalized token.
const CONTINUATIONS: &[&str] = &[
    "and", "but", "so", "or", "the", "a", "an", "it", "that", "this", "these",
    "those", "all", "then", "because", "which", "who",
];

/// Lowercase the first letter of `text` when it CONTINUES the previous block —
/// the previous block didn't end a sentence (no terminal `.!?`) AND the first word
/// is an allow-listed continuation word. Whisper cases each segment as a fresh
/// sentence; this undoes the spurious mid-sentence capital when a sentence spans a
/// pause. Conservative by construction (only the allow-list; never a proper noun).
pub fn decapitalize_continuation(text: &str, prev_clean: Option<&str>) -> String {
    let continues = prev_clean.is_some_and(|p| {
        // Look past trailing closing quotes/brackets so `."` reads as terminated;
        // a Unicode ellipsis is a deliberate trail-off, also terminal.
        let tail = p.trim_end().trim_end_matches(['"', '\'', ')', ']', '', '']);
        !matches!(tail.chars().last(), Some('.' | '!' | '?' | '') | None)
    });
    if !continues {
        return text.to_string();
    }
    let first = text.split_whitespace().next().unwrap_or("");
    let bare = first.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
    if !CONTINUATIONS.contains(&bare.as_str()) {
        return text.to_string();
    }
    let mut chars = text.chars();
    match chars.next() {
        Some(c) if c.is_uppercase() => c.to_lowercase().collect::<String>() + chars.as_str(),
        _ => text.to_string(),
    }
}

/// Format a pass-2 Whisper revise. Whisper already cased + punctuated, so this does
/// NOT re-capitalize sentence starts or force terminal punctuation (that re-creates
/// the per-segment mid-sentence capital). It applies only the spoken-command and
/// `scratch that` backtrack features and the continuation de-capitalizer.
///
/// Order is DELIBERATELY the reverse of the commit path's
/// `apply_backtrack(apply_spoken_commands(raw))` (live.rs/session.rs/format.rs):
/// `apply_backtrack` ends with `split_whitespace().join(" ")`, which collapses the
/// `\n` that spoken commands insert. Running backtrack FIRST (on whitespace-only
/// text) and spoken commands LAST lets a spoken `new line` survive into the output.
/// Do not "tidy" this back to the commit order — it silently drops newlines.
pub fn format_revise(whisper: &str, prev_clean: Option<&str>) -> String {
    let pre = apply_spoken_commands(&apply_backtrack(whisper));
    decapitalize_continuation(&pre, prev_clean)
}

/// Deterministic "Light": capitalize sentence starts, ensure terminal
/// punctuation, strip leading fillers. Always guard-safe by construction.
pub fn deterministic_light(text: &str) -> String {
    let trimmed = text.trim();
    let without_lead = strip_leading_fillers(trimmed);
    let capped = capitalize_sentences(&without_lead);
    ensure_terminal(&capitalize_standalone_i(&capped))
}

/// Capitalize a standalone `i` (and its contractions — the `'` after it is a
/// non-alphanumeric boundary, so `i'm`/`i'll` qualify) anywhere in the phrase.
/// The Plan-3 T1 spike showed this is the LLM's main visible improvement over the
/// deterministic layer — and it's free, and invisible to the case-insensitive
/// content-word guard.
fn capitalize_standalone_i(text: &str) -> String {
    let chars: Vec<char> = text.chars().collect();
    let mut out = String::with_capacity(text.len());
    for (idx, &ch) in chars.iter().enumerate() {
        let alone_before = idx == 0 || !chars[idx - 1].is_alphanumeric();
        let alone_after = idx + 1 == chars.len() || !chars[idx + 1].is_alphanumeric();
        out.push(if ch == 'i' && alone_before && alone_after { 'I' } else { ch });
    }
    out
}

fn strip_leading_fillers(text: &str) -> String {
    let mut words: Vec<&str> = text.split_whitespace().collect();
    while let Some(first) = words.first() {
        // Strip trailing punctuation so "um," / "uh." still match the bare token.
        let lw = first.trim_matches(|c: char| !c.is_alphanumeric()).to_lowercase();
        if LEADING_DISFLUENCIES.contains(&lw.as_str()) { words.remove(0); } else { break; }
    }
    words.join(" ")
}

fn capitalize_sentences(text: &str) -> String {
    let mut out = String::with_capacity(text.len());
    let mut at_start = true;
    for ch in text.chars() {
        if at_start && ch.is_alphabetic() {
            out.extend(ch.to_uppercase());
            at_start = false;
        } else {
            out.push(ch);
            if ch == '.' || ch == '!' || ch == '?' { at_start = true; }
        }
    }
    out
}

fn ensure_terminal(text: &str) -> String {
    let t = text.trim_end();
    if t.is_empty() || matches!(t.chars().last(), Some('.') | Some('!') | Some('?')) {
        t.to_string()
    } else {
        format!("{}.", t)
    }
}

/// Parse a config string into a `Level` (defaults to Light — the safe, restrained
/// default — on anything unrecognized).
pub fn parse_level(s: &str) -> Level {
    match s.trim().to_lowercase().as_str() {
        "none" => Level::None,
        "medium" => Level::Medium,
        "high" => Level::High,
        _ => Level::Light,
    }
}

/// The constrained-rewrite prompt for the LLM formatter (consumed by the Candle
/// façade in T7). `system` is hard restraint that holds at every level; the
/// per-level rule only *widens* which edits are permitted. Restraint is the
/// wording, so it lives here in the pure core, not in the inference façade.
pub struct RewritePrompt {
    pub system: String,
    pub user: String,
}

/// Build the per-level rewrite prompt for T7's Candle façade. The Light rule keeps
/// filler removal to LEADING disfluencies only — mid-sentence `you know`/`i mean`
/// removal is deliberately NOT requested, because the content-word guard would
/// accept such drops (see the `FILLERS` note). T7 must preserve this restriction.
pub fn rewrite_prompt(level: Level, text: &str) -> RewritePrompt {
    let restraint = "You clean up raw voice transcripts. Return ONLY the cleaned text, nothing else — no preamble, no quotes. NEVER change meaning: never swap a word for a different one, never add words that change meaning, never drop a negation, never reorder clauses. When unsure, leave it as it is.";
    let rule = match level {
        Level::None => "Return the text exactly as given.",
        Level::Light => "Fix only capitalization and punctuation, and drop leading non-lexical filler (um, uh, er, ah). Remove no other words.",
        Level::Medium => "Also remove disfluencies and false starts and join fragments into sentences. Keep every meaning-bearing word.",
        Level::High => "Also break into paragraphs at topic shifts and turn spoken lists into bullets. Keep every meaning-bearing word.",
    };
    RewritePrompt {
        system: format!("{restraint} {rule}"),
        user: format!("Clean this transcript:\n{text}"),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn accepts_pure_punctuation_and_filler_cleanup() {
        assert!(guard_accepts(
            "um so the thing is i keep avoiding it",
            "The thing is, I keep avoiding it.",
        ));
    }

    #[test]
    fn rejects_a_substituted_meaning_word() {
        // "love" -> "loathe": tiny edit distance, catastrophic meaning change.
        assert!(!guard_accepts("i love her", "I loathe her."));
    }

    #[test]
    fn rejects_a_dropped_content_word() {
        assert!(!guard_accepts("i never said that", "I said that."));
    }

    #[test]
    fn rejects_an_added_content_word() {
        assert!(!guard_accepts("i am tired", "I am very tired."));
    }

    #[test]
    fn guard_permits_dropping_filler_homographs_known_limit() {
        // Documents the Plan-3 review limit: filler-set words can be dropped even
        // as content. deterministic_light never does this (leading-only); the T7
        // LLM prompt must not request mid-sentence filler removal.
        assert!(guard_accepts("do you know the way", "do the way"));
        assert!(guard_accepts("i like it a lot", "it a lot"));
    }

    #[test]
    fn deterministic_light_caps_and_terminates() {
        assert_eq!(deterministic_light("um the thing is"), "The thing is.");
    }

    #[test]
    fn does_not_strip_a_leading_content_word() {
        // The reported "cleaning up too much" bug: a leading subject pronoun or
        // discourse opener is CONTENT, not a disfluency — it must survive.
        assert_eq!(deterministic_light("i sometimes forget the small things"),
            "I sometimes forget the small things.");
        assert_eq!(deterministic_light("you should go now"), "You should go now.");
        assert_eq!(deterministic_light("so i realized the answer"), "So I realized the answer.");
        assert_eq!(deterministic_light("well that is the thing"), "Well that is the thing.");
    }

    #[test]
    fn still_strips_leading_nonlexical_disfluencies() {
        assert_eq!(deterministic_light("um uh the thing is"), "The thing is.");
        assert_eq!(deterministic_light("ah i see it now"), "I see it now.");
        // Trailing punctuation on the disfluency token must not shield it.
        assert_eq!(deterministic_light("um, the thing is"), "The thing is.");
    }

    #[test]
    fn a_leading_pure_punctuation_token_survives() {
        // It trims to "" which is not a disfluency, so the loop stops and the token
        // is kept — no panic, no over-strip. (Capitalization still lands on the
        // first real word.)
        assert_eq!(deterministic_light("-- the thing is"), "-- The thing is.");
    }

    #[test]
    fn standalone_i_is_capitalized_mid_sentence() {
        assert_eq!(
            deterministic_light("the thing is i keep avoiding it"),
            "The thing is I keep avoiding it."
        );
        assert_eq!(
            deterministic_light("i'm sure i'll try what i've found"),
            "I'm sure I'll try what I've found."
        );
        // never inside words
        assert_eq!(deterministic_light("it is in the bin"), "It is in the bin.");
    }

    #[test]
    fn deterministic_light_is_guard_safe() {
        let raw = "um so i keep avoiding the hard conversation";
        assert!(guard_accepts(raw, &deterministic_light(raw)));
    }

    #[test]
    fn spoken_command_becomes_newline() {
        assert_eq!(apply_spoken_commands("a new line b"), "a\nb");
    }

    #[test]
    fn backtrack_drops_preceding_clause() {
        let out = apply_backtrack("the answer is yes scratch that the answer is no");
        assert!(!out.contains("yes"));
        assert!(out.contains("the answer is no"));
    }

    #[test]
    fn backtrack_does_not_fire_inside_a_word() {
        // "actually no" must NOT match inside "actually nobody" (word-bounded).
        let out = apply_backtrack("well actually nobody knows the truth");
        assert!(out.contains("nobody"));
        assert!(out.contains("the truth"));
    }

    #[test]
    fn spoken_command_at_phrase_start_and_end() {
        assert_eq!(apply_spoken_commands("new line b"), "b");
        assert_eq!(apply_spoken_commands("a new line"), "a");
    }

    #[test]
    fn backtrack_handles_non_ascii_without_panicking() {
        // 'ẞ' lowercases to fewer bytes; offsets must stay on char boundaries.
        let out = apply_backtrack("aa bb ẞ scratch that ẞ tail");
        assert!(out.contains("tail"));
        assert!(!out.contains("scratch that"));
    }

    #[test]
    fn parse_level_maps_known_and_defaults_to_light() {
        assert_eq!(parse_level("none"), Level::None);
        assert_eq!(parse_level("Medium"), Level::Medium);
        assert_eq!(parse_level("HIGH"), Level::High);
        assert_eq!(parse_level("light"), Level::Light);
        assert_eq!(parse_level("nonsense"), Level::Light);
    }

    #[test]
    fn rewrite_prompt_widens_by_level_and_carries_the_text() {
        assert!(rewrite_prompt(Level::Light, "x").system.to_lowercase().contains("capitalization"));
        assert!(rewrite_prompt(Level::Medium, "x").system.to_lowercase().contains("disfluencies"));
        assert!(rewrite_prompt(Level::High, "x").system.to_lowercase().contains("paragraph"));
        assert!(rewrite_prompt(Level::Light, "the raw phrase").user.contains("the raw phrase"));
    }

    #[test]
    fn rewrite_prompt_always_states_the_restraint() {
        for lvl in [Level::Light, Level::Medium, Level::High] {
            assert!(rewrite_prompt(lvl, "x").system.to_lowercase().contains("never change meaning"));
        }
    }

    #[test]
    fn decapitalize_lowercases_an_allowlist_continuation_after_unterminated_prior() {
        assert_eq!(
            decapitalize_continuation("All these edge cases get sorted out.", Some("with their product")),
            "all these edge cases get sorted out."
        );
    }

    #[test]
    fn decapitalize_keeps_capital_after_a_terminated_prior() {
        assert_eq!(
            decapitalize_continuation("All these edge cases.", Some("That worked.")),
            "All these edge cases."
        );
    }

    #[test]
    fn decapitalize_never_lowercases_a_non_allowlist_word_protecting_proper_nouns() {
        assert_eq!(
            decapitalize_continuation("Whisper does the rest", Some("the tool i use is")),
            "Whisper does the rest"
        );
    }

    #[test]
    fn format_revise_trusts_whisper_casing_and_applies_features() {
        assert_eq!(format_revise("hello there", None), "hello there");
        assert_eq!(format_revise("first line new line second", None), "first line\nsecond");
    }
}