lean_ctx/core/web/
distill.rs

1//! Extractive research-compression modes for prose and transcripts.
2//!
3//! These are deterministic, heuristic distillations — no LLM in the loop — so
4//! they are cheap, reproducible, and safe to run inside a synchronous tool
5//! handler. They turn a cleaned article or transcript into the high-signal
6//! subset an agent actually needs:
7//!
8//! * [`facts_scored`] — sentences carrying factual signals (numbers, dates,
9//!   entities), each with a confidence score.
10//! * [`quotes_scored`] — the most central / query-relevant sentences, as
11//!   evidence, each with a confidence score.
12//! * [`transcript_summary`] — de-duplicated, filler-stripped spoken text.
13
14use std::collections::{HashMap, HashSet};
15
16const MIN_SENTENCE_CHARS: usize = 24;
17const MAX_SENTENCE_CHARS: usize = 400;
18
19const STOPWORDS: &[&str] = &[
20    "the", "and", "for", "are", "but", "not", "you", "all", "any", "can", "had", "her", "was",
21    "one", "our", "out", "day", "get", "has", "him", "his", "how", "man", "new", "now", "old",
22    "see", "two", "way", "who", "did", "its", "let", "put", "say", "she", "too", "use", "that",
23    "this", "with", "from", "they", "have", "were", "will", "your", "what", "when", "your", "than",
24    "then", "them", "into", "more", "some", "such", "only", "also", "been", "very", "just", "over",
25];
26
27const FILLER: &[&str] = &[
28    "um",
29    "uh",
30    "erm",
31    "hmm",
32    "like",
33    "basically",
34    "actually",
35    "literally",
36    "honestly",
37    "okay",
38    "ok",
39    "yeah",
40    "right",
41    "so",
42    "well",
43    "anyway",
44    "anyways",
45];
46
47/// Extract sentences carrying factual signals, ranked and de-duplicated. Each
48/// sentence carries a confidence (`[0.0, 1.0]`) so callers can build attributable
49/// [`crate::core::evidence::Claim`]s. Facts use an *absolute* mapping (more
50/// factual signals → higher confidence) rather than min-max, so the score is
51/// meaningful even when the top sentences tie.
52pub fn facts_scored(text: &str, query: Option<&str>, max_items: usize) -> Vec<(String, f32)> {
53    select_top_scored(facts_ranked(text, query), max_items)
54        .into_iter()
55        .map(|(text, raw)| (text, factual_confidence(raw)))
56        .collect()
57}
58
59/// Map a raw factual score (≈ number of factual signals) to absolute confidence.
60fn factual_confidence(raw: f32) -> f32 {
61    (0.55 + 0.09 * raw).clamp(0.5, 0.97)
62}
63
64fn facts_ranked(text: &str, query: Option<&str>) -> Vec<(f64, usize, String)> {
65    let qterms = query_terms(query);
66    let mut scored = Vec::new();
67    for (idx, sentence) in split_sentences(text).into_iter().enumerate() {
68        let len = sentence.chars().count();
69        if !(MIN_SENTENCE_CHARS..=MAX_SENTENCE_CHARS).contains(&len) {
70            continue;
71        }
72        let base = factual_score(&sentence);
73        if base <= 0.0 {
74            continue;
75        }
76        let score = base + query_boost(&sentence, &qterms);
77        scored.push((score, idx, sentence));
78    }
79    scored
80}
81
82/// Extract the most central (or query-relevant) sentences as quotable evidence.
83/// Each sentence carries a source-relative confidence (`[0.0, 1.0]`).
84pub fn quotes_scored(text: &str, query: Option<&str>, max_items: usize) -> Vec<(String, f32)> {
85    normalize_conf(select_top_scored(quotes_ranked(text, query), max_items))
86}
87
88fn quotes_ranked(text: &str, query: Option<&str>) -> Vec<(f64, usize, String)> {
89    let sentences = split_sentences(text);
90    let freq = term_frequencies(&sentences);
91    let qterms = query_terms(query);
92
93    let mut scored = Vec::new();
94    for (idx, sentence) in sentences.into_iter().enumerate() {
95        let len = sentence.chars().count();
96        if !(MIN_SENTENCE_CHARS..=MAX_SENTENCE_CHARS).contains(&len) {
97            continue;
98        }
99        let centrality = centrality_score(&sentence, &freq);
100        let score = centrality + query_boost(&sentence, &qterms) * 3.0;
101        if score <= 0.0 {
102            continue;
103        }
104        scored.push((score, idx, sentence));
105    }
106    scored
107}
108
109/// Condense a transcript: strip filler, drop near-duplicate runs, cap length.
110pub fn transcript_summary(text: &str, max_chars: usize) -> String {
111    let mut kept: Vec<String> = Vec::new();
112    let mut total = 0usize;
113
114    for sentence in split_sentences(text) {
115        let cleaned = strip_filler(&sentence);
116        let cleaned = cleaned.trim();
117        if cleaned.chars().count() < 8 {
118            continue;
119        }
120        if let Some(last) = kept.last() {
121            if jaccard(last, cleaned) > 0.8 {
122                continue;
123            }
124        }
125        if total + cleaned.len() > max_chars && !kept.is_empty() {
126            break;
127        }
128        total += cleaned.len();
129        kept.push(cleaned.to_string());
130    }
131    kept.join(" ")
132}
133
134/// Line-structure-preserving prose squeeze for the proxy tool-result funnel.
135///
136/// Unlike [`transcript_summary`] (which collapses everything into one paragraph),
137/// this keeps paragraph/heading shape and only:
138/// * collapses runs of blank lines to a single blank,
139/// * drops a line that is a near-duplicate of a recently kept line
140///   (boilerplate / nav repeats common in scraped pages),
141/// * caps total length to `max_chars` with a truncation marker.
142///
143/// Filler-word stripping is intentionally *not* applied here: words like
144/// "so" / "like" / "right" carry meaning in written prose and are only noise in
145/// spoken transcripts.
146pub fn squeeze_prose(text: &str, max_chars: usize) -> String {
147    const RECENT: usize = 12;
148    let mut out: Vec<String> = Vec::new();
149    let mut recent: Vec<String> = Vec::new();
150    let mut total = 0usize;
151    let mut blank_run = 0u32;
152
153    for raw in text.lines() {
154        let line = raw.trim_end();
155        if line.trim().is_empty() {
156            blank_run += 1;
157            if blank_run == 1 && !out.is_empty() {
158                out.push(String::new());
159            }
160            continue;
161        }
162        blank_run = 0;
163
164        let normalized = line.trim();
165        if !is_protected_line(line) && recent.iter().any(|p| jaccard(p, normalized) > 0.9) {
166            continue;
167        }
168
169        if total + line.len() > max_chars && !out.is_empty() {
170            out.push("…[truncated]".to_string());
171            break;
172        }
173        total += line.len();
174        out.push(line.to_string());
175
176        recent.push(normalized.to_string());
177        if recent.len() > RECENT {
178            recent.remove(0);
179        }
180    }
181
182    while out.last().is_some_and(String::is_empty) {
183        out.pop();
184    }
185    out.join("\n")
186}
187
188/// Lines that must survive dedup: citations, links, headings and quote/list
189/// markers carry attribution or structure even when textually similar.
190fn is_protected_line(line: &str) -> bool {
191    let t = line.trim_start();
192    t.starts_with("Source:")
193        || t.starts_with("Site:")
194        || t.starts_with("http://")
195        || t.starts_with("https://")
196        || t.starts_with("- [")
197        || t.starts_with("> ")
198        || t.starts_with('#')
199        || t.starts_with("---")
200}
201
202// ── Sentence splitting ─────────────────────────────────────────────────────
203
204/// Split text into trimmed, non-empty sentences across line boundaries.
205pub fn split_sentences(text: &str) -> Vec<String> {
206    let mut sentences = Vec::new();
207    for line in text.lines() {
208        let line = line.trim();
209        if line.is_empty() {
210            continue;
211        }
212        let mut current = String::new();
213        let mut chars = line.chars().peekable();
214        while let Some(c) = chars.next() {
215            current.push(c);
216            if matches!(c, '.' | '!' | '?') {
217                let boundary = chars.peek().is_none_or(|n| n.is_whitespace());
218                if boundary {
219                    push_trimmed(&mut sentences, &current);
220                    current.clear();
221                }
222            }
223        }
224        push_trimmed(&mut sentences, &current);
225    }
226    sentences
227}
228
229fn push_trimmed(acc: &mut Vec<String>, s: &str) {
230    let trimmed = s.trim();
231    if !trimmed.is_empty() {
232        acc.push(trimmed.to_string());
233    }
234}
235
236// ── Scoring ────────────────────────────────────────────────────────────────
237
238fn factual_score(sentence: &str) -> f64 {
239    let lower = sentence.to_lowercase();
240    let mut score = 0.0;
241
242    if sentence.chars().any(|c| c.is_ascii_digit()) {
243        score += 1.0;
244    }
245    if sentence.contains('%') || sentence.contains('$') || sentence.contains('€') {
246        score += 1.0;
247    }
248    if has_year(sentence) {
249        score += 1.0;
250    }
251    if has_magnitude_word(&lower) {
252        score += 1.0;
253    }
254    if proper_noun_runs(sentence) >= 1 {
255        score += 0.5;
256    }
257    score
258}
259
260fn has_year(sentence: &str) -> bool {
261    let bytes = sentence.as_bytes();
262    let mut run = 0;
263    for &b in bytes {
264        if b.is_ascii_digit() {
265            run += 1;
266            if run == 4 {
267                return true;
268            }
269        } else {
270            run = 0;
271        }
272    }
273    false
274}
275
276fn has_magnitude_word(lower: &str) -> bool {
277    const WORDS: &[&str] = &[
278        "percent",
279        "million",
280        "billion",
281        "trillion",
282        "thousand",
283        "kg",
284        "km",
285        "mph",
286        "gb",
287        "mb",
288        "tb",
289        "ghz",
290        "kwh",
291        "celsius",
292        "fahrenheit",
293        "dollars",
294        "euros",
295    ];
296    WORDS.iter().any(|w| contains_word(lower, w))
297}
298
299fn proper_noun_runs(sentence: &str) -> usize {
300    let mut runs = 0;
301    let mut consecutive = 0;
302    for (i, word) in sentence.split_whitespace().enumerate() {
303        let is_cap = word.chars().next().is_some_and(char::is_uppercase);
304        // Ignore the very first word (sentence-initial capital is not a signal).
305        if is_cap && i > 0 {
306            consecutive += 1;
307            if consecutive == 2 {
308                runs += 1;
309            }
310        } else {
311            consecutive = 0;
312        }
313    }
314    runs
315}
316
317fn term_frequencies(sentences: &[String]) -> HashMap<String, usize> {
318    let mut freq = HashMap::new();
319    for sentence in sentences {
320        for word in content_words(sentence) {
321            *freq.entry(word).or_insert(0) += 1;
322        }
323    }
324    freq
325}
326
327fn centrality_score(sentence: &str, freq: &HashMap<String, usize>) -> f64 {
328    let words = content_words(sentence);
329    if words.is_empty() {
330        return 0.0;
331    }
332    let sum: usize = words.iter().filter_map(|w| freq.get(w)).sum();
333    sum as f64 / (words.len() as f64).sqrt()
334}
335
336fn query_terms(query: Option<&str>) -> HashSet<String> {
337    query
338        .map(|q| {
339            q.split(|c: char| !c.is_alphanumeric())
340                .filter(|w| w.len() >= 3)
341                .map(str::to_lowercase)
342                .collect()
343        })
344        .unwrap_or_default()
345}
346
347fn query_boost(sentence: &str, qterms: &HashSet<String>) -> f64 {
348    if qterms.is_empty() {
349        return 0.0;
350    }
351    let lower = sentence.to_lowercase();
352    qterms.iter().filter(|t| contains_word(&lower, t)).count() as f64
353}
354
355fn select_top_scored(
356    mut scored: Vec<(f64, usize, String)>,
357    max_items: usize,
358) -> Vec<(String, f32)> {
359    scored.sort_by(|a, b| {
360        b.0.partial_cmp(&a.0)
361            .unwrap_or(std::cmp::Ordering::Equal)
362            .then(a.1.cmp(&b.1))
363    });
364
365    let mut seen = HashSet::new();
366    let mut chosen: Vec<(usize, String, f64)> = Vec::new();
367    for (score, idx, sentence) in scored {
368        if seen.insert(norm_key(&sentence)) {
369            chosen.push((idx, sentence, score));
370            if chosen.len() >= max_items {
371                break;
372            }
373        }
374    }
375    chosen.sort_by_key(|(idx, _, _)| *idx);
376    chosen
377        .into_iter()
378        .map(|(_, s, sc)| (s, sc as f32))
379        .collect()
380}
381
382/// Map raw heuristic scores onto a source-relative confidence in `[0.45, 0.95]`
383/// (single/uniform item → 0.8). Deterministic min-max within the selected set.
384fn normalize_conf(items: Vec<(String, f32)>) -> Vec<(String, f32)> {
385    if items.is_empty() {
386        return items;
387    }
388    let max = items.iter().map(|(_, s)| *s).fold(f32::MIN, f32::max);
389    let min = items.iter().map(|(_, s)| *s).fold(f32::MAX, f32::min);
390    let span = max - min;
391    if span < f32::EPSILON {
392        return items.into_iter().map(|(t, _)| (t, 0.8)).collect();
393    }
394    items
395        .into_iter()
396        .map(|(t, s)| (t, 0.45 + 0.5 * (s - min) / span))
397        .collect()
398}
399
400// ── Word helpers ───────────────────────────────────────────────────────────
401
402fn content_words(sentence: &str) -> Vec<String> {
403    sentence
404        .split(|c: char| !c.is_alphanumeric())
405        .filter(|w| w.len() >= 3)
406        .map(str::to_lowercase)
407        .filter(|w| !STOPWORDS.contains(&w.as_str()))
408        .collect()
409}
410
411fn word_set(s: &str) -> HashSet<String> {
412    s.split(|c: char| !c.is_alphanumeric())
413        .filter(|w| !w.is_empty())
414        .map(str::to_lowercase)
415        .collect()
416}
417
418fn jaccard(a: &str, b: &str) -> f64 {
419    let sa = word_set(a);
420    let sb = word_set(b);
421    if sa.is_empty() && sb.is_empty() {
422        return 1.0;
423    }
424    let inter = sa.intersection(&sb).count() as f64;
425    let union = sa.union(&sb).count() as f64;
426    if union == 0.0 {
427        0.0
428    } else {
429        inter / union
430    }
431}
432
433fn strip_filler(sentence: &str) -> String {
434    sentence
435        .split_whitespace()
436        .filter(|tok| {
437            let core: String = tok
438                .chars()
439                .filter(|c| c.is_alphanumeric())
440                .collect::<String>()
441                .to_lowercase();
442            !core.is_empty() && !FILLER.contains(&core.as_str())
443        })
444        .collect::<Vec<_>>()
445        .join(" ")
446}
447
448fn contains_word(haystack: &str, word: &str) -> bool {
449    let mut start = 0;
450    while let Some(pos) = haystack[start..].find(word) {
451        let idx = start + pos;
452        let before = idx
453            .checked_sub(1)
454            .is_none_or(|i| !haystack.as_bytes()[i].is_ascii_alphanumeric());
455        let after_idx = idx + word.len();
456        let after = haystack
457            .as_bytes()
458            .get(after_idx)
459            .is_none_or(|b| !b.is_ascii_alphanumeric());
460        if before && after {
461            return true;
462        }
463        start = idx + word.len();
464    }
465    false
466}
467
468fn norm_key(s: &str) -> String {
469    s.chars()
470        .filter(|c| c.is_alphanumeric())
471        .collect::<String>()
472        .to_lowercase()
473}
474
475#[cfg(test)]
476mod tests {
477    use super::*;
478
479    /// Drop confidence scores so ranking assertions read like prod callers.
480    fn names(scored: Vec<(String, f32)>) -> Vec<String> {
481        scored.into_iter().map(|(s, _)| s).collect()
482    }
483
484    #[test]
485    fn splits_sentences_across_lines() {
486        let text = "First sentence here. Second one follows!\nThird line stands alone?";
487        let s = split_sentences(text);
488        assert_eq!(s.len(), 3);
489        assert_eq!(s[0], "First sentence here.");
490        assert_eq!(s[2], "Third line stands alone?");
491    }
492
493    #[test]
494    fn facts_keep_numeric_and_drop_fluff() {
495        let text = "Revenue grew to 12 million dollars in 2023. \
496                    I really enjoyed the lovely afternoon weather today.";
497        let f = names(facts_scored(text, None, 5));
498        assert_eq!(f.len(), 1);
499        assert!(f[0].contains("12 million"));
500    }
501
502    #[test]
503    fn facts_respect_query_boost_and_limit() {
504        let text = "The rocket reached 400 km altitude. \
505                    The budget was 5 billion euros overall. \
506                    Apollo Eleven landed in 1969 successfully.";
507        let f = names(facts_scored(text, Some("budget"), 1));
508        assert_eq!(f.len(), 1);
509        assert!(f[0].contains("budget"));
510    }
511
512    #[test]
513    fn quotes_prefer_query_relevant_sentences() {
514        let text = "Climate policy shapes future energy markets across regions. \
515                    The cat sat quietly on the warm windowsill all day. \
516                    Energy markets respond to climate policy and carbon pricing.";
517        let q = names(quotes_scored(text, Some("climate energy"), 2));
518        assert_eq!(q.len(), 2);
519        assert!(q
520            .iter()
521            .all(|s| s.to_lowercase().contains("energy") || s.to_lowercase().contains("climate")));
522    }
523
524    #[test]
525    fn transcript_summary_strips_filler_and_dupes() {
526        let text = "Um so basically the model is really fast. \
527                    Um so basically the model is really fast. \
528                    Actually it scales to millions of requests.";
529        let summary = transcript_summary(text, 500);
530        assert!(!summary.to_lowercase().contains("basically"));
531        // Near-duplicate second line is dropped.
532        assert_eq!(summary.matches("the model is really fast").count(), 1);
533        assert!(summary.contains("scales to millions"));
534    }
535
536    #[test]
537    fn transcript_summary_respects_budget() {
538        let text = "Alpha statement number one here. Beta statement number two here. \
539                    Gamma statement number three here.";
540        let summary = transcript_summary(text, 30);
541        assert!(summary.len() <= 60, "got {} chars", summary.len());
542        assert!(summary.contains("Alpha"));
543    }
544
545    #[test]
546    fn squeeze_prose_dedupes_and_collapses_blanks() {
547        let text = "Rust is a systems programming language focused on safety.\n\n\n\
548                    Rust is a systems programming language focused on safety.\n\
549                    It guarantees memory safety without a garbage collector.";
550        let out = squeeze_prose(text, 10_000);
551        // Near-duplicate line dropped.
552        assert_eq!(out.matches("focused on safety").count(), 1);
553        // Blank run collapsed to at most a single blank line.
554        assert!(!out.contains("\n\n\n"));
555        assert!(out.contains("memory safety"));
556    }
557
558    #[test]
559    fn squeeze_prose_keeps_protected_lines() {
560        let text = "- [Home](https://x.com)\n- [Home](https://x.com)\n\
561                    > A quote that repeats.\n> A quote that repeats.";
562        let out = squeeze_prose(text, 10_000);
563        // Protected (link/quote) lines are never deduped away.
564        assert_eq!(out.matches("[Home]").count(), 2);
565        assert_eq!(out.matches("A quote that repeats").count(), 2);
566    }
567
568    #[test]
569    fn squeeze_prose_caps_length() {
570        let big = "This is a unique sentence number ";
571        let text = (0..500)
572            .map(|i| format!("{big}{i}."))
573            .collect::<Vec<_>>()
574            .join("\n");
575        let out = squeeze_prose(&text, 400);
576        assert!(out.contains("…[truncated]"));
577        assert!(out.len() <= 600, "got {} chars", out.len());
578    }
579
580    #[test]
581    fn contains_word_matches_whole_words_only() {
582        assert!(contains_word("the budget is large", "budget"));
583        assert!(!contains_word("budgetary spending", "budget"));
584    }
585
586    #[test]
587    fn facts_scored_assigns_bounded_confidence() {
588        let text = "Revenue grew to 12 million dollars in 2023. \
589                    Apollo Eleven landed on the Moon in 1969 successfully. \
590                    The annual budget was 5 billion euros overall.";
591        let scored = facts_scored(text, None, 3);
592        assert!(!scored.is_empty(), "expected scored facts");
593        for (_, conf) in &scored {
594            assert!(
595                (0.0..=1.0).contains(conf),
596                "confidence out of range: {conf}"
597            );
598        }
599    }
600
601    #[test]
602    fn facts_confidence_scales_with_signals() {
603        // Rich fact (digits + magnitude + year) should outrank a thin one.
604        let rich =
605            factual_confidence(factual_score("Revenue grew to 12 million dollars in 2023.") as f32);
606        let thin = factual_confidence(factual_score("There were 3 cats.") as f32);
607        assert!(rich > thin, "rich={rich} thin={thin}");
608        assert!((0.5..=0.97).contains(&rich));
609    }
610
611    #[test]
612    fn quotes_single_item_gets_default_confidence() {
613        let scored = normalize_conf(vec![("only one".to_string(), 4.2)]);
614        assert_eq!(scored.len(), 1);
615        assert!((scored[0].1 - 0.8).abs() < 1e-6);
616    }
617}
lean_ctx/core/web/distill.rs

lean_ctx/core/web/
distill.rs