1use std::collections::{HashMap, HashSet};
15
16const MIN_SENTENCE_CHARS: usize = 24;
17const MAX_SENTENCE_CHARS: usize = 400;
18
19const STOPWORDS: &[&str] = &[
20 "the", "and", "for", "are", "but", "not", "you", "all", "any", "can", "had", "her", "was",
21 "one", "our", "out", "day", "get", "has", "him", "his", "how", "man", "new", "now", "old",
22 "see", "two", "way", "who", "did", "its", "let", "put", "say", "she", "too", "use", "that",
23 "this", "with", "from", "they", "have", "were", "will", "your", "what", "when", "your", "than",
24 "then", "them", "into", "more", "some", "such", "only", "also", "been", "very", "just", "over",
25];
26
27const FILLER: &[&str] = &[
28 "um",
29 "uh",
30 "erm",
31 "hmm",
32 "like",
33 "basically",
34 "actually",
35 "literally",
36 "honestly",
37 "okay",
38 "ok",
39 "yeah",
40 "right",
41 "so",
42 "well",
43 "anyway",
44 "anyways",
45];
46
47pub fn facts_scored(text: &str, query: Option<&str>, max_items: usize) -> Vec<(String, f32)> {
53 select_top_scored(facts_ranked(text, query), max_items)
54 .into_iter()
55 .map(|(text, raw)| (text, factual_confidence(raw)))
56 .collect()
57}
58
59fn factual_confidence(raw: f32) -> f32 {
61 (0.55 + 0.09 * raw).clamp(0.5, 0.97)
62}
63
64fn facts_ranked(text: &str, query: Option<&str>) -> Vec<(f64, usize, String)> {
65 let qterms = query_terms(query);
66 let mut scored = Vec::new();
67 for (idx, sentence) in split_sentences(text).into_iter().enumerate() {
68 let len = sentence.chars().count();
69 if !(MIN_SENTENCE_CHARS..=MAX_SENTENCE_CHARS).contains(&len) {
70 continue;
71 }
72 let base = factual_score(&sentence);
73 if base <= 0.0 {
74 continue;
75 }
76 let score = base + query_boost(&sentence, &qterms);
77 scored.push((score, idx, sentence));
78 }
79 scored
80}
81
82pub fn quotes_scored(text: &str, query: Option<&str>, max_items: usize) -> Vec<(String, f32)> {
85 normalize_conf(select_top_scored(quotes_ranked(text, query), max_items))
86}
87
88fn quotes_ranked(text: &str, query: Option<&str>) -> Vec<(f64, usize, String)> {
89 let sentences = split_sentences(text);
90 let freq = term_frequencies(&sentences);
91 let qterms = query_terms(query);
92
93 let mut scored = Vec::new();
94 for (idx, sentence) in sentences.into_iter().enumerate() {
95 let len = sentence.chars().count();
96 if !(MIN_SENTENCE_CHARS..=MAX_SENTENCE_CHARS).contains(&len) {
97 continue;
98 }
99 let centrality = centrality_score(&sentence, &freq);
100 let score = centrality + query_boost(&sentence, &qterms) * 3.0;
101 if score <= 0.0 {
102 continue;
103 }
104 scored.push((score, idx, sentence));
105 }
106 scored
107}
108
109pub fn transcript_summary(text: &str, max_chars: usize) -> String {
111 let mut kept: Vec<String> = Vec::new();
112 let mut total = 0usize;
113
114 for sentence in split_sentences(text) {
115 let cleaned = strip_filler(&sentence);
116 let cleaned = cleaned.trim();
117 if cleaned.chars().count() < 8 {
118 continue;
119 }
120 if let Some(last) = kept.last() {
121 if jaccard(last, cleaned) > 0.8 {
122 continue;
123 }
124 }
125 if total + cleaned.len() > max_chars && !kept.is_empty() {
126 break;
127 }
128 total += cleaned.len();
129 kept.push(cleaned.to_string());
130 }
131 kept.join(" ")
132}
133
134pub fn squeeze_prose(text: &str, max_chars: usize) -> String {
147 const RECENT: usize = 12;
148 let mut out: Vec<String> = Vec::new();
149 let mut recent: Vec<String> = Vec::new();
150 let mut total = 0usize;
151 let mut blank_run = 0u32;
152
153 for raw in text.lines() {
154 let line = raw.trim_end();
155 if line.trim().is_empty() {
156 blank_run += 1;
157 if blank_run == 1 && !out.is_empty() {
158 out.push(String::new());
159 }
160 continue;
161 }
162 blank_run = 0;
163
164 let normalized = line.trim();
165 if !is_protected_line(line) && recent.iter().any(|p| jaccard(p, normalized) > 0.9) {
166 continue;
167 }
168
169 if total + line.len() > max_chars && !out.is_empty() {
170 out.push("…[truncated]".to_string());
171 break;
172 }
173 total += line.len();
174 out.push(line.to_string());
175
176 recent.push(normalized.to_string());
177 if recent.len() > RECENT {
178 recent.remove(0);
179 }
180 }
181
182 while out.last().is_some_and(String::is_empty) {
183 out.pop();
184 }
185 out.join("\n")
186}
187
188fn is_protected_line(line: &str) -> bool {
191 let t = line.trim_start();
192 t.starts_with("Source:")
193 || t.starts_with("Site:")
194 || t.starts_with("http://")
195 || t.starts_with("https://")
196 || t.starts_with("- [")
197 || t.starts_with("> ")
198 || t.starts_with('#')
199 || t.starts_with("---")
200}
201
202pub fn split_sentences(text: &str) -> Vec<String> {
206 let mut sentences = Vec::new();
207 for line in text.lines() {
208 let line = line.trim();
209 if line.is_empty() {
210 continue;
211 }
212 let mut current = String::new();
213 let mut chars = line.chars().peekable();
214 while let Some(c) = chars.next() {
215 current.push(c);
216 if matches!(c, '.' | '!' | '?') {
217 let boundary = chars.peek().is_none_or(|n| n.is_whitespace());
218 if boundary {
219 push_trimmed(&mut sentences, ¤t);
220 current.clear();
221 }
222 }
223 }
224 push_trimmed(&mut sentences, ¤t);
225 }
226 sentences
227}
228
229fn push_trimmed(acc: &mut Vec<String>, s: &str) {
230 let trimmed = s.trim();
231 if !trimmed.is_empty() {
232 acc.push(trimmed.to_string());
233 }
234}
235
236fn factual_score(sentence: &str) -> f64 {
239 let lower = sentence.to_lowercase();
240 let mut score = 0.0;
241
242 if sentence.chars().any(|c| c.is_ascii_digit()) {
243 score += 1.0;
244 }
245 if sentence.contains('%') || sentence.contains('$') || sentence.contains('€') {
246 score += 1.0;
247 }
248 if has_year(sentence) {
249 score += 1.0;
250 }
251 if has_magnitude_word(&lower) {
252 score += 1.0;
253 }
254 if proper_noun_runs(sentence) >= 1 {
255 score += 0.5;
256 }
257 score
258}
259
260fn has_year(sentence: &str) -> bool {
261 let bytes = sentence.as_bytes();
262 let mut run = 0;
263 for &b in bytes {
264 if b.is_ascii_digit() {
265 run += 1;
266 if run == 4 {
267 return true;
268 }
269 } else {
270 run = 0;
271 }
272 }
273 false
274}
275
276fn has_magnitude_word(lower: &str) -> bool {
277 const WORDS: &[&str] = &[
278 "percent",
279 "million",
280 "billion",
281 "trillion",
282 "thousand",
283 "kg",
284 "km",
285 "mph",
286 "gb",
287 "mb",
288 "tb",
289 "ghz",
290 "kwh",
291 "celsius",
292 "fahrenheit",
293 "dollars",
294 "euros",
295 ];
296 WORDS.iter().any(|w| contains_word(lower, w))
297}
298
299fn proper_noun_runs(sentence: &str) -> usize {
300 let mut runs = 0;
301 let mut consecutive = 0;
302 for (i, word) in sentence.split_whitespace().enumerate() {
303 let is_cap = word.chars().next().is_some_and(char::is_uppercase);
304 if is_cap && i > 0 {
306 consecutive += 1;
307 if consecutive == 2 {
308 runs += 1;
309 }
310 } else {
311 consecutive = 0;
312 }
313 }
314 runs
315}
316
317fn term_frequencies(sentences: &[String]) -> HashMap<String, usize> {
318 let mut freq = HashMap::new();
319 for sentence in sentences {
320 for word in content_words(sentence) {
321 *freq.entry(word).or_insert(0) += 1;
322 }
323 }
324 freq
325}
326
327fn centrality_score(sentence: &str, freq: &HashMap<String, usize>) -> f64 {
328 let words = content_words(sentence);
329 if words.is_empty() {
330 return 0.0;
331 }
332 let sum: usize = words.iter().filter_map(|w| freq.get(w)).sum();
333 sum as f64 / (words.len() as f64).sqrt()
334}
335
336fn query_terms(query: Option<&str>) -> HashSet<String> {
337 query
338 .map(|q| {
339 q.split(|c: char| !c.is_alphanumeric())
340 .filter(|w| w.len() >= 3)
341 .map(str::to_lowercase)
342 .collect()
343 })
344 .unwrap_or_default()
345}
346
347fn query_boost(sentence: &str, qterms: &HashSet<String>) -> f64 {
348 if qterms.is_empty() {
349 return 0.0;
350 }
351 let lower = sentence.to_lowercase();
352 qterms.iter().filter(|t| contains_word(&lower, t)).count() as f64
353}
354
355fn select_top_scored(
356 mut scored: Vec<(f64, usize, String)>,
357 max_items: usize,
358) -> Vec<(String, f32)> {
359 scored.sort_by(|a, b| {
360 b.0.partial_cmp(&a.0)
361 .unwrap_or(std::cmp::Ordering::Equal)
362 .then(a.1.cmp(&b.1))
363 });
364
365 let mut seen = HashSet::new();
366 let mut chosen: Vec<(usize, String, f64)> = Vec::new();
367 for (score, idx, sentence) in scored {
368 if seen.insert(norm_key(&sentence)) {
369 chosen.push((idx, sentence, score));
370 if chosen.len() >= max_items {
371 break;
372 }
373 }
374 }
375 chosen.sort_by_key(|(idx, _, _)| *idx);
376 chosen
377 .into_iter()
378 .map(|(_, s, sc)| (s, sc as f32))
379 .collect()
380}
381
382fn normalize_conf(items: Vec<(String, f32)>) -> Vec<(String, f32)> {
385 if items.is_empty() {
386 return items;
387 }
388 let max = items.iter().map(|(_, s)| *s).fold(f32::MIN, f32::max);
389 let min = items.iter().map(|(_, s)| *s).fold(f32::MAX, f32::min);
390 let span = max - min;
391 if span < f32::EPSILON {
392 return items.into_iter().map(|(t, _)| (t, 0.8)).collect();
393 }
394 items
395 .into_iter()
396 .map(|(t, s)| (t, 0.45 + 0.5 * (s - min) / span))
397 .collect()
398}
399
400fn content_words(sentence: &str) -> Vec<String> {
403 sentence
404 .split(|c: char| !c.is_alphanumeric())
405 .filter(|w| w.len() >= 3)
406 .map(str::to_lowercase)
407 .filter(|w| !STOPWORDS.contains(&w.as_str()))
408 .collect()
409}
410
411fn word_set(s: &str) -> HashSet<String> {
412 s.split(|c: char| !c.is_alphanumeric())
413 .filter(|w| !w.is_empty())
414 .map(str::to_lowercase)
415 .collect()
416}
417
418fn jaccard(a: &str, b: &str) -> f64 {
419 let sa = word_set(a);
420 let sb = word_set(b);
421 if sa.is_empty() && sb.is_empty() {
422 return 1.0;
423 }
424 let inter = sa.intersection(&sb).count() as f64;
425 let union = sa.union(&sb).count() as f64;
426 if union == 0.0 {
427 0.0
428 } else {
429 inter / union
430 }
431}
432
433fn strip_filler(sentence: &str) -> String {
434 sentence
435 .split_whitespace()
436 .filter(|tok| {
437 let core: String = tok
438 .chars()
439 .filter(|c| c.is_alphanumeric())
440 .collect::<String>()
441 .to_lowercase();
442 !core.is_empty() && !FILLER.contains(&core.as_str())
443 })
444 .collect::<Vec<_>>()
445 .join(" ")
446}
447
448fn contains_word(haystack: &str, word: &str) -> bool {
449 let mut start = 0;
450 while let Some(pos) = haystack[start..].find(word) {
451 let idx = start + pos;
452 let before = idx
453 .checked_sub(1)
454 .is_none_or(|i| !haystack.as_bytes()[i].is_ascii_alphanumeric());
455 let after_idx = idx + word.len();
456 let after = haystack
457 .as_bytes()
458 .get(after_idx)
459 .is_none_or(|b| !b.is_ascii_alphanumeric());
460 if before && after {
461 return true;
462 }
463 start = idx + word.len();
464 }
465 false
466}
467
468fn norm_key(s: &str) -> String {
469 s.chars()
470 .filter(|c| c.is_alphanumeric())
471 .collect::<String>()
472 .to_lowercase()
473}
474
475#[cfg(test)]
476mod tests {
477 use super::*;
478
479 fn names(scored: Vec<(String, f32)>) -> Vec<String> {
481 scored.into_iter().map(|(s, _)| s).collect()
482 }
483
484 #[test]
485 fn splits_sentences_across_lines() {
486 let text = "First sentence here. Second one follows!\nThird line stands alone?";
487 let s = split_sentences(text);
488 assert_eq!(s.len(), 3);
489 assert_eq!(s[0], "First sentence here.");
490 assert_eq!(s[2], "Third line stands alone?");
491 }
492
493 #[test]
494 fn facts_keep_numeric_and_drop_fluff() {
495 let text = "Revenue grew to 12 million dollars in 2023. \
496 I really enjoyed the lovely afternoon weather today.";
497 let f = names(facts_scored(text, None, 5));
498 assert_eq!(f.len(), 1);
499 assert!(f[0].contains("12 million"));
500 }
501
502 #[test]
503 fn facts_respect_query_boost_and_limit() {
504 let text = "The rocket reached 400 km altitude. \
505 The budget was 5 billion euros overall. \
506 Apollo Eleven landed in 1969 successfully.";
507 let f = names(facts_scored(text, Some("budget"), 1));
508 assert_eq!(f.len(), 1);
509 assert!(f[0].contains("budget"));
510 }
511
512 #[test]
513 fn quotes_prefer_query_relevant_sentences() {
514 let text = "Climate policy shapes future energy markets across regions. \
515 The cat sat quietly on the warm windowsill all day. \
516 Energy markets respond to climate policy and carbon pricing.";
517 let q = names(quotes_scored(text, Some("climate energy"), 2));
518 assert_eq!(q.len(), 2);
519 assert!(q
520 .iter()
521 .all(|s| s.to_lowercase().contains("energy") || s.to_lowercase().contains("climate")));
522 }
523
524 #[test]
525 fn transcript_summary_strips_filler_and_dupes() {
526 let text = "Um so basically the model is really fast. \
527 Um so basically the model is really fast. \
528 Actually it scales to millions of requests.";
529 let summary = transcript_summary(text, 500);
530 assert!(!summary.to_lowercase().contains("basically"));
531 assert_eq!(summary.matches("the model is really fast").count(), 1);
533 assert!(summary.contains("scales to millions"));
534 }
535
536 #[test]
537 fn transcript_summary_respects_budget() {
538 let text = "Alpha statement number one here. Beta statement number two here. \
539 Gamma statement number three here.";
540 let summary = transcript_summary(text, 30);
541 assert!(summary.len() <= 60, "got {} chars", summary.len());
542 assert!(summary.contains("Alpha"));
543 }
544
545 #[test]
546 fn squeeze_prose_dedupes_and_collapses_blanks() {
547 let text = "Rust is a systems programming language focused on safety.\n\n\n\
548 Rust is a systems programming language focused on safety.\n\
549 It guarantees memory safety without a garbage collector.";
550 let out = squeeze_prose(text, 10_000);
551 assert_eq!(out.matches("focused on safety").count(), 1);
553 assert!(!out.contains("\n\n\n"));
555 assert!(out.contains("memory safety"));
556 }
557
558 #[test]
559 fn squeeze_prose_keeps_protected_lines() {
560 let text = "- [Home](https://x.com)\n- [Home](https://x.com)\n\
561 > A quote that repeats.\n> A quote that repeats.";
562 let out = squeeze_prose(text, 10_000);
563 assert_eq!(out.matches("[Home]").count(), 2);
565 assert_eq!(out.matches("A quote that repeats").count(), 2);
566 }
567
568 #[test]
569 fn squeeze_prose_caps_length() {
570 let big = "This is a unique sentence number ";
571 let text = (0..500)
572 .map(|i| format!("{big}{i}."))
573 .collect::<Vec<_>>()
574 .join("\n");
575 let out = squeeze_prose(&text, 400);
576 assert!(out.contains("…[truncated]"));
577 assert!(out.len() <= 600, "got {} chars", out.len());
578 }
579
580 #[test]
581 fn contains_word_matches_whole_words_only() {
582 assert!(contains_word("the budget is large", "budget"));
583 assert!(!contains_word("budgetary spending", "budget"));
584 }
585
586 #[test]
587 fn facts_scored_assigns_bounded_confidence() {
588 let text = "Revenue grew to 12 million dollars in 2023. \
589 Apollo Eleven landed on the Moon in 1969 successfully. \
590 The annual budget was 5 billion euros overall.";
591 let scored = facts_scored(text, None, 3);
592 assert!(!scored.is_empty(), "expected scored facts");
593 for (_, conf) in &scored {
594 assert!(
595 (0.0..=1.0).contains(conf),
596 "confidence out of range: {conf}"
597 );
598 }
599 }
600
601 #[test]
602 fn facts_confidence_scales_with_signals() {
603 let rich =
605 factual_confidence(factual_score("Revenue grew to 12 million dollars in 2023.") as f32);
606 let thin = factual_confidence(factual_score("There were 3 cats.") as f32);
607 assert!(rich > thin, "rich={rich} thin={thin}");
608 assert!((0.5..=0.97).contains(&rich));
609 }
610
611 #[test]
612 fn quotes_single_item_gets_default_confidence() {
613 let scored = normalize_conf(vec![("only one".to_string(), 4.2)]);
614 assert_eq!(scored.len(), 1);
615 assert!((scored[0].1 - 0.8).abs() < 1e-6);
616 }
617}