use std::collections::HashMap;
use async_trait::async_trait;
use unicode_segmentation::UnicodeSegmentation;
use crate::summarizer::backend::{CompactMode, CompactOpts, Style, SummarizerBackend};
use crate::summarizer::error::BackendError;
use crate::tokenizer::{self, Tokenizer};
const PAGERANK_DAMPING: f32 = 0.85;
const PAGERANK_MAX_ITER: usize = 50;
const PAGERANK_TOL: f32 = 1e-4;
const SIMILARITY_FLOOR: f32 = 0.1;
#[derive(Debug, Clone)]
pub(super) struct Sentence {
pub span_start: usize,
pub text: String,
}
pub(super) fn split_sentences(content: &str) -> Vec<Sentence> {
let mut out = Vec::new();
for (offset, s) in content.split_sentence_bound_indices() {
let trimmed = s.trim();
if trimmed.chars().count() < 3 {
continue;
}
if parse_atx_heading(trimmed).is_some() {
continue;
}
out.push(Sentence {
span_start: offset,
text: trimmed.to_string(),
});
}
out
}
#[cfg(test)]
mod sentence_tests {
use super::*;
#[test]
fn split_produces_three_sentences_from_simple_paragraph() {
let text = "Hello world. How are you? Fine!";
let s = split_sentences(text);
assert_eq!(s.len(), 3);
assert_eq!(s[0].text, "Hello world.");
assert_eq!(s[1].text, "How are you?");
assert_eq!(s[2].text, "Fine!");
}
#[test]
fn split_skips_short_fragments() {
let text = "Hi. This is a longer sentence.";
let s = split_sentences(text);
assert_eq!(s.len(), 2);
}
#[test]
fn split_handles_unicode_punctuation() {
let text = "Привет мир. Это тест.";
let s = split_sentences(text);
assert_eq!(s.len(), 2);
}
#[test]
fn split_preserves_byte_offsets() {
let text = "First sentence. Second sentence here.";
let s = split_sentences(text);
assert!(s[0].span_start < s[1].span_start);
}
#[test]
fn split_returns_empty_for_empty_input() {
assert!(split_sentences("").is_empty());
}
}
pub(super) fn tokenize(s: &str) -> Vec<String> {
s.unicode_words().map(str::to_lowercase).collect()
}
pub(super) fn tfidf_vectors(sentences: &[Sentence]) -> Vec<HashMap<usize, f32>> {
if sentences.is_empty() {
return Vec::new();
}
let mut df: HashMap<String, usize> = HashMap::new();
let mut vocab: HashMap<String, usize> = HashMap::new();
let mut vocab_terms: Vec<String> = Vec::new();
let tokens_per_sent: Vec<Vec<String>> = sentences.iter().map(|s| tokenize(&s.text)).collect();
for terms in &tokens_per_sent {
let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::new();
for t in terms {
if seen.insert(t.as_str()) {
*df.entry(t.clone()).or_insert(0) += 1;
}
if !vocab.contains_key(t) {
let id = vocab.len();
vocab.insert(t.clone(), id);
vocab_terms.push(t.clone());
}
}
}
let n = sentences.len() as f32;
let mut vectors = Vec::with_capacity(sentences.len());
for terms in &tokens_per_sent {
let mut tf: HashMap<usize, f32> = HashMap::new();
for t in terms {
let id = vocab[t.as_str()];
*tf.entry(id).or_insert(0.0) += 1.0;
}
let mut tfidf: HashMap<usize, f32> = HashMap::new();
for (id, count) in tf {
let term = vocab_terms[id].as_str();
let dfv = df[term] as f32;
let idf = (n / dfv).ln(); if idf > 0.0 {
tfidf.insert(id, count * idf);
}
}
let norm: f32 = tfidf.values().map(|v| v * v).sum::<f32>().sqrt();
if norm > 0.0 {
for v in tfidf.values_mut() {
*v /= norm;
}
}
vectors.push(tfidf);
}
vectors
}
pub(super) fn cosine(a: &HashMap<usize, f32>, b: &HashMap<usize, f32>) -> f32 {
if a.is_empty() || b.is_empty() {
return 0.0;
}
let (small, large) = if a.len() <= b.len() { (a, b) } else { (b, a) };
let mut sum = 0.0;
for (k, v) in small {
if let Some(w) = large.get(k) {
sum += v * w;
}
}
sum
}
#[cfg(test)]
mod tfidf_tests {
use super::*;
fn sents(strs: &[&str]) -> Vec<Sentence> {
strs.iter()
.enumerate()
.map(|(i, s)| Sentence {
span_start: i * 100,
text: s.to_string(),
})
.collect()
}
#[test]
fn tokenize_lowercases_words() {
let t = tokenize("Hello, World! Don't stop.");
assert!(t.contains(&"hello".to_string()));
assert!(t.contains(&"world".to_string()));
assert!(t.contains(&"don't".to_string()));
assert!(t.contains(&"stop".to_string()));
}
#[test]
fn tfidf_zeroes_out_terms_appearing_everywhere() {
let s = sents(&["the cat sat", "the cat slept", "the cat ran"]);
let vecs = tfidf_vectors(&s);
for v in &vecs {
assert_eq!(v.len(), 1, "sentence kept only the discriminating term");
}
}
#[test]
fn cosine_is_one_for_identical_sentences() {
let s = sents(&["hello world", "hello world", "goodbye moon"]);
let v = tfidf_vectors(&s);
let c = cosine(&v[0], &v[1]);
assert!(
(c - 1.0).abs() < 1e-5,
"cosine on identical non-degenerate sentences was {c}",
);
}
#[test]
fn cosine_is_zero_for_disjoint_sentences() {
let s = sents(&["alpha beta", "gamma delta", "epsilon zeta"]);
let v = tfidf_vectors(&s);
assert!(cosine(&v[0], &v[1]).abs() < 1e-5);
}
#[test]
fn tfidf_returns_empty_for_empty_input() {
assert!(tfidf_vectors(&[]).is_empty());
}
#[test]
fn tfidf_handles_all_universal_corpus() {
let s = sents(&["the cat", "the cat", "the cat"]);
let v = tfidf_vectors(&s);
for vec in &v {
assert!(vec.is_empty(), "expected empty vector, got {vec:?}");
}
let pr = pagerank(&v);
assert_eq!(pr.len(), 3);
for score in &pr {
assert!(
(score - 1.0 / 3.0).abs() < 0.1,
"expected ~0.333 uniform, got {score}",
);
}
}
}
pub(super) fn pagerank(vectors: &[HashMap<usize, f32>]) -> Vec<f32> {
let n = vectors.len();
if n == 0 {
return Vec::new();
}
if n == 1 {
return vec![1.0];
}
let mut weights = vec![0.0_f32; n * n];
let mut row_sums = vec![0.0_f32; n];
for i in 0..n {
for j in (i + 1)..n {
let s = cosine(&vectors[i], &vectors[j]);
if s >= SIMILARITY_FLOOR {
weights[i * n + j] = s;
weights[j * n + i] = s;
row_sums[i] += s;
row_sums[j] += s;
}
}
}
let inv_n = 1.0_f32 / n as f32;
let mut score = vec![inv_n; n];
let teleport = (1.0 - PAGERANK_DAMPING) * inv_n;
for _ in 0..PAGERANK_MAX_ITER {
let mut next = vec![teleport; n];
for j in 0..n {
if row_sums[j] < f32::EPSILON {
let share = PAGERANK_DAMPING * score[j] * inv_n;
for slot in next.iter_mut() {
*slot += share;
}
} else {
let factor = PAGERANK_DAMPING * score[j] / row_sums[j];
for i in 0..n {
let w = weights[i * n + j];
if w > 0.0 {
next[i] += factor * w;
}
}
}
}
let delta: f32 = score
.iter()
.zip(next.iter())
.map(|(a, b)| (a - b).abs())
.sum();
score = next;
if delta < PAGERANK_TOL {
break;
}
}
score
}
#[cfg(test)]
mod pagerank_tests {
use super::*;
fn sents(strs: &[&str]) -> Vec<Sentence> {
strs.iter()
.enumerate()
.map(|(i, s)| Sentence {
span_start: i * 100,
text: s.to_string(),
})
.collect()
}
#[test]
fn empty_returns_empty() {
assert!(pagerank(&[]).is_empty());
}
#[test]
fn single_sentence_gets_full_mass() {
let v = tfidf_vectors(&sents(&["alpha beta gamma"]));
let pr = pagerank(&v);
assert_eq!(pr.len(), 1);
assert!((pr[0] - 1.0).abs() < 1e-5);
}
#[test]
fn central_sentence_outscores_peripheral() {
let v = tfidf_vectors(&sents(&[
"alpha unique_left",
"alpha gamma bridge",
"gamma unique_right",
]));
let pr = pagerank(&v);
assert!(
pr[1] >= pr[0] && pr[1] >= pr[2],
"middle sentence should be highest: {pr:?}",
);
}
#[test]
fn scores_sum_close_to_one() {
let v = tfidf_vectors(&sents(&[
"alpha beta gamma",
"alpha gamma",
"delta epsilon zeta",
"alpha epsilon",
]));
let pr = pagerank(&v);
let total: f32 = pr.iter().sum();
assert!((total - 1.0).abs() < 0.05, "sum was {total}");
}
#[test]
fn fully_disconnected_graph_converges_to_uniform() {
let v = tfidf_vectors(&sents(&[
"alpha unique_one",
"beta unique_two",
"gamma unique_three",
"delta unique_four",
]));
let pr = pagerank(&v);
for score in &pr {
assert!(
(score - 0.25).abs() < 0.05,
"expected ~0.25 uniform, got {score}",
);
}
}
}
fn select_extractive(
sentences: &[Sentence],
scores: &[f32],
target_tokens: Option<usize>,
family: Tokenizer,
) -> Vec<usize> {
if sentences.is_empty() {
return Vec::new();
}
let mut order: Vec<usize> = (0..sentences.len()).collect();
order.sort_by(|a, b| {
scores[*b]
.partial_cmp(&scores[*a])
.unwrap_or(std::cmp::Ordering::Equal)
});
let chosen = match target_tokens {
None => order,
Some(max) => {
let mut chosen = Vec::new();
let mut cumulative: usize = 0;
let mut warned_fallback = false;
for idx in order {
let count = match tokenizer::count(&sentences[idx].text, family) {
Ok(c) => c,
Err(e) => {
if !warned_fallback {
tracing::warn!(
target: "rover::summarizer",
family = ?family,
error = %e,
"tokenizer unavailable; falling back to chars/4 heuristic for target_tokens accounting"
);
warned_fallback = true;
}
sentences[idx].text.chars().count().div_ceil(4)
}
};
if chosen.is_empty() {
chosen.push(idx);
cumulative = count;
if count > max {
tracing::warn!(
target: "rover::summarizer",
sentence_tokens = count,
target_tokens = max,
"top-ranked sentence exceeds target_tokens; emitting anyway",
);
break;
}
} else if cumulative + count <= max {
chosen.push(idx);
cumulative += count;
} else {
}
}
chosen
}
};
let mut by_position = chosen;
by_position.sort_by_key(|&i| sentences[i].span_start);
by_position
}
fn format_selected(sentences: &[Sentence], indices: &[usize], style: Style) -> String {
match style {
Style::Bullet => indices
.iter()
.map(|&i| format!("- {}", sentences[i].text))
.collect::<Vec<_>>()
.join("\n"),
Style::Prose => indices
.iter()
.map(|&i| sentences[i].text.clone())
.collect::<Vec<_>>()
.join(" "),
Style::Executive => {
if indices.is_empty() {
String::new()
} else {
let head = &sentences[indices[0]].text;
if indices.len() == 1 {
head.clone()
} else {
let rest = indices[1..]
.iter()
.map(|&i| sentences[i].text.clone())
.collect::<Vec<_>>()
.join(" ");
format!("{head}\n\nDetails: {rest}")
}
}
}
}
}
#[cfg(test)]
mod extractive_mode_tests {
use super::*;
fn sents(strs: &[&str]) -> Vec<Sentence> {
strs.iter()
.enumerate()
.map(|(i, s)| Sentence {
span_start: i * 100,
text: s.to_string(),
})
.collect()
}
#[test]
fn select_extractive_picks_in_source_order() {
let s = sents(&["Low.", "High Importance Sentence Here.", "Mid."]);
let v = tfidf_vectors(&s);
let pr = pagerank(&v);
let _ = pr;
let chosen = select_extractive(&s, &[0.1, 0.9, 0.5], None, Tokenizer::O200k);
assert_eq!(chosen, vec![0, 1, 2]);
}
#[test]
fn select_extractive_caps_to_target_tokens() {
let s = sents(&[
"first sentence.", "second sentence.", "third sentence.", ]);
let chosen = select_extractive(&s, &[0.5, 0.5, 0.5], Some(5), Tokenizer::O200k);
assert_eq!(chosen.len(), 1);
}
#[test]
fn select_extractive_skips_oversize_and_admits_lower_ranked_that_fits() {
let s = sents(&[
"first sentence here that is reasonably long.", "okay.", "third sentence here that is reasonably long.", ]);
let chosen = select_extractive(&s, &[0.9, 0.1, 0.8], Some(13), Tokenizer::O200k);
assert!(chosen.contains(&0), "expected index 0 in {chosen:?}");
assert!(chosen.contains(&1), "expected index 1 in {chosen:?}");
assert!(
!chosen.contains(&2),
"expected index 2 excluded in {chosen:?}"
);
}
#[test]
fn format_bullet_prefixes_dashes() {
let s = sents(&["a.", "b.", "c."]);
assert_eq!(format_selected(&s, &[0, 2], Style::Bullet), "- a.\n- c.",);
}
#[test]
fn format_executive_with_one_sentence_omits_details() {
let s = sents(&["only one."]);
assert_eq!(format_selected(&s, &[0], Style::Executive), "only one.");
}
}
fn parse_atx_heading(line: &str) -> Option<(usize, &str)> {
let bytes = line.as_bytes();
let mut depth = 0;
while depth < bytes.len() && bytes[depth] == b'#' {
depth += 1;
}
if depth == 0 || depth > 6 {
return None;
}
if depth == bytes.len() {
return None;
}
if bytes[depth] != b' ' {
return None;
}
let text = line[depth + 1..].trim();
if text.is_empty() {
None
} else {
Some((depth, text))
}
}
#[derive(Debug)]
struct HeadingSection {
depth: usize,
heading: String,
sentence_indices: Vec<usize>,
}
fn group_by_headings(content: &str, sentences: &[Sentence]) -> Vec<HeadingSection> {
let mut headings: Vec<HeadingSection> = Vec::new();
let mut heading_offsets: Vec<usize> = Vec::new();
let mut byte_offset = 0;
for line in content.split_inclusive('\n') {
let line_trimmed = line.trim_end_matches('\n');
if let Some((depth, text)) = parse_atx_heading(line_trimmed) {
headings.push(HeadingSection {
depth,
heading: text.to_string(),
sentence_indices: Vec::new(),
});
heading_offsets.push(byte_offset);
}
byte_offset += line.len();
}
if headings.is_empty() {
return Vec::new();
}
for (si, sent) in sentences.iter().enumerate() {
let mut bucket: Option<usize> = None;
for (hi, off) in heading_offsets.iter().enumerate() {
if sent.span_start >= *off {
bucket = Some(hi);
} else {
break;
}
}
if let Some(b) = bucket {
headings[b].sentence_indices.push(si);
}
}
headings
}
fn select_headlines(
content: &str,
sentences: &[Sentence],
scores: &[f32],
target_tokens: Option<usize>,
family: Tokenizer,
) -> String {
let mut groups = group_by_headings(content, sentences);
groups.retain(|g| !g.sentence_indices.is_empty());
if groups.is_empty() {
let chosen = select_extractive(
sentences,
scores,
target_tokens.or(Some(usize::MAX)),
family,
);
let trimmed: Vec<usize> = chosen.into_iter().take(3).collect();
return format_selected(sentences, &trimmed, Style::Bullet);
}
let deepest = groups.iter().map(|g| g.depth).max().unwrap();
let mut emitted = Vec::new();
let mut cumulative_tokens: usize = 0;
let token_budget = target_tokens.unwrap_or(usize::MAX);
let mut warned_fallback = false;
for g in groups.iter().filter(|g| g.depth == deepest) {
let best = g.sentence_indices.iter().copied().max_by(|a, b| {
scores[*a]
.partial_cmp(&scores[*b])
.unwrap_or(std::cmp::Ordering::Equal)
});
if let Some(idx) = best {
let count = match tokenizer::count(&sentences[idx].text, family) {
Ok(c) => c,
Err(e) => {
if !warned_fallback {
tracing::warn!(
target: "rover::summarizer",
family = ?family,
error = %e,
"tokenizer unavailable; falling back to chars/4 heuristic for headlines budget"
);
warned_fallback = true;
}
sentences[idx].text.chars().count().div_ceil(4)
}
};
if !emitted.is_empty() && cumulative_tokens + count > token_budget {
break;
}
let prefix = "#".repeat(g.depth);
emitted.push(format!("{prefix} {}\n\n{}", g.heading, sentences[idx].text));
cumulative_tokens += count;
}
}
emitted.join("\n\n")
}
#[cfg(test)]
mod headlines_tests {
use super::*;
#[test]
fn parse_atx_extracts_depth_and_text() {
assert_eq!(parse_atx_heading("# Hello"), Some((1, "Hello")));
assert_eq!(parse_atx_heading("### Three"), Some((3, "Three")));
assert_eq!(parse_atx_heading("####### Too Deep"), None);
assert_eq!(parse_atx_heading("#NoSpace"), None);
assert_eq!(parse_atx_heading("Not a heading"), None);
}
#[test]
fn group_by_headings_buckets_sentences_correctly() {
let content =
"# A\nfirst sentence here.\nsecond sentence here.\n# B\nthird sentence here.\n";
let s = split_sentences(content);
let groups = group_by_headings(content, &s);
assert_eq!(groups.len(), 2);
assert_eq!(groups[0].heading, "A");
assert_eq!(groups[1].heading, "B");
assert_eq!(groups[0].sentence_indices.len(), 2);
assert_eq!(groups[1].sentence_indices.len(), 1);
}
#[test]
fn select_headlines_emits_one_per_section() {
let content = "# Intro\nThe quick brown fox.\nThe lazy dog.\n## Body\nDetails matter here.\nMore words follow.\n";
let s = split_sentences(content);
let v = tfidf_vectors(&s);
let pr = pagerank(&v);
let out = select_headlines(content, &s, &pr, None, Tokenizer::O200k);
assert!(out.contains("## Body"));
assert!(out.contains("Details") || out.contains("More words"));
}
#[test]
fn select_headlines_falls_back_when_no_headings() {
let content = "First sentence here. Second sentence here. Third one.";
let s = split_sentences(content);
let v = tfidf_vectors(&s);
let pr = pagerank(&v);
let out = select_headlines(content, &s, &pr, None, Tokenizer::O200k);
assert!(out.contains("- "));
}
#[test]
fn select_headlines_picks_deepest_depth_under_mixed_nesting() {
let content = "\
# A\n\
## A1\n\
Sentence under A1 here describing the thing.\n\
## A2\n\
Sentence under A2 here describing the other thing.\n\
# B\n\
## B1\n\
Sentence under B1 here describing a third thing.\n";
let s = split_sentences(content);
let v = tfidf_vectors(&s);
let pr = pagerank(&v);
let out = select_headlines(content, &s, &pr, None, Tokenizer::O200k);
assert!(out.contains("## A1"), "expected ## A1 in {out}");
assert!(out.contains("## A2"), "expected ## A2 in {out}");
assert!(out.contains("## B1"), "expected ## B1 in {out}");
for line in out.lines() {
if line.starts_with("# ") && !line.starts_with("## ") {
panic!("unexpected H1 in headlines output: {line}");
}
}
}
}
#[derive(Debug, Clone)]
pub struct ExtractiveBackend {
name: String,
tokenizer: Tokenizer,
}
impl ExtractiveBackend {
pub fn new(name: impl Into<String>, tokenizer: Tokenizer) -> Self {
Self {
name: name.into(),
tokenizer,
}
}
pub fn run(&self, content: &str, opts: &CompactOpts) -> String {
let sentences = split_sentences(content);
if sentences.is_empty() {
return String::new();
}
let vectors = tfidf_vectors(&sentences);
let scores = pagerank(&vectors);
match opts.mode {
CompactMode::Headlines => select_headlines(
content,
&sentences,
&scores,
opts.target_tokens,
self.tokenizer,
),
CompactMode::Extractive => {
let indices =
select_extractive(&sentences, &scores, opts.target_tokens, self.tokenizer);
format_selected(&sentences, &indices, opts.style)
}
CompactMode::Abstractive => {
let indices =
select_extractive(&sentences, &scores, opts.target_tokens, self.tokenizer);
format_selected(&sentences, &indices, opts.style)
}
}
}
}
#[async_trait]
impl SummarizerBackend for ExtractiveBackend {
async fn compact(&self, content: &str, opts: &CompactOpts) -> Result<String, BackendError> {
if content.trim().is_empty() {
return Err(BackendError::Invalid("empty content".to_string()));
}
Ok(self.run(content, opts))
}
fn name(&self) -> &str {
&self.name
}
}
#[cfg(test)]
mod trait_tests {
use super::*;
use crate::summarizer::backend::{CompactMode, PreserveSection, Style};
fn opts(mode: CompactMode, style: Style) -> CompactOpts {
CompactOpts {
mode,
style,
target_tokens: None,
focus: None,
preserve: vec![],
backend_name: "default".to_string(),
}
}
#[tokio::test]
async fn empty_content_returns_invalid_error() {
let be = ExtractiveBackend::new("default", Tokenizer::O200k);
let r = be
.compact(" ", &opts(CompactMode::Extractive, Style::Prose))
.await;
assert!(matches!(r, Err(BackendError::Invalid(_))));
}
#[tokio::test]
async fn extractive_returns_non_empty_for_real_content() {
let be = ExtractiveBackend::new("default", Tokenizer::O200k);
let content = "The cat sat on the mat. The dog ran away quickly. The bird flew south.";
let out = be
.compact(content, &opts(CompactMode::Extractive, Style::Prose))
.await
.unwrap();
assert!(!out.is_empty());
}
#[tokio::test]
async fn name_round_trips() {
let be = ExtractiveBackend::new("alt-name", Tokenizer::O200k);
assert_eq!(be.name(), "alt-name");
}
#[test]
fn preserve_unused_for_extractive_compiles() {
let _ = PreserveSection::Code;
}
}