use crate::types::{ChatMessage, MessageRole};
use unicode_segmentation::UnicodeSegmentation;
use crate::estimator::TokenEstimator;
#[derive(Debug)]
pub struct ExtractiveSummarizer;
impl ExtractiveSummarizer {
#[must_use]
pub fn summarize(messages: &[ChatMessage], max_tokens: u32) -> String {
if messages.is_empty() {
return String::new();
}
let mut scored_sentences: Vec<ScoredSentence> = Vec::new();
for (msg_idx, msg) in messages.iter().enumerate() {
let sentences = extract_sentences(&msg.content);
let total_sentences = sentences.len();
for (sent_idx, sentence) in sentences.into_iter().enumerate() {
let score =
compute_sentence_score(&sentence, msg.role, msg_idx, sent_idx, total_sentences);
scored_sentences.push(ScoredSentence {
text: sentence,
score,
});
}
}
scored_sentences.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut summary_parts: Vec<String> = Vec::new();
let mut tokens_used: u32 = 0;
for scored in &scored_sentences {
let sentence_tokens = TokenEstimator::estimate_tokens(&scored.text);
if tokens_used + sentence_tokens > max_tokens {
break;
}
summary_parts.push(scored.text.clone());
tokens_used += sentence_tokens;
}
if summary_parts.is_empty() {
return messages
.iter()
.rev()
.find(|m| m.role == MessageRole::User)
.and_then(|m| extract_sentences(&m.content).into_iter().next())
.map(|s| truncate_to_tokens(&s, max_tokens))
.unwrap_or_default();
}
summary_parts.join(" ")
}
}
struct ScoredSentence {
text: String,
score: f64,
}
fn extract_sentences(text: &str) -> Vec<String> {
text.unicode_sentences()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
fn compute_sentence_score(
sentence: &str,
role: MessageRole,
msg_position: usize,
sent_position: usize,
total_sentences: usize,
) -> f64 {
let mut score = 0.0;
score += match role {
MessageRole::User => 1.5,
MessageRole::Assistant => 1.2,
MessageRole::System => 0.5,
MessageRole::Tool => 0.8,
};
#[allow(clippy::cast_precision_loss)]
{
score += (msg_position as f64) * 0.1;
}
if sent_position == 0 {
score += 0.8;
} else if total_sentences > 1 && sent_position == total_sentences - 1 {
score += 0.4;
}
let density = information_density(sentence);
score += density;
let len = sentence.len();
if len < 10 {
score -= 0.5;
} else if len > 200 {
score -= 0.2;
}
score
}
fn information_density(sentence: &str) -> f64 {
let words: Vec<&str> = sentence.split_whitespace().collect();
if words.is_empty() {
return 0.0;
}
#[allow(clippy::cast_precision_loss)]
let word_count = words.len() as f64;
let mut density = 0.0;
for word in &words {
if word.chars().any(|c| c.is_ascii_digit()) {
density += 1.0;
}
if word.len() > 1
&& word.starts_with(|c: char| c.is_uppercase())
&& word.chars().skip(1).any(char::is_lowercase)
{
density += 0.5;
}
}
(density / word_count).min(2.0)
}
fn truncate_to_tokens(text: &str, max_tokens: u32) -> String {
let max_chars = (max_tokens as usize) * 4;
if text.len() <= max_chars {
text.to_string()
} else {
let truncated: String = text.chars().take(max_chars.saturating_sub(3)).collect();
format!("{truncated}...")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_messages_produce_empty_summary() {
let result = ExtractiveSummarizer::summarize(&[], 100);
assert!(result.is_empty());
}
#[test]
fn single_message_extracts_sentence() {
let msgs = vec![ChatMessage::user("What is the capital of France?")];
let result = ExtractiveSummarizer::summarize(&msgs, 100);
assert!(!result.is_empty());
assert!(result.contains("France"));
}
#[test]
fn respects_token_budget() {
let msgs = vec![
ChatMessage::user("Tell me about quantum computing and its applications."),
ChatMessage::assistant(
"Quantum computing uses qubits. It has applications in cryptography. \
Drug discovery benefits greatly. Machine learning is another area.",
),
];
let result = ExtractiveSummarizer::summarize(&msgs, 10);
let tokens = TokenEstimator::estimate_tokens(&result);
assert!(tokens <= 15, "tokens {tokens} exceeds budget"); }
#[test]
fn prefers_user_messages() {
let msgs = vec![
ChatMessage::system("You are helpful."),
ChatMessage::user("What is 2 + 2?"),
ChatMessage::assistant("The answer is 4."),
];
let result = ExtractiveSummarizer::summarize(&msgs, 50);
assert!(
result.contains("2 + 2") || result.contains("answer"),
"Expected user/assistant content in summary, got: {result}"
);
}
#[test]
fn information_density_scores_numbers_higher() {
let plain = information_density("This is a simple sentence");
let numeric = information_density("The temperature is 22 degrees on March 15");
assert!(numeric > plain);
}
#[test]
fn extract_sentences_handles_empty() {
assert!(extract_sentences("").is_empty());
}
}