graphrag_core/text/
extractive_summarizer.rs

1//! Real extractive summarization with sentence ranking
2//!
3//! This module implements actual sentence ranking algorithms for extractive
4//! summarization, not mock/placeholder implementations.
5
6use std::collections::{HashMap, HashSet};
7
8/// Extractive summarizer using sentence scoring
9pub struct ExtractiveSummarizer {
10    /// Stop words to ignore in scoring
11    stopwords: HashSet<String>,
12}
13
14impl ExtractiveSummarizer {
15    /// Create a new extractive summarizer
16    pub fn new() -> Self {
17        Self {
18            stopwords: Self::load_stopwords(),
19        }
20    }
21
22    /// Generate a summary of the given text
23    ///
24    /// # Arguments
25    /// * `text` - The input text to summarize
26    /// * `max_length` - Maximum character length of the summary
27    ///
28    /// # Returns
29    /// Extractive summary selecting the most important sentences
30    pub fn summarize(&self, text: &str, max_length: usize) -> crate::Result<String> {
31        // 1. Split into sentences
32        let sentences = self.split_sentences(text);
33
34        if sentences.is_empty() {
35            return Ok(String::new());
36        }
37
38        if sentences.len() == 1 {
39            let sentence = &sentences[0];
40            if sentence.len() <= max_length {
41                return Ok(sentence.clone());
42            } else {
43                return Ok(self.truncate_sentence(sentence, max_length));
44            }
45        }
46
47        // 2. Score each sentence
48        let scored_sentences: Vec<(usize, f32)> = sentences
49            .iter()
50            .enumerate()
51            .map(|(idx, sentence)| {
52                let score = self.score_sentence(sentence, &sentences, idx);
53                (idx, score)
54            })
55            .collect();
56
57        // 3. Select top sentences until we reach max_length
58        let selected_indices = self.select_sentences(scored_sentences, &sentences, max_length);
59
60        // 4. Combine selected sentences in original order
61        let summary = selected_indices
62            .iter()
63            .map(|&idx| sentences[idx].as_str())
64            .collect::<Vec<_>>()
65            .join(" ");
66
67        Ok(summary)
68    }
69
70    /// Split text into sentences using multiple heuristics
71    fn split_sentences(&self, text: &str) -> Vec<String> {
72        let mut sentences = Vec::new();
73        let mut current_sentence = String::new();
74
75        let sentence_endings = ['.', '!', '?'];
76
77        for ch in text.chars() {
78            current_sentence.push(ch);
79
80            if sentence_endings.contains(&ch) {
81                // Check if next character is whitespace or end of text
82                let trimmed = current_sentence.trim().to_string();
83                if !trimmed.is_empty() && trimmed.len() > 5 {
84                    // Filter out very short "sentences" (likely abbreviations)
85                    sentences.push(trimmed);
86                }
87                current_sentence.clear();
88            }
89        }
90
91        // Add any remaining text
92        let trimmed = current_sentence.trim().to_string();
93        if !trimmed.is_empty() && trimmed.len() > 5 {
94            sentences.push(trimmed);
95        }
96
97        sentences
98    }
99
100    /// Score a sentence based on multiple factors
101    ///
102    /// Scoring criteria:
103    /// 1. Position score (beginning and end sentences are important)
104    /// 2. Length score (prefer medium-length sentences)
105    /// 3. Word frequency score (content words that appear multiple times)
106    /// 4. Proper noun score (capitalized words)
107    /// 5. Numeric content score (sentences with numbers often contain facts)
108    fn score_sentence(&self, sentence: &str, all_sentences: &[String], position: usize) -> f32 {
109        let mut total_score = 0.0;
110
111        // 1. Position score: first and last sentences often contain key information
112        let position_score = if position == 0 {
113            2.0 // First sentence is very important
114        } else if position == all_sentences.len() - 1 {
115            1.5 // Last sentence is somewhat important
116        } else {
117            // Middle sentences get decreasing score based on distance from start
118            let distance_from_start = position as f32 / all_sentences.len() as f32;
119            1.0 - (distance_from_start * 0.5) // Gradually decrease from 1.0 to 0.5
120        };
121        total_score += position_score * 0.3;
122
123        // 2. Length score: prefer sentences of medium length
124        let words: Vec<&str> = sentence.split_whitespace().collect();
125        let word_count = words.len();
126
127        let length_score = if word_count < 5 {
128            0.3 // Too short
129        } else if word_count > 40 {
130            0.5 // Too long
131        } else if (10..=25).contains(&word_count) {
132            1.0 // Ideal length
133        } else {
134            0.7 // Acceptable length
135        };
136        total_score += length_score * 0.2;
137
138        // 3. Word frequency score: words that appear across multiple sentences
139        let word_freq_score = self.calculate_word_frequency_score(sentence, all_sentences);
140        total_score += word_freq_score * 0.3;
141
142        // 4. Proper noun score: sentences with capitalized words (names, places)
143        let proper_noun_score = self.calculate_proper_noun_score(sentence);
144        total_score += proper_noun_score * 0.1;
145
146        // 5. Numeric content score: sentences with numbers often state facts
147        let numeric_score = self.calculate_numeric_score(sentence);
148        total_score += numeric_score * 0.1;
149
150        total_score
151    }
152
153    /// Calculate word frequency score
154    fn calculate_word_frequency_score(&self, sentence: &str, all_sentences: &[String]) -> f32 {
155        // Get all words from all sentences
156        let all_words: Vec<String> = all_sentences
157            .iter()
158            .flat_map(|s| s.split_whitespace())
159            .map(|w| w.to_lowercase().trim_matches(|c: char| !c.is_alphanumeric()).to_string())
160            .filter(|w| !w.is_empty() && !self.stopwords.contains(w))
161            .collect();
162
163        // Count word frequencies
164        let mut word_counts: HashMap<String, usize> = HashMap::new();
165        for word in &all_words {
166            *word_counts.entry(word.clone()).or_insert(0) += 1;
167        }
168
169        // Score sentence based on its words' frequencies
170        let sentence_words: Vec<String> = sentence
171            .split_whitespace()
172            .map(|w| w.to_lowercase().trim_matches(|c: char| !c.is_alphanumeric()).to_string())
173            .filter(|w| !w.is_empty() && !self.stopwords.contains(w))
174            .collect();
175
176        if sentence_words.is_empty() {
177            return 0.0;
178        }
179
180        let total_score: usize = sentence_words
181            .iter()
182            .filter_map(|w| word_counts.get(w))
183            .sum();
184
185        let avg_score = total_score as f32 / sentence_words.len() as f32;
186
187        // Normalize (words appearing 2-3 times are good indicators)
188        (avg_score / 3.0).min(1.0)
189    }
190
191    /// Calculate proper noun score
192    fn calculate_proper_noun_score(&self, sentence: &str) -> f32 {
193        let words: Vec<&str> = sentence.split_whitespace().collect();
194        if words.is_empty() {
195            return 0.0;
196        }
197
198        let proper_noun_count = words
199            .iter()
200            .filter(|word| {
201                // Check if word starts with uppercase and is not sentence start
202                word.chars().next().map_or(false, |c| c.is_uppercase())
203                    && word.len() > 2
204                    && !self.stopwords.contains(&word.to_lowercase())
205            })
206            .count();
207
208        // Normalize by sentence length
209        (proper_noun_count as f32 / words.len() as f32).min(1.0)
210    }
211
212    /// Calculate numeric content score
213    fn calculate_numeric_score(&self, sentence: &str) -> f32 {
214        let has_number = sentence.chars().any(|c| c.is_numeric());
215
216        // Count numbers
217        let number_count = sentence
218            .split_whitespace()
219            .filter(|word| word.chars().any(|c| c.is_numeric()))
220            .count();
221
222        if has_number {
223            (number_count as f32 * 0.3).min(1.0)
224        } else {
225            0.0
226        }
227    }
228
229    /// Select sentences to include in summary
230    ///
231    /// Greedy selection: iteratively add highest-scoring sentences until max_length reached
232    fn select_sentences(
233        &self,
234        mut scored_sentences: Vec<(usize, f32)>,
235        sentences: &[String],
236        max_length: usize,
237    ) -> Vec<usize> {
238        // Sort by score (descending)
239        scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
240
241        let mut selected_indices = Vec::new();
242        let mut current_length = 0;
243
244        for &(idx, _score) in &scored_sentences {
245            let sentence_len = sentences[idx].len();
246
247            // Check if adding this sentence would exceed limit
248            if current_length + sentence_len + 1 <= max_length {
249                // +1 for space
250                selected_indices.push(idx);
251                current_length += sentence_len + 1;
252            }
253
254            // Early exit if we're close to the limit
255            if current_length >= max_length * 90 / 100 {
256                // 90% of max
257                break;
258            }
259        }
260
261        // Sort indices to maintain original order
262        selected_indices.sort_unstable();
263
264        // If no sentences fit, take the first (highest-scored) one and truncate
265        if selected_indices.is_empty() && !scored_sentences.is_empty() {
266            selected_indices.push(scored_sentences[0].0);
267        }
268
269        selected_indices
270    }
271
272    /// Truncate a sentence to fit within max_length
273    fn truncate_sentence(&self, sentence: &str, max_length: usize) -> String {
274        if sentence.len() <= max_length {
275            return sentence.to_string();
276        }
277
278        // Find a good breaking point (word boundary)
279        let mut end = max_length.saturating_sub(3); // Leave room for "..."
280
281        // Move back to last word boundary
282        while end > 0 && !sentence.is_char_boundary(end) {
283            end -= 1;
284        }
285
286        while end > 0 && !sentence.chars().nth(end).map_or(false, |c| c.is_whitespace()) {
287            end -= 1;
288        }
289
290        if end == 0 {
291            // Fallback: just cut at character boundary
292            end = max_length.saturating_sub(3);
293            while end > 0 && !sentence.is_char_boundary(end) {
294                end -= 1;
295            }
296        }
297
298        format!("{}...", &sentence[..end].trim())
299    }
300
301    /// Load stopwords
302    fn load_stopwords() -> HashSet<String> {
303        let stopwords_list = vec![
304            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
305            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
306            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
307            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
308            "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know",
309            "take", "people", "into", "year", "your", "good", "some", "could", "them", "see",
310            "other", "than", "then", "now", "look", "only", "come", "its", "over", "think",
311        ];
312
313        stopwords_list.into_iter().map(|s| s.to_string()).collect()
314    }
315
316    /// Summarize with a target number of sentences instead of character limit
317    pub fn summarize_sentences(&self, text: &str, num_sentences: usize) -> crate::Result<String> {
318        let sentences = self.split_sentences(text);
319
320        if sentences.is_empty() {
321            return Ok(String::new());
322        }
323
324        if sentences.len() <= num_sentences {
325            return Ok(sentences.join(" "));
326        }
327
328        // Score sentences
329        let mut scored_sentences: Vec<(usize, f32)> = sentences
330            .iter()
331            .enumerate()
332            .map(|(idx, sentence)| {
333                let score = self.score_sentence(sentence, &sentences, idx);
334                (idx, score)
335            })
336            .collect();
337
338        // Sort by score and take top N
339        scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
340
341        let mut selected_indices: Vec<usize> = scored_sentences
342            .into_iter()
343            .take(num_sentences)
344            .map(|(idx, _)| idx)
345            .collect();
346
347        // Sort indices to maintain original order
348        selected_indices.sort_unstable();
349
350        let summary = selected_indices
351            .iter()
352            .map(|&idx| sentences[idx].as_str())
353            .collect::<Vec<_>>()
354            .join(" ");
355
356        Ok(summary)
357    }
358}
359
360impl Default for ExtractiveSummarizer {
361    fn default() -> Self {
362        Self::new()
363    }
364}
365
366#[cfg(test)]
367mod tests {
368    use super::*;
369
370    #[test]
371    fn test_sentence_splitting() {
372        let summarizer = ExtractiveSummarizer::new();
373        let text = "This is the first sentence. This is the second! Is this the third?";
374        let sentences = summarizer.split_sentences(text);
375
376        assert_eq!(sentences.len(), 3);
377        assert!(sentences[0].contains("first sentence"));
378        assert!(sentences[1].contains("second"));
379        assert!(sentences[2].contains("third"));
380    }
381
382    #[test]
383    fn test_summarization() {
384        let summarizer = ExtractiveSummarizer::new();
385        let text = "Machine learning is a subset of artificial intelligence. \
386                    It focuses on training algorithms to learn from data. \
387                    Deep learning is a specialized branch of machine learning. \
388                    Neural networks are the foundation of deep learning systems.";
389
390        let summary = summarizer.summarize(text, 100).unwrap();
391
392        assert!(!summary.is_empty());
393        assert!(summary.len() <= 100);
394        // Summary should contain content from the original text
395        assert!(summary.contains("machine learning") || summary.contains("artificial intelligence"));
396    }
397
398    #[test]
399    fn test_sentence_selection() {
400        let summarizer = ExtractiveSummarizer::new();
401        let text = "The quick brown fox jumps over the lazy dog. \
402                    This is a simple test sentence. \
403                    Machine learning and artificial intelligence are transforming technology.";
404
405        let summary = summarizer.summarize_sentences(text, 1).unwrap();
406
407        // Should select one sentence (likely the first or third based on content)
408        let sentence_count = summary.matches('.').count() + summary.matches('!').count() + summary.matches('?').count();
409        assert!(sentence_count <= 2); // Allow for edge cases
410    }
411
412    #[test]
413    fn test_truncation() {
414        let summarizer = ExtractiveSummarizer::new();
415        let long_sentence = "This is a very long sentence that needs to be truncated because it exceeds the maximum allowed length for the summary";
416
417        let truncated = summarizer.truncate_sentence(long_sentence, 50);
418
419        assert!(truncated.len() <= 50);
420        assert!(truncated.ends_with("..."));
421    }
422
423    #[test]
424    fn test_empty_text() {
425        let summarizer = ExtractiveSummarizer::new();
426        let summary = summarizer.summarize("", 100).unwrap();
427        assert_eq!(summary, "");
428    }
429
430    #[test]
431    fn test_single_sentence() {
432        let summarizer = ExtractiveSummarizer::new();
433        let text = "This is a single sentence.";
434        let summary = summarizer.summarize(text, 100).unwrap();
435
436        assert_eq!(summary, text);
437    }
438}
graphrag_core/text/extractive_summarizer.rs

graphrag_core/text/
extractive_summarizer.rs