graphrag_core/text/
extractive_summarizer.rs

1//! Real extractive summarization with sentence ranking
2//!
3//! This module implements actual sentence ranking algorithms for extractive
4//! summarization, not mock/placeholder implementations.
5
6use std::collections::{HashMap, HashSet};
7
8/// Extractive summarizer using sentence scoring
9pub struct ExtractiveSummarizer {
10    /// Stop words to ignore in scoring
11    stopwords: HashSet<String>,
12}
13
14impl ExtractiveSummarizer {
15    /// Create a new extractive summarizer
16    pub fn new() -> Self {
17        Self {
18            stopwords: Self::load_stopwords(),
19        }
20    }
21
22    /// Generate a summary of the given text
23    ///
24    /// # Arguments
25    /// * `text` - The input text to summarize
26    /// * `max_length` - Maximum character length of the summary
27    ///
28    /// # Returns
29    /// Extractive summary selecting the most important sentences
30    pub fn summarize(&self, text: &str, max_length: usize) -> crate::Result<String> {
31        // 1. Split into sentences
32        let sentences = self.split_sentences(text);
33
34        if sentences.is_empty() {
35            return Ok(String::new());
36        }
37
38        if sentences.len() == 1 {
39            let sentence = &sentences[0];
40            if sentence.len() <= max_length {
41                return Ok(sentence.clone());
42            } else {
43                return Ok(self.truncate_sentence(sentence, max_length));
44            }
45        }
46
47        // 2. Score each sentence
48        let scored_sentences: Vec<(usize, f32)> = sentences
49            .iter()
50            .enumerate()
51            .map(|(idx, sentence)| {
52                let score = self.score_sentence(sentence, &sentences, idx);
53                (idx, score)
54            })
55            .collect();
56
57        // 3. Select top sentences until we reach max_length
58        let selected_indices = self.select_sentences(scored_sentences, &sentences, max_length);
59
60        // 4. Combine selected sentences in original order
61        let summary = selected_indices
62            .iter()
63            .map(|&idx| sentences[idx].as_str())
64            .collect::<Vec<_>>()
65            .join(" ");
66
67        Ok(summary)
68    }
69
70    /// Split text into sentences using multiple heuristics
71    fn split_sentences(&self, text: &str) -> Vec<String> {
72        let mut sentences = Vec::new();
73        let mut current_sentence = String::new();
74
75        let sentence_endings = ['.', '!', '?'];
76
77        for ch in text.chars() {
78            current_sentence.push(ch);
79
80            if sentence_endings.contains(&ch) {
81                // Check if next character is whitespace or end of text
82                let trimmed = current_sentence.trim().to_string();
83                if !trimmed.is_empty() && trimmed.len() > 5 {
84                    // Filter out very short "sentences" (likely abbreviations)
85                    sentences.push(trimmed);
86                }
87                current_sentence.clear();
88            }
89        }
90
91        // Add any remaining text
92        let trimmed = current_sentence.trim().to_string();
93        if !trimmed.is_empty() && trimmed.len() > 5 {
94            sentences.push(trimmed);
95        }
96
97        sentences
98    }
99
100    /// Score a sentence based on multiple factors
101    ///
102    /// Scoring criteria:
103    /// 1. Position score (beginning and end sentences are important)
104    /// 2. Length score (prefer medium-length sentences)
105    /// 3. Word frequency score (content words that appear multiple times)
106    /// 4. Proper noun score (capitalized words)
107    /// 5. Numeric content score (sentences with numbers often contain facts)
108    fn score_sentence(&self, sentence: &str, all_sentences: &[String], position: usize) -> f32 {
109        let mut total_score = 0.0;
110
111        // 1. Position score: first and last sentences often contain key information
112        let position_score = if position == 0 {
113            2.0 // First sentence is very important
114        } else if position == all_sentences.len() - 1 {
115            1.5 // Last sentence is somewhat important
116        } else {
117            // Middle sentences get decreasing score based on distance from start
118            let distance_from_start = position as f32 / all_sentences.len() as f32;
119            1.0 - (distance_from_start * 0.5) // Gradually decrease from 1.0 to 0.5
120        };
121        total_score += position_score * 0.3;
122
123        // 2. Length score: prefer sentences of medium length
124        let words: Vec<&str> = sentence.split_whitespace().collect();
125        let word_count = words.len();
126
127        let length_score = if word_count < 5 {
128            0.3 // Too short
129        } else if word_count > 40 {
130            0.5 // Too long
131        } else if (10..=25).contains(&word_count) {
132            1.0 // Ideal length
133        } else {
134            0.7 // Acceptable length
135        };
136        total_score += length_score * 0.2;
137
138        // 3. Word frequency score: words that appear across multiple sentences
139        let word_freq_score = self.calculate_word_frequency_score(sentence, all_sentences);
140        total_score += word_freq_score * 0.3;
141
142        // 4. Proper noun score: sentences with capitalized words (names, places)
143        let proper_noun_score = self.calculate_proper_noun_score(sentence);
144        total_score += proper_noun_score * 0.1;
145
146        // 5. Numeric content score: sentences with numbers often state facts
147        let numeric_score = self.calculate_numeric_score(sentence);
148        total_score += numeric_score * 0.1;
149
150        total_score
151    }
152
153    /// Calculate word frequency score
154    fn calculate_word_frequency_score(&self, sentence: &str, all_sentences: &[String]) -> f32 {
155        // Get all words from all sentences
156        let all_words: Vec<String> = all_sentences
157            .iter()
158            .flat_map(|s| s.split_whitespace())
159            .map(|w| {
160                w.to_lowercase()
161                    .trim_matches(|c: char| !c.is_alphanumeric())
162                    .to_string()
163            })
164            .filter(|w| !w.is_empty() && !self.stopwords.contains(w))
165            .collect();
166
167        // Count word frequencies
168        let mut word_counts: HashMap<String, usize> = HashMap::new();
169        for word in &all_words {
170            *word_counts.entry(word.clone()).or_insert(0) += 1;
171        }
172
173        // Score sentence based on its words' frequencies
174        let sentence_words: Vec<String> = sentence
175            .split_whitespace()
176            .map(|w| {
177                w.to_lowercase()
178                    .trim_matches(|c: char| !c.is_alphanumeric())
179                    .to_string()
180            })
181            .filter(|w| !w.is_empty() && !self.stopwords.contains(w))
182            .collect();
183
184        if sentence_words.is_empty() {
185            return 0.0;
186        }
187
188        let total_score: usize = sentence_words
189            .iter()
190            .filter_map(|w| word_counts.get(w))
191            .sum();
192
193        let avg_score = total_score as f32 / sentence_words.len() as f32;
194
195        // Normalize (words appearing 2-3 times are good indicators)
196        (avg_score / 3.0).min(1.0)
197    }
198
199    /// Calculate proper noun score
200    fn calculate_proper_noun_score(&self, sentence: &str) -> f32 {
201        let words: Vec<&str> = sentence.split_whitespace().collect();
202        if words.is_empty() {
203            return 0.0;
204        }
205
206        let proper_noun_count = words
207            .iter()
208            .filter(|word| {
209                // Check if word starts with uppercase and is not sentence start
210                word.chars().next().is_some_and(|c| c.is_uppercase())
211                    && word.len() > 2
212                    && !self.stopwords.contains(&word.to_lowercase())
213            })
214            .count();
215
216        // Normalize by sentence length
217        (proper_noun_count as f32 / words.len() as f32).min(1.0)
218    }
219
220    /// Calculate numeric content score
221    fn calculate_numeric_score(&self, sentence: &str) -> f32 {
222        let has_number = sentence.chars().any(|c| c.is_numeric());
223
224        // Count numbers
225        let number_count = sentence
226            .split_whitespace()
227            .filter(|word| word.chars().any(|c| c.is_numeric()))
228            .count();
229
230        if has_number {
231            (number_count as f32 * 0.3).min(1.0)
232        } else {
233            0.0
234        }
235    }
236
237    /// Select sentences to include in summary
238    ///
239    /// Greedy selection: iteratively add highest-scoring sentences until max_length reached
240    fn select_sentences(
241        &self,
242        mut scored_sentences: Vec<(usize, f32)>,
243        sentences: &[String],
244        max_length: usize,
245    ) -> Vec<usize> {
246        // Sort by score (descending)
247        scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
248
249        let mut selected_indices = Vec::new();
250        let mut current_length = 0;
251
252        for &(idx, _score) in &scored_sentences {
253            let sentence_len = sentences[idx].len();
254
255            // Check if adding this sentence would exceed limit
256            if current_length + sentence_len < max_length {
257                // +1 for space
258                selected_indices.push(idx);
259                current_length += sentence_len + 1;
260            }
261
262            // Early exit if we're close to the limit
263            if current_length >= max_length * 90 / 100 {
264                // 90% of max
265                break;
266            }
267        }
268
269        // Sort indices to maintain original order
270        selected_indices.sort_unstable();
271
272        // If no sentences fit, take the first (highest-scored) one and truncate
273        if selected_indices.is_empty() && !scored_sentences.is_empty() {
274            selected_indices.push(scored_sentences[0].0);
275        }
276
277        selected_indices
278    }
279
280    /// Truncate a sentence to fit within max_length
281    fn truncate_sentence(&self, sentence: &str, max_length: usize) -> String {
282        if sentence.len() <= max_length {
283            return sentence.to_string();
284        }
285
286        // Find a good breaking point (word boundary)
287        let mut end = max_length.saturating_sub(3); // Leave room for "..."
288
289        // Move back to last word boundary
290        while end > 0 && !sentence.is_char_boundary(end) {
291            end -= 1;
292        }
293
294        while end > 0 && !sentence.chars().nth(end).is_some_and(|c| c.is_whitespace()) {
295            end -= 1;
296        }
297
298        if end == 0 {
299            // Fallback: just cut at character boundary
300            end = max_length.saturating_sub(3);
301            while end > 0 && !sentence.is_char_boundary(end) {
302                end -= 1;
303            }
304        }
305
306        format!("{}...", &sentence[..end].trim())
307    }
308
309    /// Load stopwords
310    fn load_stopwords() -> HashSet<String> {
311        let stopwords_list = vec![
312            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
313            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
314            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
315            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
316            "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
317            "people", "into", "year", "your", "good", "some", "could", "them", "see", "other",
318            "than", "then", "now", "look", "only", "come", "its", "over", "think",
319        ];
320
321        stopwords_list.into_iter().map(|s| s.to_string()).collect()
322    }
323
324    /// Summarize with a target number of sentences instead of character limit
325    pub fn summarize_sentences(&self, text: &str, num_sentences: usize) -> crate::Result<String> {
326        let sentences = self.split_sentences(text);
327
328        if sentences.is_empty() {
329            return Ok(String::new());
330        }
331
332        if sentences.len() <= num_sentences {
333            return Ok(sentences.join(" "));
334        }
335
336        // Score sentences
337        let mut scored_sentences: Vec<(usize, f32)> = sentences
338            .iter()
339            .enumerate()
340            .map(|(idx, sentence)| {
341                let score = self.score_sentence(sentence, &sentences, idx);
342                (idx, score)
343            })
344            .collect();
345
346        // Sort by score and take top N
347        scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
348
349        let mut selected_indices: Vec<usize> = scored_sentences
350            .into_iter()
351            .take(num_sentences)
352            .map(|(idx, _)| idx)
353            .collect();
354
355        // Sort indices to maintain original order
356        selected_indices.sort_unstable();
357
358        let summary = selected_indices
359            .iter()
360            .map(|&idx| sentences[idx].as_str())
361            .collect::<Vec<_>>()
362            .join(" ");
363
364        Ok(summary)
365    }
366}
367
368impl Default for ExtractiveSummarizer {
369    fn default() -> Self {
370        Self::new()
371    }
372}
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377
378    #[test]
379    fn test_sentence_splitting() {
380        let summarizer = ExtractiveSummarizer::new();
381        let text = "This is the first sentence. This is the second! Is this the third?";
382        let sentences = summarizer.split_sentences(text);
383
384        assert_eq!(sentences.len(), 3);
385        assert!(sentences[0].contains("first sentence"));
386        assert!(sentences[1].contains("second"));
387        assert!(sentences[2].contains("third"));
388    }
389
390    #[test]
391    fn test_summarization() {
392        let summarizer = ExtractiveSummarizer::new();
393        let text = "Machine learning is a subset of artificial intelligence. \
394                    It focuses on training algorithms to learn from data. \
395                    Deep learning is a specialized branch of machine learning. \
396                    Neural networks are the foundation of deep learning systems.";
397
398        let summary = summarizer.summarize(text, 100).unwrap();
399
400        assert!(!summary.is_empty());
401        assert!(summary.len() <= 100);
402        // Summary should contain content from the original text
403        assert!(
404            summary.contains("machine learning") || summary.contains("artificial intelligence")
405        );
406    }
407
408    #[test]
409    fn test_sentence_selection() {
410        let summarizer = ExtractiveSummarizer::new();
411        let text = "The quick brown fox jumps over the lazy dog. \
412                    This is a simple test sentence. \
413                    Machine learning and artificial intelligence are transforming technology.";
414
415        let summary = summarizer.summarize_sentences(text, 1).unwrap();
416
417        // Should select one sentence (likely the first or third based on content)
418        let sentence_count = summary.matches('.').count()
419            + summary.matches('!').count()
420            + summary.matches('?').count();
421        assert!(sentence_count <= 2); // Allow for edge cases
422    }
423
424    #[test]
425    fn test_truncation() {
426        let summarizer = ExtractiveSummarizer::new();
427        let long_sentence = "This is a very long sentence that needs to be truncated because it exceeds the maximum allowed length for the summary";
428
429        let truncated = summarizer.truncate_sentence(long_sentence, 50);
430
431        assert!(truncated.len() <= 50);
432        assert!(truncated.ends_with("..."));
433    }
434
435    #[test]
436    fn test_empty_text() {
437        let summarizer = ExtractiveSummarizer::new();
438        let summary = summarizer.summarize("", 100).unwrap();
439        assert_eq!(summary, "");
440    }
441
442    #[test]
443    fn test_single_sentence() {
444        let summarizer = ExtractiveSummarizer::new();
445        let text = "This is a single sentence.";
446        let summary = summarizer.summarize(text, 100).unwrap();
447
448        assert_eq!(summary, text);
449    }
450}
graphrag_core/text/extractive_summarizer.rs

graphrag_core/text/
extractive_summarizer.rs