1use std::collections::{HashMap, HashSet};
7
8pub struct ExtractiveSummarizer {
10 stopwords: HashSet<String>,
12}
13
14impl ExtractiveSummarizer {
15 pub fn new() -> Self {
17 Self {
18 stopwords: Self::load_stopwords(),
19 }
20 }
21
22 pub fn summarize(&self, text: &str, max_length: usize) -> crate::Result<String> {
31 let sentences = self.split_sentences(text);
33
34 if sentences.is_empty() {
35 return Ok(String::new());
36 }
37
38 if sentences.len() == 1 {
39 let sentence = &sentences[0];
40 if sentence.len() <= max_length {
41 return Ok(sentence.clone());
42 } else {
43 return Ok(self.truncate_sentence(sentence, max_length));
44 }
45 }
46
47 let scored_sentences: Vec<(usize, f32)> = sentences
49 .iter()
50 .enumerate()
51 .map(|(idx, sentence)| {
52 let score = self.score_sentence(sentence, &sentences, idx);
53 (idx, score)
54 })
55 .collect();
56
57 let selected_indices = self.select_sentences(scored_sentences, &sentences, max_length);
59
60 let summary = selected_indices
62 .iter()
63 .map(|&idx| sentences[idx].as_str())
64 .collect::<Vec<_>>()
65 .join(" ");
66
67 Ok(summary)
68 }
69
70 fn split_sentences(&self, text: &str) -> Vec<String> {
72 let mut sentences = Vec::new();
73 let mut current_sentence = String::new();
74
75 let sentence_endings = ['.', '!', '?'];
76
77 for ch in text.chars() {
78 current_sentence.push(ch);
79
80 if sentence_endings.contains(&ch) {
81 let trimmed = current_sentence.trim().to_string();
83 if !trimmed.is_empty() && trimmed.len() > 5 {
84 sentences.push(trimmed);
86 }
87 current_sentence.clear();
88 }
89 }
90
91 let trimmed = current_sentence.trim().to_string();
93 if !trimmed.is_empty() && trimmed.len() > 5 {
94 sentences.push(trimmed);
95 }
96
97 sentences
98 }
99
100 fn score_sentence(&self, sentence: &str, all_sentences: &[String], position: usize) -> f32 {
109 let mut total_score = 0.0;
110
111 let position_score = if position == 0 {
113 2.0 } else if position == all_sentences.len() - 1 {
115 1.5 } else {
117 let distance_from_start = position as f32 / all_sentences.len() as f32;
119 1.0 - (distance_from_start * 0.5) };
121 total_score += position_score * 0.3;
122
123 let words: Vec<&str> = sentence.split_whitespace().collect();
125 let word_count = words.len();
126
127 let length_score = if word_count < 5 {
128 0.3 } else if word_count > 40 {
130 0.5 } else if (10..=25).contains(&word_count) {
132 1.0 } else {
134 0.7 };
136 total_score += length_score * 0.2;
137
138 let word_freq_score = self.calculate_word_frequency_score(sentence, all_sentences);
140 total_score += word_freq_score * 0.3;
141
142 let proper_noun_score = self.calculate_proper_noun_score(sentence);
144 total_score += proper_noun_score * 0.1;
145
146 let numeric_score = self.calculate_numeric_score(sentence);
148 total_score += numeric_score * 0.1;
149
150 total_score
151 }
152
153 fn calculate_word_frequency_score(&self, sentence: &str, all_sentences: &[String]) -> f32 {
155 let all_words: Vec<String> = all_sentences
157 .iter()
158 .flat_map(|s| s.split_whitespace())
159 .map(|w| {
160 w.to_lowercase()
161 .trim_matches(|c: char| !c.is_alphanumeric())
162 .to_string()
163 })
164 .filter(|w| !w.is_empty() && !self.stopwords.contains(w))
165 .collect();
166
167 let mut word_counts: HashMap<String, usize> = HashMap::new();
169 for word in &all_words {
170 *word_counts.entry(word.clone()).or_insert(0) += 1;
171 }
172
173 let sentence_words: Vec<String> = sentence
175 .split_whitespace()
176 .map(|w| {
177 w.to_lowercase()
178 .trim_matches(|c: char| !c.is_alphanumeric())
179 .to_string()
180 })
181 .filter(|w| !w.is_empty() && !self.stopwords.contains(w))
182 .collect();
183
184 if sentence_words.is_empty() {
185 return 0.0;
186 }
187
188 let total_score: usize = sentence_words
189 .iter()
190 .filter_map(|w| word_counts.get(w))
191 .sum();
192
193 let avg_score = total_score as f32 / sentence_words.len() as f32;
194
195 (avg_score / 3.0).min(1.0)
197 }
198
199 fn calculate_proper_noun_score(&self, sentence: &str) -> f32 {
201 let words: Vec<&str> = sentence.split_whitespace().collect();
202 if words.is_empty() {
203 return 0.0;
204 }
205
206 let proper_noun_count = words
207 .iter()
208 .filter(|word| {
209 word.chars().next().is_some_and(|c| c.is_uppercase())
211 && word.len() > 2
212 && !self.stopwords.contains(&word.to_lowercase())
213 })
214 .count();
215
216 (proper_noun_count as f32 / words.len() as f32).min(1.0)
218 }
219
220 fn calculate_numeric_score(&self, sentence: &str) -> f32 {
222 let has_number = sentence.chars().any(|c| c.is_numeric());
223
224 let number_count = sentence
226 .split_whitespace()
227 .filter(|word| word.chars().any(|c| c.is_numeric()))
228 .count();
229
230 if has_number {
231 (number_count as f32 * 0.3).min(1.0)
232 } else {
233 0.0
234 }
235 }
236
237 fn select_sentences(
241 &self,
242 mut scored_sentences: Vec<(usize, f32)>,
243 sentences: &[String],
244 max_length: usize,
245 ) -> Vec<usize> {
246 scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
248
249 let mut selected_indices = Vec::new();
250 let mut current_length = 0;
251
252 for &(idx, _score) in &scored_sentences {
253 let sentence_len = sentences[idx].len();
254
255 if current_length + sentence_len < max_length {
257 selected_indices.push(idx);
259 current_length += sentence_len + 1;
260 }
261
262 if current_length >= max_length * 90 / 100 {
264 break;
266 }
267 }
268
269 selected_indices.sort_unstable();
271
272 if selected_indices.is_empty() && !scored_sentences.is_empty() {
274 selected_indices.push(scored_sentences[0].0);
275 }
276
277 selected_indices
278 }
279
280 fn truncate_sentence(&self, sentence: &str, max_length: usize) -> String {
282 if sentence.len() <= max_length {
283 return sentence.to_string();
284 }
285
286 let mut end = max_length.saturating_sub(3); while end > 0 && !sentence.is_char_boundary(end) {
291 end -= 1;
292 }
293
294 while end > 0 && !sentence.chars().nth(end).is_some_and(|c| c.is_whitespace()) {
295 end -= 1;
296 }
297
298 if end == 0 {
299 end = max_length.saturating_sub(3);
301 while end > 0 && !sentence.is_char_boundary(end) {
302 end -= 1;
303 }
304 }
305
306 format!("{}...", &sentence[..end].trim())
307 }
308
309 fn load_stopwords() -> HashSet<String> {
311 let stopwords_list = vec![
312 "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
313 "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
314 "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
315 "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
316 "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
317 "people", "into", "year", "your", "good", "some", "could", "them", "see", "other",
318 "than", "then", "now", "look", "only", "come", "its", "over", "think",
319 ];
320
321 stopwords_list.into_iter().map(|s| s.to_string()).collect()
322 }
323
324 pub fn summarize_sentences(&self, text: &str, num_sentences: usize) -> crate::Result<String> {
326 let sentences = self.split_sentences(text);
327
328 if sentences.is_empty() {
329 return Ok(String::new());
330 }
331
332 if sentences.len() <= num_sentences {
333 return Ok(sentences.join(" "));
334 }
335
336 let mut scored_sentences: Vec<(usize, f32)> = sentences
338 .iter()
339 .enumerate()
340 .map(|(idx, sentence)| {
341 let score = self.score_sentence(sentence, &sentences, idx);
342 (idx, score)
343 })
344 .collect();
345
346 scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
348
349 let mut selected_indices: Vec<usize> = scored_sentences
350 .into_iter()
351 .take(num_sentences)
352 .map(|(idx, _)| idx)
353 .collect();
354
355 selected_indices.sort_unstable();
357
358 let summary = selected_indices
359 .iter()
360 .map(|&idx| sentences[idx].as_str())
361 .collect::<Vec<_>>()
362 .join(" ");
363
364 Ok(summary)
365 }
366}
367
368impl Default for ExtractiveSummarizer {
369 fn default() -> Self {
370 Self::new()
371 }
372}
373
374#[cfg(test)]
375mod tests {
376 use super::*;
377
378 #[test]
379 fn test_sentence_splitting() {
380 let summarizer = ExtractiveSummarizer::new();
381 let text = "This is the first sentence. This is the second! Is this the third?";
382 let sentences = summarizer.split_sentences(text);
383
384 assert_eq!(sentences.len(), 3);
385 assert!(sentences[0].contains("first sentence"));
386 assert!(sentences[1].contains("second"));
387 assert!(sentences[2].contains("third"));
388 }
389
390 #[test]
391 fn test_summarization() {
392 let summarizer = ExtractiveSummarizer::new();
393 let text = "Machine learning is a subset of artificial intelligence. \
394 It focuses on training algorithms to learn from data. \
395 Deep learning is a specialized branch of machine learning. \
396 Neural networks are the foundation of deep learning systems.";
397
398 let summary = summarizer.summarize(text, 100).unwrap();
399
400 assert!(!summary.is_empty());
401 assert!(summary.len() <= 100);
402 assert!(
404 summary.contains("machine learning") || summary.contains("artificial intelligence")
405 );
406 }
407
408 #[test]
409 fn test_sentence_selection() {
410 let summarizer = ExtractiveSummarizer::new();
411 let text = "The quick brown fox jumps over the lazy dog. \
412 This is a simple test sentence. \
413 Machine learning and artificial intelligence are transforming technology.";
414
415 let summary = summarizer.summarize_sentences(text, 1).unwrap();
416
417 let sentence_count = summary.matches('.').count()
419 + summary.matches('!').count()
420 + summary.matches('?').count();
421 assert!(sentence_count <= 2); }
423
424 #[test]
425 fn test_truncation() {
426 let summarizer = ExtractiveSummarizer::new();
427 let long_sentence = "This is a very long sentence that needs to be truncated because it exceeds the maximum allowed length for the summary";
428
429 let truncated = summarizer.truncate_sentence(long_sentence, 50);
430
431 assert!(truncated.len() <= 50);
432 assert!(truncated.ends_with("..."));
433 }
434
435 #[test]
436 fn test_empty_text() {
437 let summarizer = ExtractiveSummarizer::new();
438 let summary = summarizer.summarize("", 100).unwrap();
439 assert_eq!(summary, "");
440 }
441
442 #[test]
443 fn test_single_sentence() {
444 let summarizer = ExtractiveSummarizer::new();
445 let text = "This is a single sentence.";
446 let summary = summarizer.summarize(text, 100).unwrap();
447
448 assert_eq!(summary, text);
449 }
450}