1use std::collections::{HashMap, HashSet};
7
8pub struct ExtractiveSummarizer {
10 stopwords: HashSet<String>,
12}
13
14impl ExtractiveSummarizer {
15 pub fn new() -> Self {
17 Self {
18 stopwords: Self::load_stopwords(),
19 }
20 }
21
22 pub fn summarize(&self, text: &str, max_length: usize) -> crate::Result<String> {
31 let sentences = self.split_sentences(text);
33
34 if sentences.is_empty() {
35 return Ok(String::new());
36 }
37
38 if sentences.len() == 1 {
39 let sentence = &sentences[0];
40 if sentence.len() <= max_length {
41 return Ok(sentence.clone());
42 } else {
43 return Ok(self.truncate_sentence(sentence, max_length));
44 }
45 }
46
47 let scored_sentences: Vec<(usize, f32)> = sentences
49 .iter()
50 .enumerate()
51 .map(|(idx, sentence)| {
52 let score = self.score_sentence(sentence, &sentences, idx);
53 (idx, score)
54 })
55 .collect();
56
57 let selected_indices = self.select_sentences(scored_sentences, &sentences, max_length);
59
60 let summary = selected_indices
62 .iter()
63 .map(|&idx| sentences[idx].as_str())
64 .collect::<Vec<_>>()
65 .join(" ");
66
67 Ok(summary)
68 }
69
70 fn split_sentences(&self, text: &str) -> Vec<String> {
72 let mut sentences = Vec::new();
73 let mut current_sentence = String::new();
74
75 let sentence_endings = ['.', '!', '?'];
76
77 for ch in text.chars() {
78 current_sentence.push(ch);
79
80 if sentence_endings.contains(&ch) {
81 let trimmed = current_sentence.trim().to_string();
83 if !trimmed.is_empty() && trimmed.len() > 5 {
84 sentences.push(trimmed);
86 }
87 current_sentence.clear();
88 }
89 }
90
91 let trimmed = current_sentence.trim().to_string();
93 if !trimmed.is_empty() && trimmed.len() > 5 {
94 sentences.push(trimmed);
95 }
96
97 sentences
98 }
99
100 fn score_sentence(&self, sentence: &str, all_sentences: &[String], position: usize) -> f32 {
109 let mut total_score = 0.0;
110
111 let position_score = if position == 0 {
113 2.0 } else if position == all_sentences.len() - 1 {
115 1.5 } else {
117 let distance_from_start = position as f32 / all_sentences.len() as f32;
119 1.0 - (distance_from_start * 0.5) };
121 total_score += position_score * 0.3;
122
123 let words: Vec<&str> = sentence.split_whitespace().collect();
125 let word_count = words.len();
126
127 let length_score = if word_count < 5 {
128 0.3 } else if word_count > 40 {
130 0.5 } else if (10..=25).contains(&word_count) {
132 1.0 } else {
134 0.7 };
136 total_score += length_score * 0.2;
137
138 let word_freq_score = self.calculate_word_frequency_score(sentence, all_sentences);
140 total_score += word_freq_score * 0.3;
141
142 let proper_noun_score = self.calculate_proper_noun_score(sentence);
144 total_score += proper_noun_score * 0.1;
145
146 let numeric_score = self.calculate_numeric_score(sentence);
148 total_score += numeric_score * 0.1;
149
150 total_score
151 }
152
153 fn calculate_word_frequency_score(&self, sentence: &str, all_sentences: &[String]) -> f32 {
155 let all_words: Vec<String> = all_sentences
157 .iter()
158 .flat_map(|s| s.split_whitespace())
159 .map(|w| w.to_lowercase().trim_matches(|c: char| !c.is_alphanumeric()).to_string())
160 .filter(|w| !w.is_empty() && !self.stopwords.contains(w))
161 .collect();
162
163 let mut word_counts: HashMap<String, usize> = HashMap::new();
165 for word in &all_words {
166 *word_counts.entry(word.clone()).or_insert(0) += 1;
167 }
168
169 let sentence_words: Vec<String> = sentence
171 .split_whitespace()
172 .map(|w| w.to_lowercase().trim_matches(|c: char| !c.is_alphanumeric()).to_string())
173 .filter(|w| !w.is_empty() && !self.stopwords.contains(w))
174 .collect();
175
176 if sentence_words.is_empty() {
177 return 0.0;
178 }
179
180 let total_score: usize = sentence_words
181 .iter()
182 .filter_map(|w| word_counts.get(w))
183 .sum();
184
185 let avg_score = total_score as f32 / sentence_words.len() as f32;
186
187 (avg_score / 3.0).min(1.0)
189 }
190
191 fn calculate_proper_noun_score(&self, sentence: &str) -> f32 {
193 let words: Vec<&str> = sentence.split_whitespace().collect();
194 if words.is_empty() {
195 return 0.0;
196 }
197
198 let proper_noun_count = words
199 .iter()
200 .filter(|word| {
201 word.chars().next().map_or(false, |c| c.is_uppercase())
203 && word.len() > 2
204 && !self.stopwords.contains(&word.to_lowercase())
205 })
206 .count();
207
208 (proper_noun_count as f32 / words.len() as f32).min(1.0)
210 }
211
212 fn calculate_numeric_score(&self, sentence: &str) -> f32 {
214 let has_number = sentence.chars().any(|c| c.is_numeric());
215
216 let number_count = sentence
218 .split_whitespace()
219 .filter(|word| word.chars().any(|c| c.is_numeric()))
220 .count();
221
222 if has_number {
223 (number_count as f32 * 0.3).min(1.0)
224 } else {
225 0.0
226 }
227 }
228
229 fn select_sentences(
233 &self,
234 mut scored_sentences: Vec<(usize, f32)>,
235 sentences: &[String],
236 max_length: usize,
237 ) -> Vec<usize> {
238 scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
240
241 let mut selected_indices = Vec::new();
242 let mut current_length = 0;
243
244 for &(idx, _score) in &scored_sentences {
245 let sentence_len = sentences[idx].len();
246
247 if current_length + sentence_len + 1 <= max_length {
249 selected_indices.push(idx);
251 current_length += sentence_len + 1;
252 }
253
254 if current_length >= max_length * 90 / 100 {
256 break;
258 }
259 }
260
261 selected_indices.sort_unstable();
263
264 if selected_indices.is_empty() && !scored_sentences.is_empty() {
266 selected_indices.push(scored_sentences[0].0);
267 }
268
269 selected_indices
270 }
271
272 fn truncate_sentence(&self, sentence: &str, max_length: usize) -> String {
274 if sentence.len() <= max_length {
275 return sentence.to_string();
276 }
277
278 let mut end = max_length.saturating_sub(3); while end > 0 && !sentence.is_char_boundary(end) {
283 end -= 1;
284 }
285
286 while end > 0 && !sentence.chars().nth(end).map_or(false, |c| c.is_whitespace()) {
287 end -= 1;
288 }
289
290 if end == 0 {
291 end = max_length.saturating_sub(3);
293 while end > 0 && !sentence.is_char_boundary(end) {
294 end -= 1;
295 }
296 }
297
298 format!("{}...", &sentence[..end].trim())
299 }
300
301 fn load_stopwords() -> HashSet<String> {
303 let stopwords_list = vec![
304 "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
305 "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
306 "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
307 "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
308 "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know",
309 "take", "people", "into", "year", "your", "good", "some", "could", "them", "see",
310 "other", "than", "then", "now", "look", "only", "come", "its", "over", "think",
311 ];
312
313 stopwords_list.into_iter().map(|s| s.to_string()).collect()
314 }
315
316 pub fn summarize_sentences(&self, text: &str, num_sentences: usize) -> crate::Result<String> {
318 let sentences = self.split_sentences(text);
319
320 if sentences.is_empty() {
321 return Ok(String::new());
322 }
323
324 if sentences.len() <= num_sentences {
325 return Ok(sentences.join(" "));
326 }
327
328 let mut scored_sentences: Vec<(usize, f32)> = sentences
330 .iter()
331 .enumerate()
332 .map(|(idx, sentence)| {
333 let score = self.score_sentence(sentence, &sentences, idx);
334 (idx, score)
335 })
336 .collect();
337
338 scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
340
341 let mut selected_indices: Vec<usize> = scored_sentences
342 .into_iter()
343 .take(num_sentences)
344 .map(|(idx, _)| idx)
345 .collect();
346
347 selected_indices.sort_unstable();
349
350 let summary = selected_indices
351 .iter()
352 .map(|&idx| sentences[idx].as_str())
353 .collect::<Vec<_>>()
354 .join(" ");
355
356 Ok(summary)
357 }
358}
359
360impl Default for ExtractiveSummarizer {
361 fn default() -> Self {
362 Self::new()
363 }
364}
365
366#[cfg(test)]
367mod tests {
368 use super::*;
369
370 #[test]
371 fn test_sentence_splitting() {
372 let summarizer = ExtractiveSummarizer::new();
373 let text = "This is the first sentence. This is the second! Is this the third?";
374 let sentences = summarizer.split_sentences(text);
375
376 assert_eq!(sentences.len(), 3);
377 assert!(sentences[0].contains("first sentence"));
378 assert!(sentences[1].contains("second"));
379 assert!(sentences[2].contains("third"));
380 }
381
382 #[test]
383 fn test_summarization() {
384 let summarizer = ExtractiveSummarizer::new();
385 let text = "Machine learning is a subset of artificial intelligence. \
386 It focuses on training algorithms to learn from data. \
387 Deep learning is a specialized branch of machine learning. \
388 Neural networks are the foundation of deep learning systems.";
389
390 let summary = summarizer.summarize(text, 100).unwrap();
391
392 assert!(!summary.is_empty());
393 assert!(summary.len() <= 100);
394 assert!(summary.contains("machine learning") || summary.contains("artificial intelligence"));
396 }
397
398 #[test]
399 fn test_sentence_selection() {
400 let summarizer = ExtractiveSummarizer::new();
401 let text = "The quick brown fox jumps over the lazy dog. \
402 This is a simple test sentence. \
403 Machine learning and artificial intelligence are transforming technology.";
404
405 let summary = summarizer.summarize_sentences(text, 1).unwrap();
406
407 let sentence_count = summary.matches('.').count() + summary.matches('!').count() + summary.matches('?').count();
409 assert!(sentence_count <= 2); }
411
412 #[test]
413 fn test_truncation() {
414 let summarizer = ExtractiveSummarizer::new();
415 let long_sentence = "This is a very long sentence that needs to be truncated because it exceeds the maximum allowed length for the summary";
416
417 let truncated = summarizer.truncate_sentence(long_sentence, 50);
418
419 assert!(truncated.len() <= 50);
420 assert!(truncated.ends_with("..."));
421 }
422
423 #[test]
424 fn test_empty_text() {
425 let summarizer = ExtractiveSummarizer::new();
426 let summary = summarizer.summarize("", 100).unwrap();
427 assert_eq!(summary, "");
428 }
429
430 #[test]
431 fn test_single_sentence() {
432 let summarizer = ExtractiveSummarizer::new();
433 let text = "This is a single sentence.";
434 let summary = summarizer.summarize(text, 100).unwrap();
435
436 assert_eq!(summary, text);
437 }
438}