simple_summarize/
summarize.rs

1use crate::prelude::*;
2
3const PUNCTUATIONS: &[char] = &[
4    '.', ',', '!', '?', ';', ':', '…', '„', '«', '»', '`', '\'', '"', '(', ')', 
5    '[', ']', '{', '}', '<', '>', '-', '–', '—', '/', '\\', '|', '@', '#', '%', 
6    '&', '*', '+', '=', '^', '~', '$', '€', '£', '¢', '§', '°',
7];
8
9const STOP_WORDS: &[&str] = &[
10    // english:
11    "etc.", "i.e", "e.g", "dr.", "mr.", "mrs.", "u.s.", "rep.", "sen.", "st.",
12    "jan.", "feb.", "mar.", "apr.", "may.", "jun.", "jul.", "aug.", "sep.",
13    "oct.", "nov.", "dec.", "lt.", "gov.", "a.m.", "p.m.", 
14
15    "a", "an", "the", "and", "or", "but", "not", "be", "have", "do", 
16    "can", "will", "shall", "may", "must", "should", "could", "would",
17    "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", 
18    "us", "them", "my", "your", "his", "its", "our", "their",
19    "this", "that", "these", "those", "who", "what", "which", "where",
20    "when", "why", "how", "if", "in", "on", "at", "to", "for", "of", 
21    "with", "by", "from", "up", "about", "into", "through", "during",
22    "before", "after", "above", "below", "between", "under", "over",
23    "all", "any", "each", "some", "no", "nor", "only", "other", "such",
24    "same", "so", "than", "too", "very", "just", "now", "here", "there",
25
26    // russian:
27    "т.д.", "т.п.", "т.е.", "напр.", "см.", "стр.", "гл.", 
28    "р.", "г.", "ул.", "д.", "кв.", "тел.", "моб.",
29
30    "а", "и", "в", "на", "с", "у", "к", "по", "для", "из", "не", "но", 
31    "что", "это", "как", "кто", "где", "когда", "зачем", "почему",
32    "быть", "иметь", "делать", "мочь", "хотеть", "нужно", "можно",
33    "я", "ты", "он", "она", "оно", "мы", "вы", "они", "мой", "твой",
34    "его", "ее", "их", "этот", "тот", "все", "каждый", "другой",
35    "такой", "столько", "если", "или", "даже", "только", "всего",
36    "уже", "еще", "очень", "совсем", "сам", "самый", "например",
37];
38
39const DIST_COOF: f64 = 0.8;
40
41
42/// Summarizes input text and returns it with keywords (Summarized, Keywords)
43pub fn summarize_text(input_text: &str, compress_coof: f64) -> Result<(String, HashMap<String, usize>)> {
44    let keywords = parse_keywords(input_text, 2)?;
45    
46    // sorting text sentences by keywords & building output:
47    let mut output_sentences = vec![];
48    for sentence in input_text.split(['.', '!', '?', ';']) {
49        let sentence = sentence.trim();
50        let clear_sentence = sentence.to_lowercase().replace(PUNCTUATIONS, "");
51        if clear_sentence.is_empty() { continue }
52
53        let mut score = 0.0;
54        for (i, word) in clear_sentence.split_whitespace().enumerate() {
55            if keywords.get(word).is_some() {
56                let weight = match i {
57                    0..=2 => 1.5,    // first 3 words: +50%
58                    3..=6 => 1.2,    // from 4 to 6: +20%  
59                    _ => 1.0,        // other: +0%
60                };
61                score += weight;
62            }
63        }
64
65        if score >= compress_coof {
66            output_sentences.push(fmt!("{}.", sentence.replace("\n", " ").replace("  ", " ")));
67        }
68    }
69    
70    Ok((output_sentences.join("\n"), keywords))
71}
72
73/// Parses text into keywords
74pub fn parse_keywords(input_text: &str, min_count: usize) -> Result<HashMap<String, usize>> {
75    let clear_text = input_text.trim().to_lowercase().replace(PUNCTUATIONS, "");
76    let mut keywords: HashMap<String, usize> = map![];
77
78    // removing stop words & analysing keywords:
79    'f1: for word in clear_text.split_whitespace() {
80        // skip stop words:
81        for stop_word in STOP_WORDS.iter() {
82            if lev_compare(word, stop_word) {
83                continue 'f1;
84            }
85        }
86
87        // plus same keyword:
88        for (keyword, count) in keywords.iter_mut() {
89            if lev_compare(keyword, word) {
90                *count += 1;
91                continue 'f1;
92            }
93        }
94        // or insert new keyword:
95        keywords.insert(word.to_owned(), 1);
96    }
97    
98    Ok(
99        keywords
100            .into_iter()
101            .filter(|(_, count)| *count >= min_count)
102            .collect()
103    )
104}
105
106/// Compares two texts by levenshtein distance alghoritm
107fn lev_compare(s1: &str, s2: &str) -> bool {
108    let dist = distance::levenshtein(s1, s2);
109    let max_len = s1.chars().count().max(s2.chars().count()).max(1);
110    let coof = 1.0 - (dist as f64 / max_len as f64);
111    
112    coof >= DIST_COOF
113}