1use crate::prelude::*;
2
3const PUNCTUATIONS: &[char] = &[
4 '.', ',', '!', '?', ';', ':', '…', '„', '«', '»', '`', '\'', '"', '(', ')',
5 '[', ']', '{', '}', '<', '>', '-', '–', '—', '/', '\\', '|', '@', '#', '%',
6 '&', '*', '+', '=', '^', '~', '$', '€', '£', '¢', '§', '°',
7];
8
9const STOP_WORDS: &[&str] = &[
10 "etc.", "i.e", "e.g", "dr.", "mr.", "mrs.", "u.s.", "rep.", "sen.", "st.",
12 "jan.", "feb.", "mar.", "apr.", "may.", "jun.", "jul.", "aug.", "sep.",
13 "oct.", "nov.", "dec.", "lt.", "gov.", "a.m.", "p.m.",
14
15 "a", "an", "the", "and", "or", "but", "not", "be", "have", "do",
16 "can", "will", "shall", "may", "must", "should", "could", "would",
17 "i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
18 "us", "them", "my", "your", "his", "its", "our", "their",
19 "this", "that", "these", "those", "who", "what", "which", "where",
20 "when", "why", "how", "if", "in", "on", "at", "to", "for", "of",
21 "with", "by", "from", "up", "about", "into", "through", "during",
22 "before", "after", "above", "below", "between", "under", "over",
23 "all", "any", "each", "some", "no", "nor", "only", "other", "such",
24 "same", "so", "than", "too", "very", "just", "now", "here", "there",
25
26 "т.д.", "т.п.", "т.е.", "напр.", "см.", "стр.", "гл.",
28 "р.", "г.", "ул.", "д.", "кв.", "тел.", "моб.",
29
30 "а", "и", "в", "на", "с", "у", "к", "по", "для", "из", "не", "но",
31 "что", "это", "как", "кто", "где", "когда", "зачем", "почему",
32 "быть", "иметь", "делать", "мочь", "хотеть", "нужно", "можно",
33 "я", "ты", "он", "она", "оно", "мы", "вы", "они", "мой", "твой",
34 "его", "ее", "их", "этот", "тот", "все", "каждый", "другой",
35 "такой", "столько", "если", "или", "даже", "только", "всего",
36 "уже", "еще", "очень", "совсем", "сам", "самый", "например",
37];
38
39const DIST_COOF: f64 = 0.8;
40
41
42pub fn summarize_text(input_text: &str, compress_coof: f64) -> Result<(String, HashMap<String, usize>)> {
44 let keywords = parse_keywords(input_text, 2)?;
45
46 let mut output_sentences = vec![];
48 for sentence in input_text.split(['.', '!', '?', ';']) {
49 let sentence = sentence.trim();
50 let clear_sentence = sentence.to_lowercase().replace(PUNCTUATIONS, "");
51 if clear_sentence.is_empty() { continue }
52
53 let mut score = 0.0;
54 for (i, word) in clear_sentence.split_whitespace().enumerate() {
55 if keywords.get(word).is_some() {
56 let weight = match i {
57 0..=2 => 1.5, 3..=6 => 1.2, _ => 1.0, };
61 score += weight;
62 }
63 }
64
65 if score >= compress_coof {
66 output_sentences.push(fmt!("{}.", sentence.replace("\n", " ").replace(" ", " ")));
67 }
68 }
69
70 Ok((output_sentences.join("\n"), keywords))
71}
72
73pub fn parse_keywords(input_text: &str, min_count: usize) -> Result<HashMap<String, usize>> {
75 let clear_text = input_text.trim().to_lowercase().replace(PUNCTUATIONS, "");
76 let mut keywords: HashMap<String, usize> = map![];
77
78 'f1: for word in clear_text.split_whitespace() {
80 for stop_word in STOP_WORDS.iter() {
82 if lev_compare(word, stop_word) {
83 continue 'f1;
84 }
85 }
86
87 for (keyword, count) in keywords.iter_mut() {
89 if lev_compare(keyword, word) {
90 *count += 1;
91 continue 'f1;
92 }
93 }
94 keywords.insert(word.to_owned(), 1);
96 }
97
98 Ok(
99 keywords
100 .into_iter()
101 .filter(|(_, count)| *count >= min_count)
102 .collect()
103 )
104}
105
106fn lev_compare(s1: &str, s2: &str) -> bool {
108 let dist = distance::levenshtein(s1, s2);
109 let max_len = s1.chars().count().max(s2.chars().count()).max(1);
110 let coof = 1.0 - (dist as f64 / max_len as f64);
111
112 coof >= DIST_COOF
113}