1use wasm_bindgen::prelude::*;
2
3mod katana;
4
5fn unescape_html(html_string: &str) -> String {
6 let replacements = [
7 (" ", " "),
8 ("&", "&"),
9 ("<", "<"),
10 (">", ">"),
11 (""", "\""),
12 ("'", "'"),
13 ("'", "'"),
14 ("¢", "¢"),
15 ("£", "£"),
16 ("¥", "¥"),
17 ("€", "€"),
18 ("©", "©"),
19 ("®", "®"),
20 ("§", "§"),
21 ("¨", "¨"),
22 ("ª", "ª"),
23 ("«", "«"),
24 ("¬", "¬"),
25 ("­", ""),
26 ("¯", "¯"),
27 ("°", "°"),
28 ("±", "±"),
29 ("²", "²"),
30 ("³", "³"),
31 ("´", "´"),
32 ("µ", "µ"),
33 ("¶", "¶"),
34 ("·", "·"),
35 ("¸", "¸"),
36 ("¹", "¹"),
37 ("º", "º"),
38 ("»", "»"),
39 ("¼", "¼"),
40 ("½", "½"),
41 ("¾", "¾"),
42 ("¿", "¿"),
43 ("À", "À"),
44 ("Á", "Á"),
45 ("Â", "Â"),
46 ("Ã", "Ã"),
47 ("Ä", "Ä"),
48 ("Å", "Å"),
49 ("Æ", "Æ"),
50 ("Ç", "Ç"),
51 ("È", "È"),
52 ("É", "É"),
53 ("Ê", "Ê"),
54 ("Ë", "Ë"),
55 ("Ì", "Ì"),
56 ("Í", "Í"),
57 ("Î", "Î"),
58 ("Ï", "Ï"),
59 ("Ð", "Ð"),
60 ("Ñ", "Ñ"),
61 ("Ò", "Ò"),
62 ("Ó", "Ó"),
63 ("Ô", "Ô"),
64 ("Õ", "Õ"),
65 ("Ö", "Ö"),
66 ("×", "×"),
67 ("Ø", "Ø"),
68 ("Ù", "Ù"),
69 ("Ú", "Ú"),
70 ("Û", "Û"),
71 ("Ü", "Ü"),
72 ("Ý", "Ý"),
73 ("Þ", "Þ"),
74 ("ß", "ß"),
75 ("à", "à"),
76 ("á", "á"),
77 ("â", "â"),
78 ("ã", "ã"),
79 ("ä", "ä"),
80 ("å", "å"),
81 ("æ", "æ"),
82 ("ç", "ç"),
83 ("è", "è"),
84 ("é", "é"),
85 ("ê", "ê"),
86 ("ë", "ë"),
87 ("ì", "ì"),
88 ("í", "í"),
89 ("î", "î"),
90 ("ï", "ï"),
91 ("ð", "ð"),
92 ("ñ", "ñ"),
93 ("ò", "ò"),
94 ("ó", "ó"),
95 ("ô", "ô"),
96 ("õ", "õ"),
97 ("ö", "ö"),
98 ("÷", "÷"),
99 ("ø", "ø"),
100 ("ù", "ù"),
101 ("ú", "ú"),
102 ("û", "û"),
103 ("ü", "ü"),
104 ("ý", "ý"),
105 ("þ", "þ"),
106 ("ÿ", "ÿ"),
107 ];
108 replacements
109 .iter()
110 .fold(html_string.to_string(), |acc, &(entity, char)| {
111 acc.replace(entity, char)
112 })
113}
114
115fn replace_abbreviations(text: &str) -> String {
116 let abbreviations = [
117 ("i.e.", "ie"),
118 ("e.g.", "eg"),
119 ("etc.", "etc"),
120 ("mr.", "mr"),
121 ("mrs.", "mrs"),
122 ("vs.", "vs"),
123 ("dr.", "dr"),
124 ("prof.", "prof"),
125 ("sr.", "sr"),
126 ("jr.", "jr"),
127 ("st.", "st"),
128 ("jan.", "jan"),
129 ("feb.", "feb"),
130 ("mar.", "mar"),
131 ("apr.", "apr"),
132 ("jun.", "jun"),
133 ("jul.", "jul"),
134 ("aug.", "aug"),
135 ("sept.", "sept"),
136 ("oct.", "oct"),
137 ("nov.", "nov"),
138 ("dec.", "dec"),
139 ("a.m.", "am"),
140 ("p.m.", "pm"),
141 ("u.s.", "us"),
142 ("u.k.", "uk"),
143 ];
144 let regex_set = regex::RegexSet::new(abbreviations.iter().map(|&(abbr, _)| abbr)).unwrap();
145 regex_set
146 .matches(text)
147 .iter()
148 .fold(text.to_string(), |acc, m| {
149 let (from, to) = abbreviations[m];
150 acc.replace(from, to)
151 })
152}
153
154fn remove_html_tags(html_string: &str) -> String {
155 let text = regex::Regex::new(r"(?s)<!--(.*?)-->")
156 .unwrap()
157 .replace_all(html_string, "")
158 .into_owned();
159
160 let text = regex::Regex::new(r"(?s)<h[1-6]>(.*?)</h[1-6]>")
161 .unwrap()
162 .replace_all(&text, "$1\n\n")
163 .into_owned();
164
165 let text = unescape_html(&text);
166 let text = regex::Regex::new(r"<(.*?)>")
167 .unwrap()
168 .replace_all(&text, " ")
169 .into_owned();
170 let text = regex::Regex::new(r" ")
171 .unwrap()
172 .replace_all(&text, " ")
173 .into_owned();
174 let text = replace_abbreviations(&text);
175 let text = regex::Regex::new(r"\n\s*?\n")
176 .unwrap()
177 .replace_all(&text, "\n\n")
178 .into_owned();
179 let text = regex::Regex::new(r"\s?\[[0-9]+\]\s?")
180 .unwrap()
181 .replace_all(&text, "")
182 .into_owned();
183 let text = text
184 .split("\n")
185 .map(|line| line.trim())
186 .filter(|line| !line.starts_with("^ "))
187 .collect::<Vec<&str>>()
188 .join("\n");
189 let text = regex::Regex::new(r"\n{3,}")
191 .unwrap()
192 .replace_all(&text, "\n\n")
193 .into_owned();
194 text
195}
196
197#[wasm_bindgen]
198pub fn prepare_text(text: &str) -> String {
199 let text = text
200 .split("\n")
201 .map(|line| line.trim())
202 .filter(|line| !line.is_empty())
203 .collect::<Vec<&str>>()
204 .join(" ");
205
206 let text = html2md::parse_html(&text);
207
208 let text = remove_html_tags(&text);
209
210 let paragraphs = katana::cut(&text);
211
212 paragraphs
213 .iter()
214 .map(|p| p.as_slice().join(" "))
215 .collect::<Vec<String>>()
216 .join("\n\n")
217}