readability_text_cleanup/
lib.rs

1use wasm_bindgen::prelude::*;
2
3mod katana;
4
5fn unescape_html(html_string: &str) -> String {
6    let replacements = [
7        (" ", " "),
8        ("&", "&"),
9        ("&lt;", "<"),
10        ("&gt;", ">"),
11        ("&quot;", "\""),
12        ("&#39;", "'"),
13        ("&apos;", "'"),
14        ("&cent;", "¢"),
15        ("&pound;", "£"),
16        ("&yen;", "¥"),
17        ("&euro;", "€"),
18        ("&copy;", "©"),
19        ("&reg;", "®"),
20        ("&sect;", "§"),
21        ("&uml;", "¨"),
22        ("&ordf;", "ª"),
23        ("&laquo;", "«"),
24        ("&not;", "¬"),
25        ("&shy;", "­"),
26        ("&macr;", "¯"),
27        ("&deg;", "°"),
28        ("&plusmn;", "±"),
29        ("&sup2;", "²"),
30        ("&sup3;", "³"),
31        ("&acute;", "´"),
32        ("&micro;", "µ"),
33        ("&para;", "¶"),
34        ("&middot;", "·"),
35        ("&cedil;", "¸"),
36        ("&sup1;", "¹"),
37        ("&ordm;", "º"),
38        ("&raquo;", "»"),
39        ("&frac14;", "¼"),
40        ("&frac12;", "½"),
41        ("&frac34;", "¾"),
42        ("&iquest;", "¿"),
43        ("&Agrave;", "À"),
44        ("&Aacute;", "Á"),
45        ("&Acirc;", "Â"),
46        ("&Atilde;", "Ã"),
47        ("&Auml;", "Ä"),
48        ("&Aring;", "Å"),
49        ("&AElig;", "Æ"),
50        ("&Ccedil;", "Ç"),
51        ("&Egrave;", "È"),
52        ("&Eacute;", "É"),
53        ("&Ecirc;", "Ê"),
54        ("&Euml;", "Ë"),
55        ("&Igrave;", "Ì"),
56        ("&Iacute;", "Í"),
57        ("&Icirc;", "Î"),
58        ("&Iuml;", "Ï"),
59        ("&ETH;", "Ð"),
60        ("&Ntilde;", "Ñ"),
61        ("&Ograve;", "Ò"),
62        ("&Oacute;", "Ó"),
63        ("&Ocirc;", "Ô"),
64        ("&Otilde;", "Õ"),
65        ("&Ouml;", "Ö"),
66        ("&times;", "×"),
67        ("&Oslash;", "Ø"),
68        ("&Ugrave;", "Ù"),
69        ("&Uacute;", "Ú"),
70        ("&Ucirc;", "Û"),
71        ("&Uuml;", "Ü"),
72        ("&Yacute;", "Ý"),
73        ("&THORN;", "Þ"),
74        ("&szlig;", "ß"),
75        ("&agrave;", "à"),
76        ("&aacute;", "á"),
77        ("&acirc;", "â"),
78        ("&atilde;", "ã"),
79        ("&auml;", "ä"),
80        ("&aring;", "å"),
81        ("&aelig;", "æ"),
82        ("&ccedil;", "ç"),
83        ("&egrave;", "è"),
84        ("&eacute;", "é"),
85        ("&ecirc;", "ê"),
86        ("&euml;", "ë"),
87        ("&igrave;", "ì"),
88        ("&iacute;", "í"),
89        ("&icirc;", "î"),
90        ("&iuml;", "ï"),
91        ("&eth;", "ð"),
92        ("&ntilde;", "ñ"),
93        ("&ograve;", "ò"),
94        ("&oacute;", "ó"),
95        ("&ocirc;", "ô"),
96        ("&otilde;", "õ"),
97        ("&ouml;", "ö"),
98        ("&divide;", "÷"),
99        ("&oslash;", "ø"),
100        ("&ugrave;", "ù"),
101        ("&uacute;", "ú"),
102        ("&ucirc;", "û"),
103        ("&uuml;", "ü"),
104        ("&yacute;", "ý"),
105        ("&thorn;", "þ"),
106        ("&yuml;", "ÿ"),
107    ];
108    replacements
109        .iter()
110        .fold(html_string.to_string(), |acc, &(entity, char)| {
111            acc.replace(entity, char)
112        })
113}
114
115fn replace_abbreviations(text: &str) -> String {
116    let abbreviations = [
117        ("i.e.", "ie"),
118        ("e.g.", "eg"),
119        ("etc.", "etc"),
120        ("mr.", "mr"),
121        ("mrs.", "mrs"),
122        ("vs.", "vs"),
123        ("dr.", "dr"),
124        ("prof.", "prof"),
125        ("sr.", "sr"),
126        ("jr.", "jr"),
127        ("st.", "st"),
128        ("jan.", "jan"),
129        ("feb.", "feb"),
130        ("mar.", "mar"),
131        ("apr.", "apr"),
132        ("jun.", "jun"),
133        ("jul.", "jul"),
134        ("aug.", "aug"),
135        ("sept.", "sept"),
136        ("oct.", "oct"),
137        ("nov.", "nov"),
138        ("dec.", "dec"),
139        ("a.m.", "am"),
140        ("p.m.", "pm"),
141        ("u.s.", "us"),
142        ("u.k.", "uk"),
143    ];
144    let regex_set = regex::RegexSet::new(abbreviations.iter().map(|&(abbr, _)| abbr)).unwrap();
145    regex_set
146        .matches(text)
147        .iter()
148        .fold(text.to_string(), |acc, m| {
149            let (from, to) = abbreviations[m];
150            acc.replace(from, to)
151        })
152}
153
154fn remove_html_tags(html_string: &str) -> String {
155    let text = regex::Regex::new(r"(?s)<!--(.*?)-->")
156        .unwrap()
157        .replace_all(html_string, "")
158        .into_owned();
159
160    let text = regex::Regex::new(r"(?s)<h[1-6]>(.*?)</h[1-6]>")
161        .unwrap()
162        .replace_all(&text, "$1\n\n")
163        .into_owned();
164
165    let text = unescape_html(&text);
166    let text = regex::Regex::new(r"<(.*?)>")
167        .unwrap()
168        .replace_all(&text, " ")
169        .into_owned();
170    let text = regex::Regex::new(r"  ")
171        .unwrap()
172        .replace_all(&text, " ")
173        .into_owned();
174    let text = replace_abbreviations(&text);
175    let text = regex::Regex::new(r"\n\s*?\n")
176        .unwrap()
177        .replace_all(&text, "\n\n")
178        .into_owned();
179    let text = regex::Regex::new(r"\s?\[[0-9]+\]\s?")
180        .unwrap()
181        .replace_all(&text, "")
182        .into_owned();
183    let text = text
184        .split("\n")
185        .map(|line| line.trim())
186        .filter(|line| !line.starts_with("^  "))
187        .collect::<Vec<&str>>()
188        .join("\n");
189    // remove all sequences of 3 or more newlines with two newlines
190    let text = regex::Regex::new(r"\n{3,}")
191        .unwrap()
192        .replace_all(&text, "\n\n")
193        .into_owned();
194    text
195}
196
197#[wasm_bindgen]
198pub fn prepare_text(text: &str) -> String {
199    let text = text
200        .split("\n")
201        .map(|line| line.trim())
202        .filter(|line| !line.is_empty())
203        .collect::<Vec<&str>>()
204        .join(" ");
205
206    let text = html2md::parse_html(&text);
207
208    let text = remove_html_tags(&text);
209
210    let paragraphs = katana::cut(&text);
211
212    paragraphs
213        .iter()
214        .map(|p| p.as_slice().join(" "))
215        .collect::<Vec<String>>()
216        .join("\n\n")
217}