readability_rust/
regexps.rs1use regex::Regex;
4use std::sync::OnceLock;
5
6pub struct ReadabilityRegexps {
8 pub unlikely_candidates: Regex,
9 pub ok_maybe_its_candidate: Regex,
10 pub positive: Regex,
11 pub negative: Regex,
12 pub extraneous: Regex,
13 pub byline: Regex,
14 pub replace_fonts: Regex,
15 pub normalize: Regex,
16 pub videos: Regex,
17 pub share_elements: Regex,
18 pub next_link: Regex,
19 pub prev_link: Regex,
20 pub tokenize: Regex,
21 pub whitespace: Regex,
22 pub has_content: Regex,
23 pub hash_url: Regex,
24 pub b64_data_url: Regex,
25 pub commas: Regex,
26 pub json_ld_article_types: Regex,
27 pub ad_words: Regex,
28 pub loading_words: Regex,
29}
30
31impl ReadabilityRegexps {
32 pub fn new() -> Self {
33 Self {
34 unlikely_candidates: Regex::new(
35 r"(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote"
36 ).unwrap(),
37
38 ok_maybe_its_candidate: Regex::new(
39 r"(?i)and|article|body|column|content|main|mathjax|shadow"
40 ).unwrap(),
41
42 positive: Regex::new(
43 r"(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story"
44 ).unwrap(),
45
46 negative: Regex::new(
47 r"(?i)-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget"
48 ).unwrap(),
49
50 extraneous: Regex::new(
51 r"(?i)print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility"
52 ).unwrap(),
53
54 byline: Regex::new(
55 r"(?i)byline|author|dateline|writtenby|written\s+by|p-author"
56 ).unwrap(),
57
58 replace_fonts: Regex::new(
59 r"<(\/?)font[^>]*>"
60 ).unwrap(),
61
62 normalize: Regex::new(
63 r"\s{2,}"
64 ).unwrap(),
65
66 videos: Regex::new(
67 r"\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq|bilibili|live.bilibili)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)"
68 ).unwrap(),
69
70 share_elements: Regex::new(
71 r"(\b|_)(share|sharedaddy)(\b|_)"
72 ).unwrap(),
73
74 next_link: Regex::new(
75 r"(?i)(next|weiter|continue|>([^\|]|$)|»([^\|]|$))"
76 ).unwrap(),
77
78 prev_link: Regex::new(
79 r"(?i)(prev|earl|old|new|<|«)"
80 ).unwrap(),
81
82 tokenize: Regex::new(
83 r"\W+"
84 ).unwrap(),
85
86 whitespace: Regex::new(
87 r"^\s*$"
88 ).unwrap(),
89
90 has_content: Regex::new(
91 r"\S$"
92 ).unwrap(),
93
94 hash_url: Regex::new(
95 r"^#.+"
96 ).unwrap(),
97
98 b64_data_url: Regex::new(
99 r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*,"
100 ).unwrap(),
101
102 commas: Regex::new(
103 r"\u{002C}|\u{060C}|\u{FE50}|\u{FE10}|\u{FE11}|\u{2E41}|\u{2E34}|\u{2E32}|\u{FF0C}"
104 ).unwrap(),
105
106 json_ld_article_types: Regex::new(
107 r"^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$"
108 ).unwrap(),
109
110 ad_words: Regex::new(
111 r"(?i)^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$"
112 ).unwrap(),
113
114 loading_words: Regex::new(
115 r"(?i)^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$"
116 ).unwrap(),
117 }
118 }
119}
120
121static REGEXPS: OnceLock<ReadabilityRegexps> = OnceLock::new();
123
124pub fn get_regexps() -> &'static ReadabilityRegexps {
126 REGEXPS.get_or_init(ReadabilityRegexps::new)
127}
128
129pub fn is_unlikely_candidate(text: &str) -> bool {
131 let regexps = get_regexps();
132 regexps.unlikely_candidates.is_match(text) && !regexps.ok_maybe_its_candidate.is_match(text)
133}
134
135pub fn has_positive_indicators(text: &str) -> bool {
137 get_regexps().positive.is_match(text)
138}
139
140pub fn has_negative_indicators(text: &str) -> bool {
142 get_regexps().negative.is_match(text)
143}
144
145pub fn is_byline(text: &str) -> bool {
147 get_regexps().byline.is_match(text)
148}
149
150pub fn is_video_url(url: &str) -> bool {
152 get_regexps().videos.is_match(url)
153}
154
155
156
157pub fn is_whitespace(text: &str) -> bool {
159 get_regexps().whitespace.is_match(text)
160}
161
162pub fn has_content(text: &str) -> bool {
164 get_regexps().has_content.is_match(text)
165}
166
167pub fn contains_ad_words(text: &str) -> bool {
169 get_regexps().ad_words.is_match(text)
170}
171
172pub fn contains_loading_words(text: &str) -> bool {
174 get_regexps().loading_words.is_match(text)
175}
176
177pub fn is_extraneous_content(text: &str) -> bool {
179 get_regexps().extraneous.is_match(text)
180}
181
182pub fn is_share_element(text: &str) -> bool {
184 get_regexps().share_elements.is_match(text)
185}
186
187pub fn is_next_link(text: &str) -> bool {
189 get_regexps().next_link.is_match(text)
190}
191
192pub fn is_prev_link(text: &str) -> bool {
194 get_regexps().prev_link.is_match(text)
195}
196
197pub fn is_hash_url(url: &str) -> bool {
199 get_regexps().hash_url.is_match(url)
200}
201
202pub fn is_b64_data_url(url: &str) -> bool {
204 get_regexps().b64_data_url.is_match(url)
205}
206
207pub fn is_json_ld_article_type(text: &str) -> bool {
209 get_regexps().json_ld_article_types.is_match(text)
210}
211
212pub fn replace_font_tags(html: &str) -> String {
214 get_regexps().replace_fonts.replace_all(html, "<$1span>").to_string()
215}
216
217pub fn normalize_whitespace(text: &str) -> String {
219 get_regexps().normalize.replace_all(text, " ").to_string()
220}
221
222pub fn tokenize_text(text: &str) -> Vec<&str> {
224 get_regexps().tokenize.split(text).filter(|s| !s.is_empty()).collect()
225}
226
227pub fn count_commas(text: &str) -> usize {
229 get_regexps().commas.find_iter(text).count()
230}
231
232#[cfg(test)]
233mod tests {
234 use super::*;
235
236 #[test]
237 fn test_unlikely_candidates() {
238 assert!(is_unlikely_candidate("sidebar-ad navigation"));
239 assert!(is_unlikely_candidate("comment-section"));
240 assert!(!is_unlikely_candidate("main-content"));
241 assert!(!is_unlikely_candidate("article-body"));
242 }
243
244 #[test]
245 fn test_positive_indicators() {
246 assert!(has_positive_indicators("article-content"));
247 assert!(has_positive_indicators("main-body"));
248 assert!(!has_positive_indicators("sidebar"));
249 }
250
251 #[test]
252 fn test_video_urls() {
253 assert!(is_video_url("https://www.youtube.com/watch?v=test"));
254 assert!(is_video_url("https://player.vimeo.com/video/123"));
255 assert!(!is_video_url("https://example.com/image.jpg"));
256 }
257
258 #[test]
259 fn test_whitespace() {
260 assert!(is_whitespace(" \n\t "));
261 assert!(!is_whitespace("some text"));
262
263 assert!(has_content("some text"));
264 assert!(!has_content(" \n\t "));
265 }
266
267
268
269 #[test]
270 fn test_byline() {
271 assert!(is_byline("by author"));
272 assert!(is_byline("written by John Doe"));
273 assert!(!is_byline("random text"));
274 }
275}