readability_rust/
regexps.rs

1//! Regular expressions used throughout the Readability parser
2
3use regex::Regex;
4use std::sync::OnceLock;
5
6/// Regular expressions for identifying content patterns
7pub struct ReadabilityRegexps {
8    pub unlikely_candidates: Regex,
9    pub ok_maybe_its_candidate: Regex,
10    pub positive: Regex,
11    pub negative: Regex,
12    pub extraneous: Regex,
13    pub byline: Regex,
14    pub replace_fonts: Regex,
15    pub normalize: Regex,
16    pub videos: Regex,
17    pub share_elements: Regex,
18    pub next_link: Regex,
19    pub prev_link: Regex,
20    pub tokenize: Regex,
21    pub whitespace: Regex,
22    pub has_content: Regex,
23    pub hash_url: Regex,
24    pub b64_data_url: Regex,
25    pub commas: Regex,
26    pub json_ld_article_types: Regex,
27    pub ad_words: Regex,
28    pub loading_words: Regex,
29}
30
31impl ReadabilityRegexps {
32    pub fn new() -> Self {
33        Self {
34            unlikely_candidates: Regex::new(
35                r"(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote"
36            ).unwrap(),
37            
38            ok_maybe_its_candidate: Regex::new(
39                r"(?i)and|article|body|column|content|main|mathjax|shadow"
40            ).unwrap(),
41            
42            positive: Regex::new(
43                r"(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story"
44            ).unwrap(),
45            
46            negative: Regex::new(
47                r"(?i)-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget"
48            ).unwrap(),
49            
50            extraneous: Regex::new(
51                r"(?i)print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility"
52            ).unwrap(),
53            
54            byline: Regex::new(
55                r"(?i)byline|author|dateline|writtenby|written\s+by|p-author"
56            ).unwrap(),
57            
58            replace_fonts: Regex::new(
59                r"<(\/?)font[^>]*>"
60            ).unwrap(),
61            
62            normalize: Regex::new(
63                r"\s{2,}"
64            ).unwrap(),
65            
66            videos: Regex::new(
67                r"\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq|bilibili|live.bilibili)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)"
68            ).unwrap(),
69            
70            share_elements: Regex::new(
71                r"(\b|_)(share|sharedaddy)(\b|_)"
72            ).unwrap(),
73            
74            next_link: Regex::new(
75                r"(?i)(next|weiter|continue|>([^\|]|$)|»([^\|]|$))"
76            ).unwrap(),
77            
78            prev_link: Regex::new(
79                r"(?i)(prev|earl|old|new|<|«)"
80            ).unwrap(),
81            
82            tokenize: Regex::new(
83                r"\W+"
84            ).unwrap(),
85            
86            whitespace: Regex::new(
87                r"^\s*$"
88            ).unwrap(),
89            
90            has_content: Regex::new(
91                r"\S$"
92            ).unwrap(),
93            
94            hash_url: Regex::new(
95                r"^#.+"
96            ).unwrap(),
97            
98            b64_data_url: Regex::new(
99                r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*,"
100            ).unwrap(),
101            
102            commas: Regex::new(
103                r"\u{002C}|\u{060C}|\u{FE50}|\u{FE10}|\u{FE11}|\u{2E41}|\u{2E34}|\u{2E32}|\u{FF0C}"
104            ).unwrap(),
105            
106            json_ld_article_types: Regex::new(
107                r"^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$"
108            ).unwrap(),
109            
110            ad_words: Regex::new(
111                r"(?i)^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$"
112            ).unwrap(),
113            
114            loading_words: Regex::new(
115                r"(?i)^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$"
116            ).unwrap(),
117        }
118    }
119}
120
121/// Global instance of readability regexps
122static REGEXPS: OnceLock<ReadabilityRegexps> = OnceLock::new();
123
124/// Get the global regexps instance
125pub fn get_regexps() -> &'static ReadabilityRegexps {
126    REGEXPS.get_or_init(ReadabilityRegexps::new)
127}
128
129/// Check if a string matches the unlikely candidates pattern
130pub fn is_unlikely_candidate(text: &str) -> bool {
131    let regexps = get_regexps();
132    regexps.unlikely_candidates.is_match(text) && !regexps.ok_maybe_its_candidate.is_match(text)
133}
134
135/// Check if a string has positive content indicators
136pub fn has_positive_indicators(text: &str) -> bool {
137    get_regexps().positive.is_match(text)
138}
139
140/// Check if a string has negative content indicators
141pub fn has_negative_indicators(text: &str) -> bool {
142    get_regexps().negative.is_match(text)
143}
144
145/// Check if a string contains byline indicators
146pub fn is_byline(text: &str) -> bool {
147    get_regexps().byline.is_match(text)
148}
149
150/// Check if a URL is a video URL
151pub fn is_video_url(url: &str) -> bool {
152    get_regexps().videos.is_match(url)
153}
154
155
156
157/// Check if text is only whitespace
158pub fn is_whitespace(text: &str) -> bool {
159    get_regexps().whitespace.is_match(text)
160}
161
162/// Check if text has content (non-whitespace)
163pub fn has_content(text: &str) -> bool {
164    get_regexps().has_content.is_match(text)
165}
166
167/// Check if a string contains ad-related words
168pub fn contains_ad_words(text: &str) -> bool {
169    get_regexps().ad_words.is_match(text)
170}
171
172/// Check if a string contains loading words
173pub fn contains_loading_words(text: &str) -> bool {
174    get_regexps().loading_words.is_match(text)
175}
176
177/// Check if a string matches extraneous content patterns
178pub fn is_extraneous_content(text: &str) -> bool {
179    get_regexps().extraneous.is_match(text)
180}
181
182/// Check if a string matches share element patterns
183pub fn is_share_element(text: &str) -> bool {
184    get_regexps().share_elements.is_match(text)
185}
186
187/// Check if a string is a next link
188pub fn is_next_link(text: &str) -> bool {
189    get_regexps().next_link.is_match(text)
190}
191
192/// Check if a string is a previous link
193pub fn is_prev_link(text: &str) -> bool {
194    get_regexps().prev_link.is_match(text)
195}
196
197/// Check if a URL is a hash URL
198pub fn is_hash_url(url: &str) -> bool {
199    get_regexps().hash_url.is_match(url)
200}
201
202/// Check if a URL is a base64 data URL
203pub fn is_b64_data_url(url: &str) -> bool {
204    get_regexps().b64_data_url.is_match(url)
205}
206
207/// Check if text matches JSON-LD article types
208pub fn is_json_ld_article_type(text: &str) -> bool {
209    get_regexps().json_ld_article_types.is_match(text)
210}
211
212/// Replace font tags in HTML
213pub fn replace_font_tags(html: &str) -> String {
214    get_regexps().replace_fonts.replace_all(html, "<$1span>").to_string()
215}
216
217/// Normalize whitespace in text
218pub fn normalize_whitespace(text: &str) -> String {
219    get_regexps().normalize.replace_all(text, " ").to_string()
220}
221
222/// Tokenize text
223pub fn tokenize_text(text: &str) -> Vec<&str> {
224    get_regexps().tokenize.split(text).filter(|s| !s.is_empty()).collect()
225}
226
227/// Count commas in text
228pub fn count_commas(text: &str) -> usize {
229    get_regexps().commas.find_iter(text).count()
230}
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235
236    #[test]
237    fn test_unlikely_candidates() {
238        assert!(is_unlikely_candidate("sidebar-ad navigation"));
239        assert!(is_unlikely_candidate("comment-section"));
240        assert!(!is_unlikely_candidate("main-content"));
241        assert!(!is_unlikely_candidate("article-body"));
242    }
243
244    #[test]
245    fn test_positive_indicators() {
246        assert!(has_positive_indicators("article-content"));
247        assert!(has_positive_indicators("main-body"));
248        assert!(!has_positive_indicators("sidebar"));
249    }
250
251    #[test]
252    fn test_video_urls() {
253        assert!(is_video_url("https://www.youtube.com/watch?v=test"));
254        assert!(is_video_url("https://player.vimeo.com/video/123"));
255        assert!(!is_video_url("https://example.com/image.jpg"));
256    }
257
258    #[test]
259    fn test_whitespace() {
260        assert!(is_whitespace("   \n\t  "));
261        assert!(!is_whitespace("some text"));
262        
263        assert!(has_content("some text"));
264        assert!(!has_content("   \n\t  "));
265    }
266
267
268
269    #[test]
270    fn test_byline() {
271        assert!(is_byline("by author"));
272        assert!(is_byline("written by John Doe"));
273        assert!(!is_byline("random text"));
274    }
275}