mozilla_readability/
regexes.rs

1/// This module contains regular expressions frequently used by moz_readability
2/// All regexes that only test if a `&str` matches the regex are preceded by the
3/// word "is_match". All other regexes are publicly accessible.
4use regex::Regex;
5use lazy_static::lazy_static;
6
7pub fn is_match_byline(match_str: &str) -> bool {
8    lazy_static! {
9        static ref BYLINE_REGEX: Regex =
10            Regex::new(r"(?i)byline|author|dateline|writtenby|p-author").unwrap();
11    }
12    BYLINE_REGEX.is_match(match_str)
13}
14
15pub fn is_match_positive(match_str: &str) -> bool {
16    lazy_static! {
17        static ref POSITIVE_REGEX: Regex = Regex::new(r"(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story").unwrap();
18    }
19    POSITIVE_REGEX.is_match(match_str)
20}
21
22pub fn is_match_negative(match_str: &str) -> bool {
23    lazy_static! {
24        static ref NEGATIVE_REGEX: Regex = Regex::new(r"(?i)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget").unwrap();
25    }
26    NEGATIVE_REGEX.is_match(match_str)
27}
28
29pub fn is_match_videos(match_str: &str) -> bool {
30    lazy_static! {
31        static ref VIDEOS_REGEX: Regex = Regex::new(r"(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)").unwrap();
32    }
33    VIDEOS_REGEX.is_match(match_str)
34}
35
36pub fn is_match_unlikely(match_str: &str) -> bool {
37    lazy_static! {
38        static ref UNLIKELY_REGEX: Regex = Regex::new(r"(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote").unwrap();
39    }
40    UNLIKELY_REGEX.is_match(match_str)
41}
42
43pub fn is_match_ok_maybe(match_str: &str) -> bool {
44    lazy_static! {
45        static ref OK_MAYBE_REGEX: Regex =
46            Regex::new(r"(?i)and|article|body|column|content|main|shadow").unwrap();
47    }
48    OK_MAYBE_REGEX.is_match(match_str)
49}
50
51pub fn is_match_node_content(match_str: &str) -> bool {
52    lazy_static! {
53        static ref NODE_CONTENT_REGEX: Regex = Regex::new(r"\.( |$)").unwrap();
54    }
55    NODE_CONTENT_REGEX.is_match(match_str)
56}
57
58pub fn is_match_share_elems(match_str: &str) -> bool {
59    lazy_static! {
60        static ref SHARE_ELEMS_REGEX: Regex =
61            Regex::new(r"(?i)(\b|_)(share|sharedaddy)(\b|_)").unwrap();
62    }
63    SHARE_ELEMS_REGEX.is_match(match_str)
64}
65
66pub fn is_match_has_content(match_str: &str) -> bool {
67    lazy_static! {
68        static ref HAS_CONTENT_REGEX: Regex = Regex::new(r"\S$").unwrap();
69    }
70    HAS_CONTENT_REGEX.is_match(match_str)
71}
72
73pub fn is_match_img_ext(match_str: &str) -> bool {
74    lazy_static! {
75        static ref IMG_EXT_REGEX: Regex = Regex::new(r"(?i)\.(jpg|jpeg|png|webp)").unwrap();
76    }
77    IMG_EXT_REGEX.is_match(match_str)
78}
79
80pub fn is_match_srcset(match_str: &str) -> bool {
81    lazy_static! {
82        static ref SRCSET_REGEX: Regex = Regex::new(r"\.(jpg|jpeg|png|webp)\s+\d").unwrap();
83    }
84    SRCSET_REGEX.is_match(match_str)
85}
86
87pub fn is_match_src_regex(match_str: &str) -> bool {
88    lazy_static! {
89        static ref SRC_REGEX: Regex = Regex::new(r"^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$").unwrap();
90    }
91    SRC_REGEX.is_match(match_str)
92}
93
94pub fn is_match_name_pattern(match_str: &str) -> bool {
95    lazy_static! {
96        static ref NAME_PATTERN_REGEX: Regex = Regex::new(r"(?i)\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$").unwrap();
97    }
98    NAME_PATTERN_REGEX.is_match(match_str)
99}
100
101pub fn is_match_title_separator(match_str: &str) -> bool {
102    lazy_static! {
103        static ref TITLE_SEPARATOR_REGEX: Regex = Regex::new(r" [\|\-\\/>»] ").unwrap();
104    }
105    TITLE_SEPARATOR_REGEX.is_match(match_str)
106}
107
108pub fn is_match_has_title_separator(match_str: &str) -> bool {
109    lazy_static! {
110        static ref HAS_TITLE_SEPARATOR_REGEX: Regex = Regex::new(r" [\\/>»] ").unwrap();
111    }
112    HAS_TITLE_SEPARATOR_REGEX.is_match(match_str)
113}
114
115lazy_static! {
116    pub static ref NORMALIZE_REGEX: Regex = Regex::new(r"\s{2,}").unwrap();
117    pub static ref B64_DATA_URL_REGEX: Regex =
118        Regex::new(r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*").unwrap();
119    pub static ref BASE64_REGEX: Regex = Regex::new(r"(?i)base64\s*").unwrap();
120    pub static ref PROPERTY_REGEX: Regex = Regex::new(
121        r"(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*"
122    )
123    .unwrap();
124    pub static ref SRCSET_CAPTURE_REGEX: Regex =
125        Regex::new(r"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))").unwrap();
126    pub static ref REPLACE_WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
127    pub static ref REPLACE_DOT_REGEX: Regex = Regex::new(r"\.").unwrap();
128    pub static ref REPLACE_HTML_ESCAPE_REGEX: Regex =
129        Regex::new("&(quot|amp|apos|lt|gt);").unwrap();
130    pub static ref REPLACE_HEX_REGEX: Regex =
131        Regex::new(r"(?i)&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));").unwrap();
132    pub static ref REPLACE_START_SEPARATOR_REGEX: Regex =
133        Regex::new(r"(?i)(?P<start>.*)[\|\-\\/>»] .*").unwrap();
134    pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
135        Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
136    pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
137}