use regex::Regex;
use std::sync::LazyLock;
pub(crate) static TEXT_FILTER: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?i)\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|Linkedin|Mail|PDF|Pinterest|Pocket|Print|QQ|Reddit|Twitter|WeChat|WeiBo|Whatsapp|Xing|Mehr zum Thema:?|More on this.{0,8})$"
).unwrap()
});
pub(crate) static HTML_LANG: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)[a-z]{2}").unwrap());
pub(crate) static COMMA_SEPARATOR: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\s*[,;]\s*").unwrap());
pub(crate) static TITLE_CLEANER: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)^(.+)?\s+[–•·—|⁄*⋆~‹«<›»>:-]\s+(.+)$").unwrap());
pub(crate) static JSON_SYMBOL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"[{\\}]"#).unwrap());
pub(crate) static NAME_JSON: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"(?i)"name\\?":\s*\\?"([^"\\]+)"#).unwrap());
pub(crate) static URL_CHECK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)https?://").unwrap());
pub(crate) static SITENAME_FINDER: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)https?://(?:www\.|w[0-9]+\.)?([^/]+)").unwrap());
pub(crate) static HTML_STRIP_TAG: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)(<!--.*?-->|<[^>]*>)").unwrap());
pub(crate) static CATEGORY_HREF: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)/categor(?:y|ies)/").unwrap());
pub(crate) static TAG_HREF: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)/tags?/").unwrap());
pub(crate) static CC_LICENSE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)/(by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero)/([1-9]\.[0-9])").unwrap()
});
pub(crate) static CC_LICENSE_TEXT: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?i)(cc|creative commons) (by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero) ?([1-9]\.[0-9])?",
)
.unwrap()
});
pub(crate) static AUTHOR_PREFIX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)^([a-zäöüß]+(ed|t))? ?(written by|words by|words|by|von|from) ").unwrap()
});
pub(crate) static AUTHOR_DIGITS: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)\p{N}.+?$").unwrap());
pub(crate) static AUTHOR_SOCIAL_MEDIA: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)@\S+").unwrap());
pub(crate) static AUTHOR_SPACE_CHARS: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)[._+]").unwrap());
pub(crate) static AUTHOR_NICKNAME: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"(?i)["'({\['\'][^"]+?[''"'\)\]}]"#).unwrap());
pub(crate) static AUTHOR_SPECIAL_CHARS: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)[^\p{L}\p{M}\p{N}_]+$|[:()?*$#!%/<>{}~¿]").unwrap());
pub(crate) static AUTHOR_PREPOSITION: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)\b\s+(am|on|for|at|in|to|from|of|via|with|—|-|–)\s+(.*)").unwrap()
});
pub(crate) static AUTHOR_EMAIL: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b").unwrap()
});
pub(crate) static AUTHOR_SEPARATOR: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)/|;|,|\||&|(?:^|[^\p{L}\p{M}\p{N}_])[ua]nd(?:$|[^\p{L}\p{M}\p{N}_])").unwrap()
});
pub(crate) static AUTHOR_HTML: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)<[^>]+>").unwrap());
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_text_filter_matches() {
assert!(TEXT_FILTER.is_match("Facebook"));
assert!(TEXT_FILTER.is_match(" Twitter"));
assert!(!TEXT_FILTER.is_match("Print this article")); assert!(TEXT_FILTER.is_match("Drucken"));
assert!(TEXT_FILTER.is_match("More on this topic"));
assert!(TEXT_FILTER.is_match("Mehr zum Thema:"));
}
#[test]
fn test_text_filter_no_match() {
assert!(!TEXT_FILTER.is_match("This is regular text"));
assert!(!TEXT_FILTER.is_match("The article discusses Linkedin policies"));
}
#[test]
fn test_html_lang() {
let m: Vec<_> = HTML_LANG.find_iter("en-US").map(|m| m.as_str()).collect();
assert!(m.contains(&"en"));
}
#[test]
fn test_author_prefix() {
let result = AUTHOR_PREFIX.replace("by John Doe", "");
assert_eq!(result.trim(), "John Doe");
let result = AUTHOR_PREFIX.replace("written by Jane Smith", "");
assert_eq!(result.trim(), "Jane Smith");
}
#[test]
fn test_sitename_finder() {
let cap = SITENAME_FINDER
.captures("https://www.example.com/path")
.unwrap();
assert_eq!(&cap[1], "example.com");
let cap = SITENAME_FINDER
.captures("https://w3.blog.org/article")
.unwrap();
assert_eq!(&cap[1], "blog.org");
}
#[test]
fn test_cc_license() {
let cap = CC_LICENSE
.captures("https://creativecommons.org/licenses/by-sa/4.0/")
.unwrap();
assert_eq!(&cap[1], "by-sa");
assert_eq!(&cap[2], "4.0");
}
#[test]
fn test_category_href() {
assert!(CATEGORY_HREF.is_match("/category/tech/"));
assert!(CATEGORY_HREF.is_match("/categories/news/"));
assert!(!CATEGORY_HREF.is_match("/tag/rust/"));
}
#[test]
fn test_tag_href() {
assert!(TAG_HREF.is_match("/tags/rust/"));
assert!(TAG_HREF.is_match("/tag/programming/"));
assert!(!TAG_HREF.is_match("/category/news/"));
}
#[test]
fn test_all_patterns_compile() {
let _ = &*TEXT_FILTER;
let _ = &*HTML_LANG;
let _ = &*COMMA_SEPARATOR;
let _ = &*TITLE_CLEANER;
let _ = &*JSON_SYMBOL;
let _ = &*NAME_JSON;
let _ = &*URL_CHECK;
let _ = &*SITENAME_FINDER;
let _ = &*HTML_STRIP_TAG;
let _ = &*CATEGORY_HREF;
let _ = &*TAG_HREF;
let _ = &*CC_LICENSE;
let _ = &*CC_LICENSE_TEXT;
let _ = &*AUTHOR_PREFIX;
let _ = &*AUTHOR_DIGITS;
let _ = &*AUTHOR_SOCIAL_MEDIA;
let _ = &*AUTHOR_SPACE_CHARS;
let _ = &*AUTHOR_NICKNAME;
let _ = &*AUTHOR_SPECIAL_CHARS;
let _ = &*AUTHOR_PREPOSITION;
let _ = &*AUTHOR_EMAIL;
let _ = &*AUTHOR_SEPARATOR;
let _ = &*AUTHOR_HTML;
}
}