use regex::Regex;
use crate::matcher::span::{MatchSpan, Property};
use std::sync::LazyLock;
const KNOWN_TLDS: &[&str] = &[
"com", "org", "net", "info", "tv", "io", "ru", "cc", "me", "to", "be", "de", "fr", "es", "it",
"nl", "se", "pl", "cz", "at", "ch", "co", "uk", "us", "ca", "au", "nz", "jp", "kr", "cn", "tw",
"br", "mx", "in", "za", "ua", "hu", "ro", "bg", "hr", "si", "sk", "lt", "lv", "ee", "fi", "dk",
"no", "pt", "gr", "tr", "na",
];
static WEBSITE_BRACKET: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"\[[. ]*(?P<site>(?:www\.)?[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)+\.[a-zA-Z]{2,})[. ]*\]"
).unwrap()
});
static WEBSITE_FROM: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?i)from\s*\[?\s*(?P<site>(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,})\s*\]?"
).unwrap()
});
static WEBSITE_INLINE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?P<site>(?:www\.)?[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?\.(?:com|org|net|info|tv|io|ru|cc))\b"
).unwrap()
});
pub fn find_matches(input: &str) -> Vec<MatchSpan> {
let mut matches = Vec::new();
for cap in WEBSITE_BRACKET.captures_iter(input) {
if let Some(site) = cap.name("site") {
let val = site.as_str();
if has_known_tld(val) {
matches.push(
MatchSpan::new(site.start(), site.end(), Property::Website, val)
.with_priority(crate::priority::KEYWORD),
);
}
}
}
for cap in WEBSITE_FROM.captures_iter(input) {
if let Some(site) = cap.name("site")
&& !matches.iter().any(|m| {
m.overlaps(&MatchSpan::new(
site.start(),
site.end(),
Property::Website,
"",
))
})
{
matches.push(
MatchSpan::new(site.start(), site.end(), Property::Website, site.as_str())
.with_priority(crate::priority::VOCABULARY),
);
}
}
if matches.is_empty() {
for cap in WEBSITE_INLINE.captures_iter(input) {
if let Some(site) = cap.name("site") {
let val = site.as_str();
if val.len() > 7 {
matches.push(
MatchSpan::new(site.start(), site.end(), Property::Website, val)
.with_priority(crate::priority::DEFAULT),
);
}
}
}
}
matches
}
fn has_known_tld(domain: &str) -> bool {
domain
.rsplit('.')
.next()
.is_some_and(|tld| KNOWN_TLDS.iter().any(|k| k.eq_ignore_ascii_case(tld)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bracket_website() {
let m = find_matches("Movie.720p-GROUP.[sharethefiles.com].mkv");
assert!(m.iter().any(|x| x.value == "sharethefiles.com"));
}
#[test]
fn test_bracket_multipart_tld() {
let m = find_matches("Movie.[tvu.org.ru].avi");
assert!(m.iter().any(|x| x.value == "tvu.org.ru"));
}
#[test]
fn test_inline_website() {
let m = find_matches("Movie.720p.MkvCage.com");
assert!(m.iter().any(|x| x.value == "MkvCage.com"));
}
#[test]
fn test_dotted_brackets() {
let m = find_matches("[.www.site.com.].-.Movie.mkv");
assert!(m.iter().any(|x| x.value == "www.site.com"));
}
#[test]
fn test_ready_player_one_not_website() {
let m = find_matches(
"[DBD-Raws][4K_HDR][ready.player.one][2160P][BDRip][HEVC-10bit][FLAC].mkv",
);
assert!(
!m.iter().any(|x| x.value == "ready.player.one"),
"ready.player.one should NOT be detected as a website"
);
}
#[test]
fn test_has_known_tld() {
assert!(has_known_tld("sharethefiles.com"));
assert!(has_known_tld("tvu.org.ru"));
assert!(has_known_tld("www.nimp.na"));
assert!(has_known_tld("wawa.co.uk"));
assert!(!has_known_tld("ready.player.one"));
assert!(!has_known_tld("some.thing.movie"));
}
#[test]
fn test_no_false_positive_on_language_abbrev_rus() {
let m = find_matches("Community.s02e20.rus.eng.720p.Kybik.v.Kybe");
assert!(
m.is_empty(),
"`.ru` inside `.rus` (Russian abbrev) must not match as a website, got: {:?}",
m.iter().map(|x| &x.value).collect::<Vec<_>>()
);
}
#[test]
fn test_genuine_ru_domain_still_matches() {
let m = find_matches("Movie.tracker.ru.x264.mkv");
assert!(
m.iter().any(|x| x.value == "tracker.ru"),
"genuine `.ru` domain should still match, got: {:?}",
m.iter().map(|x| &x.value).collect::<Vec<_>>()
);
}
#[test]
fn test_no_false_positive_on_extended_tld_suffixes() {
for sample in [
"Show.s01e01.community.center.720p",
"Show.s01e01.networking.event.720p",
"Show.s01e01.organic.farm.720p",
] {
let m = find_matches(sample);
assert!(
m.is_empty(),
"TLD-as-prefix-of-word must not match for {sample:?}, got: {:?}",
m.iter().map(|x| &x.value).collect::<Vec<_>>()
);
}
}
}