use crate::PageType;
#[must_use]
pub fn classify_url(url: &str) -> PageType {
if url.is_empty() {
return PageType::Article;
}
let url_lower = url.to_ascii_lowercase();
let (domain, path) = extract_domain_path(&url_lower);
if contains_any(domain, FORUM_DOMAINS)
|| contains_any(path, FORUM_PATHS)
|| contains_any(&url_lower, FORUM_URL_PATTERNS)
{
return PageType::Forum;
}
if contains_any(domain, DOCS_DOMAINS) || contains_any(path, DOCS_PATHS) {
return PageType::Documentation;
}
if contains_any(path, PRODUCT_PATHS) || contains_any(domain, PRODUCT_DOMAINS) {
return PageType::Product;
}
if contains_any(path, CATEGORY_PATHS) {
return PageType::Collection;
}
if contains_any(path, SERVICE_PATHS) || contains_any(&url_lower, SERVICE_SLUG_PATTERNS) {
return PageType::Service;
}
{
let path_trimmed = path.trim_end_matches('/');
if LISTING_PATH_ENDINGS.iter().any(|p| path_trimmed.ends_with(p))
|| contains_any(path, LISTING_PATH_CONTAINS)
{
return PageType::Listing;
}
}
if contains_any(path, ARTICLE_PATHS) || contains_any(&url_lower, BLOG_SLUG_PATTERNS) {
return PageType::Article;
}
PageType::Article
}
fn extract_domain_path(url: &str) -> (&str, &str) {
let without_scheme = url
.strip_prefix("https://")
.or_else(|| url.strip_prefix("http://"))
.or_else(|| url.strip_prefix("//"))
.unwrap_or(url);
if let Some(slash_pos) = without_scheme.find('/') {
(&without_scheme[..slash_pos], &without_scheme[slash_pos..])
} else {
(without_scheme, "/")
}
}
fn contains_any(haystack: &str, needles: &[&str]) -> bool {
needles.iter().any(|n| haystack.contains(n))
}
const FORUM_DOMAINS: &[&str] = &[
"forum.", "forums.", "board.", "boards.", "community.",
"discuss.", "discourse.",
"stackexchange.com", "stackoverflow.com",
"reddit.com", "news.ycombinator.com",
"lobste.rs", "quora.com", "slashdot.org",
"disqus.com", "mumsnet.com", "mumsnet.com/talk",
"thestudentroom.co.uk",
];
const FORUM_PATHS: &[&str] = &[
"/forum/", "/forums/", "/thread/", "/threads/",
"/topic/", "/topics/", "/discussion/", "/discussions/",
"/community/", "/t/", "/questions/", "/question/",
"/comments/", "/talk/",
];
const FORUM_URL_PATTERNS: &[&str] = &[
"/viewtopic.php", "/showthread.php", "/item?id=",
];
const DOCS_DOMAINS: &[&str] = &[
"docs.", "doc.", "wiki.", "devdocs.",
"man7.org", "readthedocs.io", "readthedocs.org",
"developer.hashicorp.com", "developer.mozilla.org",
];
const DOCS_PATHS: &[&str] = &[
"/docs/", "/doc/", "/documentation/", "/reference/",
"/api/", "/guide/", "/tutorial/", "/tutorials/",
"/manual/", "/handbook/", "/wiki/", "/man-pages/",
"/man/", "/concepts/", "/userguide/", "/quickstart",
"/getting-started", "/book/", "/glossary/", "/tech_notes/",
];
const PRODUCT_PATHS: &[&str] = &[
"/products/", "/product/", "/shop/", "/dp/", "/ip/",
];
const PRODUCT_DOMAINS: &[&str] = &["shop.", "store."];
const CATEGORY_PATHS: &[&str] = &[
"/collections/", "/collection/", "/categories/", "/category/",
"/browse/", "/cat/", "/subcategory/",
];
const SERVICE_PATHS: &[&str] = &[
"/services/", "/service/", "/services.html",
"/solutions/", "/solution/", "/offerings/", "/what-we-do",
];
const SERVICE_SLUG_PATTERNS: &[&str] = &[
"-consulting-services", "-development-services", "-management-services",
"-support-services", "-outsourcing-services", "-integration-services",
"-development-company", "-consulting-company",
"-ai-consulting", "-ai-development", "-ai-solutions",
];
const LISTING_PATH_ENDINGS: &[&str] = &[
"/news", "/testimonials", "/coupons", "/issues",
"/reviews", "/rankings", "-courses",
];
const LISTING_PATH_CONTAINS: &[&str] = &[
"/awards/", "/trending/", "/list/",
];
const ARTICLE_PATHS: &[&str] = &[
"/blog/", "/blog", "/news/", "/article/", "/articles/",
"/post/", "/posts/", "/insight/", "/insights/",
"/resource/", "/resources/", "/stories/", "/magazine/",
"/journal/", "/press/", "/editorial/", "/opinion/",
"/review/", "/column/",
];
const BLOG_SLUG_PATTERNS: &[&str] = &[
"-ways-to-", "-tips-", "-reasons-", "-steps-to-",
"-things-to-", "-best-", "-top-", "-essential-",
"beginners-guide", "complete-guide", "ultimate-guide",
"how-to-", "what-is-", "why-", "when-to-",
"-vs-", "-versus-", "-comparison", "-checklist",
"-trends-", "-strategies-", "-challenges-",
"-benefits-", "-advantages-",
];
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_forum_detection() {
assert_eq!(classify_url("https://forum.example.com/thread/1"), PageType::Forum);
assert_eq!(classify_url("https://stackoverflow.com/questions/123"), PageType::Forum);
assert_eq!(classify_url("https://example.com/viewtopic.php?t=1"), PageType::Forum);
}
#[test]
fn test_docs_detection() {
assert_eq!(classify_url("https://docs.example.com/api"), PageType::Documentation);
assert_eq!(classify_url("https://example.com/docs/getting-started"), PageType::Documentation);
}
#[test]
fn test_product_detection() {
assert_eq!(classify_url("https://example.com/products/widget"), PageType::Product);
assert_eq!(classify_url("https://amazon.com/dp/B01234"), PageType::Product);
assert_eq!(classify_url("https://shop.example.com/item"), PageType::Product);
}
#[test]
fn test_collection_detection() {
assert_eq!(classify_url("https://example.com/collections/shoes"), PageType::Collection);
assert_eq!(classify_url("https://example.com/category/electronics"), PageType::Collection);
}
#[test]
fn test_service_detection() {
assert_eq!(classify_url("https://example.com/services/consulting"), PageType::Service);
assert_eq!(classify_url("https://example.com/web-development-services"), PageType::Service);
}
#[test]
fn test_listing_detection() {
assert_eq!(classify_url("https://example.com/news"), PageType::Listing);
assert_eq!(classify_url("https://example.com/awards/2024"), PageType::Listing);
}
#[test]
fn test_article_detection() {
assert_eq!(classify_url("https://example.com/blog/my-post"), PageType::Article);
assert_eq!(classify_url("https://example.com/how-to-cook"), PageType::Article);
}
#[test]
fn test_default_is_article() {
assert_eq!(classify_url("https://example.com/some-page"), PageType::Article);
assert_eq!(classify_url(""), PageType::Article);
}
}