web-page-classifier 0.1.0

Fast web page type classification using XGBoost with compact binary model
Documentation
//! URL-based page type classification using pattern matching.
//!
//! Fast heuristic classifier that examines URL domain and path patterns.
//! This is stage 1 of classification — for ambiguous URLs (returning Article),
//! ML classification can provide a more accurate result.

use crate::PageType;

/// Classify a URL into a page type using heuristic pattern matching.
///
/// Returns `PageType::Article` when no patterns match (most common type).
///
/// # Example
///
/// ```
/// use web_page_classifier::{classify_url, PageType};
///
/// assert_eq!(classify_url("https://forum.example.com/thread/123"), PageType::Forum);
/// assert_eq!(classify_url("https://docs.example.com/api"), PageType::Documentation);
/// assert_eq!(classify_url("https://example.com/blog/my-post"), PageType::Article);
/// ```
#[must_use]
pub fn classify_url(url: &str) -> PageType {
    if url.is_empty() {
        return PageType::Article;
    }

    let url_lower = url.to_ascii_lowercase();
    let (domain, path) = extract_domain_path(&url_lower);

    // 1. Forum — distinctive domains and path patterns
    if contains_any(domain, FORUM_DOMAINS)
        || contains_any(path, FORUM_PATHS)
        || contains_any(&url_lower, FORUM_URL_PATTERNS)
    {
        return PageType::Forum;
    }

    // 2. Documentation — before article (e.g. /docs/guide/ is docs, not article)
    if contains_any(domain, DOCS_DOMAINS) || contains_any(path, DOCS_PATHS) {
        return PageType::Documentation;
    }

    // 3. Product — before category (/products/slug is a product, not a listing)
    if contains_any(path, PRODUCT_PATHS) || contains_any(domain, PRODUCT_DOMAINS) {
        return PageType::Product;
    }

    // 4. Category / collection
    if contains_any(path, CATEGORY_PATHS) {
        return PageType::Collection;
    }

    // 5. Service page
    if contains_any(path, SERVICE_PATHS) || contains_any(&url_lower, SERVICE_SLUG_PATTERNS) {
        return PageType::Service;
    }

    // 6. Listing / content index — path ends with pattern (no further segments)
    {
        let path_trimmed = path.trim_end_matches('/');
        if LISTING_PATH_ENDINGS.iter().any(|p| path_trimmed.ends_with(p))
            || contains_any(path, LISTING_PATH_CONTAINS)
        {
            return PageType::Listing;
        }
    }

    // 7. Article / blog
    if contains_any(path, ARTICLE_PATHS) || contains_any(&url_lower, BLOG_SLUG_PATTERNS) {
        return PageType::Article;
    }

    // Default — article is the most common type
    PageType::Article
}

// ===== Helper functions =====

fn extract_domain_path(url: &str) -> (&str, &str) {
    let without_scheme = url
        .strip_prefix("https://")
        .or_else(|| url.strip_prefix("http://"))
        .or_else(|| url.strip_prefix("//"))
        .unwrap_or(url);

    if let Some(slash_pos) = without_scheme.find('/') {
        (&without_scheme[..slash_pos], &without_scheme[slash_pos..])
    } else {
        (without_scheme, "/")
    }
}

fn contains_any(haystack: &str, needles: &[&str]) -> bool {
    needles.iter().any(|n| haystack.contains(n))
}

// ===== Pattern constants =====

const FORUM_DOMAINS: &[&str] = &[
    "forum.", "forums.", "board.", "boards.", "community.",
    "discuss.", "discourse.",
    "stackexchange.com", "stackoverflow.com",
    "reddit.com", "news.ycombinator.com",
    "lobste.rs", "quora.com", "slashdot.org",
    "disqus.com", "mumsnet.com", "mumsnet.com/talk",
    "thestudentroom.co.uk",
];

const FORUM_PATHS: &[&str] = &[
    "/forum/", "/forums/", "/thread/", "/threads/",
    "/topic/", "/topics/", "/discussion/", "/discussions/",
    "/community/", "/t/", "/questions/", "/question/",
    "/comments/", "/talk/",
];

const FORUM_URL_PATTERNS: &[&str] = &[
    "/viewtopic.php", "/showthread.php", "/item?id=",
];

const DOCS_DOMAINS: &[&str] = &[
    "docs.", "doc.", "wiki.", "devdocs.",
    "man7.org", "readthedocs.io", "readthedocs.org",
    "developer.hashicorp.com", "developer.mozilla.org",
];

const DOCS_PATHS: &[&str] = &[
    "/docs/", "/doc/", "/documentation/", "/reference/",
    "/api/", "/guide/", "/tutorial/", "/tutorials/",
    "/manual/", "/handbook/", "/wiki/", "/man-pages/",
    "/man/", "/concepts/", "/userguide/", "/quickstart",
    "/getting-started", "/book/", "/glossary/", "/tech_notes/",
];

const PRODUCT_PATHS: &[&str] = &[
    "/products/", "/product/", "/shop/", "/dp/", "/ip/",
];

const PRODUCT_DOMAINS: &[&str] = &["shop.", "store."];

const CATEGORY_PATHS: &[&str] = &[
    "/collections/", "/collection/", "/categories/", "/category/",
    "/browse/", "/cat/", "/subcategory/",
];

const SERVICE_PATHS: &[&str] = &[
    "/services/", "/service/", "/services.html",
    "/solutions/", "/solution/", "/offerings/", "/what-we-do",
];

const SERVICE_SLUG_PATTERNS: &[&str] = &[
    "-consulting-services", "-development-services", "-management-services",
    "-support-services", "-outsourcing-services", "-integration-services",
    "-development-company", "-consulting-company",
    "-ai-consulting", "-ai-development", "-ai-solutions",
];

const LISTING_PATH_ENDINGS: &[&str] = &[
    "/news", "/testimonials", "/coupons", "/issues",
    "/reviews", "/rankings", "-courses",
];

const LISTING_PATH_CONTAINS: &[&str] = &[
    "/awards/", "/trending/", "/list/",
];

const ARTICLE_PATHS: &[&str] = &[
    "/blog/", "/blog", "/news/", "/article/", "/articles/",
    "/post/", "/posts/", "/insight/", "/insights/",
    "/resource/", "/resources/", "/stories/", "/magazine/",
    "/journal/", "/press/", "/editorial/", "/opinion/",
    "/review/", "/column/",
];

const BLOG_SLUG_PATTERNS: &[&str] = &[
    "-ways-to-", "-tips-", "-reasons-", "-steps-to-",
    "-things-to-", "-best-", "-top-", "-essential-",
    "beginners-guide", "complete-guide", "ultimate-guide",
    "how-to-", "what-is-", "why-", "when-to-",
    "-vs-", "-versus-", "-comparison", "-checklist",
    "-trends-", "-strategies-", "-challenges-",
    "-benefits-", "-advantages-",
];

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_forum_detection() {
        assert_eq!(classify_url("https://forum.example.com/thread/1"), PageType::Forum);
        assert_eq!(classify_url("https://stackoverflow.com/questions/123"), PageType::Forum);
        assert_eq!(classify_url("https://example.com/viewtopic.php?t=1"), PageType::Forum);
    }

    #[test]
    fn test_docs_detection() {
        assert_eq!(classify_url("https://docs.example.com/api"), PageType::Documentation);
        assert_eq!(classify_url("https://example.com/docs/getting-started"), PageType::Documentation);
    }

    #[test]
    fn test_product_detection() {
        assert_eq!(classify_url("https://example.com/products/widget"), PageType::Product);
        assert_eq!(classify_url("https://amazon.com/dp/B01234"), PageType::Product);
        assert_eq!(classify_url("https://shop.example.com/item"), PageType::Product);
    }

    #[test]
    fn test_collection_detection() {
        assert_eq!(classify_url("https://example.com/collections/shoes"), PageType::Collection);
        assert_eq!(classify_url("https://example.com/category/electronics"), PageType::Collection);
    }

    #[test]
    fn test_service_detection() {
        assert_eq!(classify_url("https://example.com/services/consulting"), PageType::Service);
        assert_eq!(classify_url("https://example.com/web-development-services"), PageType::Service);
    }

    #[test]
    fn test_listing_detection() {
        assert_eq!(classify_url("https://example.com/news"), PageType::Listing);
        assert_eq!(classify_url("https://example.com/awards/2024"), PageType::Listing);
    }

    #[test]
    fn test_article_detection() {
        assert_eq!(classify_url("https://example.com/blog/my-post"), PageType::Article);
        assert_eq!(classify_url("https://example.com/how-to-cook"), PageType::Article);
    }

    #[test]
    fn test_default_is_article() {
        assert_eq!(classify_url("https://example.com/some-page"), PageType::Article);
        assert_eq!(classify_url(""), PageType::Article);
    }
}