reasonkit-web 0.1.7

//! Source Tier Classification
//!
//! Implements the three-tier source quality system per CONS-006.
//!
//! # Tier System
//!
//! | Tier | Weight | Description | Examples |
//! |------|--------|-------------|----------|
//! | **Tier 1** | 1.0 | Primary/authoritative | Official docs, gov sites, peer-reviewed |
//! | **Tier 2** | 0.7 | Reputable secondary | Wikipedia, major news, established blogs |
//! | **Tier 3** | 0.4 | Other/unverified | Forums, social media, unknown sources |

use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use tracing::debug;

/// Source quality tier (1 = highest, 3 = lowest)
#[derive(
    Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize,
)]
#[serde(rename_all = "snake_case")]
pub enum SourceTier {
    /// Primary/authoritative sources (official docs, gov sites, peer-reviewed)
    Tier1,
    /// Reputable secondary sources (Wikipedia, major news, established blogs)
    Tier2,
    /// Other/unverified sources (forums, social media, unknown)
    Tier3,
    /// Unable to classify
    #[default]
    Unknown,
}

impl SourceTier {
    /// Get the confidence weight for this tier
    pub fn weight(&self) -> f64 {
        match self {
            SourceTier::Tier1 => 1.0,
            SourceTier::Tier2 => 0.7,
            SourceTier::Tier3 => 0.4,
            SourceTier::Unknown => 0.2,
        }
    }

    /// Check if this tier meets a minimum requirement
    pub fn meets_minimum(&self, minimum: SourceTier) -> bool {
        matches!(
            (self, minimum),
            (SourceTier::Tier1, _)
                | (
                    SourceTier::Tier2,
                    SourceTier::Tier2 | SourceTier::Tier3 | SourceTier::Unknown
                )
                | (SourceTier::Tier3, SourceTier::Tier3 | SourceTier::Unknown)
                | (SourceTier::Unknown, SourceTier::Unknown)
        )
    }
}

/// Quality indicators for a source
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SourceQuality {
    /// Assigned tier
    pub tier: SourceTier,
    /// Confidence in classification (0.0 - 1.0)
    pub confidence: f64,
    /// Domain of the source
    pub domain: String,
    /// Reasons for classification
    pub reasons: Vec<String>,
    /// Has HTTPS
    pub has_https: bool,
    /// Domain age (if known, in days)
    pub domain_age_days: Option<u32>,
    /// Is the source known to be authoritative
    pub is_authoritative: bool,
    /// Is the source known to be unreliable
    pub is_unreliable: bool,
    /// Contains a DOI (Digital Object Identifier) - strong academic indicator
    pub has_doi: bool,
    /// The DOI if found (e.g., "10.1000/xyz123")
    pub doi: Option<String>,
}

impl Default for SourceQuality {
    fn default() -> Self {
        Self {
            tier: SourceTier::Unknown,
            confidence: 0.0,
            domain: String::new(),
            reasons: Vec::new(),
            has_https: false,
            domain_age_days: None,
            is_authoritative: false,
            is_unreliable: false,
            has_doi: false,
            doi: None,
        }
    }
}

/// Classifies URLs into source tiers
pub struct TierClassifier {
    /// Known Tier 1 domains (authoritative)
    tier1_domains: HashSet<String>,
    /// Known Tier 2 domains (reputable)
    tier2_domains: HashSet<String>,
    /// Known unreliable domains (automatically Tier 3 or worse)
    unreliable_domains: HashSet<String>,
    /// Tier 1 TLDs (government, educational)
    tier1_tlds: HashSet<String>,
}

impl TierClassifier {
    /// Create a new classifier with default domain lists
    pub fn new() -> Self {
        let mut classifier = Self {
            tier1_domains: HashSet::new(),
            tier2_domains: HashSet::new(),
            unreliable_domains: HashSet::new(),
            tier1_tlds: HashSet::new(),
        };
        classifier.load_defaults();
        classifier
    }

    /// Load default domain classifications
    fn load_defaults(&mut self) {
        // Tier 1: Authoritative sources
        let tier1 = [
            // Official documentation
            "docs.rs",
            "doc.rust-lang.org",
            "docs.python.org",
            "docs.microsoft.com",
            "learn.microsoft.com",
            "developer.mozilla.org",
            "developer.apple.com",
            "developers.google.com",
            "cloud.google.com",
            "aws.amazon.com",
            "docs.aws.amazon.com",
            "azure.microsoft.com",
            // Academic/research
            "arxiv.org",
            "scholar.google.com",
            "pubmed.ncbi.nlm.nih.gov",
            "nature.com",
            "science.org",
            "acm.org",
            "dl.acm.org",
            "ieee.org",
            "ieeexplore.ieee.org",
            // Standards bodies
            "w3.org",
            "ietf.org",
            "rfc-editor.org",
            "ecma-international.org",
            "iso.org",
            // Major tech companies (official)
            "github.com",
            "gitlab.com",
            "crates.io",
            "pypi.org",
            "npmjs.com",
            "packagist.org",
            "rubygems.org",
            // Government
            "whitehouse.gov",
            "usa.gov",
            "europa.eu",
            "gov.uk",
            // Major encyclopedias (primary sources only)
            "britannica.com",
        ];

        for domain in tier1 {
            self.tier1_domains.insert(domain.to_string());
        }

        // Tier 2: Reputable secondary sources
        let tier2 = [
            // Wikipedia and wikis
            "wikipedia.org",
            "en.wikipedia.org",
            "wikidata.org",
            // Major news
            "bbc.com",
            "bbc.co.uk",
            "reuters.com",
            "apnews.com",
            "nytimes.com",
            "washingtonpost.com",
            "theguardian.com",
            "economist.com",
            // Tech news/blogs
            "techcrunch.com",
            "arstechnica.com",
            "wired.com",
            "theverge.com",
            "hackernews.com",
            "news.ycombinator.com",
            // Q&A platforms
            "stackoverflow.com",
            "stackexchange.com",
            "serverfault.com",
            "superuser.com",
            // Established tech blogs
            "martinfowler.com",
            "blog.rust-lang.org",
            "blog.python.org",
            "engineering.fb.com",
            "netflixtechblog.com",
            // Reference
            "investopedia.com",
            "healthline.com",
            "mayoclinic.org",
            "webmd.com",
        ];

        for domain in tier2 {
            self.tier2_domains.insert(domain.to_string());
        }

        // Known unreliable sources
        let unreliable = [
            // Content farms
            "ehow.com",
            "demand.media",
            // Known misinformation
            "infowars.com",
            "naturalnews.com",
            // Satire (often mistaken as real)
            "theonion.com",
            "babylonbee.com",
        ];

        for domain in unreliable {
            self.unreliable_domains.insert(domain.to_string());
        }

        // Tier 1 TLDs (always authoritative)
        let tier1_tlds = [
            ".gov", ".gov.uk", ".gov.au", ".gov.ca", ".gov.nz", ".edu", ".edu.au", ".ac.uk",
            ".edu.cn", ".mil",
        ];

        for tld in tier1_tlds {
            self.tier1_tlds.insert(tld.to_string());
        }
    }

    /// Extract DOI (Digital Object Identifier) from URL
    ///
    /// DOIs follow the pattern: 10.XXXX/... where XXXX is a registrant code (4-9 digits).
    /// Common locations:
    /// - doi.org/10.1000/xyz123
    /// - dx.doi.org/10.1000/xyz123
    /// - URL path containing /10.XXXX/
    /// - Query params like ?doi=10.1000/xyz123
    ///
    /// Per DOI Handbook: suffix can contain any printable character from Unicode.
    /// Common patterns include alphanumeric, dots, dashes, underscores, parentheses.
    fn extract_doi(url: &str) -> (bool, Option<String>) {
        // DOI pattern per DOI Handbook (doi.org/doi_handbook/2_Numbering.html):
        // - Prefix: 10.XXXX where XXXX is 4-9 digit registrant code
        // - Suffix: alphanumeric + .-_()/:;%#@ and other printable chars
        // We're conservative here to avoid false positives
        let doi_pattern = regex::Regex::new(r#"10\.\d{4,9}/[a-zA-Z0-9.\-_()/:;%#@]+"#).ok();

        if let Some(re) = doi_pattern {
            if let Some(captures) = re.find(url) {
                let doi = captures.as_str();
                // Clean up trailing punctuation that might have been captured
                let doi = doi.trim_end_matches(|c: char| {
                    c == '.' || c == ',' || c == ';' || c == ':' || c == ')' || c == '/'
                });
                // Validate minimum suffix length (must have something after the /)
                if let Some(slash_pos) = doi.find('/') {
                    if doi.len() > slash_pos + 1 {
                        return (true, Some(doi.to_string()));
                    }
                }
            }
        }

        // Check for DOI resolver domains as fallback
        for resolver in &["doi.org/", "dx.doi.org/", "hdl.handle.net/"] {
            if let Some(pos) = url.find(resolver) {
                let after_domain = &url[pos + resolver.len()..];
                if after_domain.starts_with("10.") {
                    let doi = after_domain
                        .split(|c: char| {
                            c.is_whitespace() || c == '"' || c == '\'' || c == '>' || c == '<'
                        })
                        .next()
                        .unwrap_or(after_domain)
                        .trim_end_matches(['.', ',', ';', ')']);
                    // Validate it has a suffix
                    if doi.contains('/') && doi.len() > 8 {
                        return (true, Some(doi.to_string()));
                    }
                }
            }
        }

        (false, None)
    }

    /// Validate DOI against known patterns and check for indicators of quality
    ///
    /// Returns (is_valid, is_likely_quality) where:
    /// - is_valid: DOI follows correct format
    /// - is_likely_quality: DOI appears to be from a reputable registrant
    fn validate_doi(doi: &str) -> (bool, bool) {
        // Must start with 10. and have a suffix
        if !doi.starts_with("10.") || !doi.contains('/') {
            return (false, false);
        }

        // Extract registrant code
        let parts: Vec<&str> = doi.splitn(2, '/').collect();
        if parts.len() != 2 {
            return (false, false);
        }

        let prefix = parts[0]; // "10.XXXX"
        let suffix = parts[1];

        // Validate prefix format
        if !prefix.starts_with("10.") {
            return (false, false);
        }
        let registrant = &prefix[3..];
        if registrant.len() < 4
            || registrant.len() > 9
            || !registrant.chars().all(|c| c.is_ascii_digit())
        {
            return (false, false);
        }

        // Suffix must be non-empty
        if suffix.is_empty() {
            return (false, false);
        }

        // Check for known quality registrants (major publishers, standards bodies)
        // This is NOT exhaustive - just common high-quality ones
        let quality_registrants = [
            "1000", // DOI Foundation examples
            "1001", // Wiley
            "1002", // Blackwell
            "1006", // Oxford University Press
            "1007", // Springer
            "1016", // Elsevier
            "1017", // Cambridge University Press
            "1021", // ACS Publications
            "1038", // Nature Publishing Group
            "1073", // PNAS
            "1093", // Oxford Academic
            "1126", // Science/AAAS
            "1145", // ACM
            "1109", // IEEE
            "1257", // JSTOR
            "1371", // PLOS
            "3389", // Frontiers
            "1186", // BioMed Central
        ];

        let is_quality = quality_registrants.contains(&registrant);

        (true, is_quality)
    }

    /// Classify a URL's source tier
    pub fn classify(&self, url: &str) -> SourceQuality {
        let parsed = match url::Url::parse(url) {
            Ok(u) => u,
            Err(_) => {
                return SourceQuality {
                    tier: SourceTier::Unknown,
                    confidence: 0.0,
                    domain: String::new(),
                    reasons: vec!["Invalid URL".to_string()],
                    ..Default::default()
                };
            }
        };

        let host = match parsed.host_str() {
            Some(h) => h.to_lowercase(),
            None => {
                return SourceQuality {
                    tier: SourceTier::Unknown,
                    confidence: 0.0,
                    domain: String::new(),
                    reasons: vec!["No host in URL".to_string()],
                    ..Default::default()
                };
            }
        };

        let has_https = parsed.scheme() == "https";
        let mut reasons = Vec::new();
        let mut tier = SourceTier::Unknown;
        let mut confidence = 0.5;
        let mut is_authoritative = false;
        let mut is_unreliable = false;

        // Check for DOI (Digital Object Identifier) - strong academic indicator
        let (has_doi, doi) = Self::extract_doi(url);

        // Extract domain parts for matching
        let domain_parts: Vec<&str> = host.split('.').collect();
        let base_domain = if domain_parts.len() >= 2 {
            format!(
                "{}.{}",
                domain_parts[domain_parts.len() - 2],
                domain_parts[domain_parts.len() - 1]
            )
        } else {
            host.clone()
        };

        // DOI presence indicates formal publication, but NOT necessarily quality
        // Predatory journals have DOIs too - validate the registrant
        if has_doi {
            if let Some(ref doi_str) = doi {
                let (is_valid, is_quality_registrant) = Self::validate_doi(doi_str);
                if is_valid {
                    if is_quality_registrant {
                        // Known quality publisher - Tier 1
                        tier = SourceTier::Tier1;
                        confidence = 0.9;
                        is_authoritative = true;
                        reasons.push(format!("DOI from quality publisher: {}", doi_str));
                    } else {
                        // Unknown registrant - Tier 2 (could be predatory journal)
                        tier = SourceTier::Tier2;
                        confidence = 0.7;
                        reasons.push(format!(
                            "DOI from unknown registrant: {} (verify journal reputation)",
                            doi_str
                        ));
                    }
                } else {
                    // Invalid DOI format - don't boost
                    reasons.push(format!("Invalid DOI format detected: {}", doi_str));
                }
            }
        }

        // Check for Tier 1 TLDs (high priority, but DOI takes precedence)
        if tier == SourceTier::Unknown {
            for tld in &self.tier1_tlds {
                if host.ends_with(tld) {
                    tier = SourceTier::Tier1;
                    confidence = 0.95;
                    is_authoritative = true;
                    reasons.push(format!("Government/educational TLD: {}", tld));
                    break;
                }
            }
        }

        // Check known domain lists
        if tier == SourceTier::Unknown {
            // Check unreliable first
            if self.unreliable_domains.contains(&host)
                || self.unreliable_domains.contains(&base_domain)
            {
                tier = SourceTier::Tier3;
                confidence = 0.9;
                is_unreliable = true;
                reasons.push("Known unreliable source".to_string());
            } else if self.tier1_domains.contains(&host)
                || self.tier1_domains.contains(&base_domain)
            {
                tier = SourceTier::Tier1;
                confidence = 0.9;
                is_authoritative = true;
                reasons.push("Known authoritative domain".to_string());
            } else if self.tier2_domains.contains(&host)
                || self.tier2_domains.contains(&base_domain)
            {
                tier = SourceTier::Tier2;
                confidence = 0.85;
                reasons.push("Known reputable domain".to_string());
            }
        }

        // Apply heuristics for unknown domains
        if tier == SourceTier::Unknown {
            // Check for common patterns
            if host.contains("blog.") || host.contains(".blog") {
                tier = SourceTier::Tier2;
                confidence = 0.6;
                reasons.push("Blog subdomain/domain".to_string());
            } else if host.contains("forum.")
                || host.contains(".forum")
                || host.contains("community.")
            {
                tier = SourceTier::Tier3;
                confidence = 0.6;
                reasons.push("Forum/community site".to_string());
            } else if host.contains("social.") || host.contains(".social") {
                tier = SourceTier::Tier3;
                confidence = 0.6;
                reasons.push("Social media".to_string());
            } else {
                // Default to Tier 3 for unknown
                tier = SourceTier::Tier3;
                confidence = 0.4;
                reasons.push("Unknown domain - defaulting to Tier 3".to_string());
            }
        }

        // Adjust confidence based on HTTPS
        if !has_https {
            confidence *= 0.8;
            reasons.push("No HTTPS (reduced confidence)".to_string());
        }

        debug!(
            url = %url,
            tier = ?tier,
            confidence = %confidence,
            "Classified source"
        );

        SourceQuality {
            tier,
            confidence,
            domain: host,
            reasons,
            has_https,
            domain_age_days: None,
            is_authoritative,
            is_unreliable,
            has_doi,
            doi,
        }
    }

    /// Classify multiple URLs and return sorted by quality
    pub fn classify_multiple(&self, urls: &[String]) -> Vec<(String, SourceQuality)> {
        let mut results: Vec<(String, SourceQuality)> = urls
            .iter()
            .map(|url| (url.clone(), self.classify(url)))
            .collect();

        // Sort by tier (Tier1 first) then by confidence (descending)
        results.sort_by(|a, b| {
            match a.1.tier.cmp(&b.1.tier) {
                std::cmp::Ordering::Equal => {
                    // Higher confidence first
                    b.1.confidence
                        .partial_cmp(&a.1.confidence)
                        .unwrap_or(std::cmp::Ordering::Equal)
                }
                other => other,
            }
        });

        results
    }

    /// Check if a set of sources meets the triangulation requirement
    pub fn meets_triangulation_requirement(
        &self,
        sources: &[SourceQuality],
        min_sources: usize,
        min_tier: SourceTier,
    ) -> (bool, String) {
        let qualifying: Vec<&SourceQuality> = sources
            .iter()
            .filter(|s| s.tier.meets_minimum(min_tier))
            .collect();

        if qualifying.len() >= min_sources {
            (
                true,
                format!(
                    "Triangulation satisfied: {} of {} sources meet Tier {} or better",
                    qualifying.len(),
                    sources.len(),
                    match min_tier {
                        SourceTier::Tier1 => "1",
                        SourceTier::Tier2 => "2",
                        SourceTier::Tier3 => "3",
                        SourceTier::Unknown => "Unknown",
                    }
                ),
            )
        } else {
            (
                false,
                format!(
                    "Triangulation NOT satisfied: only {} of required {} sources meet Tier {} or better",
                    qualifying.len(),
                    min_sources,
                    match min_tier {
                        SourceTier::Tier1 => "1",
                        SourceTier::Tier2 => "2",
                        SourceTier::Tier3 => "3",
                        SourceTier::Unknown => "Unknown",
                    }
                ),
            )
        }
    }

    /// Add a custom Tier 1 domain
    pub fn add_tier1_domain(&mut self, domain: &str) {
        self.tier1_domains.insert(domain.to_lowercase());
    }

    /// Add a custom Tier 2 domain
    pub fn add_tier2_domain(&mut self, domain: &str) {
        self.tier2_domains.insert(domain.to_lowercase());
    }

    /// Add a custom unreliable domain
    pub fn add_unreliable_domain(&mut self, domain: &str) {
        self.unreliable_domains.insert(domain.to_lowercase());
    }
}

impl Default for TierClassifier {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_tier_weight() {
        assert_eq!(SourceTier::Tier1.weight(), 1.0);
        assert_eq!(SourceTier::Tier2.weight(), 0.7);
        assert_eq!(SourceTier::Tier3.weight(), 0.4);
        assert_eq!(SourceTier::Unknown.weight(), 0.2);
    }

    #[test]
    fn test_tier_meets_minimum() {
        assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier1));
        assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier2));
        assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier3));

        assert!(!SourceTier::Tier2.meets_minimum(SourceTier::Tier1));
        assert!(SourceTier::Tier2.meets_minimum(SourceTier::Tier2));
        assert!(SourceTier::Tier2.meets_minimum(SourceTier::Tier3));

        assert!(!SourceTier::Tier3.meets_minimum(SourceTier::Tier1));
        assert!(!SourceTier::Tier3.meets_minimum(SourceTier::Tier2));
        assert!(SourceTier::Tier3.meets_minimum(SourceTier::Tier3));
    }

    #[test]
    fn test_classify_tier1_tld() {
        let classifier = TierClassifier::new();

        let gov = classifier.classify("https://www.whitehouse.gov/briefing-room");
        assert_eq!(gov.tier, SourceTier::Tier1);
        assert!(gov.is_authoritative);

        let edu = classifier.classify("https://cs.stanford.edu/research");
        assert_eq!(edu.tier, SourceTier::Tier1);
    }

    #[test]
    fn test_classify_tier1_domain() {
        let classifier = TierClassifier::new();

        let github = classifier.classify("https://github.com/rust-lang/rust");
        assert_eq!(github.tier, SourceTier::Tier1);

        let docs = classifier.classify("https://docs.rs/tokio/latest/tokio/");
        assert_eq!(docs.tier, SourceTier::Tier1);

        let mdn = classifier.classify("https://developer.mozilla.org/en-US/docs/Web");
        assert_eq!(mdn.tier, SourceTier::Tier1);
    }

    #[test]
    fn test_classify_tier2_domain() {
        let classifier = TierClassifier::new();

        let wiki = classifier.classify("https://en.wikipedia.org/wiki/Rust");
        assert_eq!(wiki.tier, SourceTier::Tier2);

        let so = classifier.classify("https://stackoverflow.com/questions/123");
        assert_eq!(so.tier, SourceTier::Tier2);
    }

    #[test]
    fn test_classify_unreliable() {
        let classifier = TierClassifier::new();

        let unreliable = classifier.classify("https://infowars.com/article");
        assert_eq!(unreliable.tier, SourceTier::Tier3);
        assert!(unreliable.is_unreliable);
    }

    #[test]
    fn test_classify_unknown_domain() {
        let classifier = TierClassifier::new();

        let unknown = classifier.classify("https://randomsite12345.xyz/page");
        assert_eq!(unknown.tier, SourceTier::Tier3);
        assert!(unknown.reasons.iter().any(|r| r.contains("Unknown")));
    }

    #[test]
    fn test_triangulation_requirement() {
        let classifier = TierClassifier::new();

        let sources = vec![
            SourceQuality {
                tier: SourceTier::Tier1,
                confidence: 0.9,
                ..Default::default()
            },
            SourceQuality {
                tier: SourceTier::Tier2,
                confidence: 0.8,
                ..Default::default()
            },
            SourceQuality {
                tier: SourceTier::Tier2,
                confidence: 0.7,
                ..Default::default()
            },
        ];

        // Should pass with 3 sources at Tier 2 or better
        let (met, _) = classifier.meets_triangulation_requirement(&sources, 3, SourceTier::Tier2);
        assert!(met);

        // Should fail with 3 sources at Tier 1
        let (met, _) = classifier.meets_triangulation_requirement(&sources, 3, SourceTier::Tier1);
        assert!(!met);
    }

    #[test]
    fn test_classify_multiple() {
        let classifier = TierClassifier::new();

        let urls = vec![
            "https://randomsite.com/page".to_string(),
            "https://docs.rs/tokio".to_string(),
            "https://en.wikipedia.org/wiki/Rust".to_string(),
        ];

        let results = classifier.classify_multiple(&urls);

        // Should be sorted: Tier1 first, then Tier2, then Tier3
        assert_eq!(results[0].1.tier, SourceTier::Tier1); // docs.rs
        assert_eq!(results[1].1.tier, SourceTier::Tier2); // wikipedia
        assert_eq!(results[2].1.tier, SourceTier::Tier3); // randomsite
    }

    #[test]
    fn test_doi_detection() {
        let classifier = TierClassifier::new();

        // Standard DOI resolver URL from quality publisher (Nature)
        let doi_url = classifier.classify("https://doi.org/10.1038/nature12373");
        assert!(doi_url.has_doi, "Should detect DOI in doi.org URL");
        assert_eq!(
            doi_url.tier,
            SourceTier::Tier1,
            "Nature DOI should be Tier 1"
        );
        assert!(doi_url.is_authoritative);
        assert_eq!(doi_url.doi, Some("10.1038/nature12373".to_string()));

        // dx.doi.org resolver with ACM DOI
        let dx_doi = classifier.classify("https://dx.doi.org/10.1145/1234567.1234568");
        assert!(dx_doi.has_doi, "Should detect DOI in dx.doi.org URL");
        assert_eq!(dx_doi.tier, SourceTier::Tier1, "ACM DOI should be Tier 1");

        // DOI in URL path with unknown registrant (not in quality list)
        let embedded_doi = classifier.classify("https://somejournal.com/article/10.9999/xyz123");
        assert!(
            embedded_doi.has_doi,
            "Should detect DOI embedded in URL path"
        );
        assert_eq!(
            embedded_doi.tier,
            SourceTier::Tier2,
            "Unknown registrant DOI should be Tier 2"
        );

        // URL without DOI
        let no_doi = classifier.classify("https://example.com/article/123456");
        assert!(!no_doi.has_doi, "Should not detect DOI in regular URL");
        assert!(no_doi.doi.is_none());
    }

    #[test]
    fn test_doi_quality_registrant() {
        let classifier = TierClassifier::new();

        // Quality registrants should be Tier 1
        let nature = classifier.classify("https://doi.org/10.1038/s41586-021-03819-2");
        assert_eq!(
            nature.tier,
            SourceTier::Tier1,
            "Nature (10.1038) should be Tier 1"
        );
        assert!(nature.is_authoritative);

        let ieee = classifier.classify("https://ieeexplore.ieee.org/10.1109/5.771073");
        assert_eq!(
            ieee.tier,
            SourceTier::Tier1,
            "IEEE (10.1109) should be Tier 1"
        );

        let springer = classifier.classify("https://link.springer.com/10.1007/s00000-000-0000-0");
        assert_eq!(
            springer.tier,
            SourceTier::Tier1,
            "Springer (10.1007) should be Tier 1"
        );
    }

    #[test]
    fn test_doi_unknown_registrant() {
        let classifier = TierClassifier::new();

        // An unknown domain without DOI would be Tier 3
        let without_doi = classifier.classify("https://obscurejournal.xyz/article/123");
        assert_eq!(without_doi.tier, SourceTier::Tier3);

        // Unknown registrant DOI should be Tier 2 (not Tier 1)
        // This protects against predatory journals which have DOIs
        let with_unknown_doi =
            classifier.classify("https://obscurejournal.xyz/article/10.9876/test");
        assert_eq!(
            with_unknown_doi.tier,
            SourceTier::Tier2,
            "Unknown registrant should be Tier 2"
        );
        assert!(with_unknown_doi.has_doi);
        assert!(
            !with_unknown_doi.is_authoritative,
            "Unknown registrant should not be authoritative"
        );
    }

    #[test]
    fn test_validate_doi() {
        // Valid DOIs
        let (valid, quality) = TierClassifier::validate_doi("10.1038/nature12373");
        assert!(valid);
        assert!(quality, "Nature is a quality registrant");

        let (valid, quality) = TierClassifier::validate_doi("10.1145/1234567.1234568");
        assert!(valid);
        assert!(quality, "ACM is a quality registrant");

        let (valid, quality) = TierClassifier::validate_doi("10.9999/xyz123");
        assert!(valid);
        assert!(!quality, "Unknown registrant");

        // Invalid DOIs
        let (valid, _) = TierClassifier::validate_doi("11.1234/test"); // Wrong prefix
        assert!(!valid);

        let (valid, _) = TierClassifier::validate_doi("10.123/test"); // Registrant too short
        assert!(!valid);

        let (valid, _) = TierClassifier::validate_doi("10.1234"); // No suffix
        assert!(!valid);
    }
}