reasonkit_web/research/
sources.rs

1//! Source Tier Classification
2//!
3//! Implements the three-tier source quality system per CONS-006.
4//!
5//! # Tier System
6//!
7//! | Tier | Weight | Description | Examples |
8//! |------|--------|-------------|----------|
9//! | **Tier 1** | 1.0 | Primary/authoritative | Official docs, gov sites, peer-reviewed |
10//! | **Tier 2** | 0.7 | Reputable secondary | Wikipedia, major news, established blogs |
11//! | **Tier 3** | 0.4 | Other/unverified | Forums, social media, unknown sources |
12
13use serde::{Deserialize, Serialize};
14use std::collections::HashSet;
15use tracing::debug;
16
17/// Source quality tier (1 = highest, 3 = lowest)
18#[derive(
19    Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize,
20)]
21#[serde(rename_all = "snake_case")]
22pub enum SourceTier {
23    /// Primary/authoritative sources (official docs, gov sites, peer-reviewed)
24    Tier1,
25    /// Reputable secondary sources (Wikipedia, major news, established blogs)
26    Tier2,
27    /// Other/unverified sources (forums, social media, unknown)
28    Tier3,
29    /// Unable to classify
30    #[default]
31    Unknown,
32}
33
34impl SourceTier {
35    /// Get the confidence weight for this tier
36    pub fn weight(&self) -> f64 {
37        match self {
38            SourceTier::Tier1 => 1.0,
39            SourceTier::Tier2 => 0.7,
40            SourceTier::Tier3 => 0.4,
41            SourceTier::Unknown => 0.2,
42        }
43    }
44
45    /// Check if this tier meets a minimum requirement
46    pub fn meets_minimum(&self, minimum: SourceTier) -> bool {
47        matches!(
48            (self, minimum),
49            (SourceTier::Tier1, _)
50                | (
51                    SourceTier::Tier2,
52                    SourceTier::Tier2 | SourceTier::Tier3 | SourceTier::Unknown
53                )
54                | (SourceTier::Tier3, SourceTier::Tier3 | SourceTier::Unknown)
55                | (SourceTier::Unknown, SourceTier::Unknown)
56        )
57    }
58}
59
60/// Quality indicators for a source
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct SourceQuality {
63    /// Assigned tier
64    pub tier: SourceTier,
65    /// Confidence in classification (0.0 - 1.0)
66    pub confidence: f64,
67    /// Domain of the source
68    pub domain: String,
69    /// Reasons for classification
70    pub reasons: Vec<String>,
71    /// Has HTTPS
72    pub has_https: bool,
73    /// Domain age (if known, in days)
74    pub domain_age_days: Option<u32>,
75    /// Is the source known to be authoritative
76    pub is_authoritative: bool,
77    /// Is the source known to be unreliable
78    pub is_unreliable: bool,
79    /// Contains a DOI (Digital Object Identifier) - strong academic indicator
80    pub has_doi: bool,
81    /// The DOI if found (e.g., "10.1000/xyz123")
82    pub doi: Option<String>,
83}
84
85impl Default for SourceQuality {
86    fn default() -> Self {
87        Self {
88            tier: SourceTier::Unknown,
89            confidence: 0.0,
90            domain: String::new(),
91            reasons: Vec::new(),
92            has_https: false,
93            domain_age_days: None,
94            is_authoritative: false,
95            is_unreliable: false,
96            has_doi: false,
97            doi: None,
98        }
99    }
100}
101
102/// Classifies URLs into source tiers
103pub struct TierClassifier {
104    /// Known Tier 1 domains (authoritative)
105    tier1_domains: HashSet<String>,
106    /// Known Tier 2 domains (reputable)
107    tier2_domains: HashSet<String>,
108    /// Known unreliable domains (automatically Tier 3 or worse)
109    unreliable_domains: HashSet<String>,
110    /// Tier 1 TLDs (government, educational)
111    tier1_tlds: HashSet<String>,
112}
113
114impl TierClassifier {
115    /// Create a new classifier with default domain lists
116    pub fn new() -> Self {
117        let mut classifier = Self {
118            tier1_domains: HashSet::new(),
119            tier2_domains: HashSet::new(),
120            unreliable_domains: HashSet::new(),
121            tier1_tlds: HashSet::new(),
122        };
123        classifier.load_defaults();
124        classifier
125    }
126
127    /// Load default domain classifications
128    fn load_defaults(&mut self) {
129        // Tier 1: Authoritative sources
130        let tier1 = [
131            // Official documentation
132            "docs.rs",
133            "doc.rust-lang.org",
134            "docs.python.org",
135            "docs.microsoft.com",
136            "learn.microsoft.com",
137            "developer.mozilla.org",
138            "developer.apple.com",
139            "developers.google.com",
140            "cloud.google.com",
141            "aws.amazon.com",
142            "docs.aws.amazon.com",
143            "azure.microsoft.com",
144            // Academic/research
145            "arxiv.org",
146            "scholar.google.com",
147            "pubmed.ncbi.nlm.nih.gov",
148            "nature.com",
149            "science.org",
150            "acm.org",
151            "dl.acm.org",
152            "ieee.org",
153            "ieeexplore.ieee.org",
154            // Standards bodies
155            "w3.org",
156            "ietf.org",
157            "rfc-editor.org",
158            "ecma-international.org",
159            "iso.org",
160            // Major tech companies (official)
161            "github.com",
162            "gitlab.com",
163            "crates.io",
164            "pypi.org",
165            "npmjs.com",
166            "packagist.org",
167            "rubygems.org",
168            // Government
169            "whitehouse.gov",
170            "usa.gov",
171            "europa.eu",
172            "gov.uk",
173            // Major encyclopedias (primary sources only)
174            "britannica.com",
175        ];
176
177        for domain in tier1 {
178            self.tier1_domains.insert(domain.to_string());
179        }
180
181        // Tier 2: Reputable secondary sources
182        let tier2 = [
183            // Wikipedia and wikis
184            "wikipedia.org",
185            "en.wikipedia.org",
186            "wikidata.org",
187            // Major news
188            "bbc.com",
189            "bbc.co.uk",
190            "reuters.com",
191            "apnews.com",
192            "nytimes.com",
193            "washingtonpost.com",
194            "theguardian.com",
195            "economist.com",
196            // Tech news/blogs
197            "techcrunch.com",
198            "arstechnica.com",
199            "wired.com",
200            "theverge.com",
201            "hackernews.com",
202            "news.ycombinator.com",
203            // Q&A platforms
204            "stackoverflow.com",
205            "stackexchange.com",
206            "serverfault.com",
207            "superuser.com",
208            // Established tech blogs
209            "martinfowler.com",
210            "blog.rust-lang.org",
211            "blog.python.org",
212            "engineering.fb.com",
213            "netflixtechblog.com",
214            // Reference
215            "investopedia.com",
216            "healthline.com",
217            "mayoclinic.org",
218            "webmd.com",
219        ];
220
221        for domain in tier2 {
222            self.tier2_domains.insert(domain.to_string());
223        }
224
225        // Known unreliable sources
226        let unreliable = [
227            // Content farms
228            "ehow.com",
229            "demand.media",
230            // Known misinformation
231            "infowars.com",
232            "naturalnews.com",
233            // Satire (often mistaken as real)
234            "theonion.com",
235            "babylonbee.com",
236        ];
237
238        for domain in unreliable {
239            self.unreliable_domains.insert(domain.to_string());
240        }
241
242        // Tier 1 TLDs (always authoritative)
243        let tier1_tlds = [
244            ".gov", ".gov.uk", ".gov.au", ".gov.ca", ".gov.nz", ".edu", ".edu.au", ".ac.uk",
245            ".edu.cn", ".mil",
246        ];
247
248        for tld in tier1_tlds {
249            self.tier1_tlds.insert(tld.to_string());
250        }
251    }
252
253    /// Extract DOI (Digital Object Identifier) from URL
254    ///
255    /// DOIs follow the pattern: 10.XXXX/... where XXXX is a registrant code (4-9 digits).
256    /// Common locations:
257    /// - doi.org/10.1000/xyz123
258    /// - dx.doi.org/10.1000/xyz123
259    /// - URL path containing /10.XXXX/
260    /// - Query params like ?doi=10.1000/xyz123
261    ///
262    /// Per DOI Handbook: suffix can contain any printable character from Unicode.
263    /// Common patterns include alphanumeric, dots, dashes, underscores, parentheses.
264    fn extract_doi(url: &str) -> (bool, Option<String>) {
265        // DOI pattern per DOI Handbook (doi.org/doi_handbook/2_Numbering.html):
266        // - Prefix: 10.XXXX where XXXX is 4-9 digit registrant code
267        // - Suffix: alphanumeric + .-_()/:;%#@ and other printable chars
268        // We're conservative here to avoid false positives
269        let doi_pattern = regex::Regex::new(r#"10\.\d{4,9}/[a-zA-Z0-9.\-_()/:;%#@]+"#).ok();
270
271        if let Some(re) = doi_pattern {
272            if let Some(captures) = re.find(url) {
273                let doi = captures.as_str();
274                // Clean up trailing punctuation that might have been captured
275                let doi = doi.trim_end_matches(|c: char| {
276                    c == '.' || c == ',' || c == ';' || c == ':' || c == ')' || c == '/'
277                });
278                // Validate minimum suffix length (must have something after the /)
279                if let Some(slash_pos) = doi.find('/') {
280                    if doi.len() > slash_pos + 1 {
281                        return (true, Some(doi.to_string()));
282                    }
283                }
284            }
285        }
286
287        // Check for DOI resolver domains as fallback
288        for resolver in &["doi.org/", "dx.doi.org/", "hdl.handle.net/"] {
289            if let Some(pos) = url.find(resolver) {
290                let after_domain = &url[pos + resolver.len()..];
291                if after_domain.starts_with("10.") {
292                    let doi = after_domain
293                        .split(|c: char| {
294                            c.is_whitespace() || c == '"' || c == '\'' || c == '>' || c == '<'
295                        })
296                        .next()
297                        .unwrap_or(after_domain)
298                        .trim_end_matches(['.', ',', ';', ')']);
299                    // Validate it has a suffix
300                    if doi.contains('/') && doi.len() > 8 {
301                        return (true, Some(doi.to_string()));
302                    }
303                }
304            }
305        }
306
307        (false, None)
308    }
309
310    /// Validate DOI against known patterns and check for indicators of quality
311    ///
312    /// Returns (is_valid, is_likely_quality) where:
313    /// - is_valid: DOI follows correct format
314    /// - is_likely_quality: DOI appears to be from a reputable registrant
315    fn validate_doi(doi: &str) -> (bool, bool) {
316        // Must start with 10. and have a suffix
317        if !doi.starts_with("10.") || !doi.contains('/') {
318            return (false, false);
319        }
320
321        // Extract registrant code
322        let parts: Vec<&str> = doi.splitn(2, '/').collect();
323        if parts.len() != 2 {
324            return (false, false);
325        }
326
327        let prefix = parts[0]; // "10.XXXX"
328        let suffix = parts[1];
329
330        // Validate prefix format
331        if !prefix.starts_with("10.") {
332            return (false, false);
333        }
334        let registrant = &prefix[3..];
335        if registrant.len() < 4
336            || registrant.len() > 9
337            || !registrant.chars().all(|c| c.is_ascii_digit())
338        {
339            return (false, false);
340        }
341
342        // Suffix must be non-empty
343        if suffix.is_empty() {
344            return (false, false);
345        }
346
347        // Check for known quality registrants (major publishers, standards bodies)
348        // This is NOT exhaustive - just common high-quality ones
349        let quality_registrants = [
350            "1000", // DOI Foundation examples
351            "1001", // Wiley
352            "1002", // Blackwell
353            "1006", // Oxford University Press
354            "1007", // Springer
355            "1016", // Elsevier
356            "1017", // Cambridge University Press
357            "1021", // ACS Publications
358            "1038", // Nature Publishing Group
359            "1073", // PNAS
360            "1093", // Oxford Academic
361            "1126", // Science/AAAS
362            "1145", // ACM
363            "1109", // IEEE
364            "1257", // JSTOR
365            "1371", // PLOS
366            "3389", // Frontiers
367            "1186", // BioMed Central
368        ];
369
370        let is_quality = quality_registrants.contains(&registrant);
371
372        (true, is_quality)
373    }
374
375    /// Classify a URL's source tier
376    pub fn classify(&self, url: &str) -> SourceQuality {
377        let parsed = match url::Url::parse(url) {
378            Ok(u) => u,
379            Err(_) => {
380                return SourceQuality {
381                    tier: SourceTier::Unknown,
382                    confidence: 0.0,
383                    domain: String::new(),
384                    reasons: vec!["Invalid URL".to_string()],
385                    ..Default::default()
386                };
387            }
388        };
389
390        let host = match parsed.host_str() {
391            Some(h) => h.to_lowercase(),
392            None => {
393                return SourceQuality {
394                    tier: SourceTier::Unknown,
395                    confidence: 0.0,
396                    domain: String::new(),
397                    reasons: vec!["No host in URL".to_string()],
398                    ..Default::default()
399                };
400            }
401        };
402
403        let has_https = parsed.scheme() == "https";
404        let mut reasons = Vec::new();
405        let mut tier = SourceTier::Unknown;
406        let mut confidence = 0.5;
407        let mut is_authoritative = false;
408        let mut is_unreliable = false;
409
410        // Check for DOI (Digital Object Identifier) - strong academic indicator
411        let (has_doi, doi) = Self::extract_doi(url);
412
413        // Extract domain parts for matching
414        let domain_parts: Vec<&str> = host.split('.').collect();
415        let base_domain = if domain_parts.len() >= 2 {
416            format!(
417                "{}.{}",
418                domain_parts[domain_parts.len() - 2],
419                domain_parts[domain_parts.len() - 1]
420            )
421        } else {
422            host.clone()
423        };
424
425        // DOI presence indicates formal publication, but NOT necessarily quality
426        // Predatory journals have DOIs too - validate the registrant
427        if has_doi {
428            if let Some(ref doi_str) = doi {
429                let (is_valid, is_quality_registrant) = Self::validate_doi(doi_str);
430                if is_valid {
431                    if is_quality_registrant {
432                        // Known quality publisher - Tier 1
433                        tier = SourceTier::Tier1;
434                        confidence = 0.9;
435                        is_authoritative = true;
436                        reasons.push(format!("DOI from quality publisher: {}", doi_str));
437                    } else {
438                        // Unknown registrant - Tier 2 (could be predatory journal)
439                        tier = SourceTier::Tier2;
440                        confidence = 0.7;
441                        reasons.push(format!(
442                            "DOI from unknown registrant: {} (verify journal reputation)",
443                            doi_str
444                        ));
445                    }
446                } else {
447                    // Invalid DOI format - don't boost
448                    reasons.push(format!("Invalid DOI format detected: {}", doi_str));
449                }
450            }
451        }
452
453        // Check for Tier 1 TLDs (high priority, but DOI takes precedence)
454        if tier == SourceTier::Unknown {
455            for tld in &self.tier1_tlds {
456                if host.ends_with(tld) {
457                    tier = SourceTier::Tier1;
458                    confidence = 0.95;
459                    is_authoritative = true;
460                    reasons.push(format!("Government/educational TLD: {}", tld));
461                    break;
462                }
463            }
464        }
465
466        // Check known domain lists
467        if tier == SourceTier::Unknown {
468            // Check unreliable first
469            if self.unreliable_domains.contains(&host)
470                || self.unreliable_domains.contains(&base_domain)
471            {
472                tier = SourceTier::Tier3;
473                confidence = 0.9;
474                is_unreliable = true;
475                reasons.push("Known unreliable source".to_string());
476            } else if self.tier1_domains.contains(&host)
477                || self.tier1_domains.contains(&base_domain)
478            {
479                tier = SourceTier::Tier1;
480                confidence = 0.9;
481                is_authoritative = true;
482                reasons.push("Known authoritative domain".to_string());
483            } else if self.tier2_domains.contains(&host)
484                || self.tier2_domains.contains(&base_domain)
485            {
486                tier = SourceTier::Tier2;
487                confidence = 0.85;
488                reasons.push("Known reputable domain".to_string());
489            }
490        }
491
492        // Apply heuristics for unknown domains
493        if tier == SourceTier::Unknown {
494            // Check for common patterns
495            if host.contains("blog.") || host.contains(".blog") {
496                tier = SourceTier::Tier2;
497                confidence = 0.6;
498                reasons.push("Blog subdomain/domain".to_string());
499            } else if host.contains("forum.")
500                || host.contains(".forum")
501                || host.contains("community.")
502            {
503                tier = SourceTier::Tier3;
504                confidence = 0.6;
505                reasons.push("Forum/community site".to_string());
506            } else if host.contains("social.") || host.contains(".social") {
507                tier = SourceTier::Tier3;
508                confidence = 0.6;
509                reasons.push("Social media".to_string());
510            } else {
511                // Default to Tier 3 for unknown
512                tier = SourceTier::Tier3;
513                confidence = 0.4;
514                reasons.push("Unknown domain - defaulting to Tier 3".to_string());
515            }
516        }
517
518        // Adjust confidence based on HTTPS
519        if !has_https {
520            confidence *= 0.8;
521            reasons.push("No HTTPS (reduced confidence)".to_string());
522        }
523
524        debug!(
525            url = %url,
526            tier = ?tier,
527            confidence = %confidence,
528            "Classified source"
529        );
530
531        SourceQuality {
532            tier,
533            confidence,
534            domain: host,
535            reasons,
536            has_https,
537            domain_age_days: None,
538            is_authoritative,
539            is_unreliable,
540            has_doi,
541            doi,
542        }
543    }
544
545    /// Classify multiple URLs and return sorted by quality
546    pub fn classify_multiple(&self, urls: &[String]) -> Vec<(String, SourceQuality)> {
547        let mut results: Vec<(String, SourceQuality)> = urls
548            .iter()
549            .map(|url| (url.clone(), self.classify(url)))
550            .collect();
551
552        // Sort by tier (Tier1 first) then by confidence (descending)
553        results.sort_by(|a, b| {
554            match a.1.tier.cmp(&b.1.tier) {
555                std::cmp::Ordering::Equal => {
556                    // Higher confidence first
557                    b.1.confidence
558                        .partial_cmp(&a.1.confidence)
559                        .unwrap_or(std::cmp::Ordering::Equal)
560                }
561                other => other,
562            }
563        });
564
565        results
566    }
567
568    /// Check if a set of sources meets the triangulation requirement
569    pub fn meets_triangulation_requirement(
570        &self,
571        sources: &[SourceQuality],
572        min_sources: usize,
573        min_tier: SourceTier,
574    ) -> (bool, String) {
575        let qualifying: Vec<&SourceQuality> = sources
576            .iter()
577            .filter(|s| s.tier.meets_minimum(min_tier))
578            .collect();
579
580        if qualifying.len() >= min_sources {
581            (
582                true,
583                format!(
584                    "Triangulation satisfied: {} of {} sources meet Tier {} or better",
585                    qualifying.len(),
586                    sources.len(),
587                    match min_tier {
588                        SourceTier::Tier1 => "1",
589                        SourceTier::Tier2 => "2",
590                        SourceTier::Tier3 => "3",
591                        SourceTier::Unknown => "Unknown",
592                    }
593                ),
594            )
595        } else {
596            (
597                false,
598                format!(
599                    "Triangulation NOT satisfied: only {} of required {} sources meet Tier {} or better",
600                    qualifying.len(),
601                    min_sources,
602                    match min_tier {
603                        SourceTier::Tier1 => "1",
604                        SourceTier::Tier2 => "2",
605                        SourceTier::Tier3 => "3",
606                        SourceTier::Unknown => "Unknown",
607                    }
608                ),
609            )
610        }
611    }
612
613    /// Add a custom Tier 1 domain
614    pub fn add_tier1_domain(&mut self, domain: &str) {
615        self.tier1_domains.insert(domain.to_lowercase());
616    }
617
618    /// Add a custom Tier 2 domain
619    pub fn add_tier2_domain(&mut self, domain: &str) {
620        self.tier2_domains.insert(domain.to_lowercase());
621    }
622
623    /// Add a custom unreliable domain
624    pub fn add_unreliable_domain(&mut self, domain: &str) {
625        self.unreliable_domains.insert(domain.to_lowercase());
626    }
627}
628
629impl Default for TierClassifier {
630    fn default() -> Self {
631        Self::new()
632    }
633}
634
635#[cfg(test)]
636mod tests {
637    use super::*;
638
639    #[test]
640    fn test_tier_weight() {
641        assert_eq!(SourceTier::Tier1.weight(), 1.0);
642        assert_eq!(SourceTier::Tier2.weight(), 0.7);
643        assert_eq!(SourceTier::Tier3.weight(), 0.4);
644        assert_eq!(SourceTier::Unknown.weight(), 0.2);
645    }
646
647    #[test]
648    fn test_tier_meets_minimum() {
649        assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier1));
650        assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier2));
651        assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier3));
652
653        assert!(!SourceTier::Tier2.meets_minimum(SourceTier::Tier1));
654        assert!(SourceTier::Tier2.meets_minimum(SourceTier::Tier2));
655        assert!(SourceTier::Tier2.meets_minimum(SourceTier::Tier3));
656
657        assert!(!SourceTier::Tier3.meets_minimum(SourceTier::Tier1));
658        assert!(!SourceTier::Tier3.meets_minimum(SourceTier::Tier2));
659        assert!(SourceTier::Tier3.meets_minimum(SourceTier::Tier3));
660    }
661
662    #[test]
663    fn test_classify_tier1_tld() {
664        let classifier = TierClassifier::new();
665
666        let gov = classifier.classify("https://www.whitehouse.gov/briefing-room");
667        assert_eq!(gov.tier, SourceTier::Tier1);
668        assert!(gov.is_authoritative);
669
670        let edu = classifier.classify("https://cs.stanford.edu/research");
671        assert_eq!(edu.tier, SourceTier::Tier1);
672    }
673
674    #[test]
675    fn test_classify_tier1_domain() {
676        let classifier = TierClassifier::new();
677
678        let github = classifier.classify("https://github.com/rust-lang/rust");
679        assert_eq!(github.tier, SourceTier::Tier1);
680
681        let docs = classifier.classify("https://docs.rs/tokio/latest/tokio/");
682        assert_eq!(docs.tier, SourceTier::Tier1);
683
684        let mdn = classifier.classify("https://developer.mozilla.org/en-US/docs/Web");
685        assert_eq!(mdn.tier, SourceTier::Tier1);
686    }
687
688    #[test]
689    fn test_classify_tier2_domain() {
690        let classifier = TierClassifier::new();
691
692        let wiki = classifier.classify("https://en.wikipedia.org/wiki/Rust");
693        assert_eq!(wiki.tier, SourceTier::Tier2);
694
695        let so = classifier.classify("https://stackoverflow.com/questions/123");
696        assert_eq!(so.tier, SourceTier::Tier2);
697    }
698
699    #[test]
700    fn test_classify_unreliable() {
701        let classifier = TierClassifier::new();
702
703        let unreliable = classifier.classify("https://infowars.com/article");
704        assert_eq!(unreliable.tier, SourceTier::Tier3);
705        assert!(unreliable.is_unreliable);
706    }
707
708    #[test]
709    fn test_classify_unknown_domain() {
710        let classifier = TierClassifier::new();
711
712        let unknown = classifier.classify("https://randomsite12345.xyz/page");
713        assert_eq!(unknown.tier, SourceTier::Tier3);
714        assert!(unknown.reasons.iter().any(|r| r.contains("Unknown")));
715    }
716
717    #[test]
718    fn test_triangulation_requirement() {
719        let classifier = TierClassifier::new();
720
721        let sources = vec![
722            SourceQuality {
723                tier: SourceTier::Tier1,
724                confidence: 0.9,
725                ..Default::default()
726            },
727            SourceQuality {
728                tier: SourceTier::Tier2,
729                confidence: 0.8,
730                ..Default::default()
731            },
732            SourceQuality {
733                tier: SourceTier::Tier2,
734                confidence: 0.7,
735                ..Default::default()
736            },
737        ];
738
739        // Should pass with 3 sources at Tier 2 or better
740        let (met, _) = classifier.meets_triangulation_requirement(&sources, 3, SourceTier::Tier2);
741        assert!(met);
742
743        // Should fail with 3 sources at Tier 1
744        let (met, _) = classifier.meets_triangulation_requirement(&sources, 3, SourceTier::Tier1);
745        assert!(!met);
746    }
747
748    #[test]
749    fn test_classify_multiple() {
750        let classifier = TierClassifier::new();
751
752        let urls = vec![
753            "https://randomsite.com/page".to_string(),
754            "https://docs.rs/tokio".to_string(),
755            "https://en.wikipedia.org/wiki/Rust".to_string(),
756        ];
757
758        let results = classifier.classify_multiple(&urls);
759
760        // Should be sorted: Tier1 first, then Tier2, then Tier3
761        assert_eq!(results[0].1.tier, SourceTier::Tier1); // docs.rs
762        assert_eq!(results[1].1.tier, SourceTier::Tier2); // wikipedia
763        assert_eq!(results[2].1.tier, SourceTier::Tier3); // randomsite
764    }
765
766    #[test]
767    fn test_doi_detection() {
768        let classifier = TierClassifier::new();
769
770        // Standard DOI resolver URL from quality publisher (Nature)
771        let doi_url = classifier.classify("https://doi.org/10.1038/nature12373");
772        assert!(doi_url.has_doi, "Should detect DOI in doi.org URL");
773        assert_eq!(
774            doi_url.tier,
775            SourceTier::Tier1,
776            "Nature DOI should be Tier 1"
777        );
778        assert!(doi_url.is_authoritative);
779        assert_eq!(doi_url.doi, Some("10.1038/nature12373".to_string()));
780
781        // dx.doi.org resolver with ACM DOI
782        let dx_doi = classifier.classify("https://dx.doi.org/10.1145/1234567.1234568");
783        assert!(dx_doi.has_doi, "Should detect DOI in dx.doi.org URL");
784        assert_eq!(dx_doi.tier, SourceTier::Tier1, "ACM DOI should be Tier 1");
785
786        // DOI in URL path with unknown registrant (not in quality list)
787        let embedded_doi = classifier.classify("https://somejournal.com/article/10.9999/xyz123");
788        assert!(
789            embedded_doi.has_doi,
790            "Should detect DOI embedded in URL path"
791        );
792        assert_eq!(
793            embedded_doi.tier,
794            SourceTier::Tier2,
795            "Unknown registrant DOI should be Tier 2"
796        );
797
798        // URL without DOI
799        let no_doi = classifier.classify("https://example.com/article/123456");
800        assert!(!no_doi.has_doi, "Should not detect DOI in regular URL");
801        assert!(no_doi.doi.is_none());
802    }
803
804    #[test]
805    fn test_doi_quality_registrant() {
806        let classifier = TierClassifier::new();
807
808        // Quality registrants should be Tier 1
809        let nature = classifier.classify("https://doi.org/10.1038/s41586-021-03819-2");
810        assert_eq!(
811            nature.tier,
812            SourceTier::Tier1,
813            "Nature (10.1038) should be Tier 1"
814        );
815        assert!(nature.is_authoritative);
816
817        let ieee = classifier.classify("https://ieeexplore.ieee.org/10.1109/5.771073");
818        assert_eq!(
819            ieee.tier,
820            SourceTier::Tier1,
821            "IEEE (10.1109) should be Tier 1"
822        );
823
824        let springer = classifier.classify("https://link.springer.com/10.1007/s00000-000-0000-0");
825        assert_eq!(
826            springer.tier,
827            SourceTier::Tier1,
828            "Springer (10.1007) should be Tier 1"
829        );
830    }
831
832    #[test]
833    fn test_doi_unknown_registrant() {
834        let classifier = TierClassifier::new();
835
836        // An unknown domain without DOI would be Tier 3
837        let without_doi = classifier.classify("https://obscurejournal.xyz/article/123");
838        assert_eq!(without_doi.tier, SourceTier::Tier3);
839
840        // Unknown registrant DOI should be Tier 2 (not Tier 1)
841        // This protects against predatory journals which have DOIs
842        let with_unknown_doi =
843            classifier.classify("https://obscurejournal.xyz/article/10.9876/test");
844        assert_eq!(
845            with_unknown_doi.tier,
846            SourceTier::Tier2,
847            "Unknown registrant should be Tier 2"
848        );
849        assert!(with_unknown_doi.has_doi);
850        assert!(
851            !with_unknown_doi.is_authoritative,
852            "Unknown registrant should not be authoritative"
853        );
854    }
855
856    #[test]
857    fn test_validate_doi() {
858        // Valid DOIs
859        let (valid, quality) = TierClassifier::validate_doi("10.1038/nature12373");
860        assert!(valid);
861        assert!(quality, "Nature is a quality registrant");
862
863        let (valid, quality) = TierClassifier::validate_doi("10.1145/1234567.1234568");
864        assert!(valid);
865        assert!(quality, "ACM is a quality registrant");
866
867        let (valid, quality) = TierClassifier::validate_doi("10.9999/xyz123");
868        assert!(valid);
869        assert!(!quality, "Unknown registrant");
870
871        // Invalid DOIs
872        let (valid, _) = TierClassifier::validate_doi("11.1234/test"); // Wrong prefix
873        assert!(!valid);
874
875        let (valid, _) = TierClassifier::validate_doi("10.123/test"); // Registrant too short
876        assert!(!valid);
877
878        let (valid, _) = TierClassifier::validate_doi("10.1234"); // No suffix
879        assert!(!valid);
880    }
881}
reasonkit_web/research/sources.rs

reasonkit_web/research/
sources.rs