1use serde::{Deserialize, Serialize};
14use std::collections::HashSet;
15use tracing::debug;
16
17#[derive(
19 Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize,
20)]
21#[serde(rename_all = "snake_case")]
22pub enum SourceTier {
23 Tier1,
25 Tier2,
27 Tier3,
29 #[default]
31 Unknown,
32}
33
34impl SourceTier {
35 pub fn weight(&self) -> f64 {
37 match self {
38 SourceTier::Tier1 => 1.0,
39 SourceTier::Tier2 => 0.7,
40 SourceTier::Tier3 => 0.4,
41 SourceTier::Unknown => 0.2,
42 }
43 }
44
45 pub fn meets_minimum(&self, minimum: SourceTier) -> bool {
47 matches!(
48 (self, minimum),
49 (SourceTier::Tier1, _)
50 | (
51 SourceTier::Tier2,
52 SourceTier::Tier2 | SourceTier::Tier3 | SourceTier::Unknown
53 )
54 | (SourceTier::Tier3, SourceTier::Tier3 | SourceTier::Unknown)
55 | (SourceTier::Unknown, SourceTier::Unknown)
56 )
57 }
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct SourceQuality {
63 pub tier: SourceTier,
65 pub confidence: f64,
67 pub domain: String,
69 pub reasons: Vec<String>,
71 pub has_https: bool,
73 pub domain_age_days: Option<u32>,
75 pub is_authoritative: bool,
77 pub is_unreliable: bool,
79 pub has_doi: bool,
81 pub doi: Option<String>,
83}
84
85impl Default for SourceQuality {
86 fn default() -> Self {
87 Self {
88 tier: SourceTier::Unknown,
89 confidence: 0.0,
90 domain: String::new(),
91 reasons: Vec::new(),
92 has_https: false,
93 domain_age_days: None,
94 is_authoritative: false,
95 is_unreliable: false,
96 has_doi: false,
97 doi: None,
98 }
99 }
100}
101
102pub struct TierClassifier {
104 tier1_domains: HashSet<String>,
106 tier2_domains: HashSet<String>,
108 unreliable_domains: HashSet<String>,
110 tier1_tlds: HashSet<String>,
112}
113
114impl TierClassifier {
115 pub fn new() -> Self {
117 let mut classifier = Self {
118 tier1_domains: HashSet::new(),
119 tier2_domains: HashSet::new(),
120 unreliable_domains: HashSet::new(),
121 tier1_tlds: HashSet::new(),
122 };
123 classifier.load_defaults();
124 classifier
125 }
126
127 fn load_defaults(&mut self) {
129 let tier1 = [
131 "docs.rs",
133 "doc.rust-lang.org",
134 "docs.python.org",
135 "docs.microsoft.com",
136 "learn.microsoft.com",
137 "developer.mozilla.org",
138 "developer.apple.com",
139 "developers.google.com",
140 "cloud.google.com",
141 "aws.amazon.com",
142 "docs.aws.amazon.com",
143 "azure.microsoft.com",
144 "arxiv.org",
146 "scholar.google.com",
147 "pubmed.ncbi.nlm.nih.gov",
148 "nature.com",
149 "science.org",
150 "acm.org",
151 "dl.acm.org",
152 "ieee.org",
153 "ieeexplore.ieee.org",
154 "w3.org",
156 "ietf.org",
157 "rfc-editor.org",
158 "ecma-international.org",
159 "iso.org",
160 "github.com",
162 "gitlab.com",
163 "crates.io",
164 "pypi.org",
165 "npmjs.com",
166 "packagist.org",
167 "rubygems.org",
168 "whitehouse.gov",
170 "usa.gov",
171 "europa.eu",
172 "gov.uk",
173 "britannica.com",
175 ];
176
177 for domain in tier1 {
178 self.tier1_domains.insert(domain.to_string());
179 }
180
181 let tier2 = [
183 "wikipedia.org",
185 "en.wikipedia.org",
186 "wikidata.org",
187 "bbc.com",
189 "bbc.co.uk",
190 "reuters.com",
191 "apnews.com",
192 "nytimes.com",
193 "washingtonpost.com",
194 "theguardian.com",
195 "economist.com",
196 "techcrunch.com",
198 "arstechnica.com",
199 "wired.com",
200 "theverge.com",
201 "hackernews.com",
202 "news.ycombinator.com",
203 "stackoverflow.com",
205 "stackexchange.com",
206 "serverfault.com",
207 "superuser.com",
208 "martinfowler.com",
210 "blog.rust-lang.org",
211 "blog.python.org",
212 "engineering.fb.com",
213 "netflixtechblog.com",
214 "investopedia.com",
216 "healthline.com",
217 "mayoclinic.org",
218 "webmd.com",
219 ];
220
221 for domain in tier2 {
222 self.tier2_domains.insert(domain.to_string());
223 }
224
225 let unreliable = [
227 "ehow.com",
229 "demand.media",
230 "infowars.com",
232 "naturalnews.com",
233 "theonion.com",
235 "babylonbee.com",
236 ];
237
238 for domain in unreliable {
239 self.unreliable_domains.insert(domain.to_string());
240 }
241
242 let tier1_tlds = [
244 ".gov", ".gov.uk", ".gov.au", ".gov.ca", ".gov.nz", ".edu", ".edu.au", ".ac.uk",
245 ".edu.cn", ".mil",
246 ];
247
248 for tld in tier1_tlds {
249 self.tier1_tlds.insert(tld.to_string());
250 }
251 }
252
253 fn extract_doi(url: &str) -> (bool, Option<String>) {
265 let doi_pattern = regex::Regex::new(r#"10\.\d{4,9}/[a-zA-Z0-9.\-_()/:;%#@]+"#).ok();
270
271 if let Some(re) = doi_pattern {
272 if let Some(captures) = re.find(url) {
273 let doi = captures.as_str();
274 let doi = doi.trim_end_matches(|c: char| {
276 c == '.' || c == ',' || c == ';' || c == ':' || c == ')' || c == '/'
277 });
278 if let Some(slash_pos) = doi.find('/') {
280 if doi.len() > slash_pos + 1 {
281 return (true, Some(doi.to_string()));
282 }
283 }
284 }
285 }
286
287 for resolver in &["doi.org/", "dx.doi.org/", "hdl.handle.net/"] {
289 if let Some(pos) = url.find(resolver) {
290 let after_domain = &url[pos + resolver.len()..];
291 if after_domain.starts_with("10.") {
292 let doi = after_domain
293 .split(|c: char| {
294 c.is_whitespace() || c == '"' || c == '\'' || c == '>' || c == '<'
295 })
296 .next()
297 .unwrap_or(after_domain)
298 .trim_end_matches(['.', ',', ';', ')']);
299 if doi.contains('/') && doi.len() > 8 {
301 return (true, Some(doi.to_string()));
302 }
303 }
304 }
305 }
306
307 (false, None)
308 }
309
310 fn validate_doi(doi: &str) -> (bool, bool) {
316 if !doi.starts_with("10.") || !doi.contains('/') {
318 return (false, false);
319 }
320
321 let parts: Vec<&str> = doi.splitn(2, '/').collect();
323 if parts.len() != 2 {
324 return (false, false);
325 }
326
327 let prefix = parts[0]; let suffix = parts[1];
329
330 if !prefix.starts_with("10.") {
332 return (false, false);
333 }
334 let registrant = &prefix[3..];
335 if registrant.len() < 4
336 || registrant.len() > 9
337 || !registrant.chars().all(|c| c.is_ascii_digit())
338 {
339 return (false, false);
340 }
341
342 if suffix.is_empty() {
344 return (false, false);
345 }
346
347 let quality_registrants = [
350 "1000", "1001", "1002", "1006", "1007", "1016", "1017", "1021", "1038", "1073", "1093", "1126", "1145", "1109", "1257", "1371", "3389", "1186", ];
369
370 let is_quality = quality_registrants.contains(®istrant);
371
372 (true, is_quality)
373 }
374
375 pub fn classify(&self, url: &str) -> SourceQuality {
377 let parsed = match url::Url::parse(url) {
378 Ok(u) => u,
379 Err(_) => {
380 return SourceQuality {
381 tier: SourceTier::Unknown,
382 confidence: 0.0,
383 domain: String::new(),
384 reasons: vec!["Invalid URL".to_string()],
385 ..Default::default()
386 };
387 }
388 };
389
390 let host = match parsed.host_str() {
391 Some(h) => h.to_lowercase(),
392 None => {
393 return SourceQuality {
394 tier: SourceTier::Unknown,
395 confidence: 0.0,
396 domain: String::new(),
397 reasons: vec!["No host in URL".to_string()],
398 ..Default::default()
399 };
400 }
401 };
402
403 let has_https = parsed.scheme() == "https";
404 let mut reasons = Vec::new();
405 let mut tier = SourceTier::Unknown;
406 let mut confidence = 0.5;
407 let mut is_authoritative = false;
408 let mut is_unreliable = false;
409
410 let (has_doi, doi) = Self::extract_doi(url);
412
413 let domain_parts: Vec<&str> = host.split('.').collect();
415 let base_domain = if domain_parts.len() >= 2 {
416 format!(
417 "{}.{}",
418 domain_parts[domain_parts.len() - 2],
419 domain_parts[domain_parts.len() - 1]
420 )
421 } else {
422 host.clone()
423 };
424
425 if has_doi {
428 if let Some(ref doi_str) = doi {
429 let (is_valid, is_quality_registrant) = Self::validate_doi(doi_str);
430 if is_valid {
431 if is_quality_registrant {
432 tier = SourceTier::Tier1;
434 confidence = 0.9;
435 is_authoritative = true;
436 reasons.push(format!("DOI from quality publisher: {}", doi_str));
437 } else {
438 tier = SourceTier::Tier2;
440 confidence = 0.7;
441 reasons.push(format!(
442 "DOI from unknown registrant: {} (verify journal reputation)",
443 doi_str
444 ));
445 }
446 } else {
447 reasons.push(format!("Invalid DOI format detected: {}", doi_str));
449 }
450 }
451 }
452
453 if tier == SourceTier::Unknown {
455 for tld in &self.tier1_tlds {
456 if host.ends_with(tld) {
457 tier = SourceTier::Tier1;
458 confidence = 0.95;
459 is_authoritative = true;
460 reasons.push(format!("Government/educational TLD: {}", tld));
461 break;
462 }
463 }
464 }
465
466 if tier == SourceTier::Unknown {
468 if self.unreliable_domains.contains(&host)
470 || self.unreliable_domains.contains(&base_domain)
471 {
472 tier = SourceTier::Tier3;
473 confidence = 0.9;
474 is_unreliable = true;
475 reasons.push("Known unreliable source".to_string());
476 } else if self.tier1_domains.contains(&host)
477 || self.tier1_domains.contains(&base_domain)
478 {
479 tier = SourceTier::Tier1;
480 confidence = 0.9;
481 is_authoritative = true;
482 reasons.push("Known authoritative domain".to_string());
483 } else if self.tier2_domains.contains(&host)
484 || self.tier2_domains.contains(&base_domain)
485 {
486 tier = SourceTier::Tier2;
487 confidence = 0.85;
488 reasons.push("Known reputable domain".to_string());
489 }
490 }
491
492 if tier == SourceTier::Unknown {
494 if host.contains("blog.") || host.contains(".blog") {
496 tier = SourceTier::Tier2;
497 confidence = 0.6;
498 reasons.push("Blog subdomain/domain".to_string());
499 } else if host.contains("forum.")
500 || host.contains(".forum")
501 || host.contains("community.")
502 {
503 tier = SourceTier::Tier3;
504 confidence = 0.6;
505 reasons.push("Forum/community site".to_string());
506 } else if host.contains("social.") || host.contains(".social") {
507 tier = SourceTier::Tier3;
508 confidence = 0.6;
509 reasons.push("Social media".to_string());
510 } else {
511 tier = SourceTier::Tier3;
513 confidence = 0.4;
514 reasons.push("Unknown domain - defaulting to Tier 3".to_string());
515 }
516 }
517
518 if !has_https {
520 confidence *= 0.8;
521 reasons.push("No HTTPS (reduced confidence)".to_string());
522 }
523
524 debug!(
525 url = %url,
526 tier = ?tier,
527 confidence = %confidence,
528 "Classified source"
529 );
530
531 SourceQuality {
532 tier,
533 confidence,
534 domain: host,
535 reasons,
536 has_https,
537 domain_age_days: None,
538 is_authoritative,
539 is_unreliable,
540 has_doi,
541 doi,
542 }
543 }
544
545 pub fn classify_multiple(&self, urls: &[String]) -> Vec<(String, SourceQuality)> {
547 let mut results: Vec<(String, SourceQuality)> = urls
548 .iter()
549 .map(|url| (url.clone(), self.classify(url)))
550 .collect();
551
552 results.sort_by(|a, b| {
554 match a.1.tier.cmp(&b.1.tier) {
555 std::cmp::Ordering::Equal => {
556 b.1.confidence
558 .partial_cmp(&a.1.confidence)
559 .unwrap_or(std::cmp::Ordering::Equal)
560 }
561 other => other,
562 }
563 });
564
565 results
566 }
567
568 pub fn meets_triangulation_requirement(
570 &self,
571 sources: &[SourceQuality],
572 min_sources: usize,
573 min_tier: SourceTier,
574 ) -> (bool, String) {
575 let qualifying: Vec<&SourceQuality> = sources
576 .iter()
577 .filter(|s| s.tier.meets_minimum(min_tier))
578 .collect();
579
580 if qualifying.len() >= min_sources {
581 (
582 true,
583 format!(
584 "Triangulation satisfied: {} of {} sources meet Tier {} or better",
585 qualifying.len(),
586 sources.len(),
587 match min_tier {
588 SourceTier::Tier1 => "1",
589 SourceTier::Tier2 => "2",
590 SourceTier::Tier3 => "3",
591 SourceTier::Unknown => "Unknown",
592 }
593 ),
594 )
595 } else {
596 (
597 false,
598 format!(
599 "Triangulation NOT satisfied: only {} of required {} sources meet Tier {} or better",
600 qualifying.len(),
601 min_sources,
602 match min_tier {
603 SourceTier::Tier1 => "1",
604 SourceTier::Tier2 => "2",
605 SourceTier::Tier3 => "3",
606 SourceTier::Unknown => "Unknown",
607 }
608 ),
609 )
610 }
611 }
612
613 pub fn add_tier1_domain(&mut self, domain: &str) {
615 self.tier1_domains.insert(domain.to_lowercase());
616 }
617
618 pub fn add_tier2_domain(&mut self, domain: &str) {
620 self.tier2_domains.insert(domain.to_lowercase());
621 }
622
623 pub fn add_unreliable_domain(&mut self, domain: &str) {
625 self.unreliable_domains.insert(domain.to_lowercase());
626 }
627}
628
629impl Default for TierClassifier {
630 fn default() -> Self {
631 Self::new()
632 }
633}
634
635#[cfg(test)]
636mod tests {
637 use super::*;
638
639 #[test]
640 fn test_tier_weight() {
641 assert_eq!(SourceTier::Tier1.weight(), 1.0);
642 assert_eq!(SourceTier::Tier2.weight(), 0.7);
643 assert_eq!(SourceTier::Tier3.weight(), 0.4);
644 assert_eq!(SourceTier::Unknown.weight(), 0.2);
645 }
646
647 #[test]
648 fn test_tier_meets_minimum() {
649 assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier1));
650 assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier2));
651 assert!(SourceTier::Tier1.meets_minimum(SourceTier::Tier3));
652
653 assert!(!SourceTier::Tier2.meets_minimum(SourceTier::Tier1));
654 assert!(SourceTier::Tier2.meets_minimum(SourceTier::Tier2));
655 assert!(SourceTier::Tier2.meets_minimum(SourceTier::Tier3));
656
657 assert!(!SourceTier::Tier3.meets_minimum(SourceTier::Tier1));
658 assert!(!SourceTier::Tier3.meets_minimum(SourceTier::Tier2));
659 assert!(SourceTier::Tier3.meets_minimum(SourceTier::Tier3));
660 }
661
662 #[test]
663 fn test_classify_tier1_tld() {
664 let classifier = TierClassifier::new();
665
666 let gov = classifier.classify("https://www.whitehouse.gov/briefing-room");
667 assert_eq!(gov.tier, SourceTier::Tier1);
668 assert!(gov.is_authoritative);
669
670 let edu = classifier.classify("https://cs.stanford.edu/research");
671 assert_eq!(edu.tier, SourceTier::Tier1);
672 }
673
674 #[test]
675 fn test_classify_tier1_domain() {
676 let classifier = TierClassifier::new();
677
678 let github = classifier.classify("https://github.com/rust-lang/rust");
679 assert_eq!(github.tier, SourceTier::Tier1);
680
681 let docs = classifier.classify("https://docs.rs/tokio/latest/tokio/");
682 assert_eq!(docs.tier, SourceTier::Tier1);
683
684 let mdn = classifier.classify("https://developer.mozilla.org/en-US/docs/Web");
685 assert_eq!(mdn.tier, SourceTier::Tier1);
686 }
687
688 #[test]
689 fn test_classify_tier2_domain() {
690 let classifier = TierClassifier::new();
691
692 let wiki = classifier.classify("https://en.wikipedia.org/wiki/Rust");
693 assert_eq!(wiki.tier, SourceTier::Tier2);
694
695 let so = classifier.classify("https://stackoverflow.com/questions/123");
696 assert_eq!(so.tier, SourceTier::Tier2);
697 }
698
699 #[test]
700 fn test_classify_unreliable() {
701 let classifier = TierClassifier::new();
702
703 let unreliable = classifier.classify("https://infowars.com/article");
704 assert_eq!(unreliable.tier, SourceTier::Tier3);
705 assert!(unreliable.is_unreliable);
706 }
707
708 #[test]
709 fn test_classify_unknown_domain() {
710 let classifier = TierClassifier::new();
711
712 let unknown = classifier.classify("https://randomsite12345.xyz/page");
713 assert_eq!(unknown.tier, SourceTier::Tier3);
714 assert!(unknown.reasons.iter().any(|r| r.contains("Unknown")));
715 }
716
717 #[test]
718 fn test_triangulation_requirement() {
719 let classifier = TierClassifier::new();
720
721 let sources = vec![
722 SourceQuality {
723 tier: SourceTier::Tier1,
724 confidence: 0.9,
725 ..Default::default()
726 },
727 SourceQuality {
728 tier: SourceTier::Tier2,
729 confidence: 0.8,
730 ..Default::default()
731 },
732 SourceQuality {
733 tier: SourceTier::Tier2,
734 confidence: 0.7,
735 ..Default::default()
736 },
737 ];
738
739 let (met, _) = classifier.meets_triangulation_requirement(&sources, 3, SourceTier::Tier2);
741 assert!(met);
742
743 let (met, _) = classifier.meets_triangulation_requirement(&sources, 3, SourceTier::Tier1);
745 assert!(!met);
746 }
747
748 #[test]
749 fn test_classify_multiple() {
750 let classifier = TierClassifier::new();
751
752 let urls = vec![
753 "https://randomsite.com/page".to_string(),
754 "https://docs.rs/tokio".to_string(),
755 "https://en.wikipedia.org/wiki/Rust".to_string(),
756 ];
757
758 let results = classifier.classify_multiple(&urls);
759
760 assert_eq!(results[0].1.tier, SourceTier::Tier1); assert_eq!(results[1].1.tier, SourceTier::Tier2); assert_eq!(results[2].1.tier, SourceTier::Tier3); }
765
766 #[test]
767 fn test_doi_detection() {
768 let classifier = TierClassifier::new();
769
770 let doi_url = classifier.classify("https://doi.org/10.1038/nature12373");
772 assert!(doi_url.has_doi, "Should detect DOI in doi.org URL");
773 assert_eq!(
774 doi_url.tier,
775 SourceTier::Tier1,
776 "Nature DOI should be Tier 1"
777 );
778 assert!(doi_url.is_authoritative);
779 assert_eq!(doi_url.doi, Some("10.1038/nature12373".to_string()));
780
781 let dx_doi = classifier.classify("https://dx.doi.org/10.1145/1234567.1234568");
783 assert!(dx_doi.has_doi, "Should detect DOI in dx.doi.org URL");
784 assert_eq!(dx_doi.tier, SourceTier::Tier1, "ACM DOI should be Tier 1");
785
786 let embedded_doi = classifier.classify("https://somejournal.com/article/10.9999/xyz123");
788 assert!(
789 embedded_doi.has_doi,
790 "Should detect DOI embedded in URL path"
791 );
792 assert_eq!(
793 embedded_doi.tier,
794 SourceTier::Tier2,
795 "Unknown registrant DOI should be Tier 2"
796 );
797
798 let no_doi = classifier.classify("https://example.com/article/123456");
800 assert!(!no_doi.has_doi, "Should not detect DOI in regular URL");
801 assert!(no_doi.doi.is_none());
802 }
803
804 #[test]
805 fn test_doi_quality_registrant() {
806 let classifier = TierClassifier::new();
807
808 let nature = classifier.classify("https://doi.org/10.1038/s41586-021-03819-2");
810 assert_eq!(
811 nature.tier,
812 SourceTier::Tier1,
813 "Nature (10.1038) should be Tier 1"
814 );
815 assert!(nature.is_authoritative);
816
817 let ieee = classifier.classify("https://ieeexplore.ieee.org/10.1109/5.771073");
818 assert_eq!(
819 ieee.tier,
820 SourceTier::Tier1,
821 "IEEE (10.1109) should be Tier 1"
822 );
823
824 let springer = classifier.classify("https://link.springer.com/10.1007/s00000-000-0000-0");
825 assert_eq!(
826 springer.tier,
827 SourceTier::Tier1,
828 "Springer (10.1007) should be Tier 1"
829 );
830 }
831
832 #[test]
833 fn test_doi_unknown_registrant() {
834 let classifier = TierClassifier::new();
835
836 let without_doi = classifier.classify("https://obscurejournal.xyz/article/123");
838 assert_eq!(without_doi.tier, SourceTier::Tier3);
839
840 let with_unknown_doi =
843 classifier.classify("https://obscurejournal.xyz/article/10.9876/test");
844 assert_eq!(
845 with_unknown_doi.tier,
846 SourceTier::Tier2,
847 "Unknown registrant should be Tier 2"
848 );
849 assert!(with_unknown_doi.has_doi);
850 assert!(
851 !with_unknown_doi.is_authoritative,
852 "Unknown registrant should not be authoritative"
853 );
854 }
855
856 #[test]
857 fn test_validate_doi() {
858 let (valid, quality) = TierClassifier::validate_doi("10.1038/nature12373");
860 assert!(valid);
861 assert!(quality, "Nature is a quality registrant");
862
863 let (valid, quality) = TierClassifier::validate_doi("10.1145/1234567.1234568");
864 assert!(valid);
865 assert!(quality, "ACM is a quality registrant");
866
867 let (valid, quality) = TierClassifier::validate_doi("10.9999/xyz123");
868 assert!(valid);
869 assert!(!quality, "Unknown registrant");
870
871 let (valid, _) = TierClassifier::validate_doi("11.1234/test"); assert!(!valid);
874
875 let (valid, _) = TierClassifier::validate_doi("10.123/test"); assert!(!valid);
877
878 let (valid, _) = TierClassifier::validate_doi("10.1234"); assert!(!valid);
880 }
881}