1use std::collections::{BTreeMap, HashMap, HashSet};
8use std::path::Path;
9
10use serde::Deserialize;
11
12use crate::error::{Error, Result};
13use crate::site::{Engine, Site};
14
15const EMBEDDED_REGISTRY: &str = include_str!("../data/sites.json");
16
17const EMBEDDED_WMN_REGISTRY: &str = include_str!("../data/sites_wmn.json");
24
25#[derive(Debug, Clone, Deserialize)]
34pub struct Registry {
35 #[serde(default)]
36 engines: BTreeMap<String, Engine>,
37 sites: Vec<Site>,
38}
39
40#[derive(Debug, Clone, Default)]
48pub struct SiteFilter {
49 pub include: Vec<String>,
52 pub exclude: Vec<String>,
54 pub tags: Vec<String>,
57 pub exclude_tags: Vec<String>,
59 pub include_nsfw: bool,
61 pub top: Option<u32>,
65}
66
67impl SiteFilter {
68 pub fn apply(&self, sites: &[Site]) -> Vec<Site> {
71 self.apply_inner(sites, DisabledMode::Exclude)
72 }
73
74 pub fn apply_including_disabled(&self, sites: &[Site]) -> Vec<Site> {
78 self.apply_inner(sites, DisabledMode::Include)
79 }
80
81 fn apply_inner(&self, sites: &[Site], disabled_mode: DisabledMode) -> Vec<Site> {
82 let include: Vec<String> = self.include.iter().map(|s| s.to_lowercase()).collect();
83 let exclude: Vec<String> = self.exclude.iter().map(|s| s.to_lowercase()).collect();
84 let want_tags: Vec<String> = self.tags.iter().map(|s| s.to_lowercase()).collect();
85 let mut drop_tags: Vec<String> =
86 self.exclude_tags.iter().map(|s| s.to_lowercase()).collect();
87
88 let nsfw_tag = "nsfw".to_owned();
91 let asking_for_nsfw = want_tags.contains(&nsfw_tag);
92 if !self.include_nsfw && !asking_for_nsfw && !drop_tags.contains(&nsfw_tag) {
93 drop_tags.push(nsfw_tag);
94 }
95
96 let mut filtered: Vec<Site> = sites
97 .iter()
98 .filter(|site| {
99 match disabled_mode {
100 DisabledMode::Exclude if site.disabled => {
101 return false;
106 }
107 DisabledMode::Only if !site.disabled => return false,
108 DisabledMode::Exclude | DisabledMode::Only | DisabledMode::Include => {}
109 }
110 let name = site.name.to_lowercase();
111 let included = include.is_empty() || include.iter().any(|i| name.contains(i));
112 let excluded = exclude.iter().any(|x| name.contains(x));
113 let lower_tags: Vec<String> = site.tags.iter().map(|t| t.to_lowercase()).collect();
114 let tagged =
115 want_tags.is_empty() || lower_tags.iter().any(|t| want_tags.contains(t));
116 let tag_excluded = lower_tags.iter().any(|t| drop_tags.contains(t));
117 included && !excluded && tagged && !tag_excluded
118 })
119 .cloned()
120 .collect();
121
122 if let Some(n) = self.top {
123 filtered.retain(|s| s.popularity.is_some_and(|p| p <= n));
124 filtered.sort_by_key(|s| s.popularity.unwrap_or(u32::MAX));
125 }
126 filtered
127 }
128}
129
130#[derive(Debug, Clone, Copy)]
131enum DisabledMode {
132 Exclude,
133 Only,
134 Include,
135}
136
137impl Registry {
138 pub fn default_embedded() -> Result<Self> {
140 Self::from_json_str(EMBEDDED_REGISTRY)
141 }
142
143 pub fn default_embedded_with_wmn() -> Result<Self> {
158 let mut base = Self::default_embedded()?;
159 let wmn: Self = serde_json::from_str(EMBEDDED_WMN_REGISTRY)?;
160 let existing_names: HashSet<String> =
161 base.sites.iter().map(|s| s.name.to_lowercase()).collect();
162 let claimed_urls: HashSet<String> = base
167 .sites
168 .iter()
169 .filter(|s| !s.disabled)
170 .map(|s| s.url.as_str().to_owned())
171 .collect();
172 for (name, engine) in wmn.engines {
173 base.engines.entry(name).or_insert(engine);
174 }
175 for site in wmn.sites {
176 if existing_names.contains(&site.name.to_lowercase()) {
177 continue;
178 }
179 if !site.disabled && claimed_urls.contains(site.url.as_str()) {
180 continue;
186 }
187 base.sites.push(site);
188 }
189 base.resolve_engines()?;
190 base.validate()?;
191 Ok(base)
192 }
193
194 pub fn from_json_str(json: &str) -> Result<Self> {
199 let mut registry: Self = serde_json::from_str(json)?;
200 registry.resolve_engines()?;
201 registry.apply_tag_derived_policy();
202 registry.validate()?;
203 Ok(registry)
204 }
205
206 pub fn engines(&self) -> &BTreeMap<String, Engine> {
211 &self.engines
212 }
213
214 fn apply_tag_derived_policy(&mut self) {
233 for site in &mut self.sites {
234 if !site.access.geo.is_empty() {
235 continue;
236 }
237 for tag in &site.tags {
238 let Some(rest) = tag.strip_prefix("region:") else {
239 continue;
240 };
241 let Some(cc) = crate::access::CountryCode::new(rest) else {
242 continue;
243 };
244 if !site.access.prefer_geo.contains(&cc) {
245 site.access.prefer_geo.push(cc);
246 }
247 }
248 }
249 }
250
251 fn resolve_engines(&mut self) -> Result<()> {
260 for (name, engine) in &self.engines {
261 engine.validate(name)?;
262 }
263 for site in &mut self.sites {
264 let Some(name) = &site.engine else {
265 continue;
266 };
267 let Some(engine) = self.engines.get(name) else {
268 return Err(Error::InvalidSite {
269 reason: format!(
270 "site {:?}: references engine {name:?} which is not defined",
271 site.name
272 ),
273 });
274 };
275 engine.merge_into(site);
276 }
277 Ok(())
278 }
279
280 pub fn load_from_path(path: impl AsRef<Path>) -> Result<Self> {
282 let bytes = std::fs::read(path)?;
283 let json = std::str::from_utf8(&bytes).map_err(|e| Error::InvalidSite {
284 reason: format!("registry file is not valid UTF-8: {e}"),
285 })?;
286 Self::from_json_str(json)
287 }
288
289 pub fn sites(&self) -> &[Site] {
291 &self.sites
292 }
293
294 pub fn len(&self) -> usize {
296 self.sites.len()
297 }
298
299 pub fn is_empty(&self) -> bool {
302 self.sites.is_empty()
303 }
304
305 pub fn filter(
324 &self,
325 include: &[String],
326 exclude: &[String],
327 tags: &[String],
328 exclude_tags: &[String],
329 include_nsfw: bool,
330 ) -> Vec<Site> {
331 self.filter_with(&SiteFilter {
332 include: include.to_vec(),
333 exclude: exclude.to_vec(),
334 tags: tags.to_vec(),
335 exclude_tags: exclude_tags.to_vec(),
336 include_nsfw,
337 top: None,
338 })
339 }
340
341 pub fn filter_with(&self, filter: &SiteFilter) -> Vec<Site> {
343 filter.apply(&self.sites)
344 }
345
346 pub fn matches_with(&self, filter: &SiteFilter) -> Vec<Site> {
351 filter.apply_including_disabled(&self.sites)
352 }
353
354 pub fn disabled_matches_with(&self, filter: &SiteFilter) -> Vec<Site> {
359 filter.apply_inner(&self.sites, DisabledMode::Only)
360 }
361
362 pub fn tag_counts(&self) -> Vec<(String, usize)> {
365 let mut counts: std::collections::BTreeMap<String, usize> =
366 std::collections::BTreeMap::new();
367 for site in &self.sites {
368 for tag in &site.tags {
369 *counts.entry(tag.clone()).or_insert(0) += 1;
370 }
371 }
372 counts.into_iter().collect()
373 }
374
375 fn validate(&self) -> Result<()> {
376 if self.sites.is_empty() {
377 return Err(Error::InvalidSite {
378 reason: "registry has no sites".into(),
379 });
380 }
381 for site in &self.sites {
382 site.validate()?;
383 }
384 let mut seen: HashSet<String> = HashSet::new();
385 for site in &self.sites {
386 let key = site.name.to_lowercase();
387 if !seen.insert(key) {
388 return Err(Error::InvalidSite {
389 reason: format!("duplicate site name: {:?}", site.name),
390 });
391 }
392 }
393 let mut seen_url_sig: HashMap<(String, String), &str> = HashMap::new();
409 for site in &self.sites {
410 if site.disabled {
411 continue;
412 }
413 let sigs_key = serde_json::to_string(&site.signals)
420 .expect("Signal derives Serialize and contains no Map<_, _> with non-string keys");
421 let key = (site.url.as_str().to_owned(), sigs_key);
422 if let Some(prev) = seen_url_sig.insert(key, site.name.as_str()) {
423 return Err(Error::InvalidSite {
424 reason: format!(
425 "duplicate (URL, signals) among enabled sites: {:?} and {:?} both back \
426 {:?} with identical signals. Mark one `disabled: true` with \
427 `disabled_reason: \"duplicate of {prev}\"` (or, if the two entries are \
428 supposed to disambiguate via different markers, give each a distinct \
429 signal set).",
430 prev,
431 site.name,
432 site.url.as_str(),
433 ),
434 });
435 }
436 }
437 Ok(())
438 }
439}
440
441#[cfg(test)]
442mod tests {
443 use super::*;
444
445 #[test]
446 fn embedded_registry_loads_and_validates() {
447 let registry = Registry::default_embedded().expect("embedded registry must load");
448 assert!(
451 registry.len() >= 100,
452 "imported registry should have ≥100 sites, got {}",
453 registry.len()
454 );
455 let names: Vec<&str> = registry.sites().iter().map(|s| s.name.as_str()).collect();
461 assert!(names.contains(&"GitHub"));
462 assert!(names.contains(&"Reddit"));
463 assert!(names.contains(&"Telegram"));
464 }
465
466 #[test]
467 fn wmn_embedded_registry_loads_and_supersets_default() {
468 let base = Registry::default_embedded().unwrap();
469 let merged = Registry::default_embedded_with_wmn().expect("WMN-merged registry must load");
470 assert!(
471 merged.len() > base.len(),
472 "WMN merge must add sites: base={} merged={}",
473 base.len(),
474 merged.len()
475 );
476 let merged_names: HashSet<String> = merged
479 .sites()
480 .iter()
481 .map(|s| s.name.to_lowercase())
482 .collect();
483 for s in base.sites() {
484 assert!(
485 merged_names.contains(&s.name.to_lowercase()),
486 "merge dropped base-tranche site {:?}",
487 s.name
488 );
489 }
490 let has_wmn_tag = merged
492 .sites()
493 .iter()
494 .any(|s| s.tags.iter().any(|t| t == "source:wmn"));
495 assert!(has_wmn_tag, "no site carries the source:wmn tag");
496 }
497
498 #[test]
499 fn rejects_empty_registry() {
500 let err = Registry::from_json_str(r#"{ "sites": [] }"#).unwrap_err();
501 assert!(matches!(err, Error::InvalidSite { .. }));
502 }
503
504 #[test]
505 fn rejects_duplicate_site_names() {
506 let json = r#"{
507 "sites": [
508 { "name": "GitHub", "url": "https://github.com/{username}",
509 "signals": [{ "kind": "status_found", "codes": [200] }] },
510 { "name": "github", "url": "https://github.com/{username}",
511 "signals": [{ "kind": "status_found", "codes": [200] }] }
512 ]
513 }"#;
514 let err = Registry::from_json_str(json).unwrap_err();
515 assert!(matches!(err, Error::InvalidSite { .. }));
516 assert!(err.to_string().contains("duplicate"));
517 }
518
519 #[test]
520 fn rejects_duplicate_enabled_urls() {
521 let json = r#"{
525 "sites": [
526 { "name": "Hub Code", "url": "https://example.com/{username}",
527 "signals": [{ "kind": "status_found", "codes": [200] }] },
528 { "name": "HubCode", "url": "https://example.com/{username}",
529 "signals": [{ "kind": "status_found", "codes": [200] }] }
530 ]
531 }"#;
532 let err = Registry::from_json_str(json).unwrap_err();
533 assert!(matches!(err, Error::InvalidSite { .. }));
534 let msg = err.to_string();
535 assert!(msg.contains("duplicate (URL, signals)"), "msg: {msg}");
536 assert!(msg.contains("Hub Code"), "msg: {msg}");
537 assert!(msg.contains("HubCode"), "msg: {msg}");
538 }
539
540 #[test]
541 fn allows_duplicate_urls_with_distinct_signals() {
542 let json = r#"{
547 "sites": [
548 { "name": "Site Public", "url": "https://example.com/{username}",
549 "signals": [{ "kind": "status_found", "codes": [200] }] },
550 { "name": "Site Private", "url": "https://example.com/{username}",
551 "signals": [{ "kind": "status_found", "codes": [403] }] }
552 ]
553 }"#;
554 let registry = Registry::from_json_str(json).expect("distinct-signal alias must validate");
555 assert_eq!(registry.len(), 2);
556 }
557
558 #[test]
559 fn allows_duplicate_urls_when_one_side_is_disabled() {
560 let json = r#"{
565 "sites": [
566 { "name": "Hub Code", "url": "https://example.com/{username}",
567 "signals": [{ "kind": "status_found", "codes": [200] }] },
568 { "name": "HubCode", "url": "https://example.com/{username}",
569 "signals": [{ "kind": "status_found", "codes": [200] }],
570 "disabled": true,
571 "disabled_reason": "duplicate of Hub Code" }
572 ]
573 }"#;
574 let registry = Registry::from_json_str(json).expect("dedup pattern must validate");
575 assert_eq!(registry.len(), 2);
576 }
577
578 #[test]
579 fn rejects_invalid_site_definition() {
580 let json = r#"{
582 "sites": [
583 { "name": "Bad", "url": "https://example.com/",
584 "signals": [{ "kind": "status_found", "codes": [200] }] }
585 ]
586 }"#;
587 assert!(Registry::from_json_str(json).is_err());
588 }
589
590 #[test]
591 fn rejects_malformed_json() {
592 let err = Registry::from_json_str("{").unwrap_err();
593 assert!(matches!(err, Error::Json(_)));
594 }
595
596 #[test]
597 fn filter_include_is_case_insensitive_substring() {
598 let registry = Registry::default_embedded().unwrap();
599 let only_github = registry.filter(&["github".into()], &[], &[], &[], false);
600 assert_eq!(only_github.len(), 1);
601 assert_eq!(only_github[0].name, "GitHub");
602
603 let many = registry.filter(&["e".into()], &[], &[], &[], false); assert!(many.len() > 1);
605 }
606
607 #[test]
608 fn filter_exclude_drops_matches() {
609 let registry = Registry::default_embedded().unwrap();
610 let baseline = registry.filter(&[], &[], &[], &[], true);
613 let without_github = registry.filter(&[], &["github".into()], &[], &[], true);
614 assert!(without_github.iter().all(|s| s.name != "GitHub"));
615 assert_eq!(without_github.len(), baseline.len() - 1);
620 }
621
622 #[test]
623 fn filter_include_and_exclude_compose() {
624 let registry = Registry::default_embedded().unwrap();
625 let filtered = registry.filter(&["git".into()], &["lab".into()], &[], &[], false);
627 let names: Vec<&str> = filtered.iter().map(|s| s.name.as_str()).collect();
628 assert!(names.contains(&"GitHub"));
629 assert!(!names.contains(&"GitLab"));
630 }
632
633 #[test]
634 fn filter_with_no_matches_returns_empty() {
635 let registry = Registry::default_embedded().unwrap();
636 let filtered = registry.filter(&["does-not-exist-xyz".into()], &[], &[], &[], false);
637 assert!(filtered.is_empty());
638 }
639
640 #[test]
641 fn disabled_sites_are_skipped_by_filter() {
642 let json = r#"{
643 "sites": [
644 { "name": "Alive", "url": "https://alive.example/{username}",
645 "signals": [{ "kind": "status_found", "codes": [200] }] },
646 { "name": "Parked", "url": "https://parked.example/{username}",
647 "signals": [{ "kind": "status_found", "codes": [200] }],
648 "disabled": true }
649 ]
650 }"#;
651 let registry = Registry::from_json_str(json).unwrap();
652 assert_eq!(registry.sites().len(), 2);
656 let scanned = registry.filter(&[], &[], &[], &[], false);
657 let names: Vec<&str> = scanned.iter().map(|s| s.name.as_str()).collect();
658 assert_eq!(names, vec!["Alive"]);
659 }
660
661 #[test]
662 fn disabled_matches_with_explains_parked_filter_hits() {
663 let json = r#"{
664 "sites": [
665 { "name": "Alive", "url": "https://alive.example/{username}",
666 "signals": [{ "kind": "status_found", "codes": [200] }] },
667 { "name": "TikTok", "url": "https://tiktok.example/@{username}",
668 "signals": [{ "kind": "status_found", "codes": [200] }],
669 "disabled": true,
670 "disabled_reason": "Honest Limits: parked",
671 "tags": ["social"] }
672 ]
673 }"#;
674 let registry = Registry::from_json_str(json).unwrap();
675 let filter = SiteFilter {
676 include: vec!["tiktok".into()],
677 tags: vec!["social".into()],
678 ..SiteFilter::default()
679 };
680
681 assert!(registry.filter_with(&filter).is_empty());
682 let disabled = registry.disabled_matches_with(&filter);
683 assert_eq!(disabled.len(), 1);
684 assert_eq!(disabled[0].name, "TikTok");
685 assert_eq!(
686 disabled[0].disabled_reason.as_deref(),
687 Some("Honest Limits: parked")
688 );
689 }
690
691 #[test]
692 fn threads_stays_parked_behind_login_wall() {
693 let registry = Registry::default_embedded().unwrap();
694 let threads = registry
695 .sites()
696 .iter()
697 .find(|s| s.name == "Threads")
698 .expect("Threads entry should document the login-wall limitation");
699
700 assert!(threads.disabled, "Threads must not be probed by default");
701 let reason = threads
702 .disabled_reason
703 .as_deref()
704 .expect("disabled Threads entry should explain why it is parked");
705 assert!(
706 reason.contains("Honest Limits") && reason.contains("indistinguishable"),
707 "unexpected Threads disabled_reason: {reason}"
708 );
709
710 let scanned = registry.filter(&["threads".into()], &[], &[], &[], true);
711 assert!(
712 scanned.is_empty(),
713 "disabled Threads entry must not leak into scan filters"
714 );
715 }
716
717 #[test]
718 fn reddit_uses_oauth_endpoint_and_requires_session() {
719 let registry = Registry::default_embedded_with_wmn().unwrap();
720 let reddit_entries: Vec<&Site> = registry
721 .sites()
722 .iter()
723 .filter(|s| s.name == "Reddit")
724 .collect();
725
726 assert_eq!(
727 reddit_entries.len(),
728 1,
729 "WMN merge must not reintroduce a second Reddit probe"
730 );
731 let reddit = reddit_entries[0];
732 assert!(!reddit.disabled, "Reddit OAuth probe should remain enabled");
733 assert_eq!(
734 reddit.url.as_str(),
735 "https://oauth.reddit.com/user/{username}/about"
736 );
737 assert_eq!(reddit.access.session.as_deref(), Some("reddit"));
738 assert!(
739 reddit
740 .protection
741 .iter()
742 .any(|p| matches!(p, super::super::site::ProtectionKind::UserAuth)),
743 "Reddit should be classified as requiring user auth"
744 );
745 assert!(
746 reddit.tags.iter().any(|t| t == "reddit-oauth"),
747 "Reddit should be discoverable as an OAuth-gated site"
748 );
749 assert!(
750 reddit
751 .tags
752 .iter()
753 .all(|t| !t.eq_ignore_ascii_case("bot-protected")),
754 "Reddit OAuth should use HTTP session headers, not browser routing"
755 );
756
757 let scanned = registry.filter(&["reddit".into()], &[], &[], &[], true);
758 assert_eq!(
759 scanned.iter().filter(|s| s.name == "Reddit").count(),
760 1,
761 "enabled Reddit OAuth entry should be scan-filterable"
762 );
763 }
764
765 #[test]
766 fn tiktok_stays_parked_behind_hydration_wall() {
767 let registry = Registry::default_embedded_with_wmn().unwrap();
768 let tiktok_entries: Vec<&Site> = registry
769 .sites()
770 .iter()
771 .filter(|s| s.name == "TikTok")
772 .collect();
773
774 assert_eq!(
775 tiktok_entries.len(),
776 1,
777 "WMN merge must not reintroduce TikTok's oEmbed probe"
778 );
779 let tiktok = tiktok_entries[0];
780 assert!(tiktok.disabled, "TikTok must not be probed by default");
781 assert!(
782 tiktok
783 .protection
784 .iter()
785 .any(|p| matches!(p, super::super::site::ProtectionKind::Captcha)),
786 "TikTok should be classified as captcha/headless protected"
787 );
788 let reason = tiktok
789 .disabled_reason
790 .as_deref()
791 .expect("disabled TikTok entry should explain why it is parked");
792 assert!(
793 reason.contains("Honest Limits")
794 && reason.contains("JS-only SPA")
795 && reason.contains("never hydrates"),
796 "unexpected TikTok disabled_reason: {reason}"
797 );
798
799 let scanned = registry.filter(&["tiktok".into()], &[], &[], &[], true);
800 assert!(
801 scanned.iter().all(|s| s.name != "TikTok"),
802 "disabled TikTok entry must not leak into scan filters"
803 );
804 }
805
806 #[test]
807 fn pinterest_uses_oembed_instead_of_js_shell() {
808 let registry = Registry::default_embedded_with_wmn().unwrap();
809 let pinterest_entries: Vec<&Site> = registry
810 .sites()
811 .iter()
812 .filter(|s| s.name == "Pinterest")
813 .collect();
814
815 assert_eq!(
816 pinterest_entries.len(),
817 1,
818 "WMN merge must not reintroduce Pinterest's canonical JS shell"
819 );
820 let pinterest = pinterest_entries[0];
821 assert!(
822 !pinterest.disabled,
823 "Pinterest oEmbed probe should remain enabled"
824 );
825 assert!(
826 pinterest.url.as_str().contains("/oembed.json"),
827 "Pinterest should use the oEmbed endpoint, got {}",
828 pinterest.url.as_str()
829 );
830 assert!(
831 pinterest.url.as_str() != "https://www.pinterest.com/{username}/",
832 "Pinterest must not fall back to the canonical JS shell"
833 );
834
835 let scanned = registry.filter(&["pinterest".into()], &[], &[], &[], true);
836 assert_eq!(
837 scanned.iter().filter(|s| s.name == "Pinterest").count(),
838 1,
839 "enabled Pinterest oEmbed entry should be scan-filterable"
840 );
841 }
842
843 #[test]
844 fn source_field_round_trips() {
845 let json = r#"{
846 "sites": [
847 { "name": "Nitter", "url": "https://nitter.example/{username}",
848 "signals": [{ "kind": "status_found", "codes": [200] }],
849 "source": "Twitter" }
850 ]
851 }"#;
852 let registry = Registry::from_json_str(json).unwrap();
853 assert_eq!(registry.sites()[0].source.as_deref(), Some("Twitter"));
854 }
855
856 fn tagged_registry() -> Registry {
857 let json = r#"{
858 "sites": [
859 { "name": "Soc", "url": "https://soc.example/{username}",
860 "signals": [{ "kind": "status_found", "codes": [200] }],
861 "tags": ["social", "region:ru"] },
862 { "name": "Dev", "url": "https://dev.example/{username}",
863 "signals": [{ "kind": "status_found", "codes": [200] }],
864 "tags": ["dev"] },
865 { "name": "Plain", "url": "https://plain.example/{username}",
866 "signals": [{ "kind": "status_found", "codes": [200] }] }
867 ]
868 }"#;
869 Registry::from_json_str(json).unwrap()
870 }
871
872 #[test]
873 fn tag_filter_keeps_only_matching_tags_and_drops_untagged() {
874 let r = tagged_registry();
875 let social = r.filter(&[], &[], &["social".into()], &[], false);
876 let names: Vec<&str> = social.iter().map(|s| s.name.as_str()).collect();
877 assert_eq!(names, ["Soc"], "tag filter should keep only tagged matches");
878 }
879
880 #[test]
881 fn tag_filter_is_or_within_requested_tags_and_case_insensitive() {
882 let r = tagged_registry();
883 let either = r.filter(&[], &[], &["DEV".into(), "social".into()], &[], false);
884 let names: Vec<&str> = either.iter().map(|s| s.name.as_str()).collect();
885 assert_eq!(names, ["Soc", "Dev"]);
886 }
887
888 #[test]
889 fn no_tag_filter_includes_untagged_sites() {
890 let r = tagged_registry();
891 assert_eq!(r.filter(&[], &[], &[], &[], false).len(), 3);
892 }
893
894 #[test]
895 fn exclude_tag_drops_matching_sites() {
896 let r = tagged_registry();
897 let kept = r.filter(&[], &[], &[], &["social".into()], false);
898 let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
899 assert_eq!(names, ["Dev", "Plain"], "{names:?}");
901 }
902
903 fn nsfw_registry() -> Registry {
904 let json = r#"{
905 "sites": [
906 { "name": "Family", "url": "https://family.example/{username}",
907 "signals": [{ "kind": "status_found", "codes": [200] }],
908 "tags": ["social"] },
909 { "name": "Adult", "url": "https://adult.example/{username}",
910 "signals": [{ "kind": "status_found", "codes": [200] }],
911 "tags": ["nsfw"] }
912 ]
913 }"#;
914 Registry::from_json_str(json).unwrap()
915 }
916
917 #[test]
918 fn nsfw_sites_excluded_by_default() {
919 let r = nsfw_registry();
920 let kept = r.filter(&[], &[], &[], &[], false);
921 let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
922 assert_eq!(names, ["Family"], "nsfw site must be excluded by default");
923 }
924
925 #[test]
926 fn nsfw_sites_included_when_flag_set() {
927 let r = nsfw_registry();
928 let kept = r.filter(&[], &[], &[], &[], true);
929 assert_eq!(kept.len(), 2, "both sites present with include_nsfw=true");
930 }
931
932 #[test]
933 fn nsfw_sites_included_when_tag_asked_for_explicitly() {
934 let r = nsfw_registry();
937 let kept = r.filter(&[], &[], &["nsfw".into()], &[], false);
938 let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
939 assert_eq!(names, ["Adult"]);
940 }
941
942 #[test]
943 fn tag_counts_are_sorted_with_per_tag_totals() {
944 let r = tagged_registry();
945 assert_eq!(
946 r.tag_counts(),
947 vec![
948 ("dev".to_owned(), 1),
949 ("region:ru".to_owned(), 1),
950 ("social".to_owned(), 1),
951 ]
952 );
953 }
954
955 #[test]
956 fn engine_inheritance_fills_empty_site_signals() {
957 let json = r#"{
959 "engines": {
960 "Discourse": {
961 "signals": [
962 { "kind": "status_found", "codes": [200] },
963 { "kind": "body_absent", "text": "Oops! That page doesn't exist" }
964 ]
965 }
966 },
967 "sites": [
968 { "name": "Mozilla Forum", "url": "https://discourse.mozilla.org/u/{username}",
969 "engine": "Discourse" }
970 ]
971 }"#;
972 let r = Registry::from_json_str(json).unwrap();
973 let site = &r.sites()[0];
974 assert_eq!(site.signals.len(), 2);
975 assert_eq!(site.engine.as_deref(), Some("Discourse"));
976 assert!(r.engines().contains_key("Discourse"));
978 }
979
980 #[test]
981 fn site_overrides_engine_signals_on_conflict() {
982 let json = r#"{
984 "engines": {
985 "Discourse": {
986 "signals": [{ "kind": "status_found", "codes": [200] }]
987 }
988 },
989 "sites": [
990 { "name": "Custom", "url": "https://example.com/{username}",
991 "engine": "Discourse",
992 "signals": [
993 { "kind": "status_found", "codes": [200] },
994 { "kind": "status_not_found", "codes": [404] }
995 ] }
996 ]
997 }"#;
998 let r = Registry::from_json_str(json).unwrap();
999 assert_eq!(r.sites()[0].signals.len(), 2);
1001 }
1002
1003 #[test]
1004 fn engine_headers_merge_with_site_headers_per_key() {
1005 let json = r#"{
1008 "engines": {
1009 "Foo": {
1010 "signals": [{ "kind": "status_found", "codes": [200] }],
1011 "request_headers": {
1012 "X-Engine": "engine-value",
1013 "User-Agent": "engine-ua"
1014 }
1015 }
1016 },
1017 "sites": [
1018 { "name": "S", "url": "https://example.com/{username}",
1019 "engine": "Foo",
1020 "request_headers": { "User-Agent": "site-ua" } }
1021 ]
1022 }"#;
1023 let r = Registry::from_json_str(json).unwrap();
1024 let h = &r.sites()[0].request_headers;
1025 assert_eq!(h.get("X-Engine").map(String::as_str), Some("engine-value"));
1026 assert_eq!(h.get("User-Agent").map(String::as_str), Some("site-ua"));
1027 }
1028
1029 #[test]
1030 fn missing_engine_reference_fails_load() {
1031 let json = r#"{
1032 "engines": {},
1033 "sites": [
1034 { "name": "Mock", "url": "https://example.com/{username}",
1035 "engine": "DoesNotExist" }
1036 ]
1037 }"#;
1038 let err = Registry::from_json_str(json).unwrap_err();
1039 assert!(
1040 err.to_string()
1041 .contains("references engine \"DoesNotExist\""),
1042 "expected missing-engine error, got: {err}"
1043 );
1044 }
1045
1046 #[test]
1047 fn engine_regex_check_inherited_when_site_has_none() {
1048 let json = r#"{
1049 "engines": {
1050 "Bounded": {
1051 "signals": [{ "kind": "status_found", "codes": [200] }],
1052 "regex_check": "^[a-z]{3,16}$"
1053 }
1054 },
1055 "sites": [
1056 { "name": "S", "url": "https://example.com/{username}",
1057 "engine": "Bounded" }
1058 ]
1059 }"#;
1060 let r = Registry::from_json_str(json).unwrap();
1061 assert_eq!(r.sites()[0].regex_check.as_deref(), Some("^[a-z]{3,16}$"));
1062 }
1063
1064 #[test]
1065 fn region_tag_auto_populates_prefer_geo() {
1066 let json = r#"{
1067 "sites": [
1068 { "name": "vk.com", "url": "https://vk.com/{username}",
1069 "signals": [{ "kind": "status_found", "codes": [200] }],
1070 "tags": ["region:ru", "social"] }
1071 ]
1072 }"#;
1073 let r = Registry::from_json_str(json).unwrap();
1074 let prefer = &r.sites()[0].access.prefer_geo;
1075 assert_eq!(prefer.len(), 1);
1076 assert_eq!(prefer[0].as_str(), "ru");
1077 assert!(r.sites()[0].access.geo.is_empty());
1079 }
1080
1081 #[test]
1082 fn multiple_region_tags_stack() {
1083 let json = r#"{
1084 "sites": [
1085 { "name": "Pan-Slavic", "url": "https://example.test/{username}",
1086 "signals": [{ "kind": "status_found", "codes": [200] }],
1087 "tags": ["region:ru", "region:by", "region:ua"] }
1088 ]
1089 }"#;
1090 let r = Registry::from_json_str(json).unwrap();
1091 let codes: Vec<&str> = r.sites()[0]
1092 .access
1093 .prefer_geo
1094 .iter()
1095 .map(super::super::access::CountryCode::as_str)
1096 .collect();
1097 assert_eq!(codes, vec!["ru", "by", "ua"]);
1098 }
1099
1100 #[test]
1101 fn explicit_hard_geo_suppresses_tag_derived_soft() {
1102 let json = r#"{
1107 "sites": [
1108 { "name": "PL-only", "url": "https://example.test/{username}",
1109 "signals": [{ "kind": "status_found", "codes": [200] }],
1110 "tags": ["region:ru"],
1111 "access": { "geo": ["pl"] } }
1112 ]
1113 }"#;
1114 let r = Registry::from_json_str(json).unwrap();
1115 assert_eq!(r.sites()[0].access.geo[0].as_str(), "pl");
1116 assert!(r.sites()[0].access.prefer_geo.is_empty());
1117 }
1118
1119 #[test]
1120 fn malformed_region_tag_is_ignored() {
1121 let json = r#"{
1125 "sites": [
1126 { "name": "Weird", "url": "https://example.test/{username}",
1127 "signals": [{ "kind": "status_found", "codes": [200] }],
1128 "tags": ["region:eurasia", "region:r", "region:RU"] }
1129 ]
1130 }"#;
1131 let r = Registry::from_json_str(json).unwrap();
1132 let codes: Vec<&str> = r.sites()[0]
1134 .access
1135 .prefer_geo
1136 .iter()
1137 .map(super::super::access::CountryCode::as_str)
1138 .collect();
1139 assert_eq!(codes, vec!["ru"]);
1140 }
1141
1142 #[test]
1143 fn load_from_path_round_trips_via_tempfile() {
1144 let mut path = std::env::temp_dir();
1145 path.push(format!("adler-test-registry-{}.json", std::process::id()));
1146 std::fs::write(
1147 &path,
1148 r#"{
1149 "sites": [
1150 { "name": "Mock", "url": "https://example.com/{username}",
1151 "signals": [{ "kind": "status_found", "codes": [200] }] }
1152 ]
1153 }"#,
1154 )
1155 .unwrap();
1156 let result = Registry::load_from_path(&path);
1157 let _ = std::fs::remove_file(&path);
1158 let registry = result.unwrap();
1159 assert_eq!(registry.len(), 1);
1160 assert_eq!(registry.sites()[0].name, "Mock");
1161 }
1162}