1use std::collections::{BTreeMap, HashSet};
8use std::path::Path;
9
10use serde::Deserialize;
11
12use crate::error::{Error, Result};
13use crate::site::{Engine, Site};
14
15const EMBEDDED_REGISTRY: &str = include_str!("../data/sites.json");
16
17const EMBEDDED_WMN_REGISTRY: &str = include_str!("../data/sites_wmn.json");
24
25#[derive(Debug, Clone, Deserialize)]
34pub struct Registry {
35 #[serde(default)]
36 engines: BTreeMap<String, Engine>,
37 sites: Vec<Site>,
38}
39
40impl Registry {
41 pub fn default_embedded() -> Result<Self> {
43 Self::from_json_str(EMBEDDED_REGISTRY)
44 }
45
46 pub fn default_embedded_with_wmn() -> Result<Self> {
61 let mut base = Self::default_embedded()?;
62 let wmn: Self = serde_json::from_str(EMBEDDED_WMN_REGISTRY)?;
63 let existing: HashSet<String> = base.sites.iter().map(|s| s.name.to_lowercase()).collect();
64 for (name, engine) in wmn.engines {
65 base.engines.entry(name).or_insert(engine);
66 }
67 for site in wmn.sites {
68 if !existing.contains(&site.name.to_lowercase()) {
69 base.sites.push(site);
70 }
71 }
72 base.resolve_engines()?;
73 base.validate()?;
74 Ok(base)
75 }
76
77 pub fn from_json_str(json: &str) -> Result<Self> {
82 let mut registry: Self = serde_json::from_str(json)?;
83 registry.resolve_engines()?;
84 registry.apply_tag_derived_policy();
85 registry.validate()?;
86 Ok(registry)
87 }
88
89 pub fn engines(&self) -> &BTreeMap<String, Engine> {
94 &self.engines
95 }
96
97 fn apply_tag_derived_policy(&mut self) {
116 for site in &mut self.sites {
117 if !site.access.geo.is_empty() {
118 continue;
119 }
120 for tag in &site.tags {
121 let Some(rest) = tag.strip_prefix("region:") else {
122 continue;
123 };
124 let Some(cc) = crate::access::CountryCode::new(rest) else {
125 continue;
126 };
127 if !site.access.prefer_geo.contains(&cc) {
128 site.access.prefer_geo.push(cc);
129 }
130 }
131 }
132 }
133
134 fn resolve_engines(&mut self) -> Result<()> {
143 for (name, engine) in &self.engines {
144 engine.validate(name)?;
145 }
146 for site in &mut self.sites {
147 let Some(name) = &site.engine else {
148 continue;
149 };
150 let Some(engine) = self.engines.get(name) else {
151 return Err(Error::InvalidSite {
152 reason: format!(
153 "site {:?}: references engine {name:?} which is not defined",
154 site.name
155 ),
156 });
157 };
158 engine.merge_into(site);
159 }
160 Ok(())
161 }
162
163 pub fn load_from_path(path: impl AsRef<Path>) -> Result<Self> {
165 let bytes = std::fs::read(path)?;
166 let json = std::str::from_utf8(&bytes).map_err(|e| Error::InvalidSite {
167 reason: format!("registry file is not valid UTF-8: {e}"),
168 })?;
169 Self::from_json_str(json)
170 }
171
172 pub fn sites(&self) -> &[Site] {
174 &self.sites
175 }
176
177 pub fn len(&self) -> usize {
179 self.sites.len()
180 }
181
182 pub fn is_empty(&self) -> bool {
185 self.sites.is_empty()
186 }
187
188 pub fn filter(
207 &self,
208 include: &[String],
209 exclude: &[String],
210 tags: &[String],
211 exclude_tags: &[String],
212 include_nsfw: bool,
213 ) -> Vec<Site> {
214 let include: Vec<String> = include.iter().map(|s| s.to_lowercase()).collect();
215 let exclude: Vec<String> = exclude.iter().map(|s| s.to_lowercase()).collect();
216 let want_tags: Vec<String> = tags.iter().map(|s| s.to_lowercase()).collect();
217 let mut drop_tags: Vec<String> = exclude_tags.iter().map(|s| s.to_lowercase()).collect();
218
219 let nsfw_tag = "nsfw".to_owned();
222 let asking_for_nsfw = want_tags.contains(&nsfw_tag);
223 if !include_nsfw && !asking_for_nsfw && !drop_tags.contains(&nsfw_tag) {
224 drop_tags.push(nsfw_tag);
225 }
226
227 self.sites
228 .iter()
229 .filter(|site| {
230 if site.disabled {
235 return false;
236 }
237 let name = site.name.to_lowercase();
238 let included = include.is_empty() || include.iter().any(|i| name.contains(i));
239 let excluded = exclude.iter().any(|x| name.contains(x));
240 let lower_tags: Vec<String> = site.tags.iter().map(|t| t.to_lowercase()).collect();
241 let tagged =
242 want_tags.is_empty() || lower_tags.iter().any(|t| want_tags.contains(t));
243 let tag_excluded = lower_tags.iter().any(|t| drop_tags.contains(t));
244 included && !excluded && tagged && !tag_excluded
245 })
246 .cloned()
247 .collect()
248 }
249
250 pub fn tag_counts(&self) -> Vec<(String, usize)> {
253 let mut counts: std::collections::BTreeMap<String, usize> =
254 std::collections::BTreeMap::new();
255 for site in &self.sites {
256 for tag in &site.tags {
257 *counts.entry(tag.clone()).or_insert(0) += 1;
258 }
259 }
260 counts.into_iter().collect()
261 }
262
263 fn validate(&self) -> Result<()> {
264 if self.sites.is_empty() {
265 return Err(Error::InvalidSite {
266 reason: "registry has no sites".into(),
267 });
268 }
269 for site in &self.sites {
270 site.validate()?;
271 }
272 let mut seen: HashSet<String> = HashSet::new();
273 for site in &self.sites {
274 let key = site.name.to_lowercase();
275 if !seen.insert(key) {
276 return Err(Error::InvalidSite {
277 reason: format!("duplicate site name: {:?}", site.name),
278 });
279 }
280 }
281 Ok(())
282 }
283}
284
285#[cfg(test)]
286mod tests {
287 use super::*;
288
289 #[test]
290 fn embedded_registry_loads_and_validates() {
291 let registry = Registry::default_embedded().expect("embedded registry must load");
292 assert!(
295 registry.len() >= 100,
296 "imported registry should have ≥100 sites, got {}",
297 registry.len()
298 );
299 let names: Vec<&str> = registry.sites().iter().map(|s| s.name.as_str()).collect();
305 assert!(names.contains(&"GitHub"));
306 assert!(names.contains(&"Reddit"));
307 assert!(names.contains(&"Telegram"));
308 }
309
310 #[test]
311 fn wmn_embedded_registry_loads_and_supersets_default() {
312 let base = Registry::default_embedded().unwrap();
313 let merged = Registry::default_embedded_with_wmn().expect("WMN-merged registry must load");
314 assert!(
315 merged.len() > base.len(),
316 "WMN merge must add sites: base={} merged={}",
317 base.len(),
318 merged.len()
319 );
320 let merged_names: HashSet<String> = merged
323 .sites()
324 .iter()
325 .map(|s| s.name.to_lowercase())
326 .collect();
327 for s in base.sites() {
328 assert!(
329 merged_names.contains(&s.name.to_lowercase()),
330 "merge dropped base-tranche site {:?}",
331 s.name
332 );
333 }
334 let has_wmn_tag = merged
336 .sites()
337 .iter()
338 .any(|s| s.tags.iter().any(|t| t == "source:wmn"));
339 assert!(has_wmn_tag, "no site carries the source:wmn tag");
340 }
341
342 #[test]
343 fn rejects_empty_registry() {
344 let err = Registry::from_json_str(r#"{ "sites": [] }"#).unwrap_err();
345 assert!(matches!(err, Error::InvalidSite { .. }));
346 }
347
348 #[test]
349 fn rejects_duplicate_site_names() {
350 let json = r#"{
351 "sites": [
352 { "name": "GitHub", "url": "https://github.com/{username}",
353 "signals": [{ "kind": "status_found", "codes": [200] }] },
354 { "name": "github", "url": "https://github.com/{username}",
355 "signals": [{ "kind": "status_found", "codes": [200] }] }
356 ]
357 }"#;
358 let err = Registry::from_json_str(json).unwrap_err();
359 assert!(matches!(err, Error::InvalidSite { .. }));
360 assert!(err.to_string().contains("duplicate"));
361 }
362
363 #[test]
364 fn rejects_invalid_site_definition() {
365 let json = r#"{
367 "sites": [
368 { "name": "Bad", "url": "https://example.com/",
369 "signals": [{ "kind": "status_found", "codes": [200] }] }
370 ]
371 }"#;
372 assert!(Registry::from_json_str(json).is_err());
373 }
374
375 #[test]
376 fn rejects_malformed_json() {
377 let err = Registry::from_json_str("{").unwrap_err();
378 assert!(matches!(err, Error::Json(_)));
379 }
380
381 #[test]
382 fn filter_include_is_case_insensitive_substring() {
383 let registry = Registry::default_embedded().unwrap();
384 let only_github = registry.filter(&["github".into()], &[], &[], &[], false);
385 assert_eq!(only_github.len(), 1);
386 assert_eq!(only_github[0].name, "GitHub");
387
388 let many = registry.filter(&["e".into()], &[], &[], &[], false); assert!(many.len() > 1);
390 }
391
392 #[test]
393 fn filter_exclude_drops_matches() {
394 let registry = Registry::default_embedded().unwrap();
395 let without_github = registry.filter(&[], &["github".into()], &[], &[], true);
398 assert!(without_github.iter().all(|s| s.name != "GitHub"));
399 assert_eq!(without_github.len(), registry.len() - 1);
400 }
401
402 #[test]
403 fn filter_include_and_exclude_compose() {
404 let registry = Registry::default_embedded().unwrap();
405 let filtered = registry.filter(&["git".into()], &["lab".into()], &[], &[], false);
407 let names: Vec<&str> = filtered.iter().map(|s| s.name.as_str()).collect();
408 assert!(names.contains(&"GitHub"));
409 assert!(!names.contains(&"GitLab"));
410 }
412
413 #[test]
414 fn filter_with_no_matches_returns_empty() {
415 let registry = Registry::default_embedded().unwrap();
416 let filtered = registry.filter(&["does-not-exist-xyz".into()], &[], &[], &[], false);
417 assert!(filtered.is_empty());
418 }
419
420 #[test]
421 fn disabled_sites_are_skipped_by_filter() {
422 let json = r#"{
423 "sites": [
424 { "name": "Alive", "url": "https://alive.example/{username}",
425 "signals": [{ "kind": "status_found", "codes": [200] }] },
426 { "name": "Parked", "url": "https://parked.example/{username}",
427 "signals": [{ "kind": "status_found", "codes": [200] }],
428 "disabled": true }
429 ]
430 }"#;
431 let registry = Registry::from_json_str(json).unwrap();
432 assert_eq!(registry.sites().len(), 2);
436 let scanned = registry.filter(&[], &[], &[], &[], false);
437 let names: Vec<&str> = scanned.iter().map(|s| s.name.as_str()).collect();
438 assert_eq!(names, vec!["Alive"]);
439 }
440
441 #[test]
442 fn source_field_round_trips() {
443 let json = r#"{
444 "sites": [
445 { "name": "Nitter", "url": "https://nitter.example/{username}",
446 "signals": [{ "kind": "status_found", "codes": [200] }],
447 "source": "Twitter" }
448 ]
449 }"#;
450 let registry = Registry::from_json_str(json).unwrap();
451 assert_eq!(registry.sites()[0].source.as_deref(), Some("Twitter"));
452 }
453
454 fn tagged_registry() -> Registry {
455 let json = r#"{
456 "sites": [
457 { "name": "Soc", "url": "https://soc.example/{username}",
458 "signals": [{ "kind": "status_found", "codes": [200] }],
459 "tags": ["social", "region:ru"] },
460 { "name": "Dev", "url": "https://dev.example/{username}",
461 "signals": [{ "kind": "status_found", "codes": [200] }],
462 "tags": ["dev"] },
463 { "name": "Plain", "url": "https://plain.example/{username}",
464 "signals": [{ "kind": "status_found", "codes": [200] }] }
465 ]
466 }"#;
467 Registry::from_json_str(json).unwrap()
468 }
469
470 #[test]
471 fn tag_filter_keeps_only_matching_tags_and_drops_untagged() {
472 let r = tagged_registry();
473 let social = r.filter(&[], &[], &["social".into()], &[], false);
474 let names: Vec<&str> = social.iter().map(|s| s.name.as_str()).collect();
475 assert_eq!(names, ["Soc"], "tag filter should keep only tagged matches");
476 }
477
478 #[test]
479 fn tag_filter_is_or_within_requested_tags_and_case_insensitive() {
480 let r = tagged_registry();
481 let either = r.filter(&[], &[], &["DEV".into(), "social".into()], &[], false);
482 let names: Vec<&str> = either.iter().map(|s| s.name.as_str()).collect();
483 assert_eq!(names, ["Soc", "Dev"]);
484 }
485
486 #[test]
487 fn no_tag_filter_includes_untagged_sites() {
488 let r = tagged_registry();
489 assert_eq!(r.filter(&[], &[], &[], &[], false).len(), 3);
490 }
491
492 #[test]
493 fn exclude_tag_drops_matching_sites() {
494 let r = tagged_registry();
495 let kept = r.filter(&[], &[], &[], &["social".into()], false);
496 let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
497 assert_eq!(names, ["Dev", "Plain"], "{names:?}");
499 }
500
501 fn nsfw_registry() -> Registry {
502 let json = r#"{
503 "sites": [
504 { "name": "Family", "url": "https://family.example/{username}",
505 "signals": [{ "kind": "status_found", "codes": [200] }],
506 "tags": ["social"] },
507 { "name": "Adult", "url": "https://adult.example/{username}",
508 "signals": [{ "kind": "status_found", "codes": [200] }],
509 "tags": ["nsfw"] }
510 ]
511 }"#;
512 Registry::from_json_str(json).unwrap()
513 }
514
515 #[test]
516 fn nsfw_sites_excluded_by_default() {
517 let r = nsfw_registry();
518 let kept = r.filter(&[], &[], &[], &[], false);
519 let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
520 assert_eq!(names, ["Family"], "nsfw site must be excluded by default");
521 }
522
523 #[test]
524 fn nsfw_sites_included_when_flag_set() {
525 let r = nsfw_registry();
526 let kept = r.filter(&[], &[], &[], &[], true);
527 assert_eq!(kept.len(), 2, "both sites present with include_nsfw=true");
528 }
529
530 #[test]
531 fn nsfw_sites_included_when_tag_asked_for_explicitly() {
532 let r = nsfw_registry();
535 let kept = r.filter(&[], &[], &["nsfw".into()], &[], false);
536 let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
537 assert_eq!(names, ["Adult"]);
538 }
539
540 #[test]
541 fn tag_counts_are_sorted_with_per_tag_totals() {
542 let r = tagged_registry();
543 assert_eq!(
544 r.tag_counts(),
545 vec![
546 ("dev".to_owned(), 1),
547 ("region:ru".to_owned(), 1),
548 ("social".to_owned(), 1),
549 ]
550 );
551 }
552
553 #[test]
554 fn engine_inheritance_fills_empty_site_signals() {
555 let json = r#"{
557 "engines": {
558 "Discourse": {
559 "signals": [
560 { "kind": "status_found", "codes": [200] },
561 { "kind": "body_absent", "text": "Oops! That page doesn't exist" }
562 ]
563 }
564 },
565 "sites": [
566 { "name": "Mozilla Forum", "url": "https://discourse.mozilla.org/u/{username}",
567 "engine": "Discourse" }
568 ]
569 }"#;
570 let r = Registry::from_json_str(json).unwrap();
571 let site = &r.sites()[0];
572 assert_eq!(site.signals.len(), 2);
573 assert_eq!(site.engine.as_deref(), Some("Discourse"));
574 assert!(r.engines().contains_key("Discourse"));
576 }
577
578 #[test]
579 fn site_overrides_engine_signals_on_conflict() {
580 let json = r#"{
582 "engines": {
583 "Discourse": {
584 "signals": [{ "kind": "status_found", "codes": [200] }]
585 }
586 },
587 "sites": [
588 { "name": "Custom", "url": "https://example.com/{username}",
589 "engine": "Discourse",
590 "signals": [
591 { "kind": "status_found", "codes": [200] },
592 { "kind": "status_not_found", "codes": [404] }
593 ] }
594 ]
595 }"#;
596 let r = Registry::from_json_str(json).unwrap();
597 assert_eq!(r.sites()[0].signals.len(), 2);
599 }
600
601 #[test]
602 fn engine_headers_merge_with_site_headers_per_key() {
603 let json = r#"{
606 "engines": {
607 "Foo": {
608 "signals": [{ "kind": "status_found", "codes": [200] }],
609 "request_headers": {
610 "X-Engine": "engine-value",
611 "User-Agent": "engine-ua"
612 }
613 }
614 },
615 "sites": [
616 { "name": "S", "url": "https://example.com/{username}",
617 "engine": "Foo",
618 "request_headers": { "User-Agent": "site-ua" } }
619 ]
620 }"#;
621 let r = Registry::from_json_str(json).unwrap();
622 let h = &r.sites()[0].request_headers;
623 assert_eq!(h.get("X-Engine").map(String::as_str), Some("engine-value"));
624 assert_eq!(h.get("User-Agent").map(String::as_str), Some("site-ua"));
625 }
626
627 #[test]
628 fn missing_engine_reference_fails_load() {
629 let json = r#"{
630 "engines": {},
631 "sites": [
632 { "name": "Mock", "url": "https://example.com/{username}",
633 "engine": "DoesNotExist" }
634 ]
635 }"#;
636 let err = Registry::from_json_str(json).unwrap_err();
637 assert!(
638 err.to_string()
639 .contains("references engine \"DoesNotExist\""),
640 "expected missing-engine error, got: {err}"
641 );
642 }
643
644 #[test]
645 fn engine_regex_check_inherited_when_site_has_none() {
646 let json = r#"{
647 "engines": {
648 "Bounded": {
649 "signals": [{ "kind": "status_found", "codes": [200] }],
650 "regex_check": "^[a-z]{3,16}$"
651 }
652 },
653 "sites": [
654 { "name": "S", "url": "https://example.com/{username}",
655 "engine": "Bounded" }
656 ]
657 }"#;
658 let r = Registry::from_json_str(json).unwrap();
659 assert_eq!(r.sites()[0].regex_check.as_deref(), Some("^[a-z]{3,16}$"));
660 }
661
662 #[test]
663 fn region_tag_auto_populates_prefer_geo() {
664 let json = r#"{
665 "sites": [
666 { "name": "vk.com", "url": "https://vk.com/{username}",
667 "signals": [{ "kind": "status_found", "codes": [200] }],
668 "tags": ["region:ru", "social"] }
669 ]
670 }"#;
671 let r = Registry::from_json_str(json).unwrap();
672 let prefer = &r.sites()[0].access.prefer_geo;
673 assert_eq!(prefer.len(), 1);
674 assert_eq!(prefer[0].as_str(), "ru");
675 assert!(r.sites()[0].access.geo.is_empty());
677 }
678
679 #[test]
680 fn multiple_region_tags_stack() {
681 let json = r#"{
682 "sites": [
683 { "name": "Pan-Slavic", "url": "https://example.test/{username}",
684 "signals": [{ "kind": "status_found", "codes": [200] }],
685 "tags": ["region:ru", "region:by", "region:ua"] }
686 ]
687 }"#;
688 let r = Registry::from_json_str(json).unwrap();
689 let codes: Vec<&str> = r.sites()[0]
690 .access
691 .prefer_geo
692 .iter()
693 .map(super::super::access::CountryCode::as_str)
694 .collect();
695 assert_eq!(codes, vec!["ru", "by", "ua"]);
696 }
697
698 #[test]
699 fn explicit_hard_geo_suppresses_tag_derived_soft() {
700 let json = r#"{
705 "sites": [
706 { "name": "PL-only", "url": "https://example.test/{username}",
707 "signals": [{ "kind": "status_found", "codes": [200] }],
708 "tags": ["region:ru"],
709 "access": { "geo": ["pl"] } }
710 ]
711 }"#;
712 let r = Registry::from_json_str(json).unwrap();
713 assert_eq!(r.sites()[0].access.geo[0].as_str(), "pl");
714 assert!(r.sites()[0].access.prefer_geo.is_empty());
715 }
716
717 #[test]
718 fn malformed_region_tag_is_ignored() {
719 let json = r#"{
723 "sites": [
724 { "name": "Weird", "url": "https://example.test/{username}",
725 "signals": [{ "kind": "status_found", "codes": [200] }],
726 "tags": ["region:eurasia", "region:r", "region:RU"] }
727 ]
728 }"#;
729 let r = Registry::from_json_str(json).unwrap();
730 let codes: Vec<&str> = r.sites()[0]
732 .access
733 .prefer_geo
734 .iter()
735 .map(super::super::access::CountryCode::as_str)
736 .collect();
737 assert_eq!(codes, vec!["ru"]);
738 }
739
740 #[test]
741 fn load_from_path_round_trips_via_tempfile() {
742 let mut path = std::env::temp_dir();
743 path.push(format!("adler-test-registry-{}.json", std::process::id()));
744 std::fs::write(
745 &path,
746 r#"{
747 "sites": [
748 { "name": "Mock", "url": "https://example.com/{username}",
749 "signals": [{ "kind": "status_found", "codes": [200] }] }
750 ]
751 }"#,
752 )
753 .unwrap();
754 let result = Registry::load_from_path(&path);
755 let _ = std::fs::remove_file(&path);
756 let registry = result.unwrap();
757 assert_eq!(registry.len(), 1);
758 assert_eq!(registry.sites()[0].name, "Mock");
759 }
760}