Skip to main content

adler_core/
registry.rs

1//! Site registry — loading, validation, filtering.
2//!
3//! The default registry is embedded into the binary at compile time via
4//! [`include_str!`]. Callers can override it with a file at runtime through
5//! [`Registry::load_from_path`].
6
7use std::collections::{BTreeMap, HashSet};
8use std::path::Path;
9
10use serde::Deserialize;
11
12use crate::error::{Error, Result};
13use crate::site::{Engine, Site};
14
15const EMBEDDED_REGISTRY: &str = include_str!("../data/sites.json");
16
17/// Supplementary registry derived from the `WhatsMyName` project
18/// (`WebBreacher/WhatsMyName`, CC BY-SA 4.0). Kept as a separate
19/// constant because its data license is incompatible with the
20/// MIT-only [`EMBEDDED_REGISTRY`] above; callers opt in explicitly
21/// via [`Registry::default_embedded_with_wmn`] to keep the default
22/// MIT-clean for downstream redistribution.
23const EMBEDDED_WMN_REGISTRY: &str = include_str!("../data/sites_wmn.json");
24
25/// A loaded, validated collection of site definitions.
26///
27/// Engines (shared signature templates referenced by [`Site::engine`])
28/// are resolved into sites at load time — by the time you call
29/// [`Registry::sites`] every entry already has its inherited
30/// `signals` / `request_headers` / `regex_check` materialised. The original
31/// [`Engine`] objects are kept on the registry for re-export and
32/// inspection via [`Registry::engines`].
33#[derive(Debug, Clone, Deserialize)]
34pub struct Registry {
35    #[serde(default)]
36    engines: BTreeMap<String, Engine>,
37    sites: Vec<Site>,
38}
39
40impl Registry {
41    /// Load the default site list embedded into the crate at build time.
42    pub fn default_embedded() -> Result<Self> {
43        Self::from_json_str(EMBEDDED_REGISTRY)
44    }
45
46    /// Load the default site list *plus* the `WhatsMyName`-derived
47    /// supplementary set. `WhatsMyName` data is licensed CC BY-SA 4.0
48    /// (see `LICENSE-CC-BY-SA-4.0` at the repo root); enabling this
49    /// path means downstream redistribution of the merged scan data
50    /// must respect the `ShareAlike` obligation. Sites contributed by
51    /// the `WhatsMyName` tranche carry the `source:wmn` tag for
52    /// provenance.
53    ///
54    /// Engines from the WMN tranche merge with the MIT tranche;
55    /// case-insensitive site-name collisions resolve in favour of the
56    /// MIT-tranche entry (the hand-curated Sherlock/Maigret-derived
57    /// signature wins; the WMN duplicate is dropped). Returns an
58    /// error only if either tranche fails its own validation —
59    /// engine references are checked across the merged set.
60    pub fn default_embedded_with_wmn() -> Result<Self> {
61        let mut base = Self::default_embedded()?;
62        let wmn: Self = serde_json::from_str(EMBEDDED_WMN_REGISTRY)?;
63        let existing: HashSet<String> = base.sites.iter().map(|s| s.name.to_lowercase()).collect();
64        for (name, engine) in wmn.engines {
65            base.engines.entry(name).or_insert(engine);
66        }
67        for site in wmn.sites {
68            if !existing.contains(&site.name.to_lowercase()) {
69                base.sites.push(site);
70            }
71        }
72        base.resolve_engines()?;
73        base.validate()?;
74        Ok(base)
75    }
76
77    /// Parse and validate a registry from a JSON string. Engine
78    /// references on each site are resolved before validation;
79    /// a site that names an engine which doesn't exist in the
80    /// `engines` block fails loading with [`Error::InvalidSite`].
81    pub fn from_json_str(json: &str) -> Result<Self> {
82        let mut registry: Self = serde_json::from_str(json)?;
83        registry.resolve_engines()?;
84        registry.apply_tag_derived_policy();
85        registry.validate()?;
86        Ok(registry)
87    }
88
89    /// Inheritable engine templates, keyed by name. Useful for
90    /// introspection and for serialising the registry back out;
91    /// detection paths read the resolved fields off the sites
92    /// directly and don't need to consult this map.
93    pub fn engines(&self) -> &BTreeMap<String, Engine> {
94        &self.engines
95    }
96
97    /// Walk every site's tags for `region:XX` markers and fill
98    /// [`AccessPolicy::prefer_geo`](crate::AccessPolicy::prefer_geo)
99    /// with the matching country codes. **Soft** routing only — a
100    /// site declaring `region:ru` *prefers* a Russian egress when one
101    /// is configured but still works from anywhere else; the router
102    /// falls back to the default egress on no match rather than
103    /// reporting `Uncertain(GeoUnavailable)`.
104    ///
105    /// Skips sites that already declare a hard
106    /// [`AccessPolicy::geo`](crate::AccessPolicy::geo) — explicit
107    /// policy wins on conflict, same convention as engine inheritance.
108    /// Existing `prefer_geo` entries are also preserved: tag-derived
109    /// codes are *added*, not replaced, so a hand-tuned policy can
110    /// stack on top of the tag. Invalid country codes (`region:xx` is
111    /// not exactly two ASCII letters) are silently skipped — they
112    /// shouldn't exist in the registry, and a parse error here would
113    /// break the load for a tag the scanner already ignores in
114    /// every other context.
115    fn apply_tag_derived_policy(&mut self) {
116        for site in &mut self.sites {
117            if !site.access.geo.is_empty() {
118                continue;
119            }
120            for tag in &site.tags {
121                let Some(rest) = tag.strip_prefix("region:") else {
122                    continue;
123                };
124                let Some(cc) = crate::access::CountryCode::new(rest) else {
125                    continue;
126                };
127                if !site.access.prefer_geo.contains(&cc) {
128                    site.access.prefer_geo.push(cc);
129                }
130            }
131        }
132    }
133
134    /// Merge each engine's inheritable fields into the sites that
135    /// reference it. After this call every site's `signals`,
136    /// `request_headers` and `regex_check` reflect the effective
137    /// values used by the scanner.
138    ///
139    /// Per-site fields are authoritative: anything declared
140    /// explicitly on a site wins on conflict; only empty / unset
141    /// fields are filled from the engine.
142    fn resolve_engines(&mut self) -> Result<()> {
143        for (name, engine) in &self.engines {
144            engine.validate(name)?;
145        }
146        for site in &mut self.sites {
147            let Some(name) = &site.engine else {
148                continue;
149            };
150            let Some(engine) = self.engines.get(name) else {
151                return Err(Error::InvalidSite {
152                    reason: format!(
153                        "site {:?}: references engine {name:?} which is not defined",
154                        site.name
155                    ),
156                });
157            };
158            engine.merge_into(site);
159        }
160        Ok(())
161    }
162
163    /// Read a registry from a JSON file.
164    pub fn load_from_path(path: impl AsRef<Path>) -> Result<Self> {
165        let bytes = std::fs::read(path)?;
166        let json = std::str::from_utf8(&bytes).map_err(|e| Error::InvalidSite {
167            reason: format!("registry file is not valid UTF-8: {e}"),
168        })?;
169        Self::from_json_str(json)
170    }
171
172    /// Borrow all sites in load order.
173    pub fn sites(&self) -> &[Site] {
174        &self.sites
175    }
176
177    /// Number of sites.
178    pub fn len(&self) -> usize {
179        self.sites.len()
180    }
181
182    /// True if the registry has no sites (always false for a valid load,
183    /// since we'd already have rejected it).
184    pub fn is_empty(&self) -> bool {
185        self.sites.is_empty()
186    }
187
188    /// Apply include/exclude name filters and a tag filter.
189    ///
190    /// - If `include` is non-empty, only sites whose name contains at least
191    ///   one include term (case-insensitive substring) are kept.
192    /// - Sites whose name contains any exclude term are dropped.
193    /// - If `tags` is non-empty, only sites carrying at least one of the
194    ///   requested tags are kept (case-insensitive). A site with no tags is
195    ///   therefore dropped by a tag filter — asking for `--tag social` means
196    ///   "only social-tagged sites".
197    /// - Sites carrying any tag in `exclude_tags` are dropped (e.g.
198    ///   `--exclude-tag bot-protected` for a fast clean run).
199    /// - **NSFW sites are auto-excluded** (the `nsfw` tag) unless
200    ///   `include_nsfw` is `true` or `tags` explicitly asks for `nsfw`.
201    ///   This matches Sherlock's `--nsfw` opt-in pattern and prevents
202    ///   the default `adler <username>` from surfacing adult-site URLs
203    ///   the user didn't ask for.
204    /// - Sites are returned by value (cloned) so the result is independent
205    ///   of the registry's lifetime — convenient for handing to the executor.
206    pub fn filter(
207        &self,
208        include: &[String],
209        exclude: &[String],
210        tags: &[String],
211        exclude_tags: &[String],
212        include_nsfw: bool,
213    ) -> Vec<Site> {
214        let include: Vec<String> = include.iter().map(|s| s.to_lowercase()).collect();
215        let exclude: Vec<String> = exclude.iter().map(|s| s.to_lowercase()).collect();
216        let want_tags: Vec<String> = tags.iter().map(|s| s.to_lowercase()).collect();
217        let mut drop_tags: Vec<String> = exclude_tags.iter().map(|s| s.to_lowercase()).collect();
218
219        // NSFW gate: auto-exclude unless the caller explicitly opted in,
220        // either via `include_nsfw` or by asking for the `nsfw` tag.
221        let nsfw_tag = "nsfw".to_owned();
222        let asking_for_nsfw = want_tags.contains(&nsfw_tag);
223        if !include_nsfw && !asking_for_nsfw && !drop_tags.contains(&nsfw_tag) {
224            drop_tags.push(nsfw_tag);
225        }
226
227        self.sites
228            .iter()
229            .filter(|site| {
230                // Disabled sites are skipped unconditionally — the bool
231                // is meant for parking known-broken entries with a
232                // reason comment instead of deleting them, so they
233                // never get probed even with a fresh include filter.
234                if site.disabled {
235                    return false;
236                }
237                let name = site.name.to_lowercase();
238                let included = include.is_empty() || include.iter().any(|i| name.contains(i));
239                let excluded = exclude.iter().any(|x| name.contains(x));
240                let lower_tags: Vec<String> = site.tags.iter().map(|t| t.to_lowercase()).collect();
241                let tagged =
242                    want_tags.is_empty() || lower_tags.iter().any(|t| want_tags.contains(t));
243                let tag_excluded = lower_tags.iter().any(|t| drop_tags.contains(t));
244                included && !excluded && tagged && !tag_excluded
245            })
246            .cloned()
247            .collect()
248    }
249
250    /// Distinct tags across all sites, sorted, with the count of sites
251    /// carrying each. Powers `--list-tags`.
252    pub fn tag_counts(&self) -> Vec<(String, usize)> {
253        let mut counts: std::collections::BTreeMap<String, usize> =
254            std::collections::BTreeMap::new();
255        for site in &self.sites {
256            for tag in &site.tags {
257                *counts.entry(tag.clone()).or_insert(0) += 1;
258            }
259        }
260        counts.into_iter().collect()
261    }
262
263    fn validate(&self) -> Result<()> {
264        if self.sites.is_empty() {
265            return Err(Error::InvalidSite {
266                reason: "registry has no sites".into(),
267            });
268        }
269        for site in &self.sites {
270            site.validate()?;
271        }
272        let mut seen: HashSet<String> = HashSet::new();
273        for site in &self.sites {
274            let key = site.name.to_lowercase();
275            if !seen.insert(key) {
276                return Err(Error::InvalidSite {
277                    reason: format!("duplicate site name: {:?}", site.name),
278                });
279            }
280        }
281        Ok(())
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    #[test]
290    fn embedded_registry_loads_and_validates() {
291        let registry = Registry::default_embedded().expect("embedded registry must load");
292        // The registry is imported from Sherlock (~450 sites); a floor well
293        // above the old hand-written 15 guards against accidental truncation.
294        assert!(
295            registry.len() >= 100,
296            "imported registry should have ≥100 sites, got {}",
297            registry.len()
298        );
299        // Spot-check a couple of well-known entries. (HackerNews used
300        // to be here but was pruned 2026-05-26 — its Sherlock-side
301        // known_present went stale and the imported signature
302        // doctor-failed; can be restored via OVERRIDES in
303        // import_sherlock.py with a working account.)
304        let names: Vec<&str> = registry.sites().iter().map(|s| s.name.as_str()).collect();
305        assert!(names.contains(&"GitHub"));
306        assert!(names.contains(&"Reddit"));
307        assert!(names.contains(&"Telegram"));
308    }
309
310    #[test]
311    fn wmn_embedded_registry_loads_and_supersets_default() {
312        let base = Registry::default_embedded().unwrap();
313        let merged = Registry::default_embedded_with_wmn().expect("WMN-merged registry must load");
314        assert!(
315            merged.len() > base.len(),
316            "WMN merge must add sites: base={} merged={}",
317            base.len(),
318            merged.len()
319        );
320        // Every base-tranche name survives the merge; case-insensitive
321        // collisions resolve in favour of the MIT-tranche entry.
322        let merged_names: HashSet<String> = merged
323            .sites()
324            .iter()
325            .map(|s| s.name.to_lowercase())
326            .collect();
327        for s in base.sites() {
328            assert!(
329                merged_names.contains(&s.name.to_lowercase()),
330                "merge dropped base-tranche site {:?}",
331                s.name
332            );
333        }
334        // At least one WMN-only site carries the provenance tag.
335        let has_wmn_tag = merged
336            .sites()
337            .iter()
338            .any(|s| s.tags.iter().any(|t| t == "source:wmn"));
339        assert!(has_wmn_tag, "no site carries the source:wmn tag");
340    }
341
342    #[test]
343    fn rejects_empty_registry() {
344        let err = Registry::from_json_str(r#"{ "sites": [] }"#).unwrap_err();
345        assert!(matches!(err, Error::InvalidSite { .. }));
346    }
347
348    #[test]
349    fn rejects_duplicate_site_names() {
350        let json = r#"{
351            "sites": [
352                { "name": "GitHub", "url": "https://github.com/{username}",
353                  "signals": [{ "kind": "status_found", "codes": [200] }] },
354                { "name": "github", "url": "https://github.com/{username}",
355                  "signals": [{ "kind": "status_found", "codes": [200] }] }
356            ]
357        }"#;
358        let err = Registry::from_json_str(json).unwrap_err();
359        assert!(matches!(err, Error::InvalidSite { .. }));
360        assert!(err.to_string().contains("duplicate"));
361    }
362
363    #[test]
364    fn rejects_invalid_site_definition() {
365        // Missing {username} placeholder.
366        let json = r#"{
367            "sites": [
368                { "name": "Bad", "url": "https://example.com/",
369                  "signals": [{ "kind": "status_found", "codes": [200] }] }
370            ]
371        }"#;
372        assert!(Registry::from_json_str(json).is_err());
373    }
374
375    #[test]
376    fn rejects_malformed_json() {
377        let err = Registry::from_json_str("{").unwrap_err();
378        assert!(matches!(err, Error::Json(_)));
379    }
380
381    #[test]
382    fn filter_include_is_case_insensitive_substring() {
383        let registry = Registry::default_embedded().unwrap();
384        let only_github = registry.filter(&["github".into()], &[], &[], &[], false);
385        assert_eq!(only_github.len(), 1);
386        assert_eq!(only_github[0].name, "GitHub");
387
388        let many = registry.filter(&["e".into()], &[], &[], &[], false); // matches anything with "e"
389        assert!(many.len() > 1);
390    }
391
392    #[test]
393    fn filter_exclude_drops_matches() {
394        let registry = Registry::default_embedded().unwrap();
395        // Include NSFW to keep the test focused on the name-exclude
396        // path; the NSFW auto-exclusion is exercised separately.
397        let without_github = registry.filter(&[], &["github".into()], &[], &[], true);
398        assert!(without_github.iter().all(|s| s.name != "GitHub"));
399        assert_eq!(without_github.len(), registry.len() - 1);
400    }
401
402    #[test]
403    fn filter_include_and_exclude_compose() {
404        let registry = Registry::default_embedded().unwrap();
405        // Include "git", then exclude "lab" → keep GitHub, drop GitLab.
406        let filtered = registry.filter(&["git".into()], &["lab".into()], &[], &[], false);
407        let names: Vec<&str> = filtered.iter().map(|s| s.name.as_str()).collect();
408        assert!(names.contains(&"GitHub"));
409        assert!(!names.contains(&"GitLab"));
410        // Exclude wins over include for sites containing both terms (none here).
411    }
412
413    #[test]
414    fn filter_with_no_matches_returns_empty() {
415        let registry = Registry::default_embedded().unwrap();
416        let filtered = registry.filter(&["does-not-exist-xyz".into()], &[], &[], &[], false);
417        assert!(filtered.is_empty());
418    }
419
420    #[test]
421    fn disabled_sites_are_skipped_by_filter() {
422        let json = r#"{
423            "sites": [
424                { "name": "Alive", "url": "https://alive.example/{username}",
425                  "signals": [{ "kind": "status_found", "codes": [200] }] },
426                { "name": "Parked", "url": "https://parked.example/{username}",
427                  "signals": [{ "kind": "status_found", "codes": [200] }],
428                  "disabled": true }
429            ]
430        }"#;
431        let registry = Registry::from_json_str(json).unwrap();
432        // sites() returns everything including disabled — it's the
433        // serialisation view. filter() is the scan view and drops
434        // disabled entries.
435        assert_eq!(registry.sites().len(), 2);
436        let scanned = registry.filter(&[], &[], &[], &[], false);
437        let names: Vec<&str> = scanned.iter().map(|s| s.name.as_str()).collect();
438        assert_eq!(names, vec!["Alive"]);
439    }
440
441    #[test]
442    fn source_field_round_trips() {
443        let json = r#"{
444            "sites": [
445                { "name": "Nitter", "url": "https://nitter.example/{username}",
446                  "signals": [{ "kind": "status_found", "codes": [200] }],
447                  "source": "Twitter" }
448            ]
449        }"#;
450        let registry = Registry::from_json_str(json).unwrap();
451        assert_eq!(registry.sites()[0].source.as_deref(), Some("Twitter"));
452    }
453
454    fn tagged_registry() -> Registry {
455        let json = r#"{
456            "sites": [
457                { "name": "Soc", "url": "https://soc.example/{username}",
458                  "signals": [{ "kind": "status_found", "codes": [200] }],
459                  "tags": ["social", "region:ru"] },
460                { "name": "Dev", "url": "https://dev.example/{username}",
461                  "signals": [{ "kind": "status_found", "codes": [200] }],
462                  "tags": ["dev"] },
463                { "name": "Plain", "url": "https://plain.example/{username}",
464                  "signals": [{ "kind": "status_found", "codes": [200] }] }
465            ]
466        }"#;
467        Registry::from_json_str(json).unwrap()
468    }
469
470    #[test]
471    fn tag_filter_keeps_only_matching_tags_and_drops_untagged() {
472        let r = tagged_registry();
473        let social = r.filter(&[], &[], &["social".into()], &[], false);
474        let names: Vec<&str> = social.iter().map(|s| s.name.as_str()).collect();
475        assert_eq!(names, ["Soc"], "tag filter should keep only tagged matches");
476    }
477
478    #[test]
479    fn tag_filter_is_or_within_requested_tags_and_case_insensitive() {
480        let r = tagged_registry();
481        let either = r.filter(&[], &[], &["DEV".into(), "social".into()], &[], false);
482        let names: Vec<&str> = either.iter().map(|s| s.name.as_str()).collect();
483        assert_eq!(names, ["Soc", "Dev"]);
484    }
485
486    #[test]
487    fn no_tag_filter_includes_untagged_sites() {
488        let r = tagged_registry();
489        assert_eq!(r.filter(&[], &[], &[], &[], false).len(), 3);
490    }
491
492    #[test]
493    fn exclude_tag_drops_matching_sites() {
494        let r = tagged_registry();
495        let kept = r.filter(&[], &[], &[], &["social".into()], false);
496        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
497        // Soc carries "social" → dropped; Dev and untagged Plain remain.
498        assert_eq!(names, ["Dev", "Plain"], "{names:?}");
499    }
500
501    fn nsfw_registry() -> Registry {
502        let json = r#"{
503            "sites": [
504                { "name": "Family", "url": "https://family.example/{username}",
505                  "signals": [{ "kind": "status_found", "codes": [200] }],
506                  "tags": ["social"] },
507                { "name": "Adult", "url": "https://adult.example/{username}",
508                  "signals": [{ "kind": "status_found", "codes": [200] }],
509                  "tags": ["nsfw"] }
510            ]
511        }"#;
512        Registry::from_json_str(json).unwrap()
513    }
514
515    #[test]
516    fn nsfw_sites_excluded_by_default() {
517        let r = nsfw_registry();
518        let kept = r.filter(&[], &[], &[], &[], false);
519        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
520        assert_eq!(names, ["Family"], "nsfw site must be excluded by default");
521    }
522
523    #[test]
524    fn nsfw_sites_included_when_flag_set() {
525        let r = nsfw_registry();
526        let kept = r.filter(&[], &[], &[], &[], true);
527        assert_eq!(kept.len(), 2, "both sites present with include_nsfw=true");
528    }
529
530    #[test]
531    fn nsfw_sites_included_when_tag_asked_for_explicitly() {
532        // `--tag nsfw` is an explicit opt-in; should bypass the default
533        // auto-exclusion even with include_nsfw=false.
534        let r = nsfw_registry();
535        let kept = r.filter(&[], &[], &["nsfw".into()], &[], false);
536        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
537        assert_eq!(names, ["Adult"]);
538    }
539
540    #[test]
541    fn tag_counts_are_sorted_with_per_tag_totals() {
542        let r = tagged_registry();
543        assert_eq!(
544            r.tag_counts(),
545            vec![
546                ("dev".to_owned(), 1),
547                ("region:ru".to_owned(), 1),
548                ("social".to_owned(), 1),
549            ]
550        );
551    }
552
553    #[test]
554    fn engine_inheritance_fills_empty_site_signals() {
555        // Site has no `signals` block — should inherit the engine's.
556        let json = r#"{
557            "engines": {
558                "Discourse": {
559                    "signals": [
560                        { "kind": "status_found", "codes": [200] },
561                        { "kind": "body_absent", "text": "Oops! That page doesn't exist" }
562                    ]
563                }
564            },
565            "sites": [
566                { "name": "Mozilla Forum", "url": "https://discourse.mozilla.org/u/{username}",
567                  "engine": "Discourse" }
568            ]
569        }"#;
570        let r = Registry::from_json_str(json).unwrap();
571        let site = &r.sites()[0];
572        assert_eq!(site.signals.len(), 2);
573        assert_eq!(site.engine.as_deref(), Some("Discourse"));
574        // engines map preserved
575        assert!(r.engines().contains_key("Discourse"));
576    }
577
578    #[test]
579    fn site_overrides_engine_signals_on_conflict() {
580        // Site declares its own `signals` — engine's must NOT replace them.
581        let json = r#"{
582            "engines": {
583                "Discourse": {
584                    "signals": [{ "kind": "status_found", "codes": [200] }]
585                }
586            },
587            "sites": [
588                { "name": "Custom", "url": "https://example.com/{username}",
589                  "engine": "Discourse",
590                  "signals": [
591                    { "kind": "status_found", "codes": [200] },
592                    { "kind": "status_not_found", "codes": [404] }
593                  ] }
594            ]
595        }"#;
596        let r = Registry::from_json_str(json).unwrap();
597        // The site-declared 2 signals win over the engine's 1 signal.
598        assert_eq!(r.sites()[0].signals.len(), 2);
599    }
600
601    #[test]
602    fn engine_headers_merge_with_site_headers_per_key() {
603        // Engine declares one header; site declares another. Resolved
604        // site should carry both. On per-key conflict the site wins.
605        let json = r#"{
606            "engines": {
607                "Foo": {
608                    "signals": [{ "kind": "status_found", "codes": [200] }],
609                    "request_headers": {
610                        "X-Engine": "engine-value",
611                        "User-Agent": "engine-ua"
612                    }
613                }
614            },
615            "sites": [
616                { "name": "S", "url": "https://example.com/{username}",
617                  "engine": "Foo",
618                  "request_headers": { "User-Agent": "site-ua" } }
619            ]
620        }"#;
621        let r = Registry::from_json_str(json).unwrap();
622        let h = &r.sites()[0].request_headers;
623        assert_eq!(h.get("X-Engine").map(String::as_str), Some("engine-value"));
624        assert_eq!(h.get("User-Agent").map(String::as_str), Some("site-ua"));
625    }
626
627    #[test]
628    fn missing_engine_reference_fails_load() {
629        let json = r#"{
630            "engines": {},
631            "sites": [
632                { "name": "Mock", "url": "https://example.com/{username}",
633                  "engine": "DoesNotExist" }
634            ]
635        }"#;
636        let err = Registry::from_json_str(json).unwrap_err();
637        assert!(
638            err.to_string()
639                .contains("references engine \"DoesNotExist\""),
640            "expected missing-engine error, got: {err}"
641        );
642    }
643
644    #[test]
645    fn engine_regex_check_inherited_when_site_has_none() {
646        let json = r#"{
647            "engines": {
648                "Bounded": {
649                    "signals": [{ "kind": "status_found", "codes": [200] }],
650                    "regex_check": "^[a-z]{3,16}$"
651                }
652            },
653            "sites": [
654                { "name": "S", "url": "https://example.com/{username}",
655                  "engine": "Bounded" }
656            ]
657        }"#;
658        let r = Registry::from_json_str(json).unwrap();
659        assert_eq!(r.sites()[0].regex_check.as_deref(), Some("^[a-z]{3,16}$"));
660    }
661
662    #[test]
663    fn region_tag_auto_populates_prefer_geo() {
664        let json = r#"{
665            "sites": [
666                { "name": "vk.com", "url": "https://vk.com/{username}",
667                  "signals": [{ "kind": "status_found", "codes": [200] }],
668                  "tags": ["region:ru", "social"] }
669            ]
670        }"#;
671        let r = Registry::from_json_str(json).unwrap();
672        let prefer = &r.sites()[0].access.prefer_geo;
673        assert_eq!(prefer.len(), 1);
674        assert_eq!(prefer[0].as_str(), "ru");
675        // Hard geo stays empty — the tag is soft.
676        assert!(r.sites()[0].access.geo.is_empty());
677    }
678
679    #[test]
680    fn multiple_region_tags_stack() {
681        let json = r#"{
682            "sites": [
683                { "name": "Pan-Slavic", "url": "https://example.test/{username}",
684                  "signals": [{ "kind": "status_found", "codes": [200] }],
685                  "tags": ["region:ru", "region:by", "region:ua"] }
686            ]
687        }"#;
688        let r = Registry::from_json_str(json).unwrap();
689        let codes: Vec<&str> = r.sites()[0]
690            .access
691            .prefer_geo
692            .iter()
693            .map(super::super::access::CountryCode::as_str)
694            .collect();
695        assert_eq!(codes, vec!["ru", "by", "ua"]);
696    }
697
698    #[test]
699    fn explicit_hard_geo_suppresses_tag_derived_soft() {
700        // A site with hard `access.geo = ["pl"]` AND a `region:ru` tag:
701        // the explicit hard policy wins, prefer_geo stays empty.
702        // Otherwise tag-derived soft would silently re-route a probe
703        // that the maintainer deliberately pinned to PL.
704        let json = r#"{
705            "sites": [
706                { "name": "PL-only", "url": "https://example.test/{username}",
707                  "signals": [{ "kind": "status_found", "codes": [200] }],
708                  "tags": ["region:ru"],
709                  "access": { "geo": ["pl"] } }
710            ]
711        }"#;
712        let r = Registry::from_json_str(json).unwrap();
713        assert_eq!(r.sites()[0].access.geo[0].as_str(), "pl");
714        assert!(r.sites()[0].access.prefer_geo.is_empty());
715    }
716
717    #[test]
718    fn malformed_region_tag_is_ignored() {
719        // `region:` followed by something that isn't a 2-letter code:
720        // skip it silently rather than reject the whole load. The tag
721        // already had no routing semantics in older versions.
722        let json = r#"{
723            "sites": [
724                { "name": "Weird", "url": "https://example.test/{username}",
725                  "signals": [{ "kind": "status_found", "codes": [200] }],
726                  "tags": ["region:eurasia", "region:r", "region:RU"] }
727            ]
728        }"#;
729        let r = Registry::from_json_str(json).unwrap();
730        // Only the valid 2-letter "RU" survives (lowercased to "ru").
731        let codes: Vec<&str> = r.sites()[0]
732            .access
733            .prefer_geo
734            .iter()
735            .map(super::super::access::CountryCode::as_str)
736            .collect();
737        assert_eq!(codes, vec!["ru"]);
738    }
739
740    #[test]
741    fn load_from_path_round_trips_via_tempfile() {
742        let mut path = std::env::temp_dir();
743        path.push(format!("adler-test-registry-{}.json", std::process::id()));
744        std::fs::write(
745            &path,
746            r#"{
747                "sites": [
748                    { "name": "Mock", "url": "https://example.com/{username}",
749                      "signals": [{ "kind": "status_found", "codes": [200] }] }
750                ]
751            }"#,
752        )
753        .unwrap();
754        let result = Registry::load_from_path(&path);
755        let _ = std::fs::remove_file(&path);
756        let registry = result.unwrap();
757        assert_eq!(registry.len(), 1);
758        assert_eq!(registry.sites()[0].name, "Mock");
759    }
760}