adler_core/
registry.rs

1//! Site registry — loading, validation, filtering.
2//!
3//! The default registry is embedded into the binary at compile time via
4//! [`include_str!`]. Callers can override it with a file at runtime through
5//! [`Registry::load_from_path`].
6
7use std::collections::{BTreeMap, HashMap, HashSet};
8use std::path::Path;
9
10use serde::Deserialize;
11
12use crate::error::{Error, Result};
13use crate::site::{Engine, Site};
14
15const EMBEDDED_REGISTRY: &str = include_str!("../data/sites.json");
16
17/// Supplementary registry derived from the `WhatsMyName` project
18/// (`WebBreacher/WhatsMyName`, CC BY-SA 4.0). Kept as a separate
19/// constant because its data license is incompatible with the
20/// MIT-only [`EMBEDDED_REGISTRY`] above; callers opt in explicitly
21/// via [`Registry::default_embedded_with_wmn`] to keep the default
22/// MIT-clean for downstream redistribution.
23const EMBEDDED_WMN_REGISTRY: &str = include_str!("../data/sites_wmn.json");
24
25/// A loaded, validated collection of site definitions.
26///
27/// Engines (shared signature templates referenced by [`Site::engine`])
28/// are resolved into sites at load time — by the time you call
29/// [`Registry::sites`] every entry already has its inherited
30/// `signals` / `request_headers` / `regex_check` materialised. The original
31/// [`Engine`] objects are kept on the registry for re-export and
32/// inspection via [`Registry::engines`].
33#[derive(Debug, Clone, Deserialize)]
34pub struct Registry {
35    #[serde(default)]
36    engines: BTreeMap<String, Engine>,
37    sites: Vec<Site>,
38}
39
40impl Registry {
41    /// Load the default site list embedded into the crate at build time.
42    pub fn default_embedded() -> Result<Self> {
43        Self::from_json_str(EMBEDDED_REGISTRY)
44    }
45
46    /// Load the default site list *plus* the `WhatsMyName`-derived
47    /// supplementary set. `WhatsMyName` data is licensed CC BY-SA 4.0
48    /// (see `LICENSE-CC-BY-SA-4.0` at the repo root); enabling this
49    /// path means downstream redistribution of the merged scan data
50    /// must respect the `ShareAlike` obligation. Sites contributed by
51    /// the `WhatsMyName` tranche carry the `source:wmn` tag for
52    /// provenance.
53    ///
54    /// Engines from the WMN tranche merge with the MIT tranche;
55    /// case-insensitive site-name collisions resolve in favour of the
56    /// MIT-tranche entry (the hand-curated Sherlock/Maigret-derived
57    /// signature wins; the WMN duplicate is dropped). Returns an
58    /// error only if either tranche fails its own validation —
59    /// engine references are checked across the merged set.
60    pub fn default_embedded_with_wmn() -> Result<Self> {
61        let mut base = Self::default_embedded()?;
62        let wmn: Self = serde_json::from_str(EMBEDDED_WMN_REGISTRY)?;
63        let existing_names: HashSet<String> =
64            base.sites.iter().map(|s| s.name.to_lowercase()).collect();
65        // URL-claim only counts enabled base entries — the dedup pattern
66        // keeps disabled siblings at the canonical's URL, and a WMN
67        // entry colliding with one of *those* is no worse than colliding
68        // with the canonical.
69        let claimed_urls: HashSet<String> = base
70            .sites
71            .iter()
72            .filter(|s| !s.disabled)
73            .map(|s| s.url.as_str().to_owned())
74            .collect();
75        for (name, engine) in wmn.engines {
76            base.engines.entry(name).or_insert(engine);
77        }
78        for site in wmn.sites {
79            if existing_names.contains(&site.name.to_lowercase()) {
80                continue;
81            }
82            if !site.disabled && claimed_urls.contains(site.url.as_str()) {
83                // Base already has an enabled site at this URL; WMN's
84                // version would just produce a doubled probe, and
85                // validate() would refuse the merged registry. Drop the
86                // WMN entry; base canonical wins (same precedence rule
87                // we apply for name collisions).
88                continue;
89            }
90            base.sites.push(site);
91        }
92        base.resolve_engines()?;
93        base.validate()?;
94        Ok(base)
95    }
96
97    /// Parse and validate a registry from a JSON string. Engine
98    /// references on each site are resolved before validation;
99    /// a site that names an engine which doesn't exist in the
100    /// `engines` block fails loading with [`Error::InvalidSite`].
101    pub fn from_json_str(json: &str) -> Result<Self> {
102        let mut registry: Self = serde_json::from_str(json)?;
103        registry.resolve_engines()?;
104        registry.apply_tag_derived_policy();
105        registry.validate()?;
106        Ok(registry)
107    }
108
109    /// Inheritable engine templates, keyed by name. Useful for
110    /// introspection and for serialising the registry back out;
111    /// detection paths read the resolved fields off the sites
112    /// directly and don't need to consult this map.
113    pub fn engines(&self) -> &BTreeMap<String, Engine> {
114        &self.engines
115    }
116
117    /// Walk every site's tags for `region:XX` markers and fill
118    /// [`AccessPolicy::prefer_geo`](crate::AccessPolicy::prefer_geo)
119    /// with the matching country codes. **Soft** routing only — a
120    /// site declaring `region:ru` *prefers* a Russian egress when one
121    /// is configured but still works from anywhere else; the router
122    /// falls back to the default egress on no match rather than
123    /// reporting `Uncertain(GeoUnavailable)`.
124    ///
125    /// Skips sites that already declare a hard
126    /// [`AccessPolicy::geo`](crate::AccessPolicy::geo) — explicit
127    /// policy wins on conflict, same convention as engine inheritance.
128    /// Existing `prefer_geo` entries are also preserved: tag-derived
129    /// codes are *added*, not replaced, so a hand-tuned policy can
130    /// stack on top of the tag. Invalid country codes (`region:xx` is
131    /// not exactly two ASCII letters) are silently skipped — they
132    /// shouldn't exist in the registry, and a parse error here would
133    /// break the load for a tag the scanner already ignores in
134    /// every other context.
135    fn apply_tag_derived_policy(&mut self) {
136        for site in &mut self.sites {
137            if !site.access.geo.is_empty() {
138                continue;
139            }
140            for tag in &site.tags {
141                let Some(rest) = tag.strip_prefix("region:") else {
142                    continue;
143                };
144                let Some(cc) = crate::access::CountryCode::new(rest) else {
145                    continue;
146                };
147                if !site.access.prefer_geo.contains(&cc) {
148                    site.access.prefer_geo.push(cc);
149                }
150            }
151        }
152    }
153
154    /// Merge each engine's inheritable fields into the sites that
155    /// reference it. After this call every site's `signals`,
156    /// `request_headers` and `regex_check` reflect the effective
157    /// values used by the scanner.
158    ///
159    /// Per-site fields are authoritative: anything declared
160    /// explicitly on a site wins on conflict; only empty / unset
161    /// fields are filled from the engine.
162    fn resolve_engines(&mut self) -> Result<()> {
163        for (name, engine) in &self.engines {
164            engine.validate(name)?;
165        }
166        for site in &mut self.sites {
167            let Some(name) = &site.engine else {
168                continue;
169            };
170            let Some(engine) = self.engines.get(name) else {
171                return Err(Error::InvalidSite {
172                    reason: format!(
173                        "site {:?}: references engine {name:?} which is not defined",
174                        site.name
175                    ),
176                });
177            };
178            engine.merge_into(site);
179        }
180        Ok(())
181    }
182
183    /// Read a registry from a JSON file.
184    pub fn load_from_path(path: impl AsRef<Path>) -> Result<Self> {
185        let bytes = std::fs::read(path)?;
186        let json = std::str::from_utf8(&bytes).map_err(|e| Error::InvalidSite {
187            reason: format!("registry file is not valid UTF-8: {e}"),
188        })?;
189        Self::from_json_str(json)
190    }
191
192    /// Borrow all sites in load order.
193    pub fn sites(&self) -> &[Site] {
194        &self.sites
195    }
196
197    /// Number of sites.
198    pub fn len(&self) -> usize {
199        self.sites.len()
200    }
201
202    /// True if the registry has no sites (always false for a valid load,
203    /// since we'd already have rejected it).
204    pub fn is_empty(&self) -> bool {
205        self.sites.is_empty()
206    }
207
208    /// Apply include/exclude name filters and a tag filter.
209    ///
210    /// - If `include` is non-empty, only sites whose name contains at least
211    ///   one include term (case-insensitive substring) are kept.
212    /// - Sites whose name contains any exclude term are dropped.
213    /// - If `tags` is non-empty, only sites carrying at least one of the
214    ///   requested tags are kept (case-insensitive). A site with no tags is
215    ///   therefore dropped by a tag filter — asking for `--tag social` means
216    ///   "only social-tagged sites".
217    /// - Sites carrying any tag in `exclude_tags` are dropped (e.g.
218    ///   `--exclude-tag bot-protected` for a fast clean run).
219    /// - **NSFW sites are auto-excluded** (the `nsfw` tag) unless
220    ///   `include_nsfw` is `true` or `tags` explicitly asks for `nsfw`.
221    ///   This matches Sherlock's `--nsfw` opt-in pattern and prevents
222    ///   the default `adler <username>` from surfacing adult-site URLs
223    ///   the user didn't ask for.
224    /// - Sites are returned by value (cloned) so the result is independent
225    ///   of the registry's lifetime — convenient for handing to the executor.
226    pub fn filter(
227        &self,
228        include: &[String],
229        exclude: &[String],
230        tags: &[String],
231        exclude_tags: &[String],
232        include_nsfw: bool,
233    ) -> Vec<Site> {
234        let include: Vec<String> = include.iter().map(|s| s.to_lowercase()).collect();
235        let exclude: Vec<String> = exclude.iter().map(|s| s.to_lowercase()).collect();
236        let want_tags: Vec<String> = tags.iter().map(|s| s.to_lowercase()).collect();
237        let mut drop_tags: Vec<String> = exclude_tags.iter().map(|s| s.to_lowercase()).collect();
238
239        // NSFW gate: auto-exclude unless the caller explicitly opted in,
240        // either via `include_nsfw` or by asking for the `nsfw` tag.
241        let nsfw_tag = "nsfw".to_owned();
242        let asking_for_nsfw = want_tags.contains(&nsfw_tag);
243        if !include_nsfw && !asking_for_nsfw && !drop_tags.contains(&nsfw_tag) {
244            drop_tags.push(nsfw_tag);
245        }
246
247        self.sites
248            .iter()
249            .filter(|site| {
250                // Disabled sites are skipped unconditionally — the bool
251                // is meant for parking known-broken entries with a
252                // reason comment instead of deleting them, so they
253                // never get probed even with a fresh include filter.
254                if site.disabled {
255                    return false;
256                }
257                let name = site.name.to_lowercase();
258                let included = include.is_empty() || include.iter().any(|i| name.contains(i));
259                let excluded = exclude.iter().any(|x| name.contains(x));
260                let lower_tags: Vec<String> = site.tags.iter().map(|t| t.to_lowercase()).collect();
261                let tagged =
262                    want_tags.is_empty() || lower_tags.iter().any(|t| want_tags.contains(t));
263                let tag_excluded = lower_tags.iter().any(|t| drop_tags.contains(t));
264                included && !excluded && tagged && !tag_excluded
265            })
266            .cloned()
267            .collect()
268    }
269
270    /// Distinct tags across all sites, sorted, with the count of sites
271    /// carrying each. Powers `--list-tags`.
272    pub fn tag_counts(&self) -> Vec<(String, usize)> {
273        let mut counts: std::collections::BTreeMap<String, usize> =
274            std::collections::BTreeMap::new();
275        for site in &self.sites {
276            for tag in &site.tags {
277                *counts.entry(tag.clone()).or_insert(0) += 1;
278            }
279        }
280        counts.into_iter().collect()
281    }
282
283    fn validate(&self) -> Result<()> {
284        if self.sites.is_empty() {
285            return Err(Error::InvalidSite {
286                reason: "registry has no sites".into(),
287            });
288        }
289        for site in &self.sites {
290            site.validate()?;
291        }
292        let mut seen: HashSet<String> = HashSet::new();
293        for site in &self.sites {
294            let key = site.name.to_lowercase();
295            if !seen.insert(key) {
296                return Err(Error::InvalidSite {
297                    reason: format!("duplicate site name: {:?}", site.name),
298                });
299            }
300        }
301        // (URL, signals) uniqueness among ENABLED sites: each
302        // (URL template, signal-set) pair should back exactly one live
303        // entry. Disabled entries can legitimately share URLs with
304        // their canonicals — that's how the `duplicate of <canonical>`
305        // dedup pattern works. A second enabled hit at the same URL
306        // *and* the same signal array is almost always an importer
307        // re-introducing a known duplicate
308        // (Sherlock/Maigret/WhatsMyName each name the same site
309        // slightly differently); the doctor would otherwise
310        // double-probe the URL for an identical verdict.
311        //
312        // Same URL with *distinct* signals is the legitimate-alias
313        // shape — WordPress.com (Public/Private/Deleted) hit the same
314        // API endpoint and disambiguate via their `body_present`
315        // marker, which the doctor reads as three independent verdicts.
316        let mut seen_url_sig: HashMap<(String, String), &str> = HashMap::new();
317        for site in &self.sites {
318            if site.disabled {
319                continue;
320            }
321            // `Debug` on `Signal` is deterministic and avoids pulling
322            // serde_json into the validation hot path.
323            let key = (site.url.as_str().to_owned(), format!("{:?}", site.signals));
324            if let Some(prev) = seen_url_sig.insert(key, site.name.as_str()) {
325                return Err(Error::InvalidSite {
326                    reason: format!(
327                        "duplicate (URL, signals) among enabled sites: {:?} and {:?} both back \
328                         {:?} with identical signals. Mark one `disabled: true` with \
329                         `disabled_reason: \"duplicate of {prev}\"` (or, if the two entries are \
330                         supposed to disambiguate via different markers, give each a distinct \
331                         signal set).",
332                        prev,
333                        site.name,
334                        site.url.as_str(),
335                    ),
336                });
337            }
338        }
339        Ok(())
340    }
341}
342
343#[cfg(test)]
344mod tests {
345    use super::*;
346
347    #[test]
348    fn embedded_registry_loads_and_validates() {
349        let registry = Registry::default_embedded().expect("embedded registry must load");
350        // The registry is imported from Sherlock (~450 sites); a floor well
351        // above the old hand-written 15 guards against accidental truncation.
352        assert!(
353            registry.len() >= 100,
354            "imported registry should have ≥100 sites, got {}",
355            registry.len()
356        );
357        // Spot-check a couple of well-known entries. (HackerNews used
358        // to be here but was pruned 2026-05-26 — its Sherlock-side
359        // known_present went stale and the imported signature
360        // doctor-failed; can be restored via OVERRIDES in
361        // import_sherlock.py with a working account.)
362        let names: Vec<&str> = registry.sites().iter().map(|s| s.name.as_str()).collect();
363        assert!(names.contains(&"GitHub"));
364        assert!(names.contains(&"Reddit"));
365        assert!(names.contains(&"Telegram"));
366    }
367
368    #[test]
369    fn wmn_embedded_registry_loads_and_supersets_default() {
370        let base = Registry::default_embedded().unwrap();
371        let merged = Registry::default_embedded_with_wmn().expect("WMN-merged registry must load");
372        assert!(
373            merged.len() > base.len(),
374            "WMN merge must add sites: base={} merged={}",
375            base.len(),
376            merged.len()
377        );
378        // Every base-tranche name survives the merge; case-insensitive
379        // collisions resolve in favour of the MIT-tranche entry.
380        let merged_names: HashSet<String> = merged
381            .sites()
382            .iter()
383            .map(|s| s.name.to_lowercase())
384            .collect();
385        for s in base.sites() {
386            assert!(
387                merged_names.contains(&s.name.to_lowercase()),
388                "merge dropped base-tranche site {:?}",
389                s.name
390            );
391        }
392        // At least one WMN-only site carries the provenance tag.
393        let has_wmn_tag = merged
394            .sites()
395            .iter()
396            .any(|s| s.tags.iter().any(|t| t == "source:wmn"));
397        assert!(has_wmn_tag, "no site carries the source:wmn tag");
398    }
399
400    #[test]
401    fn rejects_empty_registry() {
402        let err = Registry::from_json_str(r#"{ "sites": [] }"#).unwrap_err();
403        assert!(matches!(err, Error::InvalidSite { .. }));
404    }
405
406    #[test]
407    fn rejects_duplicate_site_names() {
408        let json = r#"{
409            "sites": [
410                { "name": "GitHub", "url": "https://github.com/{username}",
411                  "signals": [{ "kind": "status_found", "codes": [200] }] },
412                { "name": "github", "url": "https://github.com/{username}",
413                  "signals": [{ "kind": "status_found", "codes": [200] }] }
414            ]
415        }"#;
416        let err = Registry::from_json_str(json).unwrap_err();
417        assert!(matches!(err, Error::InvalidSite { .. }));
418        assert!(err.to_string().contains("duplicate"));
419    }
420
421    #[test]
422    fn rejects_duplicate_enabled_urls() {
423        // Two enabled sites at the same URL is almost always an importer
424        // re-introducing a known duplicate. Reject at load time with a
425        // message naming both entries.
426        let json = r#"{
427            "sites": [
428                { "name": "Hub Code", "url": "https://example.com/{username}",
429                  "signals": [{ "kind": "status_found", "codes": [200] }] },
430                { "name": "HubCode", "url": "https://example.com/{username}",
431                  "signals": [{ "kind": "status_found", "codes": [200] }] }
432            ]
433        }"#;
434        let err = Registry::from_json_str(json).unwrap_err();
435        assert!(matches!(err, Error::InvalidSite { .. }));
436        let msg = err.to_string();
437        assert!(msg.contains("duplicate (URL, signals)"), "msg: {msg}");
438        assert!(msg.contains("Hub Code"), "msg: {msg}");
439        assert!(msg.contains("HubCode"), "msg: {msg}");
440    }
441
442    #[test]
443    fn allows_duplicate_urls_with_distinct_signals() {
444        // Same URL, distinct signal sets — this is the legitimate-alias
445        // shape (e.g. WordPress.com (Public/Private/Deleted) hit one
446        // endpoint and disambiguate via the body marker). Must NOT
447        // trigger the URL-uniqueness rule.
448        let json = r#"{
449            "sites": [
450                { "name": "Site Public", "url": "https://example.com/{username}",
451                  "signals": [{ "kind": "status_found", "codes": [200] }] },
452                { "name": "Site Private", "url": "https://example.com/{username}",
453                  "signals": [{ "kind": "status_found", "codes": [403] }] }
454            ]
455        }"#;
456        let registry = Registry::from_json_str(json).expect("distinct-signal alias must validate");
457        assert_eq!(registry.len(), 2);
458    }
459
460    #[test]
461    fn allows_duplicate_urls_when_one_side_is_disabled() {
462        // The dedup pattern that the v0.14 hygiene pass established:
463        // canonical stays enabled, the surplus entry gets
464        // `disabled: true` + `disabled_reason: "duplicate of <canonical>"`.
465        // This shape must continue loading cleanly.
466        let json = r#"{
467            "sites": [
468                { "name": "Hub Code", "url": "https://example.com/{username}",
469                  "signals": [{ "kind": "status_found", "codes": [200] }] },
470                { "name": "HubCode", "url": "https://example.com/{username}",
471                  "signals": [{ "kind": "status_found", "codes": [200] }],
472                  "disabled": true,
473                  "disabled_reason": "duplicate of Hub Code" }
474            ]
475        }"#;
476        let registry = Registry::from_json_str(json).expect("dedup pattern must validate");
477        assert_eq!(registry.len(), 2);
478    }
479
480    #[test]
481    fn rejects_invalid_site_definition() {
482        // Missing {username} placeholder.
483        let json = r#"{
484            "sites": [
485                { "name": "Bad", "url": "https://example.com/",
486                  "signals": [{ "kind": "status_found", "codes": [200] }] }
487            ]
488        }"#;
489        assert!(Registry::from_json_str(json).is_err());
490    }
491
492    #[test]
493    fn rejects_malformed_json() {
494        let err = Registry::from_json_str("{").unwrap_err();
495        assert!(matches!(err, Error::Json(_)));
496    }
497
498    #[test]
499    fn filter_include_is_case_insensitive_substring() {
500        let registry = Registry::default_embedded().unwrap();
501        let only_github = registry.filter(&["github".into()], &[], &[], &[], false);
502        assert_eq!(only_github.len(), 1);
503        assert_eq!(only_github[0].name, "GitHub");
504
505        let many = registry.filter(&["e".into()], &[], &[], &[], false); // matches anything with "e"
506        assert!(many.len() > 1);
507    }
508
509    #[test]
510    fn filter_exclude_drops_matches() {
511        let registry = Registry::default_embedded().unwrap();
512        // Include NSFW to keep the test focused on the name-exclude
513        // path; the NSFW auto-exclusion is exercised separately.
514        let baseline = registry.filter(&[], &[], &[], &[], true);
515        let without_github = registry.filter(&[], &["github".into()], &[], &[], true);
516        assert!(without_github.iter().all(|s| s.name != "GitHub"));
517        // Asserting against the baseline (filtered) count rather than
518        // `registry.len()` so this test is robust to changes in the
519        // disabled-site count — `len()` includes disabled entries,
520        // `filter()` does not.
521        assert_eq!(without_github.len(), baseline.len() - 1);
522    }
523
524    #[test]
525    fn filter_include_and_exclude_compose() {
526        let registry = Registry::default_embedded().unwrap();
527        // Include "git", then exclude "lab" → keep GitHub, drop GitLab.
528        let filtered = registry.filter(&["git".into()], &["lab".into()], &[], &[], false);
529        let names: Vec<&str> = filtered.iter().map(|s| s.name.as_str()).collect();
530        assert!(names.contains(&"GitHub"));
531        assert!(!names.contains(&"GitLab"));
532        // Exclude wins over include for sites containing both terms (none here).
533    }
534
535    #[test]
536    fn filter_with_no_matches_returns_empty() {
537        let registry = Registry::default_embedded().unwrap();
538        let filtered = registry.filter(&["does-not-exist-xyz".into()], &[], &[], &[], false);
539        assert!(filtered.is_empty());
540    }
541
542    #[test]
543    fn disabled_sites_are_skipped_by_filter() {
544        let json = r#"{
545            "sites": [
546                { "name": "Alive", "url": "https://alive.example/{username}",
547                  "signals": [{ "kind": "status_found", "codes": [200] }] },
548                { "name": "Parked", "url": "https://parked.example/{username}",
549                  "signals": [{ "kind": "status_found", "codes": [200] }],
550                  "disabled": true }
551            ]
552        }"#;
553        let registry = Registry::from_json_str(json).unwrap();
554        // sites() returns everything including disabled — it's the
555        // serialisation view. filter() is the scan view and drops
556        // disabled entries.
557        assert_eq!(registry.sites().len(), 2);
558        let scanned = registry.filter(&[], &[], &[], &[], false);
559        let names: Vec<&str> = scanned.iter().map(|s| s.name.as_str()).collect();
560        assert_eq!(names, vec!["Alive"]);
561    }
562
563    #[test]
564    fn source_field_round_trips() {
565        let json = r#"{
566            "sites": [
567                { "name": "Nitter", "url": "https://nitter.example/{username}",
568                  "signals": [{ "kind": "status_found", "codes": [200] }],
569                  "source": "Twitter" }
570            ]
571        }"#;
572        let registry = Registry::from_json_str(json).unwrap();
573        assert_eq!(registry.sites()[0].source.as_deref(), Some("Twitter"));
574    }
575
576    fn tagged_registry() -> Registry {
577        let json = r#"{
578            "sites": [
579                { "name": "Soc", "url": "https://soc.example/{username}",
580                  "signals": [{ "kind": "status_found", "codes": [200] }],
581                  "tags": ["social", "region:ru"] },
582                { "name": "Dev", "url": "https://dev.example/{username}",
583                  "signals": [{ "kind": "status_found", "codes": [200] }],
584                  "tags": ["dev"] },
585                { "name": "Plain", "url": "https://plain.example/{username}",
586                  "signals": [{ "kind": "status_found", "codes": [200] }] }
587            ]
588        }"#;
589        Registry::from_json_str(json).unwrap()
590    }
591
592    #[test]
593    fn tag_filter_keeps_only_matching_tags_and_drops_untagged() {
594        let r = tagged_registry();
595        let social = r.filter(&[], &[], &["social".into()], &[], false);
596        let names: Vec<&str> = social.iter().map(|s| s.name.as_str()).collect();
597        assert_eq!(names, ["Soc"], "tag filter should keep only tagged matches");
598    }
599
600    #[test]
601    fn tag_filter_is_or_within_requested_tags_and_case_insensitive() {
602        let r = tagged_registry();
603        let either = r.filter(&[], &[], &["DEV".into(), "social".into()], &[], false);
604        let names: Vec<&str> = either.iter().map(|s| s.name.as_str()).collect();
605        assert_eq!(names, ["Soc", "Dev"]);
606    }
607
608    #[test]
609    fn no_tag_filter_includes_untagged_sites() {
610        let r = tagged_registry();
611        assert_eq!(r.filter(&[], &[], &[], &[], false).len(), 3);
612    }
613
614    #[test]
615    fn exclude_tag_drops_matching_sites() {
616        let r = tagged_registry();
617        let kept = r.filter(&[], &[], &[], &["social".into()], false);
618        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
619        // Soc carries "social" → dropped; Dev and untagged Plain remain.
620        assert_eq!(names, ["Dev", "Plain"], "{names:?}");
621    }
622
623    fn nsfw_registry() -> Registry {
624        let json = r#"{
625            "sites": [
626                { "name": "Family", "url": "https://family.example/{username}",
627                  "signals": [{ "kind": "status_found", "codes": [200] }],
628                  "tags": ["social"] },
629                { "name": "Adult", "url": "https://adult.example/{username}",
630                  "signals": [{ "kind": "status_found", "codes": [200] }],
631                  "tags": ["nsfw"] }
632            ]
633        }"#;
634        Registry::from_json_str(json).unwrap()
635    }
636
637    #[test]
638    fn nsfw_sites_excluded_by_default() {
639        let r = nsfw_registry();
640        let kept = r.filter(&[], &[], &[], &[], false);
641        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
642        assert_eq!(names, ["Family"], "nsfw site must be excluded by default");
643    }
644
645    #[test]
646    fn nsfw_sites_included_when_flag_set() {
647        let r = nsfw_registry();
648        let kept = r.filter(&[], &[], &[], &[], true);
649        assert_eq!(kept.len(), 2, "both sites present with include_nsfw=true");
650    }
651
652    #[test]
653    fn nsfw_sites_included_when_tag_asked_for_explicitly() {
654        // `--tag nsfw` is an explicit opt-in; should bypass the default
655        // auto-exclusion even with include_nsfw=false.
656        let r = nsfw_registry();
657        let kept = r.filter(&[], &[], &["nsfw".into()], &[], false);
658        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
659        assert_eq!(names, ["Adult"]);
660    }
661
662    #[test]
663    fn tag_counts_are_sorted_with_per_tag_totals() {
664        let r = tagged_registry();
665        assert_eq!(
666            r.tag_counts(),
667            vec![
668                ("dev".to_owned(), 1),
669                ("region:ru".to_owned(), 1),
670                ("social".to_owned(), 1),
671            ]
672        );
673    }
674
675    #[test]
676    fn engine_inheritance_fills_empty_site_signals() {
677        // Site has no `signals` block — should inherit the engine's.
678        let json = r#"{
679            "engines": {
680                "Discourse": {
681                    "signals": [
682                        { "kind": "status_found", "codes": [200] },
683                        { "kind": "body_absent", "text": "Oops! That page doesn't exist" }
684                    ]
685                }
686            },
687            "sites": [
688                { "name": "Mozilla Forum", "url": "https://discourse.mozilla.org/u/{username}",
689                  "engine": "Discourse" }
690            ]
691        }"#;
692        let r = Registry::from_json_str(json).unwrap();
693        let site = &r.sites()[0];
694        assert_eq!(site.signals.len(), 2);
695        assert_eq!(site.engine.as_deref(), Some("Discourse"));
696        // engines map preserved
697        assert!(r.engines().contains_key("Discourse"));
698    }
699
700    #[test]
701    fn site_overrides_engine_signals_on_conflict() {
702        // Site declares its own `signals` — engine's must NOT replace them.
703        let json = r#"{
704            "engines": {
705                "Discourse": {
706                    "signals": [{ "kind": "status_found", "codes": [200] }]
707                }
708            },
709            "sites": [
710                { "name": "Custom", "url": "https://example.com/{username}",
711                  "engine": "Discourse",
712                  "signals": [
713                    { "kind": "status_found", "codes": [200] },
714                    { "kind": "status_not_found", "codes": [404] }
715                  ] }
716            ]
717        }"#;
718        let r = Registry::from_json_str(json).unwrap();
719        // The site-declared 2 signals win over the engine's 1 signal.
720        assert_eq!(r.sites()[0].signals.len(), 2);
721    }
722
723    #[test]
724    fn engine_headers_merge_with_site_headers_per_key() {
725        // Engine declares one header; site declares another. Resolved
726        // site should carry both. On per-key conflict the site wins.
727        let json = r#"{
728            "engines": {
729                "Foo": {
730                    "signals": [{ "kind": "status_found", "codes": [200] }],
731                    "request_headers": {
732                        "X-Engine": "engine-value",
733                        "User-Agent": "engine-ua"
734                    }
735                }
736            },
737            "sites": [
738                { "name": "S", "url": "https://example.com/{username}",
739                  "engine": "Foo",
740                  "request_headers": { "User-Agent": "site-ua" } }
741            ]
742        }"#;
743        let r = Registry::from_json_str(json).unwrap();
744        let h = &r.sites()[0].request_headers;
745        assert_eq!(h.get("X-Engine").map(String::as_str), Some("engine-value"));
746        assert_eq!(h.get("User-Agent").map(String::as_str), Some("site-ua"));
747    }
748
749    #[test]
750    fn missing_engine_reference_fails_load() {
751        let json = r#"{
752            "engines": {},
753            "sites": [
754                { "name": "Mock", "url": "https://example.com/{username}",
755                  "engine": "DoesNotExist" }
756            ]
757        }"#;
758        let err = Registry::from_json_str(json).unwrap_err();
759        assert!(
760            err.to_string()
761                .contains("references engine \"DoesNotExist\""),
762            "expected missing-engine error, got: {err}"
763        );
764    }
765
766    #[test]
767    fn engine_regex_check_inherited_when_site_has_none() {
768        let json = r#"{
769            "engines": {
770                "Bounded": {
771                    "signals": [{ "kind": "status_found", "codes": [200] }],
772                    "regex_check": "^[a-z]{3,16}$"
773                }
774            },
775            "sites": [
776                { "name": "S", "url": "https://example.com/{username}",
777                  "engine": "Bounded" }
778            ]
779        }"#;
780        let r = Registry::from_json_str(json).unwrap();
781        assert_eq!(r.sites()[0].regex_check.as_deref(), Some("^[a-z]{3,16}$"));
782    }
783
784    #[test]
785    fn region_tag_auto_populates_prefer_geo() {
786        let json = r#"{
787            "sites": [
788                { "name": "vk.com", "url": "https://vk.com/{username}",
789                  "signals": [{ "kind": "status_found", "codes": [200] }],
790                  "tags": ["region:ru", "social"] }
791            ]
792        }"#;
793        let r = Registry::from_json_str(json).unwrap();
794        let prefer = &r.sites()[0].access.prefer_geo;
795        assert_eq!(prefer.len(), 1);
796        assert_eq!(prefer[0].as_str(), "ru");
797        // Hard geo stays empty — the tag is soft.
798        assert!(r.sites()[0].access.geo.is_empty());
799    }
800
801    #[test]
802    fn multiple_region_tags_stack() {
803        let json = r#"{
804            "sites": [
805                { "name": "Pan-Slavic", "url": "https://example.test/{username}",
806                  "signals": [{ "kind": "status_found", "codes": [200] }],
807                  "tags": ["region:ru", "region:by", "region:ua"] }
808            ]
809        }"#;
810        let r = Registry::from_json_str(json).unwrap();
811        let codes: Vec<&str> = r.sites()[0]
812            .access
813            .prefer_geo
814            .iter()
815            .map(super::super::access::CountryCode::as_str)
816            .collect();
817        assert_eq!(codes, vec!["ru", "by", "ua"]);
818    }
819
820    #[test]
821    fn explicit_hard_geo_suppresses_tag_derived_soft() {
822        // A site with hard `access.geo = ["pl"]` AND a `region:ru` tag:
823        // the explicit hard policy wins, prefer_geo stays empty.
824        // Otherwise tag-derived soft would silently re-route a probe
825        // that the maintainer deliberately pinned to PL.
826        let json = r#"{
827            "sites": [
828                { "name": "PL-only", "url": "https://example.test/{username}",
829                  "signals": [{ "kind": "status_found", "codes": [200] }],
830                  "tags": ["region:ru"],
831                  "access": { "geo": ["pl"] } }
832            ]
833        }"#;
834        let r = Registry::from_json_str(json).unwrap();
835        assert_eq!(r.sites()[0].access.geo[0].as_str(), "pl");
836        assert!(r.sites()[0].access.prefer_geo.is_empty());
837    }
838
839    #[test]
840    fn malformed_region_tag_is_ignored() {
841        // `region:` followed by something that isn't a 2-letter code:
842        // skip it silently rather than reject the whole load. The tag
843        // already had no routing semantics in older versions.
844        let json = r#"{
845            "sites": [
846                { "name": "Weird", "url": "https://example.test/{username}",
847                  "signals": [{ "kind": "status_found", "codes": [200] }],
848                  "tags": ["region:eurasia", "region:r", "region:RU"] }
849            ]
850        }"#;
851        let r = Registry::from_json_str(json).unwrap();
852        // Only the valid 2-letter "RU" survives (lowercased to "ru").
853        let codes: Vec<&str> = r.sites()[0]
854            .access
855            .prefer_geo
856            .iter()
857            .map(super::super::access::CountryCode::as_str)
858            .collect();
859        assert_eq!(codes, vec!["ru"]);
860    }
861
862    #[test]
863    fn load_from_path_round_trips_via_tempfile() {
864        let mut path = std::env::temp_dir();
865        path.push(format!("adler-test-registry-{}.json", std::process::id()));
866        std::fs::write(
867            &path,
868            r#"{
869                "sites": [
870                    { "name": "Mock", "url": "https://example.com/{username}",
871                      "signals": [{ "kind": "status_found", "codes": [200] }] }
872                ]
873            }"#,
874        )
875        .unwrap();
876        let result = Registry::load_from_path(&path);
877        let _ = std::fs::remove_file(&path);
878        let registry = result.unwrap();
879        assert_eq!(registry.len(), 1);
880        assert_eq!(registry.sites()[0].name, "Mock");
881    }
882}
adler_core/registry.rs

adler_core/
registry.rs