adler_core/
registry.rs

1//! Site registry — loading, validation, filtering.
2//!
3//! The default registry is embedded into the binary at compile time via
4//! [`include_str!`]. Callers can override it with a file at runtime through
5//! [`Registry::load_from_path`].
6
7use std::collections::{BTreeMap, HashMap, HashSet};
8use std::path::Path;
9
10use serde::Deserialize;
11
12use crate::error::{Error, Result};
13use crate::site::{Engine, Site};
14
15const EMBEDDED_REGISTRY: &str = include_str!("../data/sites.json");
16
17/// Supplementary registry derived from the `WhatsMyName` project
18/// (`WebBreacher/WhatsMyName`, CC BY-SA 4.0). Kept as a separate
19/// constant because its data license is incompatible with the
20/// MIT-only [`EMBEDDED_REGISTRY`] above; callers opt in explicitly
21/// via [`Registry::default_embedded_with_wmn`] to keep the default
22/// MIT-clean for downstream redistribution.
23const EMBEDDED_WMN_REGISTRY: &str = include_str!("../data/sites_wmn.json");
24
25/// A loaded, validated collection of site definitions.
26///
27/// Engines (shared signature templates referenced by [`Site::engine`])
28/// are resolved into sites at load time — by the time you call
29/// [`Registry::sites`] every entry already has its inherited
30/// `signals` / `request_headers` / `regex_check` materialised. The original
31/// [`Engine`] objects are kept on the registry for re-export and
32/// inspection via [`Registry::engines`].
33#[derive(Debug, Clone, Deserialize)]
34pub struct Registry {
35    #[serde(default)]
36    engines: BTreeMap<String, Engine>,
37    sites: Vec<Site>,
38}
39
40impl Registry {
41    /// Load the default site list embedded into the crate at build time.
42    pub fn default_embedded() -> Result<Self> {
43        Self::from_json_str(EMBEDDED_REGISTRY)
44    }
45
46    /// Load the default site list *plus* the `WhatsMyName`-derived
47    /// supplementary set. `WhatsMyName` data is licensed CC BY-SA 4.0
48    /// (see `LICENSE-CC-BY-SA-4.0` at the repo root); enabling this
49    /// path means downstream redistribution of the merged scan data
50    /// must respect the `ShareAlike` obligation. Sites contributed by
51    /// the `WhatsMyName` tranche carry the `source:wmn` tag for
52    /// provenance.
53    ///
54    /// Engines from the WMN tranche merge with the MIT tranche;
55    /// case-insensitive site-name collisions resolve in favour of the
56    /// MIT-tranche entry (the hand-curated Sherlock/Maigret-derived
57    /// signature wins; the WMN duplicate is dropped). Returns an
58    /// error only if either tranche fails its own validation —
59    /// engine references are checked across the merged set.
60    pub fn default_embedded_with_wmn() -> Result<Self> {
61        let mut base = Self::default_embedded()?;
62        let wmn: Self = serde_json::from_str(EMBEDDED_WMN_REGISTRY)?;
63        let existing_names: HashSet<String> =
64            base.sites.iter().map(|s| s.name.to_lowercase()).collect();
65        // URL-claim only counts enabled base entries — the dedup pattern
66        // keeps disabled siblings at the canonical's URL, and a WMN
67        // entry colliding with one of *those* is no worse than colliding
68        // with the canonical.
69        let claimed_urls: HashSet<String> = base
70            .sites
71            .iter()
72            .filter(|s| !s.disabled)
73            .map(|s| s.url.as_str().to_owned())
74            .collect();
75        for (name, engine) in wmn.engines {
76            base.engines.entry(name).or_insert(engine);
77        }
78        for site in wmn.sites {
79            if existing_names.contains(&site.name.to_lowercase()) {
80                continue;
81            }
82            if !site.disabled && claimed_urls.contains(site.url.as_str()) {
83                // Base already has an enabled site at this URL; WMN's
84                // version would just produce a doubled probe, and
85                // validate() would refuse the merged registry. Drop the
86                // WMN entry; base canonical wins (same precedence rule
87                // we apply for name collisions).
88                continue;
89            }
90            base.sites.push(site);
91        }
92        base.resolve_engines()?;
93        base.validate()?;
94        Ok(base)
95    }
96
97    /// Parse and validate a registry from a JSON string. Engine
98    /// references on each site are resolved before validation;
99    /// a site that names an engine which doesn't exist in the
100    /// `engines` block fails loading with [`Error::InvalidSite`].
101    pub fn from_json_str(json: &str) -> Result<Self> {
102        let mut registry: Self = serde_json::from_str(json)?;
103        registry.resolve_engines()?;
104        registry.apply_tag_derived_policy();
105        registry.validate()?;
106        Ok(registry)
107    }
108
109    /// Inheritable engine templates, keyed by name. Useful for
110    /// introspection and for serialising the registry back out;
111    /// detection paths read the resolved fields off the sites
112    /// directly and don't need to consult this map.
113    pub fn engines(&self) -> &BTreeMap<String, Engine> {
114        &self.engines
115    }
116
117    /// Walk every site's tags for `region:XX` markers and fill
118    /// [`AccessPolicy::prefer_geo`](crate::AccessPolicy::prefer_geo)
119    /// with the matching country codes. **Soft** routing only — a
120    /// site declaring `region:ru` *prefers* a Russian egress when one
121    /// is configured but still works from anywhere else; the router
122    /// falls back to the default egress on no match rather than
123    /// reporting `Uncertain(GeoUnavailable)`.
124    ///
125    /// Skips sites that already declare a hard
126    /// [`AccessPolicy::geo`](crate::AccessPolicy::geo) — explicit
127    /// policy wins on conflict, same convention as engine inheritance.
128    /// Existing `prefer_geo` entries are also preserved: tag-derived
129    /// codes are *added*, not replaced, so a hand-tuned policy can
130    /// stack on top of the tag. Invalid country codes (`region:xx` is
131    /// not exactly two ASCII letters) are silently skipped — they
132    /// shouldn't exist in the registry, and a parse error here would
133    /// break the load for a tag the scanner already ignores in
134    /// every other context.
135    fn apply_tag_derived_policy(&mut self) {
136        for site in &mut self.sites {
137            if !site.access.geo.is_empty() {
138                continue;
139            }
140            for tag in &site.tags {
141                let Some(rest) = tag.strip_prefix("region:") else {
142                    continue;
143                };
144                let Some(cc) = crate::access::CountryCode::new(rest) else {
145                    continue;
146                };
147                if !site.access.prefer_geo.contains(&cc) {
148                    site.access.prefer_geo.push(cc);
149                }
150            }
151        }
152    }
153
154    /// Merge each engine's inheritable fields into the sites that
155    /// reference it. After this call every site's `signals`,
156    /// `request_headers` and `regex_check` reflect the effective
157    /// values used by the scanner.
158    ///
159    /// Per-site fields are authoritative: anything declared
160    /// explicitly on a site wins on conflict; only empty / unset
161    /// fields are filled from the engine.
162    fn resolve_engines(&mut self) -> Result<()> {
163        for (name, engine) in &self.engines {
164            engine.validate(name)?;
165        }
166        for site in &mut self.sites {
167            let Some(name) = &site.engine else {
168                continue;
169            };
170            let Some(engine) = self.engines.get(name) else {
171                return Err(Error::InvalidSite {
172                    reason: format!(
173                        "site {:?}: references engine {name:?} which is not defined",
174                        site.name
175                    ),
176                });
177            };
178            engine.merge_into(site);
179        }
180        Ok(())
181    }
182
183    /// Read a registry from a JSON file.
184    pub fn load_from_path(path: impl AsRef<Path>) -> Result<Self> {
185        let bytes = std::fs::read(path)?;
186        let json = std::str::from_utf8(&bytes).map_err(|e| Error::InvalidSite {
187            reason: format!("registry file is not valid UTF-8: {e}"),
188        })?;
189        Self::from_json_str(json)
190    }
191
192    /// Borrow all sites in load order.
193    pub fn sites(&self) -> &[Site] {
194        &self.sites
195    }
196
197    /// Number of sites.
198    pub fn len(&self) -> usize {
199        self.sites.len()
200    }
201
202    /// True if the registry has no sites (always false for a valid load,
203    /// since we'd already have rejected it).
204    pub fn is_empty(&self) -> bool {
205        self.sites.is_empty()
206    }
207
208    /// Apply include/exclude name filters and a tag filter.
209    ///
210    /// - If `include` is non-empty, only sites whose name contains at least
211    ///   one include term (case-insensitive substring) are kept.
212    /// - Sites whose name contains any exclude term are dropped.
213    /// - If `tags` is non-empty, only sites carrying at least one of the
214    ///   requested tags are kept (case-insensitive). A site with no tags is
215    ///   therefore dropped by a tag filter — asking for `--tag social` means
216    ///   "only social-tagged sites".
217    /// - Sites carrying any tag in `exclude_tags` are dropped (e.g.
218    ///   `--exclude-tag bot-protected` for a fast clean run).
219    /// - **NSFW sites are auto-excluded** (the `nsfw` tag) unless
220    ///   `include_nsfw` is `true` or `tags` explicitly asks for `nsfw`.
221    ///   This matches Sherlock's `--nsfw` opt-in pattern and prevents
222    ///   the default `adler <username>` from surfacing adult-site URLs
223    ///   the user didn't ask for.
224    /// - Sites are returned by value (cloned) so the result is independent
225    ///   of the registry's lifetime — convenient for handing to the executor.
226    pub fn filter(
227        &self,
228        include: &[String],
229        exclude: &[String],
230        tags: &[String],
231        exclude_tags: &[String],
232        include_nsfw: bool,
233    ) -> Vec<Site> {
234        let include: Vec<String> = include.iter().map(|s| s.to_lowercase()).collect();
235        let exclude: Vec<String> = exclude.iter().map(|s| s.to_lowercase()).collect();
236        let want_tags: Vec<String> = tags.iter().map(|s| s.to_lowercase()).collect();
237        let mut drop_tags: Vec<String> = exclude_tags.iter().map(|s| s.to_lowercase()).collect();
238
239        // NSFW gate: auto-exclude unless the caller explicitly opted in,
240        // either via `include_nsfw` or by asking for the `nsfw` tag.
241        let nsfw_tag = "nsfw".to_owned();
242        let asking_for_nsfw = want_tags.contains(&nsfw_tag);
243        if !include_nsfw && !asking_for_nsfw && !drop_tags.contains(&nsfw_tag) {
244            drop_tags.push(nsfw_tag);
245        }
246
247        self.sites
248            .iter()
249            .filter(|site| {
250                // Disabled sites are skipped unconditionally — the bool
251                // is meant for parking known-broken entries with a
252                // reason comment instead of deleting them, so they
253                // never get probed even with a fresh include filter.
254                if site.disabled {
255                    return false;
256                }
257                let name = site.name.to_lowercase();
258                let included = include.is_empty() || include.iter().any(|i| name.contains(i));
259                let excluded = exclude.iter().any(|x| name.contains(x));
260                let lower_tags: Vec<String> = site.tags.iter().map(|t| t.to_lowercase()).collect();
261                let tagged =
262                    want_tags.is_empty() || lower_tags.iter().any(|t| want_tags.contains(t));
263                let tag_excluded = lower_tags.iter().any(|t| drop_tags.contains(t));
264                included && !excluded && tagged && !tag_excluded
265            })
266            .cloned()
267            .collect()
268    }
269
270    /// Distinct tags across all sites, sorted, with the count of sites
271    /// carrying each. Powers `--list-tags`.
272    pub fn tag_counts(&self) -> Vec<(String, usize)> {
273        let mut counts: std::collections::BTreeMap<String, usize> =
274            std::collections::BTreeMap::new();
275        for site in &self.sites {
276            for tag in &site.tags {
277                *counts.entry(tag.clone()).or_insert(0) += 1;
278            }
279        }
280        counts.into_iter().collect()
281    }
282
283    fn validate(&self) -> Result<()> {
284        if self.sites.is_empty() {
285            return Err(Error::InvalidSite {
286                reason: "registry has no sites".into(),
287            });
288        }
289        for site in &self.sites {
290            site.validate()?;
291        }
292        let mut seen: HashSet<String> = HashSet::new();
293        for site in &self.sites {
294            let key = site.name.to_lowercase();
295            if !seen.insert(key) {
296                return Err(Error::InvalidSite {
297                    reason: format!("duplicate site name: {:?}", site.name),
298                });
299            }
300        }
301        // (URL, signals) uniqueness among ENABLED sites: each
302        // (URL template, signal-set) pair should back exactly one live
303        // entry. Disabled entries can legitimately share URLs with
304        // their canonicals — that's how the `duplicate of <canonical>`
305        // dedup pattern works. A second enabled hit at the same URL
306        // *and* the same signal array is almost always an importer
307        // re-introducing a known duplicate
308        // (Sherlock/Maigret/WhatsMyName each name the same site
309        // slightly differently); the doctor would otherwise
310        // double-probe the URL for an identical verdict.
311        //
312        // Same URL with *distinct* signals is the legitimate-alias
313        // shape — WordPress.com (Public/Private/Deleted) hit the same
314        // API endpoint and disambiguate via their `body_present`
315        // marker, which the doctor reads as three independent verdicts.
316        let mut seen_url_sig: HashMap<(String, String), &str> = HashMap::new();
317        for site in &self.sites {
318            if site.disabled {
319                continue;
320            }
321            // `serde_json` for the signal key gives a canonical
322            // serialisation that doesn't depend on field-order or
323            // `Debug` formatting, both of which could shift between
324            // Rust releases or after a `#[derive(Debug)]` rearrange.
325            // serde_json is already a workspace dep; the cost is
326            // ~one allocation per enabled site at load time.
327            let sigs_key = serde_json::to_string(&site.signals)
328                .expect("Signal derives Serialize and contains no Map<_, _> with non-string keys");
329            let key = (site.url.as_str().to_owned(), sigs_key);
330            if let Some(prev) = seen_url_sig.insert(key, site.name.as_str()) {
331                return Err(Error::InvalidSite {
332                    reason: format!(
333                        "duplicate (URL, signals) among enabled sites: {:?} and {:?} both back \
334                         {:?} with identical signals. Mark one `disabled: true` with \
335                         `disabled_reason: \"duplicate of {prev}\"` (or, if the two entries are \
336                         supposed to disambiguate via different markers, give each a distinct \
337                         signal set).",
338                        prev,
339                        site.name,
340                        site.url.as_str(),
341                    ),
342                });
343            }
344        }
345        Ok(())
346    }
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352
353    #[test]
354    fn embedded_registry_loads_and_validates() {
355        let registry = Registry::default_embedded().expect("embedded registry must load");
356        // The registry is imported from Sherlock (~450 sites); a floor well
357        // above the old hand-written 15 guards against accidental truncation.
358        assert!(
359            registry.len() >= 100,
360            "imported registry should have ≥100 sites, got {}",
361            registry.len()
362        );
363        // Spot-check a couple of well-known entries. (HackerNews used
364        // to be here but was pruned 2026-05-26 — its Sherlock-side
365        // known_present went stale and the imported signature
366        // doctor-failed; can be restored via OVERRIDES in
367        // import_sherlock.py with a working account.)
368        let names: Vec<&str> = registry.sites().iter().map(|s| s.name.as_str()).collect();
369        assert!(names.contains(&"GitHub"));
370        assert!(names.contains(&"Reddit"));
371        assert!(names.contains(&"Telegram"));
372    }
373
374    #[test]
375    fn wmn_embedded_registry_loads_and_supersets_default() {
376        let base = Registry::default_embedded().unwrap();
377        let merged = Registry::default_embedded_with_wmn().expect("WMN-merged registry must load");
378        assert!(
379            merged.len() > base.len(),
380            "WMN merge must add sites: base={} merged={}",
381            base.len(),
382            merged.len()
383        );
384        // Every base-tranche name survives the merge; case-insensitive
385        // collisions resolve in favour of the MIT-tranche entry.
386        let merged_names: HashSet<String> = merged
387            .sites()
388            .iter()
389            .map(|s| s.name.to_lowercase())
390            .collect();
391        for s in base.sites() {
392            assert!(
393                merged_names.contains(&s.name.to_lowercase()),
394                "merge dropped base-tranche site {:?}",
395                s.name
396            );
397        }
398        // At least one WMN-only site carries the provenance tag.
399        let has_wmn_tag = merged
400            .sites()
401            .iter()
402            .any(|s| s.tags.iter().any(|t| t == "source:wmn"));
403        assert!(has_wmn_tag, "no site carries the source:wmn tag");
404    }
405
406    #[test]
407    fn rejects_empty_registry() {
408        let err = Registry::from_json_str(r#"{ "sites": [] }"#).unwrap_err();
409        assert!(matches!(err, Error::InvalidSite { .. }));
410    }
411
412    #[test]
413    fn rejects_duplicate_site_names() {
414        let json = r#"{
415            "sites": [
416                { "name": "GitHub", "url": "https://github.com/{username}",
417                  "signals": [{ "kind": "status_found", "codes": [200] }] },
418                { "name": "github", "url": "https://github.com/{username}",
419                  "signals": [{ "kind": "status_found", "codes": [200] }] }
420            ]
421        }"#;
422        let err = Registry::from_json_str(json).unwrap_err();
423        assert!(matches!(err, Error::InvalidSite { .. }));
424        assert!(err.to_string().contains("duplicate"));
425    }
426
427    #[test]
428    fn rejects_duplicate_enabled_urls() {
429        // Two enabled sites at the same URL is almost always an importer
430        // re-introducing a known duplicate. Reject at load time with a
431        // message naming both entries.
432        let json = r#"{
433            "sites": [
434                { "name": "Hub Code", "url": "https://example.com/{username}",
435                  "signals": [{ "kind": "status_found", "codes": [200] }] },
436                { "name": "HubCode", "url": "https://example.com/{username}",
437                  "signals": [{ "kind": "status_found", "codes": [200] }] }
438            ]
439        }"#;
440        let err = Registry::from_json_str(json).unwrap_err();
441        assert!(matches!(err, Error::InvalidSite { .. }));
442        let msg = err.to_string();
443        assert!(msg.contains("duplicate (URL, signals)"), "msg: {msg}");
444        assert!(msg.contains("Hub Code"), "msg: {msg}");
445        assert!(msg.contains("HubCode"), "msg: {msg}");
446    }
447
448    #[test]
449    fn allows_duplicate_urls_with_distinct_signals() {
450        // Same URL, distinct signal sets — this is the legitimate-alias
451        // shape (e.g. WordPress.com (Public/Private/Deleted) hit one
452        // endpoint and disambiguate via the body marker). Must NOT
453        // trigger the URL-uniqueness rule.
454        let json = r#"{
455            "sites": [
456                { "name": "Site Public", "url": "https://example.com/{username}",
457                  "signals": [{ "kind": "status_found", "codes": [200] }] },
458                { "name": "Site Private", "url": "https://example.com/{username}",
459                  "signals": [{ "kind": "status_found", "codes": [403] }] }
460            ]
461        }"#;
462        let registry = Registry::from_json_str(json).expect("distinct-signal alias must validate");
463        assert_eq!(registry.len(), 2);
464    }
465
466    #[test]
467    fn allows_duplicate_urls_when_one_side_is_disabled() {
468        // The dedup pattern that the v0.14 hygiene pass established:
469        // canonical stays enabled, the surplus entry gets
470        // `disabled: true` + `disabled_reason: "duplicate of <canonical>"`.
471        // This shape must continue loading cleanly.
472        let json = r#"{
473            "sites": [
474                { "name": "Hub Code", "url": "https://example.com/{username}",
475                  "signals": [{ "kind": "status_found", "codes": [200] }] },
476                { "name": "HubCode", "url": "https://example.com/{username}",
477                  "signals": [{ "kind": "status_found", "codes": [200] }],
478                  "disabled": true,
479                  "disabled_reason": "duplicate of Hub Code" }
480            ]
481        }"#;
482        let registry = Registry::from_json_str(json).expect("dedup pattern must validate");
483        assert_eq!(registry.len(), 2);
484    }
485
486    #[test]
487    fn rejects_invalid_site_definition() {
488        // Missing {username} placeholder.
489        let json = r#"{
490            "sites": [
491                { "name": "Bad", "url": "https://example.com/",
492                  "signals": [{ "kind": "status_found", "codes": [200] }] }
493            ]
494        }"#;
495        assert!(Registry::from_json_str(json).is_err());
496    }
497
498    #[test]
499    fn rejects_malformed_json() {
500        let err = Registry::from_json_str("{").unwrap_err();
501        assert!(matches!(err, Error::Json(_)));
502    }
503
504    #[test]
505    fn filter_include_is_case_insensitive_substring() {
506        let registry = Registry::default_embedded().unwrap();
507        let only_github = registry.filter(&["github".into()], &[], &[], &[], false);
508        assert_eq!(only_github.len(), 1);
509        assert_eq!(only_github[0].name, "GitHub");
510
511        let many = registry.filter(&["e".into()], &[], &[], &[], false); // matches anything with "e"
512        assert!(many.len() > 1);
513    }
514
515    #[test]
516    fn filter_exclude_drops_matches() {
517        let registry = Registry::default_embedded().unwrap();
518        // Include NSFW to keep the test focused on the name-exclude
519        // path; the NSFW auto-exclusion is exercised separately.
520        let baseline = registry.filter(&[], &[], &[], &[], true);
521        let without_github = registry.filter(&[], &["github".into()], &[], &[], true);
522        assert!(without_github.iter().all(|s| s.name != "GitHub"));
523        // Asserting against the baseline (filtered) count rather than
524        // `registry.len()` so this test is robust to changes in the
525        // disabled-site count — `len()` includes disabled entries,
526        // `filter()` does not.
527        assert_eq!(without_github.len(), baseline.len() - 1);
528    }
529
530    #[test]
531    fn filter_include_and_exclude_compose() {
532        let registry = Registry::default_embedded().unwrap();
533        // Include "git", then exclude "lab" → keep GitHub, drop GitLab.
534        let filtered = registry.filter(&["git".into()], &["lab".into()], &[], &[], false);
535        let names: Vec<&str> = filtered.iter().map(|s| s.name.as_str()).collect();
536        assert!(names.contains(&"GitHub"));
537        assert!(!names.contains(&"GitLab"));
538        // Exclude wins over include for sites containing both terms (none here).
539    }
540
541    #[test]
542    fn filter_with_no_matches_returns_empty() {
543        let registry = Registry::default_embedded().unwrap();
544        let filtered = registry.filter(&["does-not-exist-xyz".into()], &[], &[], &[], false);
545        assert!(filtered.is_empty());
546    }
547
548    #[test]
549    fn disabled_sites_are_skipped_by_filter() {
550        let json = r#"{
551            "sites": [
552                { "name": "Alive", "url": "https://alive.example/{username}",
553                  "signals": [{ "kind": "status_found", "codes": [200] }] },
554                { "name": "Parked", "url": "https://parked.example/{username}",
555                  "signals": [{ "kind": "status_found", "codes": [200] }],
556                  "disabled": true }
557            ]
558        }"#;
559        let registry = Registry::from_json_str(json).unwrap();
560        // sites() returns everything including disabled — it's the
561        // serialisation view. filter() is the scan view and drops
562        // disabled entries.
563        assert_eq!(registry.sites().len(), 2);
564        let scanned = registry.filter(&[], &[], &[], &[], false);
565        let names: Vec<&str> = scanned.iter().map(|s| s.name.as_str()).collect();
566        assert_eq!(names, vec!["Alive"]);
567    }
568
569    #[test]
570    fn source_field_round_trips() {
571        let json = r#"{
572            "sites": [
573                { "name": "Nitter", "url": "https://nitter.example/{username}",
574                  "signals": [{ "kind": "status_found", "codes": [200] }],
575                  "source": "Twitter" }
576            ]
577        }"#;
578        let registry = Registry::from_json_str(json).unwrap();
579        assert_eq!(registry.sites()[0].source.as_deref(), Some("Twitter"));
580    }
581
582    fn tagged_registry() -> Registry {
583        let json = r#"{
584            "sites": [
585                { "name": "Soc", "url": "https://soc.example/{username}",
586                  "signals": [{ "kind": "status_found", "codes": [200] }],
587                  "tags": ["social", "region:ru"] },
588                { "name": "Dev", "url": "https://dev.example/{username}",
589                  "signals": [{ "kind": "status_found", "codes": [200] }],
590                  "tags": ["dev"] },
591                { "name": "Plain", "url": "https://plain.example/{username}",
592                  "signals": [{ "kind": "status_found", "codes": [200] }] }
593            ]
594        }"#;
595        Registry::from_json_str(json).unwrap()
596    }
597
598    #[test]
599    fn tag_filter_keeps_only_matching_tags_and_drops_untagged() {
600        let r = tagged_registry();
601        let social = r.filter(&[], &[], &["social".into()], &[], false);
602        let names: Vec<&str> = social.iter().map(|s| s.name.as_str()).collect();
603        assert_eq!(names, ["Soc"], "tag filter should keep only tagged matches");
604    }
605
606    #[test]
607    fn tag_filter_is_or_within_requested_tags_and_case_insensitive() {
608        let r = tagged_registry();
609        let either = r.filter(&[], &[], &["DEV".into(), "social".into()], &[], false);
610        let names: Vec<&str> = either.iter().map(|s| s.name.as_str()).collect();
611        assert_eq!(names, ["Soc", "Dev"]);
612    }
613
614    #[test]
615    fn no_tag_filter_includes_untagged_sites() {
616        let r = tagged_registry();
617        assert_eq!(r.filter(&[], &[], &[], &[], false).len(), 3);
618    }
619
620    #[test]
621    fn exclude_tag_drops_matching_sites() {
622        let r = tagged_registry();
623        let kept = r.filter(&[], &[], &[], &["social".into()], false);
624        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
625        // Soc carries "social" → dropped; Dev and untagged Plain remain.
626        assert_eq!(names, ["Dev", "Plain"], "{names:?}");
627    }
628
629    fn nsfw_registry() -> Registry {
630        let json = r#"{
631            "sites": [
632                { "name": "Family", "url": "https://family.example/{username}",
633                  "signals": [{ "kind": "status_found", "codes": [200] }],
634                  "tags": ["social"] },
635                { "name": "Adult", "url": "https://adult.example/{username}",
636                  "signals": [{ "kind": "status_found", "codes": [200] }],
637                  "tags": ["nsfw"] }
638            ]
639        }"#;
640        Registry::from_json_str(json).unwrap()
641    }
642
643    #[test]
644    fn nsfw_sites_excluded_by_default() {
645        let r = nsfw_registry();
646        let kept = r.filter(&[], &[], &[], &[], false);
647        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
648        assert_eq!(names, ["Family"], "nsfw site must be excluded by default");
649    }
650
651    #[test]
652    fn nsfw_sites_included_when_flag_set() {
653        let r = nsfw_registry();
654        let kept = r.filter(&[], &[], &[], &[], true);
655        assert_eq!(kept.len(), 2, "both sites present with include_nsfw=true");
656    }
657
658    #[test]
659    fn nsfw_sites_included_when_tag_asked_for_explicitly() {
660        // `--tag nsfw` is an explicit opt-in; should bypass the default
661        // auto-exclusion even with include_nsfw=false.
662        let r = nsfw_registry();
663        let kept = r.filter(&[], &[], &["nsfw".into()], &[], false);
664        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
665        assert_eq!(names, ["Adult"]);
666    }
667
668    #[test]
669    fn tag_counts_are_sorted_with_per_tag_totals() {
670        let r = tagged_registry();
671        assert_eq!(
672            r.tag_counts(),
673            vec![
674                ("dev".to_owned(), 1),
675                ("region:ru".to_owned(), 1),
676                ("social".to_owned(), 1),
677            ]
678        );
679    }
680
681    #[test]
682    fn engine_inheritance_fills_empty_site_signals() {
683        // Site has no `signals` block — should inherit the engine's.
684        let json = r#"{
685            "engines": {
686                "Discourse": {
687                    "signals": [
688                        { "kind": "status_found", "codes": [200] },
689                        { "kind": "body_absent", "text": "Oops! That page doesn't exist" }
690                    ]
691                }
692            },
693            "sites": [
694                { "name": "Mozilla Forum", "url": "https://discourse.mozilla.org/u/{username}",
695                  "engine": "Discourse" }
696            ]
697        }"#;
698        let r = Registry::from_json_str(json).unwrap();
699        let site = &r.sites()[0];
700        assert_eq!(site.signals.len(), 2);
701        assert_eq!(site.engine.as_deref(), Some("Discourse"));
702        // engines map preserved
703        assert!(r.engines().contains_key("Discourse"));
704    }
705
706    #[test]
707    fn site_overrides_engine_signals_on_conflict() {
708        // Site declares its own `signals` — engine's must NOT replace them.
709        let json = r#"{
710            "engines": {
711                "Discourse": {
712                    "signals": [{ "kind": "status_found", "codes": [200] }]
713                }
714            },
715            "sites": [
716                { "name": "Custom", "url": "https://example.com/{username}",
717                  "engine": "Discourse",
718                  "signals": [
719                    { "kind": "status_found", "codes": [200] },
720                    { "kind": "status_not_found", "codes": [404] }
721                  ] }
722            ]
723        }"#;
724        let r = Registry::from_json_str(json).unwrap();
725        // The site-declared 2 signals win over the engine's 1 signal.
726        assert_eq!(r.sites()[0].signals.len(), 2);
727    }
728
729    #[test]
730    fn engine_headers_merge_with_site_headers_per_key() {
731        // Engine declares one header; site declares another. Resolved
732        // site should carry both. On per-key conflict the site wins.
733        let json = r#"{
734            "engines": {
735                "Foo": {
736                    "signals": [{ "kind": "status_found", "codes": [200] }],
737                    "request_headers": {
738                        "X-Engine": "engine-value",
739                        "User-Agent": "engine-ua"
740                    }
741                }
742            },
743            "sites": [
744                { "name": "S", "url": "https://example.com/{username}",
745                  "engine": "Foo",
746                  "request_headers": { "User-Agent": "site-ua" } }
747            ]
748        }"#;
749        let r = Registry::from_json_str(json).unwrap();
750        let h = &r.sites()[0].request_headers;
751        assert_eq!(h.get("X-Engine").map(String::as_str), Some("engine-value"));
752        assert_eq!(h.get("User-Agent").map(String::as_str), Some("site-ua"));
753    }
754
755    #[test]
756    fn missing_engine_reference_fails_load() {
757        let json = r#"{
758            "engines": {},
759            "sites": [
760                { "name": "Mock", "url": "https://example.com/{username}",
761                  "engine": "DoesNotExist" }
762            ]
763        }"#;
764        let err = Registry::from_json_str(json).unwrap_err();
765        assert!(
766            err.to_string()
767                .contains("references engine \"DoesNotExist\""),
768            "expected missing-engine error, got: {err}"
769        );
770    }
771
772    #[test]
773    fn engine_regex_check_inherited_when_site_has_none() {
774        let json = r#"{
775            "engines": {
776                "Bounded": {
777                    "signals": [{ "kind": "status_found", "codes": [200] }],
778                    "regex_check": "^[a-z]{3,16}$"
779                }
780            },
781            "sites": [
782                { "name": "S", "url": "https://example.com/{username}",
783                  "engine": "Bounded" }
784            ]
785        }"#;
786        let r = Registry::from_json_str(json).unwrap();
787        assert_eq!(r.sites()[0].regex_check.as_deref(), Some("^[a-z]{3,16}$"));
788    }
789
790    #[test]
791    fn region_tag_auto_populates_prefer_geo() {
792        let json = r#"{
793            "sites": [
794                { "name": "vk.com", "url": "https://vk.com/{username}",
795                  "signals": [{ "kind": "status_found", "codes": [200] }],
796                  "tags": ["region:ru", "social"] }
797            ]
798        }"#;
799        let r = Registry::from_json_str(json).unwrap();
800        let prefer = &r.sites()[0].access.prefer_geo;
801        assert_eq!(prefer.len(), 1);
802        assert_eq!(prefer[0].as_str(), "ru");
803        // Hard geo stays empty — the tag is soft.
804        assert!(r.sites()[0].access.geo.is_empty());
805    }
806
807    #[test]
808    fn multiple_region_tags_stack() {
809        let json = r#"{
810            "sites": [
811                { "name": "Pan-Slavic", "url": "https://example.test/{username}",
812                  "signals": [{ "kind": "status_found", "codes": [200] }],
813                  "tags": ["region:ru", "region:by", "region:ua"] }
814            ]
815        }"#;
816        let r = Registry::from_json_str(json).unwrap();
817        let codes: Vec<&str> = r.sites()[0]
818            .access
819            .prefer_geo
820            .iter()
821            .map(super::super::access::CountryCode::as_str)
822            .collect();
823        assert_eq!(codes, vec!["ru", "by", "ua"]);
824    }
825
826    #[test]
827    fn explicit_hard_geo_suppresses_tag_derived_soft() {
828        // A site with hard `access.geo = ["pl"]` AND a `region:ru` tag:
829        // the explicit hard policy wins, prefer_geo stays empty.
830        // Otherwise tag-derived soft would silently re-route a probe
831        // that the maintainer deliberately pinned to PL.
832        let json = r#"{
833            "sites": [
834                { "name": "PL-only", "url": "https://example.test/{username}",
835                  "signals": [{ "kind": "status_found", "codes": [200] }],
836                  "tags": ["region:ru"],
837                  "access": { "geo": ["pl"] } }
838            ]
839        }"#;
840        let r = Registry::from_json_str(json).unwrap();
841        assert_eq!(r.sites()[0].access.geo[0].as_str(), "pl");
842        assert!(r.sites()[0].access.prefer_geo.is_empty());
843    }
844
845    #[test]
846    fn malformed_region_tag_is_ignored() {
847        // `region:` followed by something that isn't a 2-letter code:
848        // skip it silently rather than reject the whole load. The tag
849        // already had no routing semantics in older versions.
850        let json = r#"{
851            "sites": [
852                { "name": "Weird", "url": "https://example.test/{username}",
853                  "signals": [{ "kind": "status_found", "codes": [200] }],
854                  "tags": ["region:eurasia", "region:r", "region:RU"] }
855            ]
856        }"#;
857        let r = Registry::from_json_str(json).unwrap();
858        // Only the valid 2-letter "RU" survives (lowercased to "ru").
859        let codes: Vec<&str> = r.sites()[0]
860            .access
861            .prefer_geo
862            .iter()
863            .map(super::super::access::CountryCode::as_str)
864            .collect();
865        assert_eq!(codes, vec!["ru"]);
866    }
867
868    #[test]
869    fn load_from_path_round_trips_via_tempfile() {
870        let mut path = std::env::temp_dir();
871        path.push(format!("adler-test-registry-{}.json", std::process::id()));
872        std::fs::write(
873            &path,
874            r#"{
875                "sites": [
876                    { "name": "Mock", "url": "https://example.com/{username}",
877                      "signals": [{ "kind": "status_found", "codes": [200] }] }
878                ]
879            }"#,
880        )
881        .unwrap();
882        let result = Registry::load_from_path(&path);
883        let _ = std::fs::remove_file(&path);
884        let registry = result.unwrap();
885        assert_eq!(registry.len(), 1);
886        assert_eq!(registry.sites()[0].name, "Mock");
887    }
888}
adler_core/registry.rs

adler_core/
registry.rs