adler_core/
registry.rs

1//! Site registry — loading, validation, filtering.
2//!
3//! The default registry is embedded into the binary at compile time via
4//! [`include_str!`]. Callers can override it with a file at runtime through
5//! [`Registry::load_from_path`].
6
7use std::collections::{BTreeMap, HashMap, HashSet};
8use std::path::Path;
9
10use serde::Deserialize;
11
12use crate::error::{Error, Result};
13use crate::site::{Engine, Site};
14
15const EMBEDDED_REGISTRY: &str = include_str!("../data/sites.json");
16
17/// Supplementary registry derived from the `WhatsMyName` project
18/// (`WebBreacher/WhatsMyName`, CC BY-SA 4.0). Kept as a separate
19/// constant because its data license is incompatible with the
20/// MIT-only [`EMBEDDED_REGISTRY`] above; callers opt in explicitly
21/// via [`Registry::default_embedded_with_wmn`] to keep the default
22/// MIT-clean for downstream redistribution.
23const EMBEDDED_WMN_REGISTRY: &str = include_str!("../data/sites_wmn.json");
24
25/// A loaded, validated collection of site definitions.
26///
27/// Engines (shared signature templates referenced by [`Site::engine`])
28/// are resolved into sites at load time — by the time you call
29/// [`Registry::sites`] every entry already has its inherited
30/// `signals` / `request_headers` / `regex_check` materialised. The original
31/// [`Engine`] objects are kept on the registry for re-export and
32/// inspection via [`Registry::engines`].
33#[derive(Debug, Clone, Deserialize)]
34pub struct Registry {
35    #[serde(default)]
36    engines: BTreeMap<String, Engine>,
37    sites: Vec<Site>,
38}
39
40/// Reusable site-filter specification shared by CLI, server, and MCP surfaces.
41///
42/// Filtering semantics match [`Registry::filter`]: name include/exclude terms
43/// are case-insensitive substrings, tag filters are case-insensitive exact
44/// matches, disabled sites are always skipped, and `nsfw` sites are hidden
45/// unless [`include_nsfw`](Self::include_nsfw) is true or the `nsfw` tag is
46/// requested explicitly.
47#[derive(Debug, Clone, Default)]
48pub struct SiteFilter {
49    /// Only keep sites whose name contains at least one term. Empty = no
50    /// include filter.
51    pub include: Vec<String>,
52    /// Drop sites whose name contains any term.
53    pub exclude: Vec<String>,
54    /// Only keep sites carrying at least one requested tag. Empty = no tag
55    /// include filter.
56    pub tags: Vec<String>,
57    /// Drop sites carrying any of these tags.
58    pub exclude_tags: Vec<String>,
59    /// Include sites tagged `nsfw`.
60    pub include_nsfw: bool,
61    /// Optional popularity-rank ceiling (`popularity <= top`). Sites
62    /// without a popularity rank are dropped when this is set; returned
63    /// sites are sorted by rank.
64    pub top: Option<u32>,
65}
66
67impl SiteFilter {
68    /// Apply this filter to a site slice, returning cloned sites in scan order
69    /// (or popularity order when [`top`](Self::top) is set).
70    pub fn apply(&self, sites: &[Site]) -> Vec<Site> {
71        self.apply_inner(sites, DisabledMode::Exclude)
72    }
73
74    /// Apply this filter to a site slice without dropping disabled entries.
75    /// Useful for catalogue/diagnostic surfaces that need to explain parked
76    /// sites while scan paths continue to call [`apply`](Self::apply).
77    pub fn apply_including_disabled(&self, sites: &[Site]) -> Vec<Site> {
78        self.apply_inner(sites, DisabledMode::Include)
79    }
80
81    fn apply_inner(&self, sites: &[Site], disabled_mode: DisabledMode) -> Vec<Site> {
82        let include: Vec<String> = self.include.iter().map(|s| s.to_lowercase()).collect();
83        let exclude: Vec<String> = self.exclude.iter().map(|s| s.to_lowercase()).collect();
84        let want_tags: Vec<String> = self.tags.iter().map(|s| s.to_lowercase()).collect();
85        let mut drop_tags: Vec<String> =
86            self.exclude_tags.iter().map(|s| s.to_lowercase()).collect();
87
88        // NSFW gate: auto-exclude unless the caller explicitly opted in,
89        // either via `include_nsfw` or by asking for the `nsfw` tag.
90        let nsfw_tag = "nsfw".to_owned();
91        let asking_for_nsfw = want_tags.contains(&nsfw_tag);
92        if !self.include_nsfw && !asking_for_nsfw && !drop_tags.contains(&nsfw_tag) {
93            drop_tags.push(nsfw_tag);
94        }
95
96        let mut filtered: Vec<Site> = sites
97            .iter()
98            .filter(|site| {
99                match disabled_mode {
100                    DisabledMode::Exclude if site.disabled => {
101                        // Disabled sites are skipped unconditionally in
102                        // the scan view — the bool is meant for parking
103                        // known-broken entries with a reason comment
104                        // instead of deleting them.
105                        return false;
106                    }
107                    DisabledMode::Only if !site.disabled => return false,
108                    DisabledMode::Exclude | DisabledMode::Only | DisabledMode::Include => {}
109                }
110                let name = site.name.to_lowercase();
111                let included = include.is_empty() || include.iter().any(|i| name.contains(i));
112                let excluded = exclude.iter().any(|x| name.contains(x));
113                let lower_tags: Vec<String> = site.tags.iter().map(|t| t.to_lowercase()).collect();
114                let tagged =
115                    want_tags.is_empty() || lower_tags.iter().any(|t| want_tags.contains(t));
116                let tag_excluded = lower_tags.iter().any(|t| drop_tags.contains(t));
117                included && !excluded && tagged && !tag_excluded
118            })
119            .cloned()
120            .collect();
121
122        if let Some(n) = self.top {
123            filtered.retain(|s| s.popularity.is_some_and(|p| p <= n));
124            filtered.sort_by_key(|s| s.popularity.unwrap_or(u32::MAX));
125        }
126        filtered
127    }
128}
129
130#[derive(Debug, Clone, Copy)]
131enum DisabledMode {
132    Exclude,
133    Only,
134    Include,
135}
136
137impl Registry {
138    /// Load the default site list embedded into the crate at build time.
139    pub fn default_embedded() -> Result<Self> {
140        Self::from_json_str(EMBEDDED_REGISTRY)
141    }
142
143    /// Load the default site list *plus* the `WhatsMyName`-derived
144    /// supplementary set. `WhatsMyName` data is licensed CC BY-SA 4.0
145    /// (see `LICENSE-CC-BY-SA-4.0` at the repo root); enabling this
146    /// path means downstream redistribution of the merged scan data
147    /// must respect the `ShareAlike` obligation. Sites contributed by
148    /// the `WhatsMyName` tranche carry the `source:wmn` tag for
149    /// provenance.
150    ///
151    /// Engines from the WMN tranche merge with the MIT tranche;
152    /// case-insensitive site-name collisions resolve in favour of the
153    /// MIT-tranche entry (the hand-curated Sherlock/Maigret-derived
154    /// signature wins; the WMN duplicate is dropped). Returns an
155    /// error only if either tranche fails its own validation —
156    /// engine references are checked across the merged set.
157    pub fn default_embedded_with_wmn() -> Result<Self> {
158        let mut base = Self::default_embedded()?;
159        let wmn: Self = serde_json::from_str(EMBEDDED_WMN_REGISTRY)?;
160        let existing_names: HashSet<String> =
161            base.sites.iter().map(|s| s.name.to_lowercase()).collect();
162        // URL-claim only counts enabled base entries — the dedup pattern
163        // keeps disabled siblings at the canonical's URL, and a WMN
164        // entry colliding with one of *those* is no worse than colliding
165        // with the canonical.
166        let claimed_urls: HashSet<String> = base
167            .sites
168            .iter()
169            .filter(|s| !s.disabled)
170            .map(|s| s.url.as_str().to_owned())
171            .collect();
172        for (name, engine) in wmn.engines {
173            base.engines.entry(name).or_insert(engine);
174        }
175        for site in wmn.sites {
176            if existing_names.contains(&site.name.to_lowercase()) {
177                continue;
178            }
179            if !site.disabled && claimed_urls.contains(site.url.as_str()) {
180                // Base already has an enabled site at this URL; WMN's
181                // version would just produce a doubled probe, and
182                // validate() would refuse the merged registry. Drop the
183                // WMN entry; base canonical wins (same precedence rule
184                // we apply for name collisions).
185                continue;
186            }
187            base.sites.push(site);
188        }
189        base.resolve_engines()?;
190        base.validate()?;
191        Ok(base)
192    }
193
194    /// Parse and validate a registry from a JSON string. Engine
195    /// references on each site are resolved before validation;
196    /// a site that names an engine which doesn't exist in the
197    /// `engines` block fails loading with [`Error::InvalidSite`].
198    pub fn from_json_str(json: &str) -> Result<Self> {
199        let mut registry: Self = serde_json::from_str(json)?;
200        registry.resolve_engines()?;
201        registry.apply_tag_derived_policy();
202        registry.validate()?;
203        Ok(registry)
204    }
205
206    /// Inheritable engine templates, keyed by name. Useful for
207    /// introspection and for serialising the registry back out;
208    /// detection paths read the resolved fields off the sites
209    /// directly and don't need to consult this map.
210    pub fn engines(&self) -> &BTreeMap<String, Engine> {
211        &self.engines
212    }
213
214    /// Walk every site's tags for `region:XX` markers and fill
215    /// [`AccessPolicy::prefer_geo`](crate::AccessPolicy::prefer_geo)
216    /// with the matching country codes. **Soft** routing only — a
217    /// site declaring `region:ru` *prefers* a Russian egress when one
218    /// is configured but still works from anywhere else; the router
219    /// falls back to the default egress on no match rather than
220    /// reporting `Uncertain(GeoUnavailable)`.
221    ///
222    /// Skips sites that already declare a hard
223    /// [`AccessPolicy::geo`](crate::AccessPolicy::geo) — explicit
224    /// policy wins on conflict, same convention as engine inheritance.
225    /// Existing `prefer_geo` entries are also preserved: tag-derived
226    /// codes are *added*, not replaced, so a hand-tuned policy can
227    /// stack on top of the tag. Invalid country codes (`region:xx` is
228    /// not exactly two ASCII letters) are silently skipped — they
229    /// shouldn't exist in the registry, and a parse error here would
230    /// break the load for a tag the scanner already ignores in
231    /// every other context.
232    fn apply_tag_derived_policy(&mut self) {
233        for site in &mut self.sites {
234            if !site.access.geo.is_empty() {
235                continue;
236            }
237            for tag in &site.tags {
238                let Some(rest) = tag.strip_prefix("region:") else {
239                    continue;
240                };
241                let Some(cc) = crate::access::CountryCode::new(rest) else {
242                    continue;
243                };
244                if !site.access.prefer_geo.contains(&cc) {
245                    site.access.prefer_geo.push(cc);
246                }
247            }
248        }
249    }
250
251    /// Merge each engine's inheritable fields into the sites that
252    /// reference it. After this call every site's `signals`,
253    /// `request_headers` and `regex_check` reflect the effective
254    /// values used by the scanner.
255    ///
256    /// Per-site fields are authoritative: anything declared
257    /// explicitly on a site wins on conflict; only empty / unset
258    /// fields are filled from the engine.
259    fn resolve_engines(&mut self) -> Result<()> {
260        for (name, engine) in &self.engines {
261            engine.validate(name)?;
262        }
263        for site in &mut self.sites {
264            let Some(name) = &site.engine else {
265                continue;
266            };
267            let Some(engine) = self.engines.get(name) else {
268                return Err(Error::InvalidSite {
269                    reason: format!(
270                        "site {:?}: references engine {name:?} which is not defined",
271                        site.name
272                    ),
273                });
274            };
275            engine.merge_into(site);
276        }
277        Ok(())
278    }
279
280    /// Read a registry from a JSON file.
281    pub fn load_from_path(path: impl AsRef<Path>) -> Result<Self> {
282        let bytes = std::fs::read(path)?;
283        let json = std::str::from_utf8(&bytes).map_err(|e| Error::InvalidSite {
284            reason: format!("registry file is not valid UTF-8: {e}"),
285        })?;
286        Self::from_json_str(json)
287    }
288
289    /// Borrow all sites in load order.
290    pub fn sites(&self) -> &[Site] {
291        &self.sites
292    }
293
294    /// Number of sites.
295    pub fn len(&self) -> usize {
296        self.sites.len()
297    }
298
299    /// True if the registry has no sites (always false for a valid load,
300    /// since we'd already have rejected it).
301    pub fn is_empty(&self) -> bool {
302        self.sites.is_empty()
303    }
304
305    /// Apply include/exclude name filters and a tag filter.
306    ///
307    /// - If `include` is non-empty, only sites whose name contains at least
308    ///   one include term (case-insensitive substring) are kept.
309    /// - Sites whose name contains any exclude term are dropped.
310    /// - If `tags` is non-empty, only sites carrying at least one of the
311    ///   requested tags are kept (case-insensitive). A site with no tags is
312    ///   therefore dropped by a tag filter — asking for `--tag social` means
313    ///   "only social-tagged sites".
314    /// - Sites carrying any tag in `exclude_tags` are dropped (e.g.
315    ///   `--exclude-tag bot-protected` for a fast clean run).
316    /// - **NSFW sites are auto-excluded** (the `nsfw` tag) unless
317    ///   `include_nsfw` is `true` or `tags` explicitly asks for `nsfw`.
318    ///   This matches Sherlock's `--nsfw` opt-in pattern and prevents
319    ///   the default `adler <username>` from surfacing adult-site URLs
320    ///   the user didn't ask for.
321    /// - Sites are returned by value (cloned) so the result is independent
322    ///   of the registry's lifetime — convenient for handing to the executor.
323    pub fn filter(
324        &self,
325        include: &[String],
326        exclude: &[String],
327        tags: &[String],
328        exclude_tags: &[String],
329        include_nsfw: bool,
330    ) -> Vec<Site> {
331        self.filter_with(&SiteFilter {
332            include: include.to_vec(),
333            exclude: exclude.to_vec(),
334            tags: tags.to_vec(),
335            exclude_tags: exclude_tags.to_vec(),
336            include_nsfw,
337            top: None,
338        })
339    }
340
341    /// Apply a reusable [`SiteFilter`] to this registry.
342    pub fn filter_with(&self, filter: &SiteFilter) -> Vec<Site> {
343        filter.apply(&self.sites)
344    }
345
346    /// Apply a reusable [`SiteFilter`] without dropping disabled entries.
347    /// This is the catalogue view: scans still call
348    /// [`filter_with`](Self::filter_with), while UI/agent surfaces can keep
349    /// parked entries visible with their reasons.
350    pub fn matches_with(&self, filter: &SiteFilter) -> Vec<Site> {
351        filter.apply_including_disabled(&self.sites)
352    }
353
354    /// Apply a reusable [`SiteFilter`] but return only disabled/parked
355    /// entries. This is for diagnostics: scan surfaces still call
356    /// [`filter_with`](Self::filter_with), while CLIs and UIs can use this
357    /// to explain why an otherwise matching site is unavailable.
358    pub fn disabled_matches_with(&self, filter: &SiteFilter) -> Vec<Site> {
359        filter.apply_inner(&self.sites, DisabledMode::Only)
360    }
361
362    /// Distinct tags across all sites, sorted, with the count of sites
363    /// carrying each. Powers `--list-tags`.
364    pub fn tag_counts(&self) -> Vec<(String, usize)> {
365        let mut counts: std::collections::BTreeMap<String, usize> =
366            std::collections::BTreeMap::new();
367        for site in &self.sites {
368            for tag in &site.tags {
369                *counts.entry(tag.clone()).or_insert(0) += 1;
370            }
371        }
372        counts.into_iter().collect()
373    }
374
375    fn validate(&self) -> Result<()> {
376        if self.sites.is_empty() {
377            return Err(Error::InvalidSite {
378                reason: "registry has no sites".into(),
379            });
380        }
381        for site in &self.sites {
382            site.validate()?;
383        }
384        let mut seen: HashSet<String> = HashSet::new();
385        for site in &self.sites {
386            let key = site.name.to_lowercase();
387            if !seen.insert(key) {
388                return Err(Error::InvalidSite {
389                    reason: format!("duplicate site name: {:?}", site.name),
390                });
391            }
392        }
393        // (URL, signals) uniqueness among ENABLED sites: each
394        // (URL template, signal-set) pair should back exactly one live
395        // entry. Disabled entries can legitimately share URLs with
396        // their canonicals — that's how the `duplicate of <canonical>`
397        // dedup pattern works. A second enabled hit at the same URL
398        // *and* the same signal array is almost always an importer
399        // re-introducing a known duplicate
400        // (Sherlock/Maigret/WhatsMyName each name the same site
401        // slightly differently); the doctor would otherwise
402        // double-probe the URL for an identical verdict.
403        //
404        // Same URL with *distinct* signals is the legitimate-alias
405        // shape — WordPress.com (Public/Private/Deleted) hit the same
406        // API endpoint and disambiguate via their `body_present`
407        // marker, which the doctor reads as three independent verdicts.
408        let mut seen_url_sig: HashMap<(String, String), &str> = HashMap::new();
409        for site in &self.sites {
410            if site.disabled {
411                continue;
412            }
413            // `serde_json` for the signal key gives a canonical
414            // serialisation that doesn't depend on field-order or
415            // `Debug` formatting, both of which could shift between
416            // Rust releases or after a `#[derive(Debug)]` rearrange.
417            // serde_json is already a workspace dep; the cost is
418            // ~one allocation per enabled site at load time.
419            let sigs_key = serde_json::to_string(&site.signals)
420                .expect("Signal derives Serialize and contains no Map<_, _> with non-string keys");
421            let key = (site.url.as_str().to_owned(), sigs_key);
422            if let Some(prev) = seen_url_sig.insert(key, site.name.as_str()) {
423                return Err(Error::InvalidSite {
424                    reason: format!(
425                        "duplicate (URL, signals) among enabled sites: {:?} and {:?} both back \
426                         {:?} with identical signals. Mark one `disabled: true` with \
427                         `disabled_reason: \"duplicate of {prev}\"` (or, if the two entries are \
428                         supposed to disambiguate via different markers, give each a distinct \
429                         signal set).",
430                        prev,
431                        site.name,
432                        site.url.as_str(),
433                    ),
434                });
435            }
436        }
437        Ok(())
438    }
439}
440
441#[cfg(test)]
442mod tests {
443    use super::*;
444
445    #[test]
446    fn embedded_registry_loads_and_validates() {
447        let registry = Registry::default_embedded().expect("embedded registry must load");
448        // The registry is imported from Sherlock (~450 sites); a floor well
449        // above the old hand-written 15 guards against accidental truncation.
450        assert!(
451            registry.len() >= 100,
452            "imported registry should have ≥100 sites, got {}",
453            registry.len()
454        );
455        // Spot-check a couple of well-known entries. (HackerNews used
456        // to be here but was pruned 2026-05-26 — its Sherlock-side
457        // known_present went stale and the imported signature
458        // doctor-failed; can be restored via OVERRIDES in
459        // import_sherlock.py with a working account.)
460        let names: Vec<&str> = registry.sites().iter().map(|s| s.name.as_str()).collect();
461        assert!(names.contains(&"GitHub"));
462        assert!(names.contains(&"Reddit"));
463        assert!(names.contains(&"Telegram"));
464    }
465
466    #[test]
467    fn wmn_embedded_registry_loads_and_supersets_default() {
468        let base = Registry::default_embedded().unwrap();
469        let merged = Registry::default_embedded_with_wmn().expect("WMN-merged registry must load");
470        assert!(
471            merged.len() > base.len(),
472            "WMN merge must add sites: base={} merged={}",
473            base.len(),
474            merged.len()
475        );
476        // Every base-tranche name survives the merge; case-insensitive
477        // collisions resolve in favour of the MIT-tranche entry.
478        let merged_names: HashSet<String> = merged
479            .sites()
480            .iter()
481            .map(|s| s.name.to_lowercase())
482            .collect();
483        for s in base.sites() {
484            assert!(
485                merged_names.contains(&s.name.to_lowercase()),
486                "merge dropped base-tranche site {:?}",
487                s.name
488            );
489        }
490        // At least one WMN-only site carries the provenance tag.
491        let has_wmn_tag = merged
492            .sites()
493            .iter()
494            .any(|s| s.tags.iter().any(|t| t == "source:wmn"));
495        assert!(has_wmn_tag, "no site carries the source:wmn tag");
496    }
497
498    #[test]
499    fn rejects_empty_registry() {
500        let err = Registry::from_json_str(r#"{ "sites": [] }"#).unwrap_err();
501        assert!(matches!(err, Error::InvalidSite { .. }));
502    }
503
504    #[test]
505    fn rejects_duplicate_site_names() {
506        let json = r#"{
507            "sites": [
508                { "name": "GitHub", "url": "https://github.com/{username}",
509                  "signals": [{ "kind": "status_found", "codes": [200] }] },
510                { "name": "github", "url": "https://github.com/{username}",
511                  "signals": [{ "kind": "status_found", "codes": [200] }] }
512            ]
513        }"#;
514        let err = Registry::from_json_str(json).unwrap_err();
515        assert!(matches!(err, Error::InvalidSite { .. }));
516        assert!(err.to_string().contains("duplicate"));
517    }
518
519    #[test]
520    fn rejects_duplicate_enabled_urls() {
521        // Two enabled sites at the same URL is almost always an importer
522        // re-introducing a known duplicate. Reject at load time with a
523        // message naming both entries.
524        let json = r#"{
525            "sites": [
526                { "name": "Hub Code", "url": "https://example.com/{username}",
527                  "signals": [{ "kind": "status_found", "codes": [200] }] },
528                { "name": "HubCode", "url": "https://example.com/{username}",
529                  "signals": [{ "kind": "status_found", "codes": [200] }] }
530            ]
531        }"#;
532        let err = Registry::from_json_str(json).unwrap_err();
533        assert!(matches!(err, Error::InvalidSite { .. }));
534        let msg = err.to_string();
535        assert!(msg.contains("duplicate (URL, signals)"), "msg: {msg}");
536        assert!(msg.contains("Hub Code"), "msg: {msg}");
537        assert!(msg.contains("HubCode"), "msg: {msg}");
538    }
539
540    #[test]
541    fn allows_duplicate_urls_with_distinct_signals() {
542        // Same URL, distinct signal sets — this is the legitimate-alias
543        // shape (e.g. WordPress.com (Public/Private/Deleted) hit one
544        // endpoint and disambiguate via the body marker). Must NOT
545        // trigger the URL-uniqueness rule.
546        let json = r#"{
547            "sites": [
548                { "name": "Site Public", "url": "https://example.com/{username}",
549                  "signals": [{ "kind": "status_found", "codes": [200] }] },
550                { "name": "Site Private", "url": "https://example.com/{username}",
551                  "signals": [{ "kind": "status_found", "codes": [403] }] }
552            ]
553        }"#;
554        let registry = Registry::from_json_str(json).expect("distinct-signal alias must validate");
555        assert_eq!(registry.len(), 2);
556    }
557
558    #[test]
559    fn allows_duplicate_urls_when_one_side_is_disabled() {
560        // The dedup pattern that the v0.14 hygiene pass established:
561        // canonical stays enabled, the surplus entry gets
562        // `disabled: true` + `disabled_reason: "duplicate of <canonical>"`.
563        // This shape must continue loading cleanly.
564        let json = r#"{
565            "sites": [
566                { "name": "Hub Code", "url": "https://example.com/{username}",
567                  "signals": [{ "kind": "status_found", "codes": [200] }] },
568                { "name": "HubCode", "url": "https://example.com/{username}",
569                  "signals": [{ "kind": "status_found", "codes": [200] }],
570                  "disabled": true,
571                  "disabled_reason": "duplicate of Hub Code" }
572            ]
573        }"#;
574        let registry = Registry::from_json_str(json).expect("dedup pattern must validate");
575        assert_eq!(registry.len(), 2);
576    }
577
578    #[test]
579    fn rejects_invalid_site_definition() {
580        // Missing {username} placeholder.
581        let json = r#"{
582            "sites": [
583                { "name": "Bad", "url": "https://example.com/",
584                  "signals": [{ "kind": "status_found", "codes": [200] }] }
585            ]
586        }"#;
587        assert!(Registry::from_json_str(json).is_err());
588    }
589
590    #[test]
591    fn rejects_malformed_json() {
592        let err = Registry::from_json_str("{").unwrap_err();
593        assert!(matches!(err, Error::Json(_)));
594    }
595
596    #[test]
597    fn filter_include_is_case_insensitive_substring() {
598        let registry = Registry::default_embedded().unwrap();
599        let only_github = registry.filter(&["github".into()], &[], &[], &[], false);
600        assert_eq!(only_github.len(), 1);
601        assert_eq!(only_github[0].name, "GitHub");
602
603        let many = registry.filter(&["e".into()], &[], &[], &[], false); // matches anything with "e"
604        assert!(many.len() > 1);
605    }
606
607    #[test]
608    fn filter_exclude_drops_matches() {
609        let registry = Registry::default_embedded().unwrap();
610        // Include NSFW to keep the test focused on the name-exclude
611        // path; the NSFW auto-exclusion is exercised separately.
612        let baseline = registry.filter(&[], &[], &[], &[], true);
613        let without_github = registry.filter(&[], &["github".into()], &[], &[], true);
614        assert!(without_github.iter().all(|s| s.name != "GitHub"));
615        // Asserting against the baseline (filtered) count rather than
616        // `registry.len()` so this test is robust to changes in the
617        // disabled-site count — `len()` includes disabled entries,
618        // `filter()` does not.
619        assert_eq!(without_github.len(), baseline.len() - 1);
620    }
621
622    #[test]
623    fn filter_include_and_exclude_compose() {
624        let registry = Registry::default_embedded().unwrap();
625        // Include "git", then exclude "lab" → keep GitHub, drop GitLab.
626        let filtered = registry.filter(&["git".into()], &["lab".into()], &[], &[], false);
627        let names: Vec<&str> = filtered.iter().map(|s| s.name.as_str()).collect();
628        assert!(names.contains(&"GitHub"));
629        assert!(!names.contains(&"GitLab"));
630        // Exclude wins over include for sites containing both terms (none here).
631    }
632
633    #[test]
634    fn filter_with_no_matches_returns_empty() {
635        let registry = Registry::default_embedded().unwrap();
636        let filtered = registry.filter(&["does-not-exist-xyz".into()], &[], &[], &[], false);
637        assert!(filtered.is_empty());
638    }
639
640    #[test]
641    fn disabled_sites_are_skipped_by_filter() {
642        let json = r#"{
643            "sites": [
644                { "name": "Alive", "url": "https://alive.example/{username}",
645                  "signals": [{ "kind": "status_found", "codes": [200] }] },
646                { "name": "Parked", "url": "https://parked.example/{username}",
647                  "signals": [{ "kind": "status_found", "codes": [200] }],
648                  "disabled": true }
649            ]
650        }"#;
651        let registry = Registry::from_json_str(json).unwrap();
652        // sites() returns everything including disabled — it's the
653        // serialisation view. filter() is the scan view and drops
654        // disabled entries.
655        assert_eq!(registry.sites().len(), 2);
656        let scanned = registry.filter(&[], &[], &[], &[], false);
657        let names: Vec<&str> = scanned.iter().map(|s| s.name.as_str()).collect();
658        assert_eq!(names, vec!["Alive"]);
659    }
660
661    #[test]
662    fn disabled_matches_with_explains_parked_filter_hits() {
663        let json = r#"{
664            "sites": [
665                { "name": "Alive", "url": "https://alive.example/{username}",
666                  "signals": [{ "kind": "status_found", "codes": [200] }] },
667                { "name": "TikTok", "url": "https://tiktok.example/@{username}",
668                  "signals": [{ "kind": "status_found", "codes": [200] }],
669                  "disabled": true,
670                  "disabled_reason": "Honest Limits: parked",
671                  "tags": ["social"] }
672            ]
673        }"#;
674        let registry = Registry::from_json_str(json).unwrap();
675        let filter = SiteFilter {
676            include: vec!["tiktok".into()],
677            tags: vec!["social".into()],
678            ..SiteFilter::default()
679        };
680
681        assert!(registry.filter_with(&filter).is_empty());
682        let disabled = registry.disabled_matches_with(&filter);
683        assert_eq!(disabled.len(), 1);
684        assert_eq!(disabled[0].name, "TikTok");
685        assert_eq!(
686            disabled[0].disabled_reason.as_deref(),
687            Some("Honest Limits: parked")
688        );
689    }
690
691    #[test]
692    fn threads_stays_parked_behind_login_wall() {
693        let registry = Registry::default_embedded().unwrap();
694        let threads = registry
695            .sites()
696            .iter()
697            .find(|s| s.name == "Threads")
698            .expect("Threads entry should document the login-wall limitation");
699
700        assert!(threads.disabled, "Threads must not be probed by default");
701        let reason = threads
702            .disabled_reason
703            .as_deref()
704            .expect("disabled Threads entry should explain why it is parked");
705        assert!(
706            reason.contains("Honest Limits") && reason.contains("indistinguishable"),
707            "unexpected Threads disabled_reason: {reason}"
708        );
709
710        let scanned = registry.filter(&["threads".into()], &[], &[], &[], true);
711        assert!(
712            scanned.is_empty(),
713            "disabled Threads entry must not leak into scan filters"
714        );
715    }
716
717    #[test]
718    fn reddit_uses_oauth_endpoint_and_requires_session() {
719        let registry = Registry::default_embedded_with_wmn().unwrap();
720        let reddit_entries: Vec<&Site> = registry
721            .sites()
722            .iter()
723            .filter(|s| s.name == "Reddit")
724            .collect();
725
726        assert_eq!(
727            reddit_entries.len(),
728            1,
729            "WMN merge must not reintroduce a second Reddit probe"
730        );
731        let reddit = reddit_entries[0];
732        assert!(!reddit.disabled, "Reddit OAuth probe should remain enabled");
733        assert_eq!(
734            reddit.url.as_str(),
735            "https://oauth.reddit.com/user/{username}/about"
736        );
737        assert_eq!(reddit.access.session.as_deref(), Some("reddit"));
738        assert!(
739            reddit
740                .protection
741                .iter()
742                .any(|p| matches!(p, super::super::site::ProtectionKind::UserAuth)),
743            "Reddit should be classified as requiring user auth"
744        );
745        assert!(
746            reddit.tags.iter().any(|t| t == "reddit-oauth"),
747            "Reddit should be discoverable as an OAuth-gated site"
748        );
749        assert!(
750            reddit
751                .tags
752                .iter()
753                .all(|t| !t.eq_ignore_ascii_case("bot-protected")),
754            "Reddit OAuth should use HTTP session headers, not browser routing"
755        );
756
757        let scanned = registry.filter(&["reddit".into()], &[], &[], &[], true);
758        assert_eq!(
759            scanned.iter().filter(|s| s.name == "Reddit").count(),
760            1,
761            "enabled Reddit OAuth entry should be scan-filterable"
762        );
763    }
764
765    #[test]
766    fn tiktok_stays_parked_behind_hydration_wall() {
767        let registry = Registry::default_embedded_with_wmn().unwrap();
768        let tiktok_entries: Vec<&Site> = registry
769            .sites()
770            .iter()
771            .filter(|s| s.name == "TikTok")
772            .collect();
773
774        assert_eq!(
775            tiktok_entries.len(),
776            1,
777            "WMN merge must not reintroduce TikTok's oEmbed probe"
778        );
779        let tiktok = tiktok_entries[0];
780        assert!(tiktok.disabled, "TikTok must not be probed by default");
781        assert!(
782            tiktok
783                .protection
784                .iter()
785                .any(|p| matches!(p, super::super::site::ProtectionKind::Captcha)),
786            "TikTok should be classified as captcha/headless protected"
787        );
788        let reason = tiktok
789            .disabled_reason
790            .as_deref()
791            .expect("disabled TikTok entry should explain why it is parked");
792        assert!(
793            reason.contains("Honest Limits")
794                && reason.contains("JS-only SPA")
795                && reason.contains("never hydrates"),
796            "unexpected TikTok disabled_reason: {reason}"
797        );
798
799        let scanned = registry.filter(&["tiktok".into()], &[], &[], &[], true);
800        assert!(
801            scanned.iter().all(|s| s.name != "TikTok"),
802            "disabled TikTok entry must not leak into scan filters"
803        );
804    }
805
806    #[test]
807    fn pinterest_uses_oembed_instead_of_js_shell() {
808        let registry = Registry::default_embedded_with_wmn().unwrap();
809        let pinterest_entries: Vec<&Site> = registry
810            .sites()
811            .iter()
812            .filter(|s| s.name == "Pinterest")
813            .collect();
814
815        assert_eq!(
816            pinterest_entries.len(),
817            1,
818            "WMN merge must not reintroduce Pinterest's canonical JS shell"
819        );
820        let pinterest = pinterest_entries[0];
821        assert!(
822            !pinterest.disabled,
823            "Pinterest oEmbed probe should remain enabled"
824        );
825        assert!(
826            pinterest.url.as_str().contains("/oembed.json"),
827            "Pinterest should use the oEmbed endpoint, got {}",
828            pinterest.url.as_str()
829        );
830        assert!(
831            pinterest.url.as_str() != "https://www.pinterest.com/{username}/",
832            "Pinterest must not fall back to the canonical JS shell"
833        );
834
835        let scanned = registry.filter(&["pinterest".into()], &[], &[], &[], true);
836        assert_eq!(
837            scanned.iter().filter(|s| s.name == "Pinterest").count(),
838            1,
839            "enabled Pinterest oEmbed entry should be scan-filterable"
840        );
841    }
842
843    #[test]
844    fn source_field_round_trips() {
845        let json = r#"{
846            "sites": [
847                { "name": "Nitter", "url": "https://nitter.example/{username}",
848                  "signals": [{ "kind": "status_found", "codes": [200] }],
849                  "source": "Twitter" }
850            ]
851        }"#;
852        let registry = Registry::from_json_str(json).unwrap();
853        assert_eq!(registry.sites()[0].source.as_deref(), Some("Twitter"));
854    }
855
856    fn tagged_registry() -> Registry {
857        let json = r#"{
858            "sites": [
859                { "name": "Soc", "url": "https://soc.example/{username}",
860                  "signals": [{ "kind": "status_found", "codes": [200] }],
861                  "tags": ["social", "region:ru"] },
862                { "name": "Dev", "url": "https://dev.example/{username}",
863                  "signals": [{ "kind": "status_found", "codes": [200] }],
864                  "tags": ["dev"] },
865                { "name": "Plain", "url": "https://plain.example/{username}",
866                  "signals": [{ "kind": "status_found", "codes": [200] }] }
867            ]
868        }"#;
869        Registry::from_json_str(json).unwrap()
870    }
871
872    #[test]
873    fn tag_filter_keeps_only_matching_tags_and_drops_untagged() {
874        let r = tagged_registry();
875        let social = r.filter(&[], &[], &["social".into()], &[], false);
876        let names: Vec<&str> = social.iter().map(|s| s.name.as_str()).collect();
877        assert_eq!(names, ["Soc"], "tag filter should keep only tagged matches");
878    }
879
880    #[test]
881    fn tag_filter_is_or_within_requested_tags_and_case_insensitive() {
882        let r = tagged_registry();
883        let either = r.filter(&[], &[], &["DEV".into(), "social".into()], &[], false);
884        let names: Vec<&str> = either.iter().map(|s| s.name.as_str()).collect();
885        assert_eq!(names, ["Soc", "Dev"]);
886    }
887
888    #[test]
889    fn no_tag_filter_includes_untagged_sites() {
890        let r = tagged_registry();
891        assert_eq!(r.filter(&[], &[], &[], &[], false).len(), 3);
892    }
893
894    #[test]
895    fn exclude_tag_drops_matching_sites() {
896        let r = tagged_registry();
897        let kept = r.filter(&[], &[], &[], &["social".into()], false);
898        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
899        // Soc carries "social" → dropped; Dev and untagged Plain remain.
900        assert_eq!(names, ["Dev", "Plain"], "{names:?}");
901    }
902
903    fn nsfw_registry() -> Registry {
904        let json = r#"{
905            "sites": [
906                { "name": "Family", "url": "https://family.example/{username}",
907                  "signals": [{ "kind": "status_found", "codes": [200] }],
908                  "tags": ["social"] },
909                { "name": "Adult", "url": "https://adult.example/{username}",
910                  "signals": [{ "kind": "status_found", "codes": [200] }],
911                  "tags": ["nsfw"] }
912            ]
913        }"#;
914        Registry::from_json_str(json).unwrap()
915    }
916
917    #[test]
918    fn nsfw_sites_excluded_by_default() {
919        let r = nsfw_registry();
920        let kept = r.filter(&[], &[], &[], &[], false);
921        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
922        assert_eq!(names, ["Family"], "nsfw site must be excluded by default");
923    }
924
925    #[test]
926    fn nsfw_sites_included_when_flag_set() {
927        let r = nsfw_registry();
928        let kept = r.filter(&[], &[], &[], &[], true);
929        assert_eq!(kept.len(), 2, "both sites present with include_nsfw=true");
930    }
931
932    #[test]
933    fn nsfw_sites_included_when_tag_asked_for_explicitly() {
934        // `--tag nsfw` is an explicit opt-in; should bypass the default
935        // auto-exclusion even with include_nsfw=false.
936        let r = nsfw_registry();
937        let kept = r.filter(&[], &[], &["nsfw".into()], &[], false);
938        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
939        assert_eq!(names, ["Adult"]);
940    }
941
942    #[test]
943    fn tag_counts_are_sorted_with_per_tag_totals() {
944        let r = tagged_registry();
945        assert_eq!(
946            r.tag_counts(),
947            vec![
948                ("dev".to_owned(), 1),
949                ("region:ru".to_owned(), 1),
950                ("social".to_owned(), 1),
951            ]
952        );
953    }
954
955    #[test]
956    fn engine_inheritance_fills_empty_site_signals() {
957        // Site has no `signals` block — should inherit the engine's.
958        let json = r#"{
959            "engines": {
960                "Discourse": {
961                    "signals": [
962                        { "kind": "status_found", "codes": [200] },
963                        { "kind": "body_absent", "text": "Oops! That page doesn't exist" }
964                    ]
965                }
966            },
967            "sites": [
968                { "name": "Mozilla Forum", "url": "https://discourse.mozilla.org/u/{username}",
969                  "engine": "Discourse" }
970            ]
971        }"#;
972        let r = Registry::from_json_str(json).unwrap();
973        let site = &r.sites()[0];
974        assert_eq!(site.signals.len(), 2);
975        assert_eq!(site.engine.as_deref(), Some("Discourse"));
976        // engines map preserved
977        assert!(r.engines().contains_key("Discourse"));
978    }
979
980    #[test]
981    fn site_overrides_engine_signals_on_conflict() {
982        // Site declares its own `signals` — engine's must NOT replace them.
983        let json = r#"{
984            "engines": {
985                "Discourse": {
986                    "signals": [{ "kind": "status_found", "codes": [200] }]
987                }
988            },
989            "sites": [
990                { "name": "Custom", "url": "https://example.com/{username}",
991                  "engine": "Discourse",
992                  "signals": [
993                    { "kind": "status_found", "codes": [200] },
994                    { "kind": "status_not_found", "codes": [404] }
995                  ] }
996            ]
997        }"#;
998        let r = Registry::from_json_str(json).unwrap();
999        // The site-declared 2 signals win over the engine's 1 signal.
1000        assert_eq!(r.sites()[0].signals.len(), 2);
1001    }
1002
1003    #[test]
1004    fn engine_headers_merge_with_site_headers_per_key() {
1005        // Engine declares one header; site declares another. Resolved
1006        // site should carry both. On per-key conflict the site wins.
1007        let json = r#"{
1008            "engines": {
1009                "Foo": {
1010                    "signals": [{ "kind": "status_found", "codes": [200] }],
1011                    "request_headers": {
1012                        "X-Engine": "engine-value",
1013                        "User-Agent": "engine-ua"
1014                    }
1015                }
1016            },
1017            "sites": [
1018                { "name": "S", "url": "https://example.com/{username}",
1019                  "engine": "Foo",
1020                  "request_headers": { "User-Agent": "site-ua" } }
1021            ]
1022        }"#;
1023        let r = Registry::from_json_str(json).unwrap();
1024        let h = &r.sites()[0].request_headers;
1025        assert_eq!(h.get("X-Engine").map(String::as_str), Some("engine-value"));
1026        assert_eq!(h.get("User-Agent").map(String::as_str), Some("site-ua"));
1027    }
1028
1029    #[test]
1030    fn missing_engine_reference_fails_load() {
1031        let json = r#"{
1032            "engines": {},
1033            "sites": [
1034                { "name": "Mock", "url": "https://example.com/{username}",
1035                  "engine": "DoesNotExist" }
1036            ]
1037        }"#;
1038        let err = Registry::from_json_str(json).unwrap_err();
1039        assert!(
1040            err.to_string()
1041                .contains("references engine \"DoesNotExist\""),
1042            "expected missing-engine error, got: {err}"
1043        );
1044    }
1045
1046    #[test]
1047    fn engine_regex_check_inherited_when_site_has_none() {
1048        let json = r#"{
1049            "engines": {
1050                "Bounded": {
1051                    "signals": [{ "kind": "status_found", "codes": [200] }],
1052                    "regex_check": "^[a-z]{3,16}$"
1053                }
1054            },
1055            "sites": [
1056                { "name": "S", "url": "https://example.com/{username}",
1057                  "engine": "Bounded" }
1058            ]
1059        }"#;
1060        let r = Registry::from_json_str(json).unwrap();
1061        assert_eq!(r.sites()[0].regex_check.as_deref(), Some("^[a-z]{3,16}$"));
1062    }
1063
1064    #[test]
1065    fn region_tag_auto_populates_prefer_geo() {
1066        let json = r#"{
1067            "sites": [
1068                { "name": "vk.com", "url": "https://vk.com/{username}",
1069                  "signals": [{ "kind": "status_found", "codes": [200] }],
1070                  "tags": ["region:ru", "social"] }
1071            ]
1072        }"#;
1073        let r = Registry::from_json_str(json).unwrap();
1074        let prefer = &r.sites()[0].access.prefer_geo;
1075        assert_eq!(prefer.len(), 1);
1076        assert_eq!(prefer[0].as_str(), "ru");
1077        // Hard geo stays empty — the tag is soft.
1078        assert!(r.sites()[0].access.geo.is_empty());
1079    }
1080
1081    #[test]
1082    fn multiple_region_tags_stack() {
1083        let json = r#"{
1084            "sites": [
1085                { "name": "Pan-Slavic", "url": "https://example.test/{username}",
1086                  "signals": [{ "kind": "status_found", "codes": [200] }],
1087                  "tags": ["region:ru", "region:by", "region:ua"] }
1088            ]
1089        }"#;
1090        let r = Registry::from_json_str(json).unwrap();
1091        let codes: Vec<&str> = r.sites()[0]
1092            .access
1093            .prefer_geo
1094            .iter()
1095            .map(super::super::access::CountryCode::as_str)
1096            .collect();
1097        assert_eq!(codes, vec!["ru", "by", "ua"]);
1098    }
1099
1100    #[test]
1101    fn explicit_hard_geo_suppresses_tag_derived_soft() {
1102        // A site with hard `access.geo = ["pl"]` AND a `region:ru` tag:
1103        // the explicit hard policy wins, prefer_geo stays empty.
1104        // Otherwise tag-derived soft would silently re-route a probe
1105        // that the maintainer deliberately pinned to PL.
1106        let json = r#"{
1107            "sites": [
1108                { "name": "PL-only", "url": "https://example.test/{username}",
1109                  "signals": [{ "kind": "status_found", "codes": [200] }],
1110                  "tags": ["region:ru"],
1111                  "access": { "geo": ["pl"] } }
1112            ]
1113        }"#;
1114        let r = Registry::from_json_str(json).unwrap();
1115        assert_eq!(r.sites()[0].access.geo[0].as_str(), "pl");
1116        assert!(r.sites()[0].access.prefer_geo.is_empty());
1117    }
1118
1119    #[test]
1120    fn malformed_region_tag_is_ignored() {
1121        // `region:` followed by something that isn't a 2-letter code:
1122        // skip it silently rather than reject the whole load. The tag
1123        // already had no routing semantics in older versions.
1124        let json = r#"{
1125            "sites": [
1126                { "name": "Weird", "url": "https://example.test/{username}",
1127                  "signals": [{ "kind": "status_found", "codes": [200] }],
1128                  "tags": ["region:eurasia", "region:r", "region:RU"] }
1129            ]
1130        }"#;
1131        let r = Registry::from_json_str(json).unwrap();
1132        // Only the valid 2-letter "RU" survives (lowercased to "ru").
1133        let codes: Vec<&str> = r.sites()[0]
1134            .access
1135            .prefer_geo
1136            .iter()
1137            .map(super::super::access::CountryCode::as_str)
1138            .collect();
1139        assert_eq!(codes, vec!["ru"]);
1140    }
1141
1142    #[test]
1143    fn load_from_path_round_trips_via_tempfile() {
1144        let mut path = std::env::temp_dir();
1145        path.push(format!("adler-test-registry-{}.json", std::process::id()));
1146        std::fs::write(
1147            &path,
1148            r#"{
1149                "sites": [
1150                    { "name": "Mock", "url": "https://example.com/{username}",
1151                      "signals": [{ "kind": "status_found", "codes": [200] }] }
1152                ]
1153            }"#,
1154        )
1155        .unwrap();
1156        let result = Registry::load_from_path(&path);
1157        let _ = std::fs::remove_file(&path);
1158        let registry = result.unwrap();
1159        assert_eq!(registry.len(), 1);
1160        assert_eq!(registry.sites()[0].name, "Mock");
1161    }
1162}
adler_core/registry.rs

adler_core/
registry.rs