Skip to main content

adler_core/
registry.rs

1//! Site registry — loading, validation, filtering.
2//!
3//! The default registry is embedded into the binary at compile time via
4//! [`include_str!`]. Callers can override it with a file at runtime through
5//! [`Registry::load_from_path`].
6
7use std::collections::{BTreeMap, HashSet};
8use std::path::Path;
9
10use serde::Deserialize;
11
12use crate::error::{Error, Result};
13use crate::site::{Engine, Site};
14
15const EMBEDDED_REGISTRY: &str = include_str!("../data/sites.json");
16
17/// Supplementary registry derived from the `WhatsMyName` project
18/// (`WebBreacher/WhatsMyName`, CC BY-SA 4.0). Kept as a separate
19/// constant because its data license is incompatible with the
20/// MIT-only [`EMBEDDED_REGISTRY`] above; callers opt in explicitly
21/// via [`Registry::default_embedded_with_wmn`] to keep the default
22/// MIT-clean for downstream redistribution.
23const EMBEDDED_WMN_REGISTRY: &str = include_str!("../data/sites_wmn.json");
24
25/// A loaded, validated collection of site definitions.
26///
27/// Engines (shared signature templates referenced by [`Site::engine`])
28/// are resolved into sites at load time — by the time you call
29/// [`Registry::sites`] every entry already has its inherited
30/// `signals` / `request_headers` / `regex_check` materialised. The original
31/// [`Engine`] objects are kept on the registry for re-export and
32/// inspection via [`Registry::engines`].
33#[derive(Debug, Clone, Deserialize)]
34pub struct Registry {
35    #[serde(default)]
36    engines: BTreeMap<String, Engine>,
37    sites: Vec<Site>,
38}
39
40impl Registry {
41    /// Load the default site list embedded into the crate at build time.
42    pub fn default_embedded() -> Result<Self> {
43        Self::from_json_str(EMBEDDED_REGISTRY)
44    }
45
46    /// Load the default site list *plus* the `WhatsMyName`-derived
47    /// supplementary set. `WhatsMyName` data is licensed CC BY-SA 4.0
48    /// (see `LICENSE-CC-BY-SA-4.0` at the repo root); enabling this
49    /// path means downstream redistribution of the merged scan data
50    /// must respect the `ShareAlike` obligation. Sites contributed by
51    /// the `WhatsMyName` tranche carry the `source:wmn` tag for
52    /// provenance.
53    ///
54    /// Engines from the WMN tranche merge with the MIT tranche;
55    /// case-insensitive site-name collisions resolve in favour of the
56    /// MIT-tranche entry (the hand-curated Sherlock/Maigret-derived
57    /// signature wins; the WMN duplicate is dropped). Returns an
58    /// error only if either tranche fails its own validation —
59    /// engine references are checked across the merged set.
60    pub fn default_embedded_with_wmn() -> Result<Self> {
61        let mut base = Self::default_embedded()?;
62        let wmn: Self = serde_json::from_str(EMBEDDED_WMN_REGISTRY)?;
63        let existing: HashSet<String> = base.sites.iter().map(|s| s.name.to_lowercase()).collect();
64        for (name, engine) in wmn.engines {
65            base.engines.entry(name).or_insert(engine);
66        }
67        for site in wmn.sites {
68            if !existing.contains(&site.name.to_lowercase()) {
69                base.sites.push(site);
70            }
71        }
72        base.resolve_engines()?;
73        base.validate()?;
74        Ok(base)
75    }
76
77    /// Parse and validate a registry from a JSON string. Engine
78    /// references on each site are resolved before validation;
79    /// a site that names an engine which doesn't exist in the
80    /// `engines` block fails loading with [`Error::InvalidSite`].
81    pub fn from_json_str(json: &str) -> Result<Self> {
82        let mut registry: Self = serde_json::from_str(json)?;
83        registry.resolve_engines()?;
84        registry.validate()?;
85        Ok(registry)
86    }
87
88    /// Inheritable engine templates, keyed by name. Useful for
89    /// introspection and for serialising the registry back out;
90    /// detection paths read the resolved fields off the sites
91    /// directly and don't need to consult this map.
92    pub fn engines(&self) -> &BTreeMap<String, Engine> {
93        &self.engines
94    }
95
96    /// Merge each engine's inheritable fields into the sites that
97    /// reference it. After this call every site's `signals`,
98    /// `request_headers` and `regex_check` reflect the effective
99    /// values used by the scanner.
100    ///
101    /// Per-site fields are authoritative: anything declared
102    /// explicitly on a site wins on conflict; only empty / unset
103    /// fields are filled from the engine.
104    fn resolve_engines(&mut self) -> Result<()> {
105        for (name, engine) in &self.engines {
106            engine.validate(name)?;
107        }
108        for site in &mut self.sites {
109            let Some(name) = &site.engine else {
110                continue;
111            };
112            let Some(engine) = self.engines.get(name) else {
113                return Err(Error::InvalidSite {
114                    reason: format!(
115                        "site {:?}: references engine {name:?} which is not defined",
116                        site.name
117                    ),
118                });
119            };
120            engine.merge_into(site);
121        }
122        Ok(())
123    }
124
125    /// Read a registry from a JSON file.
126    pub fn load_from_path(path: impl AsRef<Path>) -> Result<Self> {
127        let bytes = std::fs::read(path)?;
128        let json = std::str::from_utf8(&bytes).map_err(|e| Error::InvalidSite {
129            reason: format!("registry file is not valid UTF-8: {e}"),
130        })?;
131        Self::from_json_str(json)
132    }
133
134    /// Borrow all sites in load order.
135    pub fn sites(&self) -> &[Site] {
136        &self.sites
137    }
138
139    /// Number of sites.
140    pub fn len(&self) -> usize {
141        self.sites.len()
142    }
143
144    /// True if the registry has no sites (always false for a valid load,
145    /// since we'd already have rejected it).
146    pub fn is_empty(&self) -> bool {
147        self.sites.is_empty()
148    }
149
150    /// Apply include/exclude name filters and a tag filter.
151    ///
152    /// - If `include` is non-empty, only sites whose name contains at least
153    ///   one include term (case-insensitive substring) are kept.
154    /// - Sites whose name contains any exclude term are dropped.
155    /// - If `tags` is non-empty, only sites carrying at least one of the
156    ///   requested tags are kept (case-insensitive). A site with no tags is
157    ///   therefore dropped by a tag filter — asking for `--tag social` means
158    ///   "only social-tagged sites".
159    /// - Sites carrying any tag in `exclude_tags` are dropped (e.g.
160    ///   `--exclude-tag bot-protected` for a fast clean run).
161    /// - **NSFW sites are auto-excluded** (the `nsfw` tag) unless
162    ///   `include_nsfw` is `true` or `tags` explicitly asks for `nsfw`.
163    ///   This matches Sherlock's `--nsfw` opt-in pattern and prevents
164    ///   the default `adler <username>` from surfacing adult-site URLs
165    ///   the user didn't ask for.
166    /// - Sites are returned by value (cloned) so the result is independent
167    ///   of the registry's lifetime — convenient for handing to the executor.
168    pub fn filter(
169        &self,
170        include: &[String],
171        exclude: &[String],
172        tags: &[String],
173        exclude_tags: &[String],
174        include_nsfw: bool,
175    ) -> Vec<Site> {
176        let include: Vec<String> = include.iter().map(|s| s.to_lowercase()).collect();
177        let exclude: Vec<String> = exclude.iter().map(|s| s.to_lowercase()).collect();
178        let want_tags: Vec<String> = tags.iter().map(|s| s.to_lowercase()).collect();
179        let mut drop_tags: Vec<String> = exclude_tags.iter().map(|s| s.to_lowercase()).collect();
180
181        // NSFW gate: auto-exclude unless the caller explicitly opted in,
182        // either via `include_nsfw` or by asking for the `nsfw` tag.
183        let nsfw_tag = "nsfw".to_owned();
184        let asking_for_nsfw = want_tags.contains(&nsfw_tag);
185        if !include_nsfw && !asking_for_nsfw && !drop_tags.contains(&nsfw_tag) {
186            drop_tags.push(nsfw_tag);
187        }
188
189        self.sites
190            .iter()
191            .filter(|site| {
192                let name = site.name.to_lowercase();
193                let included = include.is_empty() || include.iter().any(|i| name.contains(i));
194                let excluded = exclude.iter().any(|x| name.contains(x));
195                let lower_tags: Vec<String> = site.tags.iter().map(|t| t.to_lowercase()).collect();
196                let tagged =
197                    want_tags.is_empty() || lower_tags.iter().any(|t| want_tags.contains(t));
198                let tag_excluded = lower_tags.iter().any(|t| drop_tags.contains(t));
199                included && !excluded && tagged && !tag_excluded
200            })
201            .cloned()
202            .collect()
203    }
204
205    /// Distinct tags across all sites, sorted, with the count of sites
206    /// carrying each. Powers `--list-tags`.
207    pub fn tag_counts(&self) -> Vec<(String, usize)> {
208        let mut counts: std::collections::BTreeMap<String, usize> =
209            std::collections::BTreeMap::new();
210        for site in &self.sites {
211            for tag in &site.tags {
212                *counts.entry(tag.clone()).or_insert(0) += 1;
213            }
214        }
215        counts.into_iter().collect()
216    }
217
218    fn validate(&self) -> Result<()> {
219        if self.sites.is_empty() {
220            return Err(Error::InvalidSite {
221                reason: "registry has no sites".into(),
222            });
223        }
224        for site in &self.sites {
225            site.validate()?;
226        }
227        let mut seen: HashSet<String> = HashSet::new();
228        for site in &self.sites {
229            let key = site.name.to_lowercase();
230            if !seen.insert(key) {
231                return Err(Error::InvalidSite {
232                    reason: format!("duplicate site name: {:?}", site.name),
233                });
234            }
235        }
236        Ok(())
237    }
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    #[test]
245    fn embedded_registry_loads_and_validates() {
246        let registry = Registry::default_embedded().expect("embedded registry must load");
247        // The registry is imported from Sherlock (~450 sites); a floor well
248        // above the old hand-written 15 guards against accidental truncation.
249        assert!(
250            registry.len() >= 100,
251            "imported registry should have ≥100 sites, got {}",
252            registry.len()
253        );
254        // Spot-check a couple of well-known entries. (HackerNews used
255        // to be here but was pruned 2026-05-26 — its Sherlock-side
256        // known_present went stale and the imported signature
257        // doctor-failed; can be restored via OVERRIDES in
258        // import_sherlock.py with a working account.)
259        let names: Vec<&str> = registry.sites().iter().map(|s| s.name.as_str()).collect();
260        assert!(names.contains(&"GitHub"));
261        assert!(names.contains(&"Reddit"));
262        assert!(names.contains(&"Telegram"));
263    }
264
265    #[test]
266    fn wmn_embedded_registry_loads_and_supersets_default() {
267        let base = Registry::default_embedded().unwrap();
268        let merged = Registry::default_embedded_with_wmn().expect("WMN-merged registry must load");
269        assert!(
270            merged.len() > base.len(),
271            "WMN merge must add sites: base={} merged={}",
272            base.len(),
273            merged.len()
274        );
275        // Every base-tranche name survives the merge; case-insensitive
276        // collisions resolve in favour of the MIT-tranche entry.
277        let merged_names: HashSet<String> = merged
278            .sites()
279            .iter()
280            .map(|s| s.name.to_lowercase())
281            .collect();
282        for s in base.sites() {
283            assert!(
284                merged_names.contains(&s.name.to_lowercase()),
285                "merge dropped base-tranche site {:?}",
286                s.name
287            );
288        }
289        // At least one WMN-only site carries the provenance tag.
290        let has_wmn_tag = merged
291            .sites()
292            .iter()
293            .any(|s| s.tags.iter().any(|t| t == "source:wmn"));
294        assert!(has_wmn_tag, "no site carries the source:wmn tag");
295    }
296
297    #[test]
298    fn rejects_empty_registry() {
299        let err = Registry::from_json_str(r#"{ "sites": [] }"#).unwrap_err();
300        assert!(matches!(err, Error::InvalidSite { .. }));
301    }
302
303    #[test]
304    fn rejects_duplicate_site_names() {
305        let json = r#"{
306            "sites": [
307                { "name": "GitHub", "url": "https://github.com/{username}",
308                  "signals": [{ "kind": "status_found", "codes": [200] }] },
309                { "name": "github", "url": "https://github.com/{username}",
310                  "signals": [{ "kind": "status_found", "codes": [200] }] }
311            ]
312        }"#;
313        let err = Registry::from_json_str(json).unwrap_err();
314        assert!(matches!(err, Error::InvalidSite { .. }));
315        assert!(err.to_string().contains("duplicate"));
316    }
317
318    #[test]
319    fn rejects_invalid_site_definition() {
320        // Missing {username} placeholder.
321        let json = r#"{
322            "sites": [
323                { "name": "Bad", "url": "https://example.com/",
324                  "signals": [{ "kind": "status_found", "codes": [200] }] }
325            ]
326        }"#;
327        assert!(Registry::from_json_str(json).is_err());
328    }
329
330    #[test]
331    fn rejects_malformed_json() {
332        let err = Registry::from_json_str("{").unwrap_err();
333        assert!(matches!(err, Error::Json(_)));
334    }
335
336    #[test]
337    fn filter_include_is_case_insensitive_substring() {
338        let registry = Registry::default_embedded().unwrap();
339        let only_github = registry.filter(&["github".into()], &[], &[], &[], false);
340        assert_eq!(only_github.len(), 1);
341        assert_eq!(only_github[0].name, "GitHub");
342
343        let many = registry.filter(&["e".into()], &[], &[], &[], false); // matches anything with "e"
344        assert!(many.len() > 1);
345    }
346
347    #[test]
348    fn filter_exclude_drops_matches() {
349        let registry = Registry::default_embedded().unwrap();
350        // Include NSFW to keep the test focused on the name-exclude
351        // path; the NSFW auto-exclusion is exercised separately.
352        let without_github = registry.filter(&[], &["github".into()], &[], &[], true);
353        assert!(without_github.iter().all(|s| s.name != "GitHub"));
354        assert_eq!(without_github.len(), registry.len() - 1);
355    }
356
357    #[test]
358    fn filter_include_and_exclude_compose() {
359        let registry = Registry::default_embedded().unwrap();
360        // Include "git", then exclude "lab" → keep GitHub, drop GitLab.
361        let filtered = registry.filter(&["git".into()], &["lab".into()], &[], &[], false);
362        let names: Vec<&str> = filtered.iter().map(|s| s.name.as_str()).collect();
363        assert!(names.contains(&"GitHub"));
364        assert!(!names.contains(&"GitLab"));
365        // Exclude wins over include for sites containing both terms (none here).
366    }
367
368    #[test]
369    fn filter_with_no_matches_returns_empty() {
370        let registry = Registry::default_embedded().unwrap();
371        let filtered = registry.filter(&["does-not-exist-xyz".into()], &[], &[], &[], false);
372        assert!(filtered.is_empty());
373    }
374
375    fn tagged_registry() -> Registry {
376        let json = r#"{
377            "sites": [
378                { "name": "Soc", "url": "https://soc.example/{username}",
379                  "signals": [{ "kind": "status_found", "codes": [200] }],
380                  "tags": ["social", "region:ru"] },
381                { "name": "Dev", "url": "https://dev.example/{username}",
382                  "signals": [{ "kind": "status_found", "codes": [200] }],
383                  "tags": ["dev"] },
384                { "name": "Plain", "url": "https://plain.example/{username}",
385                  "signals": [{ "kind": "status_found", "codes": [200] }] }
386            ]
387        }"#;
388        Registry::from_json_str(json).unwrap()
389    }
390
391    #[test]
392    fn tag_filter_keeps_only_matching_tags_and_drops_untagged() {
393        let r = tagged_registry();
394        let social = r.filter(&[], &[], &["social".into()], &[], false);
395        let names: Vec<&str> = social.iter().map(|s| s.name.as_str()).collect();
396        assert_eq!(names, ["Soc"], "tag filter should keep only tagged matches");
397    }
398
399    #[test]
400    fn tag_filter_is_or_within_requested_tags_and_case_insensitive() {
401        let r = tagged_registry();
402        let either = r.filter(&[], &[], &["DEV".into(), "social".into()], &[], false);
403        let names: Vec<&str> = either.iter().map(|s| s.name.as_str()).collect();
404        assert_eq!(names, ["Soc", "Dev"]);
405    }
406
407    #[test]
408    fn no_tag_filter_includes_untagged_sites() {
409        let r = tagged_registry();
410        assert_eq!(r.filter(&[], &[], &[], &[], false).len(), 3);
411    }
412
413    #[test]
414    fn exclude_tag_drops_matching_sites() {
415        let r = tagged_registry();
416        let kept = r.filter(&[], &[], &[], &["social".into()], false);
417        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
418        // Soc carries "social" → dropped; Dev and untagged Plain remain.
419        assert_eq!(names, ["Dev", "Plain"], "{names:?}");
420    }
421
422    fn nsfw_registry() -> Registry {
423        let json = r#"{
424            "sites": [
425                { "name": "Family", "url": "https://family.example/{username}",
426                  "signals": [{ "kind": "status_found", "codes": [200] }],
427                  "tags": ["social"] },
428                { "name": "Adult", "url": "https://adult.example/{username}",
429                  "signals": [{ "kind": "status_found", "codes": [200] }],
430                  "tags": ["nsfw"] }
431            ]
432        }"#;
433        Registry::from_json_str(json).unwrap()
434    }
435
436    #[test]
437    fn nsfw_sites_excluded_by_default() {
438        let r = nsfw_registry();
439        let kept = r.filter(&[], &[], &[], &[], false);
440        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
441        assert_eq!(names, ["Family"], "nsfw site must be excluded by default");
442    }
443
444    #[test]
445    fn nsfw_sites_included_when_flag_set() {
446        let r = nsfw_registry();
447        let kept = r.filter(&[], &[], &[], &[], true);
448        assert_eq!(kept.len(), 2, "both sites present with include_nsfw=true");
449    }
450
451    #[test]
452    fn nsfw_sites_included_when_tag_asked_for_explicitly() {
453        // `--tag nsfw` is an explicit opt-in; should bypass the default
454        // auto-exclusion even with include_nsfw=false.
455        let r = nsfw_registry();
456        let kept = r.filter(&[], &[], &["nsfw".into()], &[], false);
457        let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
458        assert_eq!(names, ["Adult"]);
459    }
460
461    #[test]
462    fn tag_counts_are_sorted_with_per_tag_totals() {
463        let r = tagged_registry();
464        assert_eq!(
465            r.tag_counts(),
466            vec![
467                ("dev".to_owned(), 1),
468                ("region:ru".to_owned(), 1),
469                ("social".to_owned(), 1),
470            ]
471        );
472    }
473
474    #[test]
475    fn engine_inheritance_fills_empty_site_signals() {
476        // Site has no `signals` block — should inherit the engine's.
477        let json = r#"{
478            "engines": {
479                "Discourse": {
480                    "signals": [
481                        { "kind": "status_found", "codes": [200] },
482                        { "kind": "body_absent", "text": "Oops! That page doesn't exist" }
483                    ]
484                }
485            },
486            "sites": [
487                { "name": "Mozilla Forum", "url": "https://discourse.mozilla.org/u/{username}",
488                  "engine": "Discourse" }
489            ]
490        }"#;
491        let r = Registry::from_json_str(json).unwrap();
492        let site = &r.sites()[0];
493        assert_eq!(site.signals.len(), 2);
494        assert_eq!(site.engine.as_deref(), Some("Discourse"));
495        // engines map preserved
496        assert!(r.engines().contains_key("Discourse"));
497    }
498
499    #[test]
500    fn site_overrides_engine_signals_on_conflict() {
501        // Site declares its own `signals` — engine's must NOT replace them.
502        let json = r#"{
503            "engines": {
504                "Discourse": {
505                    "signals": [{ "kind": "status_found", "codes": [200] }]
506                }
507            },
508            "sites": [
509                { "name": "Custom", "url": "https://example.com/{username}",
510                  "engine": "Discourse",
511                  "signals": [
512                    { "kind": "status_found", "codes": [200] },
513                    { "kind": "status_not_found", "codes": [404] }
514                  ] }
515            ]
516        }"#;
517        let r = Registry::from_json_str(json).unwrap();
518        // The site-declared 2 signals win over the engine's 1 signal.
519        assert_eq!(r.sites()[0].signals.len(), 2);
520    }
521
522    #[test]
523    fn engine_headers_merge_with_site_headers_per_key() {
524        // Engine declares one header; site declares another. Resolved
525        // site should carry both. On per-key conflict the site wins.
526        let json = r#"{
527            "engines": {
528                "Foo": {
529                    "signals": [{ "kind": "status_found", "codes": [200] }],
530                    "request_headers": {
531                        "X-Engine": "engine-value",
532                        "User-Agent": "engine-ua"
533                    }
534                }
535            },
536            "sites": [
537                { "name": "S", "url": "https://example.com/{username}",
538                  "engine": "Foo",
539                  "request_headers": { "User-Agent": "site-ua" } }
540            ]
541        }"#;
542        let r = Registry::from_json_str(json).unwrap();
543        let h = &r.sites()[0].request_headers;
544        assert_eq!(h.get("X-Engine").map(String::as_str), Some("engine-value"));
545        assert_eq!(h.get("User-Agent").map(String::as_str), Some("site-ua"));
546    }
547
548    #[test]
549    fn missing_engine_reference_fails_load() {
550        let json = r#"{
551            "engines": {},
552            "sites": [
553                { "name": "Mock", "url": "https://example.com/{username}",
554                  "engine": "DoesNotExist" }
555            ]
556        }"#;
557        let err = Registry::from_json_str(json).unwrap_err();
558        assert!(
559            err.to_string()
560                .contains("references engine \"DoesNotExist\""),
561            "expected missing-engine error, got: {err}"
562        );
563    }
564
565    #[test]
566    fn engine_regex_check_inherited_when_site_has_none() {
567        let json = r#"{
568            "engines": {
569                "Bounded": {
570                    "signals": [{ "kind": "status_found", "codes": [200] }],
571                    "regex_check": "^[a-z]{3,16}$"
572                }
573            },
574            "sites": [
575                { "name": "S", "url": "https://example.com/{username}",
576                  "engine": "Bounded" }
577            ]
578        }"#;
579        let r = Registry::from_json_str(json).unwrap();
580        assert_eq!(r.sites()[0].regex_check.as_deref(), Some("^[a-z]{3,16}$"));
581    }
582
583    #[test]
584    fn load_from_path_round_trips_via_tempfile() {
585        let mut path = std::env::temp_dir();
586        path.push(format!("adler-test-registry-{}.json", std::process::id()));
587        std::fs::write(
588            &path,
589            r#"{
590                "sites": [
591                    { "name": "Mock", "url": "https://example.com/{username}",
592                      "signals": [{ "kind": "status_found", "codes": [200] }] }
593                ]
594            }"#,
595        )
596        .unwrap();
597        let result = Registry::load_from_path(&path);
598        let _ = std::fs::remove_file(&path);
599        let registry = result.unwrap();
600        assert_eq!(registry.len(), 1);
601        assert_eq!(registry.sites()[0].name, "Mock");
602    }
603}