use std::collections::{BTreeMap, HashSet};
use std::path::Path;
use serde::Deserialize;
use crate::error::{Error, Result};
use crate::site::{Engine, Site};
const EMBEDDED_REGISTRY: &str = include_str!("../data/sites.json");
const EMBEDDED_WMN_REGISTRY: &str = include_str!("../data/sites_wmn.json");
#[derive(Debug, Clone, Deserialize)]
pub struct Registry {
#[serde(default)]
engines: BTreeMap<String, Engine>,
sites: Vec<Site>,
}
impl Registry {
pub fn default_embedded() -> Result<Self> {
Self::from_json_str(EMBEDDED_REGISTRY)
}
pub fn default_embedded_with_wmn() -> Result<Self> {
let mut base = Self::default_embedded()?;
let wmn: Self = serde_json::from_str(EMBEDDED_WMN_REGISTRY)?;
let existing: HashSet<String> = base.sites.iter().map(|s| s.name.to_lowercase()).collect();
for (name, engine) in wmn.engines {
base.engines.entry(name).or_insert(engine);
}
for site in wmn.sites {
if !existing.contains(&site.name.to_lowercase()) {
base.sites.push(site);
}
}
base.resolve_engines()?;
base.validate()?;
Ok(base)
}
pub fn from_json_str(json: &str) -> Result<Self> {
let mut registry: Self = serde_json::from_str(json)?;
registry.resolve_engines()?;
registry.validate()?;
Ok(registry)
}
pub fn engines(&self) -> &BTreeMap<String, Engine> {
&self.engines
}
fn resolve_engines(&mut self) -> Result<()> {
for (name, engine) in &self.engines {
engine.validate(name)?;
}
for site in &mut self.sites {
let Some(name) = &site.engine else {
continue;
};
let Some(engine) = self.engines.get(name) else {
return Err(Error::InvalidSite {
reason: format!(
"site {:?}: references engine {name:?} which is not defined",
site.name
),
});
};
engine.merge_into(site);
}
Ok(())
}
pub fn load_from_path(path: impl AsRef<Path>) -> Result<Self> {
let bytes = std::fs::read(path)?;
let json = std::str::from_utf8(&bytes).map_err(|e| Error::InvalidSite {
reason: format!("registry file is not valid UTF-8: {e}"),
})?;
Self::from_json_str(json)
}
pub fn sites(&self) -> &[Site] {
&self.sites
}
pub fn len(&self) -> usize {
self.sites.len()
}
pub fn is_empty(&self) -> bool {
self.sites.is_empty()
}
pub fn filter(
&self,
include: &[String],
exclude: &[String],
tags: &[String],
exclude_tags: &[String],
include_nsfw: bool,
) -> Vec<Site> {
let include: Vec<String> = include.iter().map(|s| s.to_lowercase()).collect();
let exclude: Vec<String> = exclude.iter().map(|s| s.to_lowercase()).collect();
let want_tags: Vec<String> = tags.iter().map(|s| s.to_lowercase()).collect();
let mut drop_tags: Vec<String> = exclude_tags.iter().map(|s| s.to_lowercase()).collect();
let nsfw_tag = "nsfw".to_owned();
let asking_for_nsfw = want_tags.contains(&nsfw_tag);
if !include_nsfw && !asking_for_nsfw && !drop_tags.contains(&nsfw_tag) {
drop_tags.push(nsfw_tag);
}
self.sites
.iter()
.filter(|site| {
let name = site.name.to_lowercase();
let included = include.is_empty() || include.iter().any(|i| name.contains(i));
let excluded = exclude.iter().any(|x| name.contains(x));
let lower_tags: Vec<String> = site.tags.iter().map(|t| t.to_lowercase()).collect();
let tagged =
want_tags.is_empty() || lower_tags.iter().any(|t| want_tags.contains(t));
let tag_excluded = lower_tags.iter().any(|t| drop_tags.contains(t));
included && !excluded && tagged && !tag_excluded
})
.cloned()
.collect()
}
pub fn tag_counts(&self) -> Vec<(String, usize)> {
let mut counts: std::collections::BTreeMap<String, usize> =
std::collections::BTreeMap::new();
for site in &self.sites {
for tag in &site.tags {
*counts.entry(tag.clone()).or_insert(0) += 1;
}
}
counts.into_iter().collect()
}
fn validate(&self) -> Result<()> {
if self.sites.is_empty() {
return Err(Error::InvalidSite {
reason: "registry has no sites".into(),
});
}
for site in &self.sites {
site.validate()?;
}
let mut seen: HashSet<String> = HashSet::new();
for site in &self.sites {
let key = site.name.to_lowercase();
if !seen.insert(key) {
return Err(Error::InvalidSite {
reason: format!("duplicate site name: {:?}", site.name),
});
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn embedded_registry_loads_and_validates() {
let registry = Registry::default_embedded().expect("embedded registry must load");
assert!(
registry.len() >= 100,
"imported registry should have ≥100 sites, got {}",
registry.len()
);
let names: Vec<&str> = registry.sites().iter().map(|s| s.name.as_str()).collect();
assert!(names.contains(&"GitHub"));
assert!(names.contains(&"Reddit"));
assert!(names.contains(&"Telegram"));
}
#[test]
fn wmn_embedded_registry_loads_and_supersets_default() {
let base = Registry::default_embedded().unwrap();
let merged = Registry::default_embedded_with_wmn().expect("WMN-merged registry must load");
assert!(
merged.len() > base.len(),
"WMN merge must add sites: base={} merged={}",
base.len(),
merged.len()
);
let merged_names: HashSet<String> = merged
.sites()
.iter()
.map(|s| s.name.to_lowercase())
.collect();
for s in base.sites() {
assert!(
merged_names.contains(&s.name.to_lowercase()),
"merge dropped base-tranche site {:?}",
s.name
);
}
let has_wmn_tag = merged
.sites()
.iter()
.any(|s| s.tags.iter().any(|t| t == "source:wmn"));
assert!(has_wmn_tag, "no site carries the source:wmn tag");
}
#[test]
fn rejects_empty_registry() {
let err = Registry::from_json_str(r#"{ "sites": [] }"#).unwrap_err();
assert!(matches!(err, Error::InvalidSite { .. }));
}
#[test]
fn rejects_duplicate_site_names() {
let json = r#"{
"sites": [
{ "name": "GitHub", "url": "https://github.com/{username}",
"signals": [{ "kind": "status_found", "codes": [200] }] },
{ "name": "github", "url": "https://github.com/{username}",
"signals": [{ "kind": "status_found", "codes": [200] }] }
]
}"#;
let err = Registry::from_json_str(json).unwrap_err();
assert!(matches!(err, Error::InvalidSite { .. }));
assert!(err.to_string().contains("duplicate"));
}
#[test]
fn rejects_invalid_site_definition() {
let json = r#"{
"sites": [
{ "name": "Bad", "url": "https://example.com/",
"signals": [{ "kind": "status_found", "codes": [200] }] }
]
}"#;
assert!(Registry::from_json_str(json).is_err());
}
#[test]
fn rejects_malformed_json() {
let err = Registry::from_json_str("{").unwrap_err();
assert!(matches!(err, Error::Json(_)));
}
#[test]
fn filter_include_is_case_insensitive_substring() {
let registry = Registry::default_embedded().unwrap();
let only_github = registry.filter(&["github".into()], &[], &[], &[], false);
assert_eq!(only_github.len(), 1);
assert_eq!(only_github[0].name, "GitHub");
let many = registry.filter(&["e".into()], &[], &[], &[], false); assert!(many.len() > 1);
}
#[test]
fn filter_exclude_drops_matches() {
let registry = Registry::default_embedded().unwrap();
let without_github = registry.filter(&[], &["github".into()], &[], &[], true);
assert!(without_github.iter().all(|s| s.name != "GitHub"));
assert_eq!(without_github.len(), registry.len() - 1);
}
#[test]
fn filter_include_and_exclude_compose() {
let registry = Registry::default_embedded().unwrap();
let filtered = registry.filter(&["git".into()], &["lab".into()], &[], &[], false);
let names: Vec<&str> = filtered.iter().map(|s| s.name.as_str()).collect();
assert!(names.contains(&"GitHub"));
assert!(!names.contains(&"GitLab"));
}
#[test]
fn filter_with_no_matches_returns_empty() {
let registry = Registry::default_embedded().unwrap();
let filtered = registry.filter(&["does-not-exist-xyz".into()], &[], &[], &[], false);
assert!(filtered.is_empty());
}
fn tagged_registry() -> Registry {
let json = r#"{
"sites": [
{ "name": "Soc", "url": "https://soc.example/{username}",
"signals": [{ "kind": "status_found", "codes": [200] }],
"tags": ["social", "region:ru"] },
{ "name": "Dev", "url": "https://dev.example/{username}",
"signals": [{ "kind": "status_found", "codes": [200] }],
"tags": ["dev"] },
{ "name": "Plain", "url": "https://plain.example/{username}",
"signals": [{ "kind": "status_found", "codes": [200] }] }
]
}"#;
Registry::from_json_str(json).unwrap()
}
#[test]
fn tag_filter_keeps_only_matching_tags_and_drops_untagged() {
let r = tagged_registry();
let social = r.filter(&[], &[], &["social".into()], &[], false);
let names: Vec<&str> = social.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, ["Soc"], "tag filter should keep only tagged matches");
}
#[test]
fn tag_filter_is_or_within_requested_tags_and_case_insensitive() {
let r = tagged_registry();
let either = r.filter(&[], &[], &["DEV".into(), "social".into()], &[], false);
let names: Vec<&str> = either.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, ["Soc", "Dev"]);
}
#[test]
fn no_tag_filter_includes_untagged_sites() {
let r = tagged_registry();
assert_eq!(r.filter(&[], &[], &[], &[], false).len(), 3);
}
#[test]
fn exclude_tag_drops_matching_sites() {
let r = tagged_registry();
let kept = r.filter(&[], &[], &[], &["social".into()], false);
let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, ["Dev", "Plain"], "{names:?}");
}
fn nsfw_registry() -> Registry {
let json = r#"{
"sites": [
{ "name": "Family", "url": "https://family.example/{username}",
"signals": [{ "kind": "status_found", "codes": [200] }],
"tags": ["social"] },
{ "name": "Adult", "url": "https://adult.example/{username}",
"signals": [{ "kind": "status_found", "codes": [200] }],
"tags": ["nsfw"] }
]
}"#;
Registry::from_json_str(json).unwrap()
}
#[test]
fn nsfw_sites_excluded_by_default() {
let r = nsfw_registry();
let kept = r.filter(&[], &[], &[], &[], false);
let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, ["Family"], "nsfw site must be excluded by default");
}
#[test]
fn nsfw_sites_included_when_flag_set() {
let r = nsfw_registry();
let kept = r.filter(&[], &[], &[], &[], true);
assert_eq!(kept.len(), 2, "both sites present with include_nsfw=true");
}
#[test]
fn nsfw_sites_included_when_tag_asked_for_explicitly() {
let r = nsfw_registry();
let kept = r.filter(&[], &[], &["nsfw".into()], &[], false);
let names: Vec<&str> = kept.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, ["Adult"]);
}
#[test]
fn tag_counts_are_sorted_with_per_tag_totals() {
let r = tagged_registry();
assert_eq!(
r.tag_counts(),
vec![
("dev".to_owned(), 1),
("region:ru".to_owned(), 1),
("social".to_owned(), 1),
]
);
}
#[test]
fn engine_inheritance_fills_empty_site_signals() {
let json = r#"{
"engines": {
"Discourse": {
"signals": [
{ "kind": "status_found", "codes": [200] },
{ "kind": "body_absent", "text": "Oops! That page doesn't exist" }
]
}
},
"sites": [
{ "name": "Mozilla Forum", "url": "https://discourse.mozilla.org/u/{username}",
"engine": "Discourse" }
]
}"#;
let r = Registry::from_json_str(json).unwrap();
let site = &r.sites()[0];
assert_eq!(site.signals.len(), 2);
assert_eq!(site.engine.as_deref(), Some("Discourse"));
assert!(r.engines().contains_key("Discourse"));
}
#[test]
fn site_overrides_engine_signals_on_conflict() {
let json = r#"{
"engines": {
"Discourse": {
"signals": [{ "kind": "status_found", "codes": [200] }]
}
},
"sites": [
{ "name": "Custom", "url": "https://example.com/{username}",
"engine": "Discourse",
"signals": [
{ "kind": "status_found", "codes": [200] },
{ "kind": "status_not_found", "codes": [404] }
] }
]
}"#;
let r = Registry::from_json_str(json).unwrap();
assert_eq!(r.sites()[0].signals.len(), 2);
}
#[test]
fn engine_headers_merge_with_site_headers_per_key() {
let json = r#"{
"engines": {
"Foo": {
"signals": [{ "kind": "status_found", "codes": [200] }],
"request_headers": {
"X-Engine": "engine-value",
"User-Agent": "engine-ua"
}
}
},
"sites": [
{ "name": "S", "url": "https://example.com/{username}",
"engine": "Foo",
"request_headers": { "User-Agent": "site-ua" } }
]
}"#;
let r = Registry::from_json_str(json).unwrap();
let h = &r.sites()[0].request_headers;
assert_eq!(h.get("X-Engine").map(String::as_str), Some("engine-value"));
assert_eq!(h.get("User-Agent").map(String::as_str), Some("site-ua"));
}
#[test]
fn missing_engine_reference_fails_load() {
let json = r#"{
"engines": {},
"sites": [
{ "name": "Mock", "url": "https://example.com/{username}",
"engine": "DoesNotExist" }
]
}"#;
let err = Registry::from_json_str(json).unwrap_err();
assert!(
err.to_string()
.contains("references engine \"DoesNotExist\""),
"expected missing-engine error, got: {err}"
);
}
#[test]
fn engine_regex_check_inherited_when_site_has_none() {
let json = r#"{
"engines": {
"Bounded": {
"signals": [{ "kind": "status_found", "codes": [200] }],
"regex_check": "^[a-z]{3,16}$"
}
},
"sites": [
{ "name": "S", "url": "https://example.com/{username}",
"engine": "Bounded" }
]
}"#;
let r = Registry::from_json_str(json).unwrap();
assert_eq!(r.sites()[0].regex_check.as_deref(), Some("^[a-z]{3,16}$"));
}
#[test]
fn load_from_path_round_trips_via_tempfile() {
let mut path = std::env::temp_dir();
path.push(format!("adler-test-registry-{}.json", std::process::id()));
std::fs::write(
&path,
r#"{
"sites": [
{ "name": "Mock", "url": "https://example.com/{username}",
"signals": [{ "kind": "status_found", "codes": [200] }] }
]
}"#,
)
.unwrap();
let result = Registry::load_from_path(&path);
let _ = std::fs::remove_file(&path);
let registry = result.unwrap();
assert_eq!(registry.len(), 1);
assert_eq!(registry.sites()[0].name, "Mock");
}
}