1use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use adler_core::{CheckOutcome, IdentityCluster, MatchKind, ProfileEvidence, Site};
17use serde::{Deserialize, Serialize};
18use tokio::fs;
19
20use crate::error::{Error, Result};
21use crate::scan::{FinishedScan, ScanId, Summary};
22
23pub(crate) const MAX_PERSISTED_SCANS: usize = 200;
27pub(crate) const PERSISTED_SCAN_SCHEMA_VERSION: u16 = 3;
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct PersistedScan {
34 #[serde(default = "default_schema_version")]
36 pub schema_version: u16,
37 pub scan_id: ScanId,
39 pub username: String,
41 #[serde(default, skip_serializing_if = "Option::is_none")]
45 pub request_context: Option<ScanRequestContext>,
46 pub site_count: usize,
48 pub created_at_ms: u64,
50 pub summary: Summary,
52 pub outcomes: Vec<CheckOutcome>,
54 #[serde(default, skip_serializing_if = "Vec::is_empty")]
57 pub identity_clusters: Vec<IdentityCluster>,
58 pub elapsed_ms: u64,
60}
61
62impl PersistedScan {
63 #[must_use]
65 pub fn from_finished(
66 scan_id: ScanId,
67 username: String,
68 site_count: usize,
69 created_at_ms: u64,
70 finished: FinishedScan,
71 ) -> Self {
72 let mut scan = Self {
73 schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
74 scan_id,
75 username,
76 request_context: None,
77 site_count,
78 created_at_ms,
79 summary: finished.summary,
80 outcomes: finished.outcomes,
81 identity_clusters: finished.identity_clusters,
82 elapsed_ms: finished.elapsed_ms,
83 };
84 scan.refresh_derived_fields();
85 scan
86 }
87
88 #[must_use]
90 pub fn with_request_context(mut self, context: ScanRequestContext) -> Self {
91 self.request_context = Some(context);
92 self
93 }
94
95 pub(crate) fn refresh_derived_fields(&mut self) {
96 for outcome in &mut self.outcomes {
97 outcome.refresh_confidence();
98 }
99 self.summary = Summary::from_outcomes(&self.outcomes);
100 self.identity_clusters =
101 adler_core::build_identity_clusters(&self.username, &self.outcomes);
102 }
103}
104
105const fn default_schema_version() -> u16 {
106 PERSISTED_SCAN_SCHEMA_VERSION
107}
108
109#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
113pub struct ScanRequestContext {
114 pub username: String,
116 #[serde(default, skip_serializing_if = "Option::is_none")]
118 pub derived_from: Option<ScanId>,
119 #[serde(default, skip_serializing_if = "Vec::is_empty")]
121 pub only: Vec<String>,
122 #[serde(default, skip_serializing_if = "Vec::is_empty")]
124 pub exclude: Vec<String>,
125 #[serde(default, skip_serializing_if = "Vec::is_empty")]
127 pub tag: Vec<String>,
128 #[serde(default, skip_serializing_if = "Vec::is_empty")]
130 pub exclude_tag: Vec<String>,
131 #[serde(default, skip_serializing_if = "Option::is_none")]
133 pub top: Option<u32>,
134 pub nsfw: bool,
136 #[serde(default, skip_serializing_if = "Option::is_none")]
138 pub concurrency: Option<usize>,
139 #[serde(default, skip_serializing_if = "Option::is_none")]
141 pub deadline_secs: Option<u64>,
142 #[serde(default, skip_serializing_if = "Vec::is_empty")]
144 pub egress_names: Vec<String>,
145 #[serde(default, skip_serializing_if = "Vec::is_empty")]
148 pub disabled_matches: Vec<PersistedDisabledMatch>,
149}
150
151#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
153pub struct PersistedDisabledMatch {
154 pub name: String,
156 pub url: String,
158 #[serde(default, skip_serializing_if = "Vec::is_empty")]
160 pub tags: Vec<String>,
161 pub disabled_reason: String,
163}
164
165impl From<&Site> for PersistedDisabledMatch {
166 fn from(site: &Site) -> Self {
167 Self {
168 name: site.name.clone(),
169 url: site.url.as_str().to_owned(),
170 tags: site.tags.clone(),
171 disabled_reason: site
172 .disabled_reason
173 .clone()
174 .unwrap_or_else(|| "disabled in registry".to_owned()),
175 }
176 }
177}
178
179#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct ScanDiff {
183 pub from_scan_id: ScanId,
185 pub to_scan_id: ScanId,
187 #[serde(default, skip_serializing_if = "Vec::is_empty")]
189 pub added_found: Vec<CheckOutcome>,
190 #[serde(default, skip_serializing_if = "Vec::is_empty")]
192 pub removed_found: Vec<CheckOutcome>,
193 #[serde(default, skip_serializing_if = "Vec::is_empty")]
195 pub verdict_changes: Vec<VerdictChange>,
196 #[serde(default, skip_serializing_if = "Vec::is_empty")]
198 pub evidence_changes: Vec<EvidenceChange>,
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct VerdictChange {
204 pub site: String,
206 pub before: MatchKind,
208 pub after: MatchKind,
210}
211
212#[derive(Debug, Clone, Serialize, Deserialize)]
214pub struct EvidenceChange {
215 pub site: String,
217 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
219 pub before_enrichment: BTreeMap<String, String>,
220 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
222 pub after_enrichment: BTreeMap<String, String>,
223 #[serde(default, skip_serializing_if = "Vec::is_empty")]
225 pub before_profile_evidence: Vec<ProfileEvidence>,
226 #[serde(default, skip_serializing_if = "Vec::is_empty")]
228 pub after_profile_evidence: Vec<ProfileEvidence>,
229}
230
231#[derive(Debug, Clone, Serialize, Deserialize)]
233pub struct ScanTimeline {
234 pub username: String,
236 pub scan_count: usize,
238 #[serde(default, skip_serializing_if = "Option::is_none")]
240 pub from_ms: Option<u64>,
241 #[serde(default, skip_serializing_if = "Option::is_none")]
243 pub to_ms: Option<u64>,
244 #[serde(default, skip_serializing_if = "Vec::is_empty")]
246 pub profiles: Vec<TimelineProfile>,
247 #[serde(default, skip_serializing_if = "Vec::is_empty")]
249 pub events: Vec<TimelineEvent>,
250}
251
252#[derive(Debug, Clone, Serialize, Deserialize)]
254pub struct TimelineProfile {
255 pub site: String,
257 pub url: String,
259 pub first_seen_ms: u64,
261 pub last_seen_ms: u64,
263 pub present_in_latest: bool,
265 #[serde(default, skip_serializing_if = "Option::is_none")]
267 pub last_verdict: Option<MatchKind>,
268}
269
270#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
272#[serde(rename_all = "snake_case")]
273pub enum TimelineEventKind {
274 FirstSeen,
276 Disappeared,
278 Reappeared,
280 EvidenceChanged,
282}
283
284#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct TimelineEvent {
287 pub scan_id: ScanId,
289 pub at_ms: u64,
291 pub site: String,
293 pub url: String,
295 pub kind: TimelineEventKind,
297 #[serde(default, skip_serializing_if = "Option::is_none")]
299 pub before: Option<MatchKind>,
300 #[serde(default, skip_serializing_if = "Option::is_none")]
302 pub after: Option<MatchKind>,
303 #[serde(default, skip_serializing_if = "Option::is_none")]
305 pub evidence_change: Option<EvidenceChange>,
306}
307
308#[must_use]
315pub fn diff_scans(previous: &PersistedScan, current: &PersistedScan) -> ScanDiff {
316 let previous_by_site = outcomes_by_site(&previous.outcomes);
317 let current_by_site = outcomes_by_site(¤t.outcomes);
318
319 let mut added_found = Vec::new();
320 let mut removed_found = Vec::new();
321 let mut verdict_changes = Vec::new();
322 let mut evidence_changes = Vec::new();
323
324 for (site, current_outcome) in ¤t_by_site {
325 let previous_outcome = previous_by_site.get(site);
326 if current_outcome.kind == MatchKind::Found
327 && previous_outcome.is_none_or(|o| o.kind != MatchKind::Found)
328 {
329 added_found.push((*current_outcome).clone());
330 }
331 if let Some(previous_outcome) = previous_outcome {
332 if previous_outcome.kind != current_outcome.kind {
333 verdict_changes.push(VerdictChange {
334 site: site.clone(),
335 before: previous_outcome.kind,
336 after: current_outcome.kind,
337 });
338 }
339 if previous_outcome.kind == MatchKind::Found
340 && current_outcome.kind == MatchKind::Found
341 && profile_evidence_changed(previous_outcome, current_outcome)
342 {
343 evidence_changes.push(EvidenceChange {
344 site: site.clone(),
345 before_enrichment: previous_outcome.enrichment.clone(),
346 after_enrichment: current_outcome.enrichment.clone(),
347 before_profile_evidence: previous_outcome.profile_evidence.clone(),
348 after_profile_evidence: current_outcome.profile_evidence.clone(),
349 });
350 }
351 }
352 }
353
354 for (site, previous_outcome) in &previous_by_site {
355 if previous_outcome.kind == MatchKind::Found
356 && current_by_site
357 .get(site)
358 .is_none_or(|o| o.kind != MatchKind::Found)
359 {
360 removed_found.push((*previous_outcome).clone());
361 }
362 }
363
364 ScanDiff {
365 from_scan_id: previous.scan_id.clone(),
366 to_scan_id: current.scan_id.clone(),
367 added_found,
368 removed_found,
369 verdict_changes,
370 evidence_changes,
371 }
372}
373
374#[must_use]
380pub fn build_scan_timeline(scans: &[PersistedScan]) -> ScanTimeline {
381 let mut ordered: Vec<&PersistedScan> = scans.iter().collect();
382 ordered.sort_by(|left, right| {
383 left.created_at_ms
384 .cmp(&right.created_at_ms)
385 .then_with(|| left.scan_id.as_str().cmp(right.scan_id.as_str()))
386 });
387
388 let username = ordered
389 .first()
390 .map(|scan| scan.username.clone())
391 .unwrap_or_default();
392 let from_ms = ordered.first().map(|scan| scan.created_at_ms);
393 let to_ms = ordered.last().map(|scan| scan.created_at_ms);
394 let mut states: BTreeMap<String, TimelineProfileState> = BTreeMap::new();
395 let mut events = Vec::new();
396
397 for scan in &ordered {
398 let current_by_site = outcomes_by_site(&scan.outcomes);
399 let sites = timeline_site_names(&states, ¤t_by_site);
400
401 for site in sites {
402 apply_timeline_site(
403 scan,
404 &site,
405 current_by_site.get(&site).copied(),
406 &mut states,
407 &mut events,
408 );
409 }
410 }
411
412 let profiles = states
413 .into_iter()
414 .map(|(site, state)| TimelineProfile {
415 site,
416 url: state.url,
417 first_seen_ms: state.first_seen_ms,
418 last_seen_ms: state.last_seen_ms,
419 present_in_latest: state.present_in_latest,
420 last_verdict: state.last_verdict,
421 })
422 .collect();
423
424 ScanTimeline {
425 username,
426 scan_count: ordered.len(),
427 from_ms,
428 to_ms,
429 profiles,
430 events,
431 }
432}
433
434fn timeline_site_names(
435 states: &BTreeMap<String, TimelineProfileState>,
436 current_by_site: &BTreeMap<String, &CheckOutcome>,
437) -> Vec<String> {
438 let mut sites: Vec<String> = states.keys().cloned().collect();
439 for site in current_by_site.keys() {
440 if !states.contains_key(site.as_str()) {
441 sites.push((*site).clone());
442 }
443 }
444 sites.sort();
445 sites.dedup();
446 sites
447}
448
449fn apply_timeline_site(
450 scan: &PersistedScan,
451 site: &str,
452 current: Option<&CheckOutcome>,
453 states: &mut BTreeMap<String, TimelineProfileState>,
454 events: &mut Vec<TimelineEvent>,
455) {
456 let current_kind = current.map(|outcome| outcome.kind);
457 let was_present = states
458 .get(site)
459 .is_some_and(|state| state.present_in_latest);
460
461 if current_kind == Some(MatchKind::Found) {
462 apply_found_timeline_site(scan, site, current.expect("found outcome"), states, events);
463 } else if was_present {
464 apply_disappeared_timeline_site(scan, site, current, current_kind, states, events);
465 } else if let (Some(state), Some(outcome)) = (states.get_mut(site), current) {
466 state.last_verdict = Some(outcome.kind);
467 state.url.clone_from(&outcome.url);
468 }
469}
470
471fn apply_found_timeline_site(
472 scan: &PersistedScan,
473 site: &str,
474 outcome: &CheckOutcome,
475 states: &mut BTreeMap<String, TimelineProfileState>,
476 events: &mut Vec<TimelineEvent>,
477) {
478 let current_kind = Some(outcome.kind);
479 let had_state = states.contains_key(site);
480 let was_present = states
481 .get(site)
482 .is_some_and(|state| state.present_in_latest);
483 let state = states
484 .entry(site.to_owned())
485 .or_insert_with(|| TimelineProfileState::new(outcome, scan.created_at_ms));
486
487 if !had_state {
488 events.push(timeline_event(
489 scan,
490 site,
491 &outcome.url,
492 TimelineEventKind::FirstSeen,
493 None,
494 current_kind,
495 None,
496 ));
497 } else if !was_present {
498 events.push(timeline_event(
499 scan,
500 site,
501 &outcome.url,
502 TimelineEventKind::Reappeared,
503 state.last_verdict,
504 current_kind,
505 None,
506 ));
507 } else if state.profile_evidence_changed(outcome) {
508 events.push(timeline_event(
509 scan,
510 site,
511 &outcome.url,
512 TimelineEventKind::EvidenceChanged,
513 Some(MatchKind::Found),
514 current_kind,
515 Some(EvidenceChange {
516 site: site.to_owned(),
517 before_enrichment: state.last_found_enrichment.clone(),
518 after_enrichment: outcome.enrichment.clone(),
519 before_profile_evidence: state.last_found_profile_evidence.clone(),
520 after_profile_evidence: outcome.profile_evidence.clone(),
521 }),
522 ));
523 }
524
525 states
526 .get_mut(site)
527 .expect("state inserted before found update")
528 .update_found(outcome, scan.created_at_ms);
529}
530
531fn apply_disappeared_timeline_site(
532 scan: &PersistedScan,
533 site: &str,
534 current: Option<&CheckOutcome>,
535 current_kind: Option<MatchKind>,
536 states: &mut BTreeMap<String, TimelineProfileState>,
537 events: &mut Vec<TimelineEvent>,
538) {
539 let state = states
540 .get_mut(site)
541 .expect("present state exists before disappearance");
542 let url = current.map_or_else(|| state.url.clone(), |outcome| outcome.url.clone());
543 events.push(timeline_event(
544 scan,
545 site,
546 &url,
547 TimelineEventKind::Disappeared,
548 state.last_verdict,
549 current_kind,
550 None,
551 ));
552 state.present_in_latest = false;
553 state.last_verdict = current_kind;
554 if let Some(outcome) = current {
555 state.url.clone_from(&outcome.url);
556 }
557}
558
559fn timeline_event(
560 scan: &PersistedScan,
561 site: &str,
562 url: &str,
563 kind: TimelineEventKind,
564 before: Option<MatchKind>,
565 after: Option<MatchKind>,
566 evidence_change: Option<EvidenceChange>,
567) -> TimelineEvent {
568 TimelineEvent {
569 scan_id: scan.scan_id.clone(),
570 at_ms: scan.created_at_ms,
571 site: site.to_owned(),
572 url: url.to_owned(),
573 kind,
574 before,
575 after,
576 evidence_change,
577 }
578}
579
580#[derive(Debug, Clone)]
581struct TimelineProfileState {
582 url: String,
583 first_seen_ms: u64,
584 last_seen_ms: u64,
585 present_in_latest: bool,
586 last_verdict: Option<MatchKind>,
587 last_found_enrichment: BTreeMap<String, String>,
588 last_found_profile_evidence: Vec<ProfileEvidence>,
589}
590
591impl TimelineProfileState {
592 fn new(outcome: &CheckOutcome, at_ms: u64) -> Self {
593 Self {
594 url: outcome.url.clone(),
595 first_seen_ms: at_ms,
596 last_seen_ms: at_ms,
597 present_in_latest: true,
598 last_verdict: Some(outcome.kind),
599 last_found_enrichment: outcome.enrichment.clone(),
600 last_found_profile_evidence: outcome.profile_evidence.clone(),
601 }
602 }
603
604 fn update_found(&mut self, outcome: &CheckOutcome, at_ms: u64) {
605 self.url.clone_from(&outcome.url);
606 self.last_seen_ms = at_ms;
607 self.present_in_latest = true;
608 self.last_verdict = Some(outcome.kind);
609 self.last_found_enrichment = outcome.enrichment.clone();
610 self.last_found_profile_evidence
611 .clone_from(&outcome.profile_evidence);
612 }
613
614 fn profile_evidence_changed(&self, outcome: &CheckOutcome) -> bool {
615 self.last_found_enrichment != outcome.enrichment
616 || self.last_found_profile_evidence != outcome.profile_evidence
617 }
618}
619
620fn outcomes_by_site(outcomes: &[CheckOutcome]) -> BTreeMap<String, &CheckOutcome> {
621 outcomes
622 .iter()
623 .map(|outcome| (outcome.site.clone(), outcome))
624 .collect()
625}
626
627fn profile_evidence_changed(previous: &CheckOutcome, current: &CheckOutcome) -> bool {
628 previous.enrichment != current.enrichment
629 || previous.profile_evidence != current.profile_evidence
630}
631
632#[must_use]
638pub fn default_dir() -> PathBuf {
639 if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
640 return PathBuf::from(xdg).join("adler").join("scans");
641 }
642 if let Some(home) = std::env::var_os("HOME") {
643 return PathBuf::from(home)
644 .join(".cache")
645 .join("adler")
646 .join("scans");
647 }
648 PathBuf::from("adler-scans")
649}
650
651pub(crate) async fn save(dir: &Path, scan: &PersistedScan) -> Result<()> {
653 fs::create_dir_all(dir).await.map_err(Error::Persist)?;
654 let path = dir.join(format!("{}.json", scan.scan_id));
655 let tmp = dir.join(format!("{}.json.tmp", scan.scan_id));
656 let mut scan = scan.clone();
657 scan.refresh_derived_fields();
658 let body = serde_json::to_vec_pretty(&scan).map_err(Error::PersistEncode)?;
659 fs::write(&tmp, &body).await.map_err(Error::Persist)?;
660 fs::rename(&tmp, &path).await.map_err(Error::Persist)?;
661 Ok(())
662}
663
664pub(crate) async fn load(dir: &Path, scan_id: &ScanId) -> Option<PersistedScan> {
668 let path = dir.join(format!("{scan_id}.json"));
669 let bytes = fs::read(&path).await.ok()?;
670 serde_json::from_slice(&bytes)
671 .ok()
672 .map(refresh_derived_fields)
673}
674
675pub(crate) async fn load_all(dir: &Path) -> Vec<PersistedScan> {
679 let Ok(mut entries) = fs::read_dir(dir).await else {
680 return Vec::new();
681 };
682 let mut out = Vec::new();
683 while let Ok(Some(entry)) = entries.next_entry().await {
684 let path = entry.path();
685 if path.extension().and_then(|s| s.to_str()) != Some("json") {
686 continue;
687 }
688 let Ok(bytes) = fs::read(&path).await else {
689 continue;
690 };
691 let Ok(scan) = serde_json::from_slice::<PersistedScan>(&bytes) else {
692 continue;
693 };
694 out.push(refresh_derived_fields(scan));
695 }
696 out.sort_by_key(|s| std::cmp::Reverse(s.created_at_ms));
697 out
698}
699
700fn refresh_derived_fields(mut scan: PersistedScan) -> PersistedScan {
701 scan.refresh_derived_fields();
702 scan
703}
704
705pub(crate) async fn prune(dir: &Path, keep_newest: usize) -> usize {
708 let scans = load_all(dir).await;
709 if scans.len() <= keep_newest {
710 return 0;
711 }
712 let mut removed = 0;
713 for s in &scans[keep_newest..] {
714 let path = dir.join(format!("{}.json", s.scan_id));
715 if fs::remove_file(&path).await.is_ok() {
716 removed += 1;
717 }
718 }
719 removed
720}
721
722#[cfg(test)]
723mod tests {
724 use super::*;
725 use adler_core::{
726 EvidenceAccessPath, MatchKind, ProfileEvidence, TransportTier, UncertainReason,
727 };
728 use std::collections::BTreeMap;
729 use tempfile::TempDir;
730
731 fn sample(scan_id: &str, ts: u64) -> PersistedScan {
732 PersistedScan {
733 schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
734 scan_id: ScanId::from(scan_id.to_owned()),
735 username: "alice".into(),
736 request_context: None,
737 site_count: 2,
738 created_at_ms: ts,
739 summary: Summary {
740 found: 1,
741 not_found: 1,
742 uncertain: 0,
743 },
744 outcomes: vec![
745 CheckOutcome {
746 site: "GitHub".into(),
747 url: "https://github.com/alice".into(),
748 kind: MatchKind::Found,
749 reason: None,
750 elapsed_ms: 120,
751 enrichment: BTreeMap::new(),
752 evidence: vec!["HTTP 200 (status_found)".into()],
753 profile_evidence: Vec::new(),
754 confidence: adler_core::ConfidenceScore::default(),
755 transport: None,
756 escalations: 0,
757 },
758 CheckOutcome {
759 site: "GitLab".into(),
760 url: "https://gitlab.com/alice".into(),
761 kind: MatchKind::NotFound,
762 reason: None,
763 elapsed_ms: 90,
764 enrichment: BTreeMap::new(),
765 evidence: vec!["HTTP 404 (status_not_found)".into()],
766 profile_evidence: Vec::new(),
767 confidence: adler_core::ConfidenceScore::default(),
768 transport: None,
769 escalations: 0,
770 },
771 ],
772 identity_clusters: Vec::new(),
773 elapsed_ms: 210,
774 }
775 }
776
777 fn outcome(site: &str, kind: MatchKind) -> CheckOutcome {
778 CheckOutcome {
779 site: site.into(),
780 url: format!("https://{site}.example/alice"),
781 kind,
782 reason: None,
783 elapsed_ms: 10,
784 enrichment: BTreeMap::new(),
785 evidence: Vec::new(),
786 profile_evidence: Vec::new(),
787 confidence: adler_core::ConfidenceScore::default(),
788 transport: None,
789 escalations: 0,
790 }
791 }
792
793 fn found_with_website(site: &str, website: &str) -> CheckOutcome {
794 let mut outcome = outcome(site, MatchKind::Found);
795 outcome
796 .profile_evidence
797 .push(ProfileEvidence::from_enrichment(
798 site,
799 &outcome.url,
800 "website",
801 website,
802 ));
803 outcome
804 }
805
806 fn large_outcomes(count: usize, generation: usize) -> Vec<CheckOutcome> {
807 (0..count)
808 .map(|idx| large_outcome(idx, generation))
809 .collect()
810 }
811
812 fn large_outcome(idx: usize, generation: usize) -> CheckOutcome {
813 let site = format!("LargeSite{idx:04}");
814 let url = format!("https://large{idx:04}.example/alice");
815 let mut kind = match idx % 20 {
816 0 | 1 => MatchKind::Found,
817 3 => MatchKind::Uncertain,
818 _ => MatchKind::NotFound,
819 };
820 if generation > 0 && idx % 20 == 0 {
821 kind = MatchKind::NotFound;
822 } else if generation > 0 && idx % 20 == 2 {
823 kind = MatchKind::Found;
824 }
825
826 let mut outcome = CheckOutcome {
827 site: site.clone(),
828 url: url.clone(),
829 kind,
830 reason: (kind == MatchKind::Uncertain).then_some(UncertainReason::RateLimited),
831 elapsed_ms: 10 + (idx % 75) as u64,
832 enrichment: BTreeMap::new(),
833 evidence: Vec::new(),
834 profile_evidence: Vec::new(),
835 confidence: adler_core::ConfidenceScore::default(),
836 transport: Some(if idx % 7 == 0 {
837 TransportTier::Browser
838 } else {
839 TransportTier::Http
840 }),
841 escalations: u8::from(idx % 7 == 0),
842 };
843
844 match kind {
845 MatchKind::Found => {
846 let observed_at_ms = 1_781_192_451_000 + generation as u64 * 1_000 + idx as u64;
847 let website = format!("https://identity-{:02}.example", idx % 25);
848 let name = format!("Alice Group {:02}", idx % 50);
849 let bio = if generation > 0 && idx % 20 == 1 {
850 format!("updated profile generation {generation} for {idx}")
851 } else {
852 format!("stable profile generation 0 for {idx}")
853 };
854 for (field, value) in [
855 ("website", website.as_str()),
856 ("name", name.as_str()),
857 ("bio", bio.as_str()),
858 ] {
859 outcome
860 .enrichment
861 .insert(field.to_owned(), value.to_owned());
862 outcome
863 .profile_evidence
864 .push(ProfileEvidence::from_enrichment_with_source(
865 &site,
866 &url,
867 field,
868 value,
869 Some(observed_at_ms),
870 Some(EvidenceAccessPath::new(
871 outcome.transport.unwrap_or(TransportTier::Http),
872 outcome.escalations,
873 idx % 11 == 0,
874 )),
875 ));
876 }
877 outcome.evidence = vec![
878 "HTTP 200 (status_found)".to_owned(),
879 "body matched profile marker".to_owned(),
880 ];
881 }
882 MatchKind::NotFound => {
883 outcome.evidence = vec!["HTTP 404 (status_not_found)".to_owned()];
884 }
885 MatchKind::Uncertain => {}
886 }
887 outcome.refresh_confidence();
888 outcome
889 }
890
891 fn large_persisted_scan(scan_id: &str, generation: usize) -> PersistedScan {
892 let outcomes = large_outcomes(2_500, generation);
893 let finished = FinishedScan {
894 summary: Summary::from_outcomes(&outcomes),
895 identity_clusters: adler_core::build_identity_clusters("alice", &outcomes),
896 elapsed_ms: 30_000 + generation as u64,
897 outcomes,
898 };
899 PersistedScan::from_finished(
900 ScanId::from(scan_id.to_owned()),
901 "alice".to_owned(),
902 2_500,
903 1_781_192_451_000 + generation as u64 * 10_000,
904 finished,
905 )
906 }
907
908 #[tokio::test]
909 async fn save_then_load_roundtrips() {
910 let tmp = TempDir::new().unwrap();
911 let s = sample("abc123", 1_700_000_000_000);
912 save(tmp.path(), &s).await.unwrap();
913
914 let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
915 assert_eq!(loaded.scan_id, s.scan_id);
916 assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
917 assert_eq!(loaded.username, "alice");
918 assert_eq!(loaded.outcomes.len(), 2);
919 assert_eq!(loaded.outcomes[0].site, "GitHub");
920 assert_eq!(loaded.summary.found, 1);
921 }
922
923 #[tokio::test]
924 async fn save_writes_schema_version() {
925 let tmp = TempDir::new().unwrap();
926 let s = sample("abc123", 1_700_000_000_000);
927 save(tmp.path(), &s).await.unwrap();
928
929 let raw = fs::read_to_string(tmp.path().join("abc123.json"))
930 .await
931 .unwrap();
932 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
933 assert_eq!(
934 value["schema_version"],
935 serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
936 );
937 }
938
939 #[tokio::test]
940 async fn save_skips_empty_identity_clusters() {
941 let tmp = TempDir::new().unwrap();
942 let s = sample("empty-clusters", 1_700_000_000_000);
943 save(tmp.path(), &s).await.unwrap();
944
945 let raw = fs::read_to_string(tmp.path().join("empty-clusters.json"))
946 .await
947 .unwrap();
948 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
949 assert_eq!(
950 value["schema_version"],
951 serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
952 );
953 assert!(
954 value.get("identity_clusters").is_none(),
955 "empty cluster cache should stay absent from persisted JSON"
956 );
957 }
958
959 #[tokio::test]
960 async fn save_writes_derived_identity_clusters() {
961 let tmp = TempDir::new().unwrap();
962 let mut s = sample("clusters", 1_700_000_000_000);
963 s.outcomes = vec![
964 found_with_website("GitHub", "https://alice.dev"),
965 found_with_website("GitLab", "https://alice.dev"),
966 ];
967
968 save(tmp.path(), &s).await.unwrap();
969
970 let raw = fs::read_to_string(tmp.path().join("clusters.json"))
971 .await
972 .unwrap();
973 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
974 assert_eq!(value["identity_clusters"].as_array().unwrap().len(), 1);
975 assert_eq!(
976 value["identity_clusters"][0]["members"]
977 .as_array()
978 .unwrap()
979 .len(),
980 2
981 );
982 }
983
984 #[tokio::test]
985 async fn save_roundtrips_request_context() {
986 let tmp = TempDir::new().unwrap();
987 let context = ScanRequestContext {
988 username: "alice".into(),
989 derived_from: Some(ScanId::from("previous".to_owned())),
990 only: vec!["Git".into()],
991 exclude: Vec::new(),
992 tag: vec!["coding".into()],
993 exclude_tag: vec!["nsfw".into()],
994 top: Some(100),
995 nsfw: false,
996 concurrency: Some(8),
997 deadline_secs: Some(30),
998 egress_names: vec!["us-resi".into()],
999 disabled_matches: vec![PersistedDisabledMatch {
1000 name: "TikTok".into(),
1001 url: "https://www.tiktok.com/@{username}".into(),
1002 tags: vec!["social".into()],
1003 disabled_reason: "Honest Limits: JS hydration".into(),
1004 }],
1005 };
1006 let s = sample("ctx", 1_700_000_000_000).with_request_context(context.clone());
1007 save(tmp.path(), &s).await.unwrap();
1008
1009 let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1010 assert_eq!(loaded.request_context, Some(context));
1011 }
1012
1013 #[test]
1014 fn diff_scans_reports_added_removed_and_verdict_changes() {
1015 let mut previous = sample("old", 1_000);
1016 previous.outcomes = vec![
1017 outcome("GitHub", MatchKind::Found),
1018 outcome("Reddit", MatchKind::Found),
1019 outcome("Mastodon", MatchKind::NotFound),
1020 ];
1021 let mut current = sample("new", 2_000);
1022 current.outcomes = vec![
1023 outcome("GitHub", MatchKind::Found),
1024 outcome("Reddit", MatchKind::NotFound),
1025 outcome("Mastodon", MatchKind::Found),
1026 ];
1027
1028 let diff = diff_scans(&previous, ¤t);
1029
1030 assert_eq!(diff.from_scan_id.as_str(), "old");
1031 assert_eq!(diff.to_scan_id.as_str(), "new");
1032 assert_eq!(
1033 diff.added_found
1034 .iter()
1035 .map(|outcome| outcome.site.as_str())
1036 .collect::<Vec<_>>(),
1037 ["Mastodon"]
1038 );
1039 assert_eq!(
1040 diff.removed_found
1041 .iter()
1042 .map(|outcome| outcome.site.as_str())
1043 .collect::<Vec<_>>(),
1044 ["Reddit"]
1045 );
1046 assert_eq!(diff.verdict_changes.len(), 2);
1047 assert_eq!(diff.verdict_changes[0].site, "Mastodon");
1048 assert_eq!(diff.verdict_changes[0].before, MatchKind::NotFound);
1049 assert_eq!(diff.verdict_changes[0].after, MatchKind::Found);
1050 assert_eq!(diff.verdict_changes[1].site, "Reddit");
1051 assert!(diff.evidence_changes.is_empty());
1052 }
1053
1054 #[test]
1055 fn diff_scans_reports_profile_evidence_changes_for_still_found_sites() {
1056 let mut previous = sample("old", 1_000);
1057 let mut old_github = outcome("GitHub", MatchKind::Found);
1058 old_github.enrichment.insert("name".into(), "Alice".into());
1059 old_github
1060 .profile_evidence
1061 .push(adler_core::ProfileEvidence::from_enrichment(
1062 "GitHub",
1063 "https://github.example/alice",
1064 "name",
1065 "Alice",
1066 ));
1067 previous.outcomes = vec![old_github];
1068
1069 let mut current = sample("new", 2_000);
1070 let mut new_github = outcome("GitHub", MatchKind::Found);
1071 new_github
1072 .enrichment
1073 .insert("name".into(), "Alice Liddell".into());
1074 new_github
1075 .profile_evidence
1076 .push(adler_core::ProfileEvidence::from_enrichment(
1077 "GitHub",
1078 "https://github.example/alice",
1079 "name",
1080 "Alice Liddell",
1081 ));
1082 current.outcomes = vec![new_github];
1083
1084 let diff = diff_scans(&previous, ¤t);
1085
1086 assert!(diff.added_found.is_empty());
1087 assert!(diff.removed_found.is_empty());
1088 assert!(diff.verdict_changes.is_empty());
1089 assert_eq!(diff.evidence_changes.len(), 1);
1090 assert_eq!(diff.evidence_changes[0].site, "GitHub");
1091 assert_eq!(
1092 diff.evidence_changes[0]
1093 .before_enrichment
1094 .get("name")
1095 .unwrap(),
1096 "Alice"
1097 );
1098 assert_eq!(
1099 diff.evidence_changes[0]
1100 .after_enrichment
1101 .get("name")
1102 .unwrap(),
1103 "Alice Liddell"
1104 );
1105 }
1106
1107 #[test]
1108 fn timeline_tracks_first_seen_disappeared_and_reappeared() {
1109 let mut first = sample("first", 1_000);
1110 first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1111 let mut second = sample("second", 2_000);
1112 second.outcomes = vec![outcome("GitHub", MatchKind::NotFound)];
1113 let mut third = sample("third", 3_000);
1114 third.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1115
1116 let timeline = build_scan_timeline(&[third, first, second]);
1117
1118 assert_eq!(timeline.username, "alice");
1119 assert_eq!(timeline.scan_count, 3);
1120 assert_eq!(timeline.from_ms, Some(1_000));
1121 assert_eq!(timeline.to_ms, Some(3_000));
1122 assert_eq!(timeline.profiles.len(), 1);
1123 assert_eq!(timeline.profiles[0].site, "GitHub");
1124 assert_eq!(timeline.profiles[0].first_seen_ms, 1_000);
1125 assert_eq!(timeline.profiles[0].last_seen_ms, 3_000);
1126 assert!(timeline.profiles[0].present_in_latest);
1127 assert_eq!(
1128 timeline
1129 .events
1130 .iter()
1131 .map(|event| event.kind)
1132 .collect::<Vec<_>>(),
1133 [
1134 TimelineEventKind::FirstSeen,
1135 TimelineEventKind::Disappeared,
1136 TimelineEventKind::Reappeared
1137 ]
1138 );
1139 assert_eq!(timeline.events[1].before, Some(MatchKind::Found));
1140 assert_eq!(timeline.events[1].after, Some(MatchKind::NotFound));
1141 }
1142
1143 #[test]
1144 fn timeline_treats_missing_site_as_disappeared() {
1145 let mut first = sample("first", 1_000);
1146 first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1147 let mut second = sample("second", 2_000);
1148 second.outcomes = vec![outcome("GitLab", MatchKind::NotFound)];
1149
1150 let timeline = build_scan_timeline(&[first, second]);
1151
1152 assert_eq!(timeline.profiles.len(), 1);
1153 assert!(!timeline.profiles[0].present_in_latest);
1154 assert_eq!(timeline.events.len(), 2);
1155 assert_eq!(timeline.events[1].kind, TimelineEventKind::Disappeared);
1156 assert_eq!(timeline.events[1].site, "GitHub");
1157 assert_eq!(timeline.events[1].after, None);
1158 }
1159
1160 #[test]
1161 fn timeline_tracks_evidence_changes_for_still_found_profile() {
1162 let mut first = sample("first", 1_000);
1163 let mut old_github = outcome("GitHub", MatchKind::Found);
1164 old_github.enrichment.insert("name".into(), "Alice".into());
1165 old_github
1166 .profile_evidence
1167 .push(adler_core::ProfileEvidence::from_enrichment(
1168 "GitHub",
1169 "https://github.example/alice",
1170 "name",
1171 "Alice",
1172 ));
1173 first.outcomes = vec![old_github];
1174
1175 let mut second = sample("second", 2_000);
1176 let mut new_github = outcome("GitHub", MatchKind::Found);
1177 new_github
1178 .enrichment
1179 .insert("name".into(), "Alice Liddell".into());
1180 new_github
1181 .profile_evidence
1182 .push(adler_core::ProfileEvidence::from_enrichment(
1183 "GitHub",
1184 "https://github.example/alice",
1185 "name",
1186 "Alice Liddell",
1187 ));
1188 second.outcomes = vec![new_github];
1189
1190 let timeline = build_scan_timeline(&[first, second]);
1191
1192 assert_eq!(
1193 timeline
1194 .events
1195 .iter()
1196 .map(|event| event.kind)
1197 .collect::<Vec<_>>(),
1198 [
1199 TimelineEventKind::FirstSeen,
1200 TimelineEventKind::EvidenceChanged
1201 ]
1202 );
1203 let evidence_change = timeline.events[1].evidence_change.as_ref().unwrap();
1204 assert_eq!(
1205 evidence_change.before_enrichment.get("name").unwrap(),
1206 "Alice"
1207 );
1208 assert_eq!(
1209 evidence_change.after_enrichment.get("name").unwrap(),
1210 "Alice Liddell"
1211 );
1212 }
1213
1214 #[tokio::test]
1215 async fn load_all_returns_newest_first() {
1216 let tmp = TempDir::new().unwrap();
1217 save(tmp.path(), &sample("old", 1_000)).await.unwrap();
1218 save(tmp.path(), &sample("mid", 2_000)).await.unwrap();
1219 save(tmp.path(), &sample("new", 3_000)).await.unwrap();
1220 let all = load_all(tmp.path()).await;
1221 assert_eq!(all.len(), 3);
1222 assert_eq!(all[0].scan_id.as_str(), "new");
1223 assert_eq!(all[1].scan_id.as_str(), "mid");
1224 assert_eq!(all[2].scan_id.as_str(), "old");
1225 }
1226
1227 #[tokio::test]
1228 async fn load_returns_none_for_missing() {
1229 let tmp = TempDir::new().unwrap();
1230 let missing = load(tmp.path(), &ScanId::from("nope".to_owned())).await;
1231 assert!(missing.is_none());
1232 }
1233
1234 #[tokio::test]
1235 async fn load_defaults_schema_version_for_legacy_scan_json() {
1236 let tmp = TempDir::new().unwrap();
1237 let path = tmp.path().join("legacy.json");
1238 fs::write(
1239 &path,
1240 br#"{
1241 "scan_id": "legacy",
1242 "username": "alice",
1243 "site_count": 0,
1244 "created_at_ms": 1700000000000,
1245 "summary": { "found": 0, "not_found": 0, "uncertain": 0 },
1246 "outcomes": [],
1247 "elapsed_ms": 0
1248 }"#,
1249 )
1250 .await
1251 .unwrap();
1252
1253 let loaded = load(tmp.path(), &ScanId::from("legacy".to_owned()))
1254 .await
1255 .expect("legacy scan loads");
1256 assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1257 }
1258
1259 #[tokio::test]
1260 async fn load_accepts_v2_scan_json_after_schema_bump() {
1261 let tmp = TempDir::new().unwrap();
1262 let path = tmp.path().join("v2.json");
1263 fs::write(
1264 &path,
1265 br#"{
1266 "schema_version": 2,
1267 "scan_id": "v2",
1268 "username": "alice",
1269 "site_count": 1,
1270 "created_at_ms": 1700000000000,
1271 "summary": { "found": 1, "not_found": 0, "uncertain": 0 },
1272 "outcomes": [
1273 {
1274 "site": "GitHub",
1275 "url": "https://github.example/alice",
1276 "kind": "found",
1277 "elapsed_ms": 10,
1278 "evidence": ["HTTP 200 (status_found)"]
1279 }
1280 ],
1281 "elapsed_ms": 10
1282 }"#,
1283 )
1284 .await
1285 .unwrap();
1286
1287 let loaded = load(tmp.path(), &ScanId::from("v2".to_owned()))
1288 .await
1289 .expect("v2 scan loads");
1290
1291 assert_eq!(loaded.schema_version, 2);
1292 assert_eq!(loaded.summary.found, 1);
1293 assert_eq!(
1294 loaded.outcomes[0].confidence.label,
1295 adler_core::ConfidenceLabel::Medium
1296 );
1297 }
1298
1299 #[tokio::test]
1300 async fn load_derives_identity_clusters_for_legacy_scan_json() {
1301 let tmp = TempDir::new().unwrap();
1302 let path = tmp.path().join("legacy-clusters.json");
1303 fs::write(
1304 &path,
1305 br#"{
1306 "schema_version": 1,
1307 "scan_id": "legacy-clusters",
1308 "username": "alice",
1309 "site_count": 2,
1310 "created_at_ms": 1700000000000,
1311 "summary": { "found": 2, "not_found": 0, "uncertain": 0 },
1312 "outcomes": [
1313 {
1314 "site": "GitHub",
1315 "url": "https://github.example/alice",
1316 "kind": "found",
1317 "elapsed_ms": 10,
1318 "profile_evidence": [
1319 {
1320 "kind": "external_link",
1321 "field": "website",
1322 "value": "https://alice.dev",
1323 "source": {
1324 "site": "GitHub",
1325 "url": "https://github.example/alice",
1326 "origin": "extractor"
1327 }
1328 }
1329 ]
1330 },
1331 {
1332 "site": "GitLab",
1333 "url": "https://gitlab.example/alice",
1334 "kind": "found",
1335 "elapsed_ms": 10,
1336 "profile_evidence": [
1337 {
1338 "kind": "external_link",
1339 "field": "website",
1340 "value": "https://alice.dev/",
1341 "source": {
1342 "site": "GitLab",
1343 "url": "https://gitlab.example/alice",
1344 "origin": "extractor"
1345 }
1346 }
1347 ]
1348 }
1349 ],
1350 "elapsed_ms": 20
1351 }"#,
1352 )
1353 .await
1354 .unwrap();
1355
1356 let loaded = load(tmp.path(), &ScanId::from("legacy-clusters".to_owned()))
1357 .await
1358 .expect("legacy scan loads");
1359
1360 assert_eq!(loaded.identity_clusters.len(), 1);
1361 assert_eq!(loaded.identity_clusters[0].members.len(), 2);
1362 assert!(!loaded.identity_clusters[0].uncertain);
1363 }
1364
1365 #[test]
1366 fn large_scan_artifact_paths_handle_identity_graph_payloads() {
1367 let previous = large_persisted_scan("large-old", 0);
1368 let current = large_persisted_scan("large-new", 1);
1369
1370 assert_eq!(previous.outcomes.len(), 2_500);
1371 assert_eq!(previous.site_count, 2_500);
1372 assert_eq!(
1373 previous.summary.found + previous.summary.not_found + previous.summary.uncertain,
1374 2_500
1375 );
1376 assert!(!previous.identity_clusters.is_empty());
1377
1378 let raw = serde_json::to_string(&previous).unwrap();
1379 let decoded: PersistedScan = serde_json::from_str(&raw).unwrap();
1380 assert_eq!(decoded.outcomes.len(), 2_500);
1381 assert_eq!(
1382 decoded.identity_clusters.len(),
1383 previous.identity_clusters.len()
1384 );
1385
1386 let diff = diff_scans(&previous, ¤t);
1387 assert!(!diff.added_found.is_empty());
1388 assert!(!diff.removed_found.is_empty());
1389 assert!(!diff.verdict_changes.is_empty());
1390 assert!(!diff.evidence_changes.is_empty());
1391
1392 let timeline = build_scan_timeline(&[previous, current]);
1393 assert_eq!(timeline.scan_count, 2);
1394 assert_eq!(timeline.profiles.len(), 375);
1395 assert!(timeline.events.len() > timeline.profiles.len());
1396 }
1397
1398 #[tokio::test]
1399 async fn load_all_skips_unrelated_files() {
1400 let tmp = TempDir::new().unwrap();
1401 fs::write(tmp.path().join("README"), b"not json")
1403 .await
1404 .unwrap();
1405 fs::write(tmp.path().join("broken.json"), b"{ invalid")
1406 .await
1407 .unwrap();
1408 save(tmp.path(), &sample("good", 9_999)).await.unwrap();
1409 let all = load_all(tmp.path()).await;
1410 assert_eq!(all.len(), 1);
1411 assert_eq!(all[0].scan_id.as_str(), "good");
1412 }
1413
1414 #[tokio::test]
1415 async fn prune_keeps_only_newest_n() {
1416 let tmp = TempDir::new().unwrap();
1417 for i in 0u64..5 {
1418 save(tmp.path(), &sample(&format!("s{i}"), i * 1_000))
1419 .await
1420 .unwrap();
1421 }
1422 let removed = prune(tmp.path(), 2).await;
1423 assert_eq!(removed, 3);
1424 let remaining = load_all(tmp.path()).await;
1425 assert_eq!(remaining.len(), 2);
1426 assert_eq!(remaining[0].scan_id.as_str(), "s4");
1427 assert_eq!(remaining[1].scan_id.as_str(), "s3");
1428 }
1429}