1use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use adler_core::{
17 CheckOutcome, HistoricalScanRef, IdentityCluster, InvestigationReport, MatchKind,
18 ProfileEvidence, ReportDisabledSite, ReportTimelineEvent, ReportTimelineEventKind, Site,
19};
20use serde::{Deserialize, Serialize};
21use tokio::fs;
22
23use crate::error::{Error, Result};
24use crate::scan::{FinishedScan, ScanId, Summary};
25
26pub(crate) const MAX_PERSISTED_SCANS: usize = 200;
30pub(crate) const PERSISTED_SCAN_SCHEMA_VERSION: u16 = 3;
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct PersistedScan {
37 #[serde(default = "default_schema_version")]
39 pub schema_version: u16,
40 pub scan_id: ScanId,
42 pub username: String,
44 #[serde(default, skip_serializing_if = "Option::is_none")]
48 pub request_context: Option<ScanRequestContext>,
49 pub site_count: usize,
51 pub created_at_ms: u64,
53 pub summary: Summary,
55 pub outcomes: Vec<CheckOutcome>,
57 #[serde(default, skip_serializing_if = "Vec::is_empty")]
60 pub identity_clusters: Vec<IdentityCluster>,
61 pub elapsed_ms: u64,
63}
64
65impl PersistedScan {
66 #[must_use]
68 pub fn from_finished(
69 scan_id: ScanId,
70 username: String,
71 site_count: usize,
72 created_at_ms: u64,
73 finished: FinishedScan,
74 ) -> Self {
75 let mut scan = Self {
76 schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
77 scan_id,
78 username,
79 request_context: None,
80 site_count,
81 created_at_ms,
82 summary: finished.summary,
83 outcomes: finished.outcomes,
84 identity_clusters: finished.identity_clusters,
85 elapsed_ms: finished.elapsed_ms,
86 };
87 scan.refresh_derived_fields();
88 scan
89 }
90
91 #[must_use]
93 pub fn with_request_context(mut self, context: ScanRequestContext) -> Self {
94 self.request_context = Some(context);
95 self
96 }
97
98 pub(crate) fn refresh_derived_fields(&mut self) {
99 for outcome in &mut self.outcomes {
100 outcome.refresh_confidence();
101 }
102 self.summary = Summary::from_outcomes(&self.outcomes);
103 self.identity_clusters =
104 adler_core::build_identity_clusters(&self.username, &self.outcomes);
105 }
106}
107
108pub fn apply_historical_confidence_overlay(
115 current: &mut PersistedScan,
116 related_scans: &[PersistedScan],
117) {
118 current.refresh_derived_fields();
119 let history_counts = historical_consistency_counts(current, related_scans);
120
121 for outcome in &mut current.outcomes {
122 let count = history_counts.get(&outcome.site).copied().unwrap_or(0);
123 outcome.refresh_confidence_with_history(count);
124 }
125
126 current.identity_clusters = historical_identity_clusters(current, related_scans);
127}
128
129#[must_use]
134pub fn build_investigation_report(
135 mut scan: PersistedScan,
136 related_scans: &[PersistedScan],
137) -> InvestigationReport {
138 apply_historical_confidence_overlay(&mut scan, related_scans);
139 let timeline = report_timeline_from_scans(related_scans, &scan);
140 let disabled_sites = scan
141 .request_context
142 .as_ref()
143 .map(|context| {
144 context
145 .disabled_matches
146 .iter()
147 .map(|site| ReportDisabledSite {
148 name: site.name.clone(),
149 url: site.url.clone(),
150 tags: site.tags.clone(),
151 disabled_reason: site.disabled_reason.clone(),
152 })
153 .collect()
154 })
155 .unwrap_or_default();
156
157 InvestigationReport::builder(scan.username, &scan.outcomes)
158 .identity_clusters(scan.identity_clusters)
159 .timeline(timeline)
160 .disabled_sites(disabled_sites)
161 .build()
162}
163
164fn report_timeline_from_scans(
165 related_scans: &[PersistedScan],
166 current: &PersistedScan,
167) -> Vec<ReportTimelineEvent> {
168 let mut scans = related_scans.to_vec();
169 if !scans.iter().any(|scan| scan.scan_id == current.scan_id) {
170 scans.push(current.clone());
171 }
172 build_scan_timeline(&scans)
173 .events
174 .into_iter()
175 .map(report_timeline_event)
176 .collect()
177}
178
179fn report_timeline_event(event: TimelineEvent) -> ReportTimelineEvent {
180 ReportTimelineEvent {
181 kind: match event.kind {
182 TimelineEventKind::FirstSeen => ReportTimelineEventKind::AddedFound,
183 TimelineEventKind::Disappeared => ReportTimelineEventKind::RemovedFound,
184 TimelineEventKind::Reappeared => ReportTimelineEventKind::Reappeared,
185 TimelineEventKind::EvidenceChanged => ReportTimelineEventKind::EvidenceChanged,
186 },
187 site: Some(event.site),
188 scan_id: Some(event.scan_id.to_string()),
189 observed_at_ms: Some(event.at_ms),
190 detail: Some(timeline_detail(event.before, event.after)),
191 }
192}
193
194fn timeline_detail(before: Option<MatchKind>, after: Option<MatchKind>) -> String {
195 match (before, after) {
196 (Some(before), Some(after)) => format!("{} -> {}", kind_label(before), kind_label(after)),
197 (None, Some(after)) => format!("new {}", kind_label(after)),
198 (Some(before), None) => format!("after {}", kind_label(before)),
199 (None, None) => "changed".to_owned(),
200 }
201}
202
203fn kind_label(kind: MatchKind) -> &'static str {
204 match kind {
205 MatchKind::Found => "found",
206 MatchKind::NotFound => "not_found",
207 MatchKind::Uncertain => "uncertain",
208 }
209}
210
211fn historical_consistency_counts(
212 current: &PersistedScan,
213 related_scans: &[PersistedScan],
214) -> BTreeMap<String, usize> {
215 let current_ref = HistoricalScanRef {
216 scan_id: current.scan_id.as_str(),
217 username: ¤t.username,
218 created_at_ms: current.created_at_ms,
219 outcomes: ¤t.outcomes,
220 };
221 let related_refs = related_scans.iter().map(|scan| HistoricalScanRef {
222 scan_id: scan.scan_id.as_str(),
223 username: &scan.username,
224 created_at_ms: scan.created_at_ms,
225 outcomes: &scan.outcomes,
226 });
227 adler_core::historical_consistency_counts(current_ref, related_refs)
228}
229
230fn historical_identity_clusters(
231 current: &PersistedScan,
232 related_scans: &[PersistedScan],
233) -> Vec<IdentityCluster> {
234 let current_ref = HistoricalScanRef {
235 scan_id: current.scan_id.as_str(),
236 username: ¤t.username,
237 created_at_ms: current.created_at_ms,
238 outcomes: ¤t.outcomes,
239 };
240 let related_refs = related_scans.iter().map(|scan| HistoricalScanRef {
241 scan_id: scan.scan_id.as_str(),
242 username: &scan.username,
243 created_at_ms: scan.created_at_ms,
244 outcomes: &scan.outcomes,
245 });
246 adler_core::build_identity_clusters_with_history(current_ref, related_refs)
247}
248
249const fn default_schema_version() -> u16 {
250 PERSISTED_SCAN_SCHEMA_VERSION
251}
252
253#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
257pub struct ScanRequestContext {
258 pub username: String,
260 #[serde(default, skip_serializing_if = "Option::is_none")]
262 pub derived_from: Option<ScanId>,
263 #[serde(default, skip_serializing_if = "Vec::is_empty")]
265 pub only: Vec<String>,
266 #[serde(default, skip_serializing_if = "Vec::is_empty")]
268 pub exclude: Vec<String>,
269 #[serde(default, skip_serializing_if = "Vec::is_empty")]
271 pub tag: Vec<String>,
272 #[serde(default, skip_serializing_if = "Vec::is_empty")]
274 pub exclude_tag: Vec<String>,
275 #[serde(default, skip_serializing_if = "Option::is_none")]
277 pub top: Option<u32>,
278 pub nsfw: bool,
280 #[serde(default, skip_serializing_if = "Option::is_none")]
282 pub concurrency: Option<usize>,
283 #[serde(default, skip_serializing_if = "Option::is_none")]
285 pub deadline_secs: Option<u64>,
286 #[serde(default, skip_serializing_if = "Vec::is_empty")]
288 pub egress_names: Vec<String>,
289 #[serde(default, skip_serializing_if = "Vec::is_empty")]
292 pub disabled_matches: Vec<PersistedDisabledMatch>,
293}
294
295#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
297pub struct PersistedDisabledMatch {
298 pub name: String,
300 pub url: String,
302 #[serde(default, skip_serializing_if = "Vec::is_empty")]
304 pub tags: Vec<String>,
305 pub disabled_reason: String,
307}
308
309impl From<&Site> for PersistedDisabledMatch {
310 fn from(site: &Site) -> Self {
311 Self {
312 name: site.name.clone(),
313 url: site.url.as_str().to_owned(),
314 tags: site.tags.clone(),
315 disabled_reason: site
316 .disabled_reason
317 .clone()
318 .unwrap_or_else(|| "disabled in registry".to_owned()),
319 }
320 }
321}
322
323#[derive(Debug, Clone, Serialize, Deserialize)]
326pub struct ScanDiff {
327 pub from_scan_id: ScanId,
329 pub to_scan_id: ScanId,
331 #[serde(default, skip_serializing_if = "Vec::is_empty")]
333 pub added_found: Vec<CheckOutcome>,
334 #[serde(default, skip_serializing_if = "Vec::is_empty")]
336 pub removed_found: Vec<CheckOutcome>,
337 #[serde(default, skip_serializing_if = "Vec::is_empty")]
339 pub verdict_changes: Vec<VerdictChange>,
340 #[serde(default, skip_serializing_if = "Vec::is_empty")]
342 pub evidence_changes: Vec<EvidenceChange>,
343}
344
345#[derive(Debug, Clone, Serialize, Deserialize)]
347pub struct VerdictChange {
348 pub site: String,
350 pub before: MatchKind,
352 pub after: MatchKind,
354}
355
356#[derive(Debug, Clone, Serialize, Deserialize)]
358pub struct EvidenceChange {
359 pub site: String,
361 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
363 pub before_enrichment: BTreeMap<String, String>,
364 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
366 pub after_enrichment: BTreeMap<String, String>,
367 #[serde(default, skip_serializing_if = "Vec::is_empty")]
369 pub before_profile_evidence: Vec<ProfileEvidence>,
370 #[serde(default, skip_serializing_if = "Vec::is_empty")]
372 pub after_profile_evidence: Vec<ProfileEvidence>,
373}
374
375#[derive(Debug, Clone, Serialize, Deserialize)]
377pub struct ScanTimeline {
378 pub username: String,
380 pub scan_count: usize,
382 #[serde(default, skip_serializing_if = "Option::is_none")]
384 pub from_ms: Option<u64>,
385 #[serde(default, skip_serializing_if = "Option::is_none")]
387 pub to_ms: Option<u64>,
388 #[serde(default, skip_serializing_if = "Vec::is_empty")]
390 pub profiles: Vec<TimelineProfile>,
391 #[serde(default, skip_serializing_if = "Vec::is_empty")]
393 pub events: Vec<TimelineEvent>,
394}
395
396#[derive(Debug, Clone, Serialize, Deserialize)]
398pub struct TimelineProfile {
399 pub site: String,
401 pub url: String,
403 pub first_seen_ms: u64,
405 pub last_seen_ms: u64,
407 pub present_in_latest: bool,
409 #[serde(default, skip_serializing_if = "Option::is_none")]
411 pub last_verdict: Option<MatchKind>,
412}
413
414#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
416#[serde(rename_all = "snake_case")]
417pub enum TimelineEventKind {
418 FirstSeen,
420 Disappeared,
422 Reappeared,
424 EvidenceChanged,
426}
427
428#[derive(Debug, Clone, Serialize, Deserialize)]
430pub struct TimelineEvent {
431 pub scan_id: ScanId,
433 pub at_ms: u64,
435 pub site: String,
437 pub url: String,
439 pub kind: TimelineEventKind,
441 #[serde(default, skip_serializing_if = "Option::is_none")]
443 pub before: Option<MatchKind>,
444 #[serde(default, skip_serializing_if = "Option::is_none")]
446 pub after: Option<MatchKind>,
447 #[serde(default, skip_serializing_if = "Option::is_none")]
449 pub evidence_change: Option<EvidenceChange>,
450}
451
452#[must_use]
459pub fn diff_scans(previous: &PersistedScan, current: &PersistedScan) -> ScanDiff {
460 let previous_by_site = outcomes_by_site(&previous.outcomes);
461 let current_by_site = outcomes_by_site(¤t.outcomes);
462
463 let mut added_found = Vec::new();
464 let mut removed_found = Vec::new();
465 let mut verdict_changes = Vec::new();
466 let mut evidence_changes = Vec::new();
467
468 for (site, current_outcome) in ¤t_by_site {
469 let previous_outcome = previous_by_site.get(site);
470 if current_outcome.kind == MatchKind::Found
471 && previous_outcome.is_none_or(|o| o.kind != MatchKind::Found)
472 {
473 added_found.push((*current_outcome).clone());
474 }
475 if let Some(previous_outcome) = previous_outcome {
476 if previous_outcome.kind != current_outcome.kind {
477 verdict_changes.push(VerdictChange {
478 site: site.clone(),
479 before: previous_outcome.kind,
480 after: current_outcome.kind,
481 });
482 }
483 if previous_outcome.kind == MatchKind::Found
484 && current_outcome.kind == MatchKind::Found
485 && profile_evidence_changed(previous_outcome, current_outcome)
486 {
487 evidence_changes.push(EvidenceChange {
488 site: site.clone(),
489 before_enrichment: previous_outcome.enrichment.clone(),
490 after_enrichment: current_outcome.enrichment.clone(),
491 before_profile_evidence: previous_outcome.profile_evidence.clone(),
492 after_profile_evidence: current_outcome.profile_evidence.clone(),
493 });
494 }
495 }
496 }
497
498 for (site, previous_outcome) in &previous_by_site {
499 if previous_outcome.kind == MatchKind::Found
500 && current_by_site
501 .get(site)
502 .is_none_or(|o| o.kind != MatchKind::Found)
503 {
504 removed_found.push((*previous_outcome).clone());
505 }
506 }
507
508 ScanDiff {
509 from_scan_id: previous.scan_id.clone(),
510 to_scan_id: current.scan_id.clone(),
511 added_found,
512 removed_found,
513 verdict_changes,
514 evidence_changes,
515 }
516}
517
518#[must_use]
524pub fn build_scan_timeline(scans: &[PersistedScan]) -> ScanTimeline {
525 let mut ordered: Vec<&PersistedScan> = scans.iter().collect();
526 ordered.sort_by(|left, right| {
527 left.created_at_ms
528 .cmp(&right.created_at_ms)
529 .then_with(|| left.scan_id.as_str().cmp(right.scan_id.as_str()))
530 });
531
532 let username = ordered
533 .first()
534 .map(|scan| scan.username.clone())
535 .unwrap_or_default();
536 let from_ms = ordered.first().map(|scan| scan.created_at_ms);
537 let to_ms = ordered.last().map(|scan| scan.created_at_ms);
538 let mut states: BTreeMap<String, TimelineProfileState> = BTreeMap::new();
539 let mut events = Vec::new();
540
541 for scan in &ordered {
542 let current_by_site = outcomes_by_site(&scan.outcomes);
543 let sites = timeline_site_names(&states, ¤t_by_site);
544
545 for site in sites {
546 apply_timeline_site(
547 scan,
548 &site,
549 current_by_site.get(&site).copied(),
550 &mut states,
551 &mut events,
552 );
553 }
554 }
555
556 let profiles = states
557 .into_iter()
558 .map(|(site, state)| TimelineProfile {
559 site,
560 url: state.url,
561 first_seen_ms: state.first_seen_ms,
562 last_seen_ms: state.last_seen_ms,
563 present_in_latest: state.present_in_latest,
564 last_verdict: state.last_verdict,
565 })
566 .collect();
567
568 ScanTimeline {
569 username,
570 scan_count: ordered.len(),
571 from_ms,
572 to_ms,
573 profiles,
574 events,
575 }
576}
577
578fn timeline_site_names(
579 states: &BTreeMap<String, TimelineProfileState>,
580 current_by_site: &BTreeMap<String, &CheckOutcome>,
581) -> Vec<String> {
582 let mut sites: Vec<String> = states.keys().cloned().collect();
583 for site in current_by_site.keys() {
584 if !states.contains_key(site.as_str()) {
585 sites.push((*site).clone());
586 }
587 }
588 sites.sort();
589 sites.dedup();
590 sites
591}
592
593fn apply_timeline_site(
594 scan: &PersistedScan,
595 site: &str,
596 current: Option<&CheckOutcome>,
597 states: &mut BTreeMap<String, TimelineProfileState>,
598 events: &mut Vec<TimelineEvent>,
599) {
600 let current_kind = current.map(|outcome| outcome.kind);
601 let was_present = states
602 .get(site)
603 .is_some_and(|state| state.present_in_latest);
604
605 if current_kind == Some(MatchKind::Found) {
606 apply_found_timeline_site(scan, site, current.expect("found outcome"), states, events);
607 } else if was_present {
608 apply_disappeared_timeline_site(scan, site, current, current_kind, states, events);
609 } else if let (Some(state), Some(outcome)) = (states.get_mut(site), current) {
610 state.last_verdict = Some(outcome.kind);
611 state.url.clone_from(&outcome.url);
612 }
613}
614
615fn apply_found_timeline_site(
616 scan: &PersistedScan,
617 site: &str,
618 outcome: &CheckOutcome,
619 states: &mut BTreeMap<String, TimelineProfileState>,
620 events: &mut Vec<TimelineEvent>,
621) {
622 let current_kind = Some(outcome.kind);
623 let had_state = states.contains_key(site);
624 let was_present = states
625 .get(site)
626 .is_some_and(|state| state.present_in_latest);
627 let state = states
628 .entry(site.to_owned())
629 .or_insert_with(|| TimelineProfileState::new(outcome, scan.created_at_ms));
630
631 if !had_state {
632 events.push(timeline_event(
633 scan,
634 site,
635 &outcome.url,
636 TimelineEventKind::FirstSeen,
637 None,
638 current_kind,
639 None,
640 ));
641 } else if !was_present {
642 events.push(timeline_event(
643 scan,
644 site,
645 &outcome.url,
646 TimelineEventKind::Reappeared,
647 state.last_verdict,
648 current_kind,
649 None,
650 ));
651 } else if state.profile_evidence_changed(outcome) {
652 events.push(timeline_event(
653 scan,
654 site,
655 &outcome.url,
656 TimelineEventKind::EvidenceChanged,
657 Some(MatchKind::Found),
658 current_kind,
659 Some(EvidenceChange {
660 site: site.to_owned(),
661 before_enrichment: state.last_found_enrichment.clone(),
662 after_enrichment: outcome.enrichment.clone(),
663 before_profile_evidence: state.last_found_profile_evidence.clone(),
664 after_profile_evidence: outcome.profile_evidence.clone(),
665 }),
666 ));
667 }
668
669 states
670 .get_mut(site)
671 .expect("state inserted before found update")
672 .update_found(outcome, scan.created_at_ms);
673}
674
675fn apply_disappeared_timeline_site(
676 scan: &PersistedScan,
677 site: &str,
678 current: Option<&CheckOutcome>,
679 current_kind: Option<MatchKind>,
680 states: &mut BTreeMap<String, TimelineProfileState>,
681 events: &mut Vec<TimelineEvent>,
682) {
683 let state = states
684 .get_mut(site)
685 .expect("present state exists before disappearance");
686 let url = current.map_or_else(|| state.url.clone(), |outcome| outcome.url.clone());
687 events.push(timeline_event(
688 scan,
689 site,
690 &url,
691 TimelineEventKind::Disappeared,
692 state.last_verdict,
693 current_kind,
694 None,
695 ));
696 state.present_in_latest = false;
697 state.last_verdict = current_kind;
698 if let Some(outcome) = current {
699 state.url.clone_from(&outcome.url);
700 }
701}
702
703fn timeline_event(
704 scan: &PersistedScan,
705 site: &str,
706 url: &str,
707 kind: TimelineEventKind,
708 before: Option<MatchKind>,
709 after: Option<MatchKind>,
710 evidence_change: Option<EvidenceChange>,
711) -> TimelineEvent {
712 TimelineEvent {
713 scan_id: scan.scan_id.clone(),
714 at_ms: scan.created_at_ms,
715 site: site.to_owned(),
716 url: url.to_owned(),
717 kind,
718 before,
719 after,
720 evidence_change,
721 }
722}
723
724#[derive(Debug, Clone)]
725struct TimelineProfileState {
726 url: String,
727 first_seen_ms: u64,
728 last_seen_ms: u64,
729 present_in_latest: bool,
730 last_verdict: Option<MatchKind>,
731 last_found_enrichment: BTreeMap<String, String>,
732 last_found_profile_evidence: Vec<ProfileEvidence>,
733}
734
735impl TimelineProfileState {
736 fn new(outcome: &CheckOutcome, at_ms: u64) -> Self {
737 Self {
738 url: outcome.url.clone(),
739 first_seen_ms: at_ms,
740 last_seen_ms: at_ms,
741 present_in_latest: true,
742 last_verdict: Some(outcome.kind),
743 last_found_enrichment: outcome.enrichment.clone(),
744 last_found_profile_evidence: outcome.profile_evidence.clone(),
745 }
746 }
747
748 fn update_found(&mut self, outcome: &CheckOutcome, at_ms: u64) {
749 self.url.clone_from(&outcome.url);
750 self.last_seen_ms = at_ms;
751 self.present_in_latest = true;
752 self.last_verdict = Some(outcome.kind);
753 self.last_found_enrichment = outcome.enrichment.clone();
754 self.last_found_profile_evidence
755 .clone_from(&outcome.profile_evidence);
756 }
757
758 fn profile_evidence_changed(&self, outcome: &CheckOutcome) -> bool {
759 self.last_found_enrichment != outcome.enrichment
760 || self.last_found_profile_evidence != outcome.profile_evidence
761 }
762}
763
764fn outcomes_by_site(outcomes: &[CheckOutcome]) -> BTreeMap<String, &CheckOutcome> {
765 outcomes
766 .iter()
767 .map(|outcome| (outcome.site.clone(), outcome))
768 .collect()
769}
770
771fn profile_evidence_changed(previous: &CheckOutcome, current: &CheckOutcome) -> bool {
772 previous.enrichment != current.enrichment
773 || previous.profile_evidence != current.profile_evidence
774}
775
776#[must_use]
782pub fn default_dir() -> PathBuf {
783 if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
784 return PathBuf::from(xdg).join("adler").join("scans");
785 }
786 if let Some(home) = std::env::var_os("HOME") {
787 return PathBuf::from(home)
788 .join(".cache")
789 .join("adler")
790 .join("scans");
791 }
792 PathBuf::from("adler-scans")
793}
794
795pub(crate) async fn save(dir: &Path, scan: &PersistedScan) -> Result<()> {
797 fs::create_dir_all(dir).await.map_err(Error::Persist)?;
798 let path = dir.join(format!("{}.json", scan.scan_id));
799 let tmp = dir.join(format!("{}.json.tmp", scan.scan_id));
800 let mut scan = scan.clone();
801 scan.refresh_derived_fields();
802 let body = serde_json::to_vec_pretty(&scan).map_err(Error::PersistEncode)?;
803 fs::write(&tmp, &body).await.map_err(Error::Persist)?;
804 fs::rename(&tmp, &path).await.map_err(Error::Persist)?;
805 Ok(())
806}
807
808pub(crate) async fn load(dir: &Path, scan_id: &ScanId) -> Option<PersistedScan> {
812 let path = dir.join(format!("{scan_id}.json"));
813 let bytes = fs::read(&path).await.ok()?;
814 serde_json::from_slice(&bytes)
815 .ok()
816 .map(refresh_derived_fields)
817}
818
819pub(crate) async fn load_all(dir: &Path) -> Vec<PersistedScan> {
823 let Ok(mut entries) = fs::read_dir(dir).await else {
824 return Vec::new();
825 };
826 let mut out = Vec::new();
827 while let Ok(Some(entry)) = entries.next_entry().await {
828 let path = entry.path();
829 if path.extension().and_then(|s| s.to_str()) != Some("json") {
830 continue;
831 }
832 let Ok(bytes) = fs::read(&path).await else {
833 continue;
834 };
835 let Ok(scan) = serde_json::from_slice::<PersistedScan>(&bytes) else {
836 continue;
837 };
838 out.push(refresh_derived_fields(scan));
839 }
840 out.sort_by_key(|s| std::cmp::Reverse(s.created_at_ms));
841 out
842}
843
844fn refresh_derived_fields(mut scan: PersistedScan) -> PersistedScan {
845 scan.refresh_derived_fields();
846 scan
847}
848
849pub(crate) async fn prune(dir: &Path, keep_newest: usize) -> usize {
852 let scans = load_all(dir).await;
853 if scans.len() <= keep_newest {
854 return 0;
855 }
856 let mut removed = 0;
857 for s in &scans[keep_newest..] {
858 let path = dir.join(format!("{}.json", s.scan_id));
859 if fs::remove_file(&path).await.is_ok() {
860 removed += 1;
861 }
862 }
863 removed
864}
865
866#[cfg(test)]
867mod tests {
868 use super::*;
869 use adler_core::{
870 ConfidenceLabel, ConfidenceReason, EvidenceAccessPath, MatchKind, ProfileEvidence,
871 TransportTier, UncertainReason,
872 };
873 use std::collections::BTreeMap;
874 use tempfile::TempDir;
875
876 fn sample(scan_id: &str, ts: u64) -> PersistedScan {
877 PersistedScan {
878 schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
879 scan_id: ScanId::from(scan_id.to_owned()),
880 username: "alice".into(),
881 request_context: None,
882 site_count: 2,
883 created_at_ms: ts,
884 summary: Summary {
885 found: 1,
886 not_found: 1,
887 uncertain: 0,
888 },
889 outcomes: vec![
890 CheckOutcome {
891 site: "GitHub".into(),
892 url: "https://github.com/alice".into(),
893 kind: MatchKind::Found,
894 reason: None,
895 elapsed_ms: 120,
896 enrichment: BTreeMap::new(),
897 evidence: vec!["HTTP 200 (status_found)".into()],
898 profile_evidence: Vec::new(),
899 confidence: adler_core::ConfidenceScore::default(),
900 transport: None,
901 escalations: 0,
902 },
903 CheckOutcome {
904 site: "GitLab".into(),
905 url: "https://gitlab.com/alice".into(),
906 kind: MatchKind::NotFound,
907 reason: None,
908 elapsed_ms: 90,
909 enrichment: BTreeMap::new(),
910 evidence: vec!["HTTP 404 (status_not_found)".into()],
911 profile_evidence: Vec::new(),
912 confidence: adler_core::ConfidenceScore::default(),
913 transport: None,
914 escalations: 0,
915 },
916 ],
917 identity_clusters: Vec::new(),
918 elapsed_ms: 210,
919 }
920 }
921
922 fn outcome(site: &str, kind: MatchKind) -> CheckOutcome {
923 CheckOutcome {
924 site: site.into(),
925 url: format!("https://{site}.example/alice"),
926 kind,
927 reason: None,
928 elapsed_ms: 10,
929 enrichment: BTreeMap::new(),
930 evidence: Vec::new(),
931 profile_evidence: Vec::new(),
932 confidence: adler_core::ConfidenceScore::default(),
933 transport: None,
934 escalations: 0,
935 }
936 }
937
938 fn scan_with_outcomes(
939 scan_id: &str,
940 username: &str,
941 ts: u64,
942 outcomes: Vec<CheckOutcome>,
943 ) -> PersistedScan {
944 PersistedScan {
945 schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
946 scan_id: ScanId::from(scan_id.to_owned()),
947 username: username.to_owned(),
948 request_context: None,
949 site_count: outcomes.len(),
950 created_at_ms: ts,
951 summary: Summary::from_outcomes(&outcomes),
952 outcomes,
953 identity_clusters: Vec::new(),
954 elapsed_ms: 10,
955 }
956 }
957
958 fn found_with_website(site: &str, website: &str) -> CheckOutcome {
959 found_with_website_at(site, website, None)
960 }
961
962 fn found_with_website_at(
963 site: &str,
964 website: &str,
965 observed_at_ms: Option<u64>,
966 ) -> CheckOutcome {
967 let mut outcome = outcome(site, MatchKind::Found);
968 outcome
969 .profile_evidence
970 .push(ProfileEvidence::from_enrichment_with_source(
971 site,
972 &outcome.url,
973 "website",
974 website,
975 observed_at_ms,
976 None,
977 ));
978 outcome
979 }
980
981 fn has_historical_reason(outcome: &CheckOutcome, count: usize) -> bool {
982 outcome.confidence.reasons.iter().any(|reason| {
983 matches!(
984 reason,
985 ConfidenceReason::HistoricalConsistency { count: actual } if *actual == count
986 )
987 })
988 }
989
990 fn large_outcomes(count: usize, generation: usize) -> Vec<CheckOutcome> {
991 (0..count)
992 .map(|idx| large_outcome(idx, generation))
993 .collect()
994 }
995
996 fn large_outcome(idx: usize, generation: usize) -> CheckOutcome {
997 let site = format!("LargeSite{idx:04}");
998 let url = format!("https://large{idx:04}.example/alice");
999 let mut kind = match idx % 20 {
1000 0 | 1 => MatchKind::Found,
1001 3 => MatchKind::Uncertain,
1002 _ => MatchKind::NotFound,
1003 };
1004 if generation > 0 && idx % 20 == 0 {
1005 kind = MatchKind::NotFound;
1006 } else if generation > 0 && idx % 20 == 2 {
1007 kind = MatchKind::Found;
1008 }
1009
1010 let mut outcome = CheckOutcome {
1011 site: site.clone(),
1012 url: url.clone(),
1013 kind,
1014 reason: (kind == MatchKind::Uncertain).then_some(UncertainReason::RateLimited),
1015 elapsed_ms: 10 + (idx % 75) as u64,
1016 enrichment: BTreeMap::new(),
1017 evidence: Vec::new(),
1018 profile_evidence: Vec::new(),
1019 confidence: adler_core::ConfidenceScore::default(),
1020 transport: Some(if idx % 7 == 0 {
1021 TransportTier::Browser
1022 } else {
1023 TransportTier::Http
1024 }),
1025 escalations: u8::from(idx % 7 == 0),
1026 };
1027
1028 match kind {
1029 MatchKind::Found => {
1030 let observed_at_ms = 1_781_192_451_000 + generation as u64 * 1_000 + idx as u64;
1031 let website = format!("https://identity-{:02}.example", idx % 25);
1032 let name = format!("Alice Group {:02}", idx % 50);
1033 let bio = if generation > 0 && idx % 20 == 1 {
1034 format!("updated profile generation {generation} for {idx}")
1035 } else {
1036 format!("stable profile generation 0 for {idx}")
1037 };
1038 for (field, value) in [
1039 ("website", website.as_str()),
1040 ("name", name.as_str()),
1041 ("bio", bio.as_str()),
1042 ] {
1043 outcome
1044 .enrichment
1045 .insert(field.to_owned(), value.to_owned());
1046 outcome
1047 .profile_evidence
1048 .push(ProfileEvidence::from_enrichment_with_source(
1049 &site,
1050 &url,
1051 field,
1052 value,
1053 Some(observed_at_ms),
1054 Some(EvidenceAccessPath::new(
1055 outcome.transport.unwrap_or(TransportTier::Http),
1056 outcome.escalations,
1057 idx % 11 == 0,
1058 )),
1059 ));
1060 }
1061 outcome.evidence = vec![
1062 "HTTP 200 (status_found)".to_owned(),
1063 "body matched profile marker".to_owned(),
1064 ];
1065 }
1066 MatchKind::NotFound => {
1067 outcome.evidence = vec!["HTTP 404 (status_not_found)".to_owned()];
1068 }
1069 MatchKind::Uncertain => {}
1070 }
1071 outcome.refresh_confidence();
1072 outcome
1073 }
1074
1075 fn large_persisted_scan(scan_id: &str, generation: usize) -> PersistedScan {
1076 let outcomes = large_outcomes(2_500, generation);
1077 let finished = FinishedScan {
1078 summary: Summary::from_outcomes(&outcomes),
1079 identity_clusters: adler_core::build_identity_clusters("alice", &outcomes),
1080 elapsed_ms: 30_000 + generation as u64,
1081 outcomes,
1082 };
1083 PersistedScan::from_finished(
1084 ScanId::from(scan_id.to_owned()),
1085 "alice".to_owned(),
1086 2_500,
1087 1_781_192_451_000 + generation as u64 * 10_000,
1088 finished,
1089 )
1090 }
1091
1092 #[tokio::test]
1093 async fn save_then_load_roundtrips() {
1094 let tmp = TempDir::new().unwrap();
1095 let s = sample("abc123", 1_700_000_000_000);
1096 save(tmp.path(), &s).await.unwrap();
1097
1098 let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1099 assert_eq!(loaded.scan_id, s.scan_id);
1100 assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1101 assert_eq!(loaded.username, "alice");
1102 assert_eq!(loaded.outcomes.len(), 2);
1103 assert_eq!(loaded.outcomes[0].site, "GitHub");
1104 assert_eq!(loaded.summary.found, 1);
1105 }
1106
1107 #[test]
1108 fn historical_overlay_adds_reason_after_two_prior_stable_found_observations() {
1109 let mut current = scan_with_outcomes(
1110 "current",
1111 "alice",
1112 30,
1113 vec![found_with_website("GitHub", "https://alice.dev")],
1114 );
1115 let previous = scan_with_outcomes(
1116 "previous",
1117 "alice",
1118 20,
1119 vec![found_with_website("GitHub", "https://alice.dev")],
1120 );
1121 let older = scan_with_outcomes(
1122 "older",
1123 "alice",
1124 10,
1125 vec![found_with_website("GitHub", "https://alice.dev")],
1126 );
1127
1128 apply_historical_confidence_overlay(&mut current, &[previous, older]);
1129
1130 assert!(has_historical_reason(¤t.outcomes[0], 2));
1131 assert_eq!(current.outcomes[0].confidence.score, 79);
1132 }
1133
1134 #[test]
1135 fn historical_overlay_ignores_single_prior_found() {
1136 let mut current = scan_with_outcomes(
1137 "current",
1138 "alice",
1139 20,
1140 vec![found_with_website("GitHub", "https://alice.dev")],
1141 );
1142 let previous = scan_with_outcomes(
1143 "previous",
1144 "alice",
1145 10,
1146 vec![found_with_website("GitHub", "https://alice.dev")],
1147 );
1148
1149 apply_historical_confidence_overlay(&mut current, &[previous]);
1150
1151 assert!(!has_historical_reason(¤t.outcomes[0], 1));
1152 assert_eq!(current.outcomes[0].confidence.score, 75);
1153 }
1154
1155 #[test]
1156 fn historical_overlay_resets_on_explicit_non_found() {
1157 let mut current = scan_with_outcomes(
1158 "current",
1159 "alice",
1160 40,
1161 vec![found_with_website("GitHub", "https://alice.dev")],
1162 );
1163 let previous = scan_with_outcomes(
1164 "previous",
1165 "alice",
1166 30,
1167 vec![outcome("GitHub", MatchKind::NotFound)],
1168 );
1169 let older = scan_with_outcomes(
1170 "older",
1171 "alice",
1172 20,
1173 vec![found_with_website("GitHub", "https://alice.dev")],
1174 );
1175 let oldest = scan_with_outcomes(
1176 "oldest",
1177 "alice",
1178 10,
1179 vec![found_with_website("GitHub", "https://alice.dev")],
1180 );
1181
1182 apply_historical_confidence_overlay(&mut current, &[previous, older, oldest]);
1183
1184 assert!(!has_historical_reason(¤t.outcomes[0], 2));
1185 assert_eq!(current.outcomes[0].confidence.score, 75);
1186 }
1187
1188 #[test]
1189 fn historical_overlay_ignores_source_timestamp_changes() {
1190 let mut current = scan_with_outcomes(
1191 "current",
1192 "alice",
1193 30,
1194 vec![found_with_website_at(
1195 "GitHub",
1196 "https://alice.dev",
1197 Some(30),
1198 )],
1199 );
1200 let previous = scan_with_outcomes(
1201 "previous",
1202 "alice",
1203 20,
1204 vec![found_with_website_at(
1205 "GitHub",
1206 "https://alice.dev",
1207 Some(20),
1208 )],
1209 );
1210 let older = scan_with_outcomes(
1211 "older",
1212 "alice",
1213 10,
1214 vec![found_with_website_at(
1215 "GitHub",
1216 "https://alice.dev",
1217 Some(10),
1218 )],
1219 );
1220
1221 apply_historical_confidence_overlay(&mut current, &[previous, older]);
1222
1223 assert!(has_historical_reason(¤t.outcomes[0], 2));
1224 }
1225
1226 #[test]
1227 fn historical_overlay_adds_cluster_co_occurrence_reason() {
1228 let mut current = scan_with_outcomes(
1229 "current",
1230 "alice",
1231 30,
1232 vec![
1233 found_with_website("GitHub", "https://alice.dev"),
1234 found_with_website("GitLab", "https://alice.dev"),
1235 ],
1236 );
1237 let previous = scan_with_outcomes(
1238 "previous",
1239 "alice",
1240 20,
1241 vec![
1242 found_with_website("GitHub", "https://alice.dev"),
1243 found_with_website("GitLab", "https://alice.dev"),
1244 ],
1245 );
1246 let older = scan_with_outcomes(
1247 "older",
1248 "alice",
1249 10,
1250 vec![
1251 found_with_website("GitHub", "https://alice.dev"),
1252 found_with_website("GitLab", "https://alice.dev"),
1253 ],
1254 );
1255
1256 apply_historical_confidence_overlay(&mut current, &[previous, older]);
1257
1258 assert_eq!(current.identity_clusters.len(), 1);
1259 assert_eq!(current.identity_clusters[0].confidence, 95);
1260 assert!(
1261 current.identity_clusters[0]
1262 .reasons
1263 .contains(&adler_core::ClusterReason::HistoricalCoOccurrence)
1264 );
1265 }
1266
1267 #[test]
1268 fn weak_status_only_result_remains_medium_capped_with_history() {
1269 let mut current_outcome = outcome("GitHub", MatchKind::Found);
1270 current_outcome.evidence = vec!["HTTP 200 (status_found)".to_owned()];
1271 let mut previous_outcome = outcome("GitHub", MatchKind::Found);
1272 previous_outcome.evidence = current_outcome.evidence.clone();
1273 let mut older_outcome = outcome("GitHub", MatchKind::Found);
1274 older_outcome.evidence = current_outcome.evidence.clone();
1275
1276 let mut current = scan_with_outcomes("current", "alice", 30, vec![current_outcome]);
1277 let previous = scan_with_outcomes("previous", "alice", 20, vec![previous_outcome]);
1278 let older = scan_with_outcomes("older", "alice", 10, vec![older_outcome]);
1279
1280 apply_historical_confidence_overlay(&mut current, &[previous, older]);
1281
1282 assert!(has_historical_reason(¤t.outcomes[0], 2));
1283 assert_eq!(
1284 current.outcomes[0].confidence.label,
1285 ConfidenceLabel::Medium
1286 );
1287 assert_eq!(current.outcomes[0].confidence.score, 70);
1288 }
1289
1290 #[tokio::test]
1291 async fn historical_overlay_does_not_rewrite_persisted_json() {
1292 let tmp = TempDir::new().unwrap();
1293 let current = scan_with_outcomes(
1294 "current",
1295 "alice",
1296 30,
1297 vec![found_with_website("GitHub", "https://alice.dev")],
1298 );
1299 let previous = scan_with_outcomes(
1300 "previous",
1301 "alice",
1302 20,
1303 vec![found_with_website("GitHub", "https://alice.dev")],
1304 );
1305 let older = scan_with_outcomes(
1306 "older",
1307 "alice",
1308 10,
1309 vec![found_with_website("GitHub", "https://alice.dev")],
1310 );
1311 save(tmp.path(), ¤t).await.unwrap();
1312 save(tmp.path(), &previous).await.unwrap();
1313 save(tmp.path(), &older).await.unwrap();
1314
1315 let current_path = tmp.path().join("current.json");
1316 let before = fs::read(¤t_path).await.unwrap();
1317 let related = load_all(tmp.path()).await;
1318 let mut loaded = load(tmp.path(), &ScanId::from("current".to_owned()))
1319 .await
1320 .unwrap();
1321
1322 apply_historical_confidence_overlay(&mut loaded, &related);
1323
1324 let after = fs::read(¤t_path).await.unwrap();
1325 assert_eq!(before, after);
1326 assert!(has_historical_reason(&loaded.outcomes[0], 2));
1327 }
1328
1329 #[tokio::test]
1330 async fn save_writes_schema_version() {
1331 let tmp = TempDir::new().unwrap();
1332 let s = sample("abc123", 1_700_000_000_000);
1333 save(tmp.path(), &s).await.unwrap();
1334
1335 let raw = fs::read_to_string(tmp.path().join("abc123.json"))
1336 .await
1337 .unwrap();
1338 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1339 assert_eq!(
1340 value["schema_version"],
1341 serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
1342 );
1343 }
1344
1345 #[tokio::test]
1346 async fn save_skips_empty_identity_clusters() {
1347 let tmp = TempDir::new().unwrap();
1348 let s = sample("empty-clusters", 1_700_000_000_000);
1349 save(tmp.path(), &s).await.unwrap();
1350
1351 let raw = fs::read_to_string(tmp.path().join("empty-clusters.json"))
1352 .await
1353 .unwrap();
1354 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1355 assert_eq!(
1356 value["schema_version"],
1357 serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
1358 );
1359 assert!(
1360 value.get("identity_clusters").is_none(),
1361 "empty cluster cache should stay absent from persisted JSON"
1362 );
1363 }
1364
1365 #[tokio::test]
1366 async fn save_writes_derived_identity_clusters() {
1367 let tmp = TempDir::new().unwrap();
1368 let mut s = sample("clusters", 1_700_000_000_000);
1369 s.outcomes = vec![
1370 found_with_website("GitHub", "https://alice.dev"),
1371 found_with_website("GitLab", "https://alice.dev"),
1372 ];
1373
1374 save(tmp.path(), &s).await.unwrap();
1375
1376 let raw = fs::read_to_string(tmp.path().join("clusters.json"))
1377 .await
1378 .unwrap();
1379 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1380 assert_eq!(value["identity_clusters"].as_array().unwrap().len(), 1);
1381 assert_eq!(
1382 value["identity_clusters"][0]["members"]
1383 .as_array()
1384 .unwrap()
1385 .len(),
1386 2
1387 );
1388 }
1389
1390 #[tokio::test]
1391 async fn save_roundtrips_request_context() {
1392 let tmp = TempDir::new().unwrap();
1393 let context = ScanRequestContext {
1394 username: "alice".into(),
1395 derived_from: Some(ScanId::from("previous".to_owned())),
1396 only: vec!["Git".into()],
1397 exclude: Vec::new(),
1398 tag: vec!["coding".into()],
1399 exclude_tag: vec!["nsfw".into()],
1400 top: Some(100),
1401 nsfw: false,
1402 concurrency: Some(8),
1403 deadline_secs: Some(30),
1404 egress_names: vec!["us-resi".into()],
1405 disabled_matches: vec![PersistedDisabledMatch {
1406 name: "Threads".into(),
1407 url: "https://www.threads.net/@{username}".into(),
1408 tags: vec!["social".into()],
1409 disabled_reason: "Honest Limits: login wall".into(),
1410 }],
1411 };
1412 let s = sample("ctx", 1_700_000_000_000).with_request_context(context.clone());
1413 save(tmp.path(), &s).await.unwrap();
1414
1415 let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1416 assert_eq!(loaded.request_context, Some(context));
1417 }
1418
1419 #[test]
1420 fn diff_scans_reports_added_removed_and_verdict_changes() {
1421 let mut previous = sample("old", 1_000);
1422 previous.outcomes = vec![
1423 outcome("GitHub", MatchKind::Found),
1424 outcome("Reddit", MatchKind::Found),
1425 outcome("Mastodon", MatchKind::NotFound),
1426 ];
1427 let mut current = sample("new", 2_000);
1428 current.outcomes = vec![
1429 outcome("GitHub", MatchKind::Found),
1430 outcome("Reddit", MatchKind::NotFound),
1431 outcome("Mastodon", MatchKind::Found),
1432 ];
1433
1434 let diff = diff_scans(&previous, ¤t);
1435
1436 assert_eq!(diff.from_scan_id.as_str(), "old");
1437 assert_eq!(diff.to_scan_id.as_str(), "new");
1438 assert_eq!(
1439 diff.added_found
1440 .iter()
1441 .map(|outcome| outcome.site.as_str())
1442 .collect::<Vec<_>>(),
1443 ["Mastodon"]
1444 );
1445 assert_eq!(
1446 diff.removed_found
1447 .iter()
1448 .map(|outcome| outcome.site.as_str())
1449 .collect::<Vec<_>>(),
1450 ["Reddit"]
1451 );
1452 assert_eq!(diff.verdict_changes.len(), 2);
1453 assert_eq!(diff.verdict_changes[0].site, "Mastodon");
1454 assert_eq!(diff.verdict_changes[0].before, MatchKind::NotFound);
1455 assert_eq!(diff.verdict_changes[0].after, MatchKind::Found);
1456 assert_eq!(diff.verdict_changes[1].site, "Reddit");
1457 assert!(diff.evidence_changes.is_empty());
1458 }
1459
1460 #[test]
1461 fn diff_scans_reports_profile_evidence_changes_for_still_found_sites() {
1462 let mut previous = sample("old", 1_000);
1463 let mut old_github = outcome("GitHub", MatchKind::Found);
1464 old_github.enrichment.insert("name".into(), "Alice".into());
1465 old_github
1466 .profile_evidence
1467 .push(adler_core::ProfileEvidence::from_enrichment(
1468 "GitHub",
1469 "https://github.example/alice",
1470 "name",
1471 "Alice",
1472 ));
1473 previous.outcomes = vec![old_github];
1474
1475 let mut current = sample("new", 2_000);
1476 let mut new_github = outcome("GitHub", MatchKind::Found);
1477 new_github
1478 .enrichment
1479 .insert("name".into(), "Alice Liddell".into());
1480 new_github
1481 .profile_evidence
1482 .push(adler_core::ProfileEvidence::from_enrichment(
1483 "GitHub",
1484 "https://github.example/alice",
1485 "name",
1486 "Alice Liddell",
1487 ));
1488 current.outcomes = vec![new_github];
1489
1490 let diff = diff_scans(&previous, ¤t);
1491
1492 assert!(diff.added_found.is_empty());
1493 assert!(diff.removed_found.is_empty());
1494 assert!(diff.verdict_changes.is_empty());
1495 assert_eq!(diff.evidence_changes.len(), 1);
1496 assert_eq!(diff.evidence_changes[0].site, "GitHub");
1497 assert_eq!(
1498 diff.evidence_changes[0]
1499 .before_enrichment
1500 .get("name")
1501 .unwrap(),
1502 "Alice"
1503 );
1504 assert_eq!(
1505 diff.evidence_changes[0]
1506 .after_enrichment
1507 .get("name")
1508 .unwrap(),
1509 "Alice Liddell"
1510 );
1511 }
1512
1513 #[test]
1514 fn timeline_tracks_first_seen_disappeared_and_reappeared() {
1515 let mut first = sample("first", 1_000);
1516 first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1517 let mut second = sample("second", 2_000);
1518 second.outcomes = vec![outcome("GitHub", MatchKind::NotFound)];
1519 let mut third = sample("third", 3_000);
1520 third.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1521
1522 let timeline = build_scan_timeline(&[third, first, second]);
1523
1524 assert_eq!(timeline.username, "alice");
1525 assert_eq!(timeline.scan_count, 3);
1526 assert_eq!(timeline.from_ms, Some(1_000));
1527 assert_eq!(timeline.to_ms, Some(3_000));
1528 assert_eq!(timeline.profiles.len(), 1);
1529 assert_eq!(timeline.profiles[0].site, "GitHub");
1530 assert_eq!(timeline.profiles[0].first_seen_ms, 1_000);
1531 assert_eq!(timeline.profiles[0].last_seen_ms, 3_000);
1532 assert!(timeline.profiles[0].present_in_latest);
1533 assert_eq!(
1534 timeline
1535 .events
1536 .iter()
1537 .map(|event| event.kind)
1538 .collect::<Vec<_>>(),
1539 [
1540 TimelineEventKind::FirstSeen,
1541 TimelineEventKind::Disappeared,
1542 TimelineEventKind::Reappeared
1543 ]
1544 );
1545 assert_eq!(timeline.events[1].before, Some(MatchKind::Found));
1546 assert_eq!(timeline.events[1].after, Some(MatchKind::NotFound));
1547 }
1548
1549 #[test]
1550 fn timeline_treats_missing_site_as_disappeared() {
1551 let mut first = sample("first", 1_000);
1552 first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1553 let mut second = sample("second", 2_000);
1554 second.outcomes = vec![outcome("GitLab", MatchKind::NotFound)];
1555
1556 let timeline = build_scan_timeline(&[first, second]);
1557
1558 assert_eq!(timeline.profiles.len(), 1);
1559 assert!(!timeline.profiles[0].present_in_latest);
1560 assert_eq!(timeline.events.len(), 2);
1561 assert_eq!(timeline.events[1].kind, TimelineEventKind::Disappeared);
1562 assert_eq!(timeline.events[1].site, "GitHub");
1563 assert_eq!(timeline.events[1].after, None);
1564 }
1565
1566 #[test]
1567 fn timeline_tracks_evidence_changes_for_still_found_profile() {
1568 let mut first = sample("first", 1_000);
1569 let mut old_github = outcome("GitHub", MatchKind::Found);
1570 old_github.enrichment.insert("name".into(), "Alice".into());
1571 old_github
1572 .profile_evidence
1573 .push(adler_core::ProfileEvidence::from_enrichment(
1574 "GitHub",
1575 "https://github.example/alice",
1576 "name",
1577 "Alice",
1578 ));
1579 first.outcomes = vec![old_github];
1580
1581 let mut second = sample("second", 2_000);
1582 let mut new_github = outcome("GitHub", MatchKind::Found);
1583 new_github
1584 .enrichment
1585 .insert("name".into(), "Alice Liddell".into());
1586 new_github
1587 .profile_evidence
1588 .push(adler_core::ProfileEvidence::from_enrichment(
1589 "GitHub",
1590 "https://github.example/alice",
1591 "name",
1592 "Alice Liddell",
1593 ));
1594 second.outcomes = vec![new_github];
1595
1596 let timeline = build_scan_timeline(&[first, second]);
1597
1598 assert_eq!(
1599 timeline
1600 .events
1601 .iter()
1602 .map(|event| event.kind)
1603 .collect::<Vec<_>>(),
1604 [
1605 TimelineEventKind::FirstSeen,
1606 TimelineEventKind::EvidenceChanged
1607 ]
1608 );
1609 let evidence_change = timeline.events[1].evidence_change.as_ref().unwrap();
1610 assert_eq!(
1611 evidence_change.before_enrichment.get("name").unwrap(),
1612 "Alice"
1613 );
1614 assert_eq!(
1615 evidence_change.after_enrichment.get("name").unwrap(),
1616 "Alice Liddell"
1617 );
1618 }
1619
1620 #[tokio::test]
1621 async fn load_all_returns_newest_first() {
1622 let tmp = TempDir::new().unwrap();
1623 save(tmp.path(), &sample("old", 1_000)).await.unwrap();
1624 save(tmp.path(), &sample("mid", 2_000)).await.unwrap();
1625 save(tmp.path(), &sample("new", 3_000)).await.unwrap();
1626 let all = load_all(tmp.path()).await;
1627 assert_eq!(all.len(), 3);
1628 assert_eq!(all[0].scan_id.as_str(), "new");
1629 assert_eq!(all[1].scan_id.as_str(), "mid");
1630 assert_eq!(all[2].scan_id.as_str(), "old");
1631 }
1632
1633 #[tokio::test]
1634 async fn load_returns_none_for_missing() {
1635 let tmp = TempDir::new().unwrap();
1636 let missing = load(tmp.path(), &ScanId::from("nope".to_owned())).await;
1637 assert!(missing.is_none());
1638 }
1639
1640 #[tokio::test]
1641 async fn load_defaults_schema_version_for_legacy_scan_json() {
1642 let tmp = TempDir::new().unwrap();
1643 let path = tmp.path().join("legacy.json");
1644 fs::write(
1645 &path,
1646 br#"{
1647 "scan_id": "legacy",
1648 "username": "alice",
1649 "site_count": 0,
1650 "created_at_ms": 1700000000000,
1651 "summary": { "found": 0, "not_found": 0, "uncertain": 0 },
1652 "outcomes": [],
1653 "elapsed_ms": 0
1654 }"#,
1655 )
1656 .await
1657 .unwrap();
1658
1659 let loaded = load(tmp.path(), &ScanId::from("legacy".to_owned()))
1660 .await
1661 .expect("legacy scan loads");
1662 assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1663 }
1664
1665 #[tokio::test]
1666 async fn load_accepts_v2_scan_json_after_schema_bump() {
1667 let tmp = TempDir::new().unwrap();
1668 let path = tmp.path().join("v2.json");
1669 fs::write(
1670 &path,
1671 br#"{
1672 "schema_version": 2,
1673 "scan_id": "v2",
1674 "username": "alice",
1675 "site_count": 1,
1676 "created_at_ms": 1700000000000,
1677 "summary": { "found": 1, "not_found": 0, "uncertain": 0 },
1678 "outcomes": [
1679 {
1680 "site": "GitHub",
1681 "url": "https://github.example/alice",
1682 "kind": "found",
1683 "elapsed_ms": 10,
1684 "evidence": ["HTTP 200 (status_found)"]
1685 }
1686 ],
1687 "elapsed_ms": 10
1688 }"#,
1689 )
1690 .await
1691 .unwrap();
1692
1693 let loaded = load(tmp.path(), &ScanId::from("v2".to_owned()))
1694 .await
1695 .expect("v2 scan loads");
1696
1697 assert_eq!(loaded.schema_version, 2);
1698 assert_eq!(loaded.summary.found, 1);
1699 assert_eq!(
1700 loaded.outcomes[0].confidence.label,
1701 adler_core::ConfidenceLabel::Medium
1702 );
1703 }
1704
1705 #[tokio::test]
1706 async fn load_derives_identity_clusters_for_legacy_scan_json() {
1707 let tmp = TempDir::new().unwrap();
1708 let path = tmp.path().join("legacy-clusters.json");
1709 fs::write(
1710 &path,
1711 br#"{
1712 "schema_version": 1,
1713 "scan_id": "legacy-clusters",
1714 "username": "alice",
1715 "site_count": 2,
1716 "created_at_ms": 1700000000000,
1717 "summary": { "found": 2, "not_found": 0, "uncertain": 0 },
1718 "outcomes": [
1719 {
1720 "site": "GitHub",
1721 "url": "https://github.example/alice",
1722 "kind": "found",
1723 "elapsed_ms": 10,
1724 "profile_evidence": [
1725 {
1726 "kind": "external_link",
1727 "field": "website",
1728 "value": "https://alice.dev",
1729 "source": {
1730 "site": "GitHub",
1731 "url": "https://github.example/alice",
1732 "origin": "extractor"
1733 }
1734 }
1735 ]
1736 },
1737 {
1738 "site": "GitLab",
1739 "url": "https://gitlab.example/alice",
1740 "kind": "found",
1741 "elapsed_ms": 10,
1742 "profile_evidence": [
1743 {
1744 "kind": "external_link",
1745 "field": "website",
1746 "value": "https://alice.dev/",
1747 "source": {
1748 "site": "GitLab",
1749 "url": "https://gitlab.example/alice",
1750 "origin": "extractor"
1751 }
1752 }
1753 ]
1754 }
1755 ],
1756 "elapsed_ms": 20
1757 }"#,
1758 )
1759 .await
1760 .unwrap();
1761
1762 let loaded = load(tmp.path(), &ScanId::from("legacy-clusters".to_owned()))
1763 .await
1764 .expect("legacy scan loads");
1765
1766 assert_eq!(loaded.identity_clusters.len(), 1);
1767 assert_eq!(loaded.identity_clusters[0].members.len(), 2);
1768 assert!(!loaded.identity_clusters[0].uncertain);
1769 }
1770
1771 #[test]
1772 fn large_scan_artifact_paths_handle_identity_graph_payloads() {
1773 let previous = large_persisted_scan("large-old", 0);
1774 let current = large_persisted_scan("large-new", 1);
1775
1776 assert_eq!(previous.outcomes.len(), 2_500);
1777 assert_eq!(previous.site_count, 2_500);
1778 assert_eq!(
1779 previous.summary.found + previous.summary.not_found + previous.summary.uncertain,
1780 2_500
1781 );
1782 assert!(!previous.identity_clusters.is_empty());
1783
1784 let raw = serde_json::to_string(&previous).unwrap();
1785 let decoded: PersistedScan = serde_json::from_str(&raw).unwrap();
1786 assert_eq!(decoded.outcomes.len(), 2_500);
1787 assert_eq!(
1788 decoded.identity_clusters.len(),
1789 previous.identity_clusters.len()
1790 );
1791
1792 let diff = diff_scans(&previous, ¤t);
1793 assert!(!diff.added_found.is_empty());
1794 assert!(!diff.removed_found.is_empty());
1795 assert!(!diff.verdict_changes.is_empty());
1796 assert!(!diff.evidence_changes.is_empty());
1797
1798 let timeline = build_scan_timeline(&[previous, current]);
1799 assert_eq!(timeline.scan_count, 2);
1800 assert_eq!(timeline.profiles.len(), 375);
1801 assert!(timeline.events.len() > timeline.profiles.len());
1802 }
1803
1804 #[tokio::test]
1805 async fn load_all_skips_unrelated_files() {
1806 let tmp = TempDir::new().unwrap();
1807 fs::write(tmp.path().join("README"), b"not json")
1809 .await
1810 .unwrap();
1811 fs::write(tmp.path().join("broken.json"), b"{ invalid")
1812 .await
1813 .unwrap();
1814 save(tmp.path(), &sample("good", 9_999)).await.unwrap();
1815 let all = load_all(tmp.path()).await;
1816 assert_eq!(all.len(), 1);
1817 assert_eq!(all[0].scan_id.as_str(), "good");
1818 }
1819
1820 #[tokio::test]
1821 async fn prune_keeps_only_newest_n() {
1822 let tmp = TempDir::new().unwrap();
1823 for i in 0u64..5 {
1824 save(tmp.path(), &sample(&format!("s{i}"), i * 1_000))
1825 .await
1826 .unwrap();
1827 }
1828 let removed = prune(tmp.path(), 2).await;
1829 assert_eq!(removed, 3);
1830 let remaining = load_all(tmp.path()).await;
1831 assert_eq!(remaining.len(), 2);
1832 assert_eq!(remaining[0].scan_id.as_str(), "s4");
1833 assert_eq!(remaining[1].scan_id.as_str(), "s3");
1834 }
1835}