1use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use adler_core::{
17 CheckOutcome, HistoricalScanRef, IdentityCluster, InvestigationReport, MatchKind,
18 ProfileEvidence, ReportDisabledSite, ReportTimelineEvent, ReportTimelineEventKind, Site,
19};
20use serde::{Deserialize, Serialize};
21use tokio::fs;
22
23use crate::error::{Error, Result};
24use crate::scan::{FinishedScan, ScanId, Summary};
25
26pub(crate) const MAX_PERSISTED_SCANS: usize = 200;
30pub(crate) const PERSISTED_SCAN_SCHEMA_VERSION: u16 = 3;
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct PersistedScan {
37 #[serde(default = "default_schema_version")]
39 pub schema_version: u16,
40 pub scan_id: ScanId,
42 pub username: String,
44 #[serde(default, skip_serializing_if = "Option::is_none")]
48 pub request_context: Option<ScanRequestContext>,
49 pub site_count: usize,
51 pub created_at_ms: u64,
53 pub summary: Summary,
55 pub outcomes: Vec<CheckOutcome>,
57 #[serde(default, skip_serializing_if = "Vec::is_empty")]
60 pub identity_clusters: Vec<IdentityCluster>,
61 pub elapsed_ms: u64,
63}
64
65impl PersistedScan {
66 #[must_use]
68 pub fn from_finished(
69 scan_id: ScanId,
70 username: String,
71 site_count: usize,
72 created_at_ms: u64,
73 finished: FinishedScan,
74 ) -> Self {
75 let mut scan = Self {
76 schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
77 scan_id,
78 username,
79 request_context: None,
80 site_count,
81 created_at_ms,
82 summary: finished.summary,
83 outcomes: finished.outcomes,
84 identity_clusters: finished.identity_clusters,
85 elapsed_ms: finished.elapsed_ms,
86 };
87 scan.refresh_derived_fields();
88 scan
89 }
90
91 #[must_use]
93 pub fn with_request_context(mut self, context: ScanRequestContext) -> Self {
94 self.request_context = Some(context);
95 self
96 }
97
98 pub(crate) fn refresh_derived_fields(&mut self) {
99 for outcome in &mut self.outcomes {
100 outcome.refresh_confidence();
101 }
102 self.summary = Summary::from_outcomes(&self.outcomes);
103 self.identity_clusters =
104 adler_core::build_identity_clusters(&self.username, &self.outcomes);
105 }
106}
107
108pub fn apply_historical_confidence_overlay(
115 current: &mut PersistedScan,
116 related_scans: &[PersistedScan],
117) {
118 current.refresh_derived_fields();
119 let history_counts = historical_consistency_counts(current, related_scans);
120
121 for outcome in &mut current.outcomes {
122 let count = history_counts.get(&outcome.site).copied().unwrap_or(0);
123 outcome.refresh_confidence_with_history(count);
124 }
125
126 current.identity_clusters =
127 adler_core::build_identity_clusters(¤t.username, ¤t.outcomes);
128}
129
130#[must_use]
135pub fn build_investigation_report(
136 mut scan: PersistedScan,
137 related_scans: &[PersistedScan],
138) -> InvestigationReport {
139 apply_historical_confidence_overlay(&mut scan, related_scans);
140 let timeline = report_timeline_from_scans(related_scans, &scan);
141 let disabled_sites = scan
142 .request_context
143 .as_ref()
144 .map(|context| {
145 context
146 .disabled_matches
147 .iter()
148 .map(|site| ReportDisabledSite {
149 name: site.name.clone(),
150 url: site.url.clone(),
151 tags: site.tags.clone(),
152 disabled_reason: site.disabled_reason.clone(),
153 })
154 .collect()
155 })
156 .unwrap_or_default();
157
158 InvestigationReport::builder(scan.username, &scan.outcomes)
159 .identity_clusters(scan.identity_clusters)
160 .timeline(timeline)
161 .disabled_sites(disabled_sites)
162 .build()
163}
164
165fn report_timeline_from_scans(
166 related_scans: &[PersistedScan],
167 current: &PersistedScan,
168) -> Vec<ReportTimelineEvent> {
169 let mut scans = related_scans.to_vec();
170 if !scans.iter().any(|scan| scan.scan_id == current.scan_id) {
171 scans.push(current.clone());
172 }
173 build_scan_timeline(&scans)
174 .events
175 .into_iter()
176 .map(report_timeline_event)
177 .collect()
178}
179
180fn report_timeline_event(event: TimelineEvent) -> ReportTimelineEvent {
181 ReportTimelineEvent {
182 kind: match event.kind {
183 TimelineEventKind::FirstSeen => ReportTimelineEventKind::AddedFound,
184 TimelineEventKind::Disappeared => ReportTimelineEventKind::RemovedFound,
185 TimelineEventKind::Reappeared => ReportTimelineEventKind::Reappeared,
186 TimelineEventKind::EvidenceChanged => ReportTimelineEventKind::EvidenceChanged,
187 },
188 site: Some(event.site),
189 scan_id: Some(event.scan_id.to_string()),
190 observed_at_ms: Some(event.at_ms),
191 detail: Some(timeline_detail(event.before, event.after)),
192 }
193}
194
195fn timeline_detail(before: Option<MatchKind>, after: Option<MatchKind>) -> String {
196 match (before, after) {
197 (Some(before), Some(after)) => format!("{} -> {}", kind_label(before), kind_label(after)),
198 (None, Some(after)) => format!("new {}", kind_label(after)),
199 (Some(before), None) => format!("after {}", kind_label(before)),
200 (None, None) => "changed".to_owned(),
201 }
202}
203
204fn kind_label(kind: MatchKind) -> &'static str {
205 match kind {
206 MatchKind::Found => "found",
207 MatchKind::NotFound => "not_found",
208 MatchKind::Uncertain => "uncertain",
209 }
210}
211
212fn historical_consistency_counts(
213 current: &PersistedScan,
214 related_scans: &[PersistedScan],
215) -> BTreeMap<String, usize> {
216 let current_ref = HistoricalScanRef {
217 scan_id: current.scan_id.as_str(),
218 username: ¤t.username,
219 created_at_ms: current.created_at_ms,
220 outcomes: ¤t.outcomes,
221 };
222 let related_refs = related_scans.iter().map(|scan| HistoricalScanRef {
223 scan_id: scan.scan_id.as_str(),
224 username: &scan.username,
225 created_at_ms: scan.created_at_ms,
226 outcomes: &scan.outcomes,
227 });
228 adler_core::historical_consistency_counts(current_ref, related_refs)
229}
230
231const fn default_schema_version() -> u16 {
232 PERSISTED_SCAN_SCHEMA_VERSION
233}
234
235#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
239pub struct ScanRequestContext {
240 pub username: String,
242 #[serde(default, skip_serializing_if = "Option::is_none")]
244 pub derived_from: Option<ScanId>,
245 #[serde(default, skip_serializing_if = "Vec::is_empty")]
247 pub only: Vec<String>,
248 #[serde(default, skip_serializing_if = "Vec::is_empty")]
250 pub exclude: Vec<String>,
251 #[serde(default, skip_serializing_if = "Vec::is_empty")]
253 pub tag: Vec<String>,
254 #[serde(default, skip_serializing_if = "Vec::is_empty")]
256 pub exclude_tag: Vec<String>,
257 #[serde(default, skip_serializing_if = "Option::is_none")]
259 pub top: Option<u32>,
260 pub nsfw: bool,
262 #[serde(default, skip_serializing_if = "Option::is_none")]
264 pub concurrency: Option<usize>,
265 #[serde(default, skip_serializing_if = "Option::is_none")]
267 pub deadline_secs: Option<u64>,
268 #[serde(default, skip_serializing_if = "Vec::is_empty")]
270 pub egress_names: Vec<String>,
271 #[serde(default, skip_serializing_if = "Vec::is_empty")]
274 pub disabled_matches: Vec<PersistedDisabledMatch>,
275}
276
277#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
279pub struct PersistedDisabledMatch {
280 pub name: String,
282 pub url: String,
284 #[serde(default, skip_serializing_if = "Vec::is_empty")]
286 pub tags: Vec<String>,
287 pub disabled_reason: String,
289}
290
291impl From<&Site> for PersistedDisabledMatch {
292 fn from(site: &Site) -> Self {
293 Self {
294 name: site.name.clone(),
295 url: site.url.as_str().to_owned(),
296 tags: site.tags.clone(),
297 disabled_reason: site
298 .disabled_reason
299 .clone()
300 .unwrap_or_else(|| "disabled in registry".to_owned()),
301 }
302 }
303}
304
305#[derive(Debug, Clone, Serialize, Deserialize)]
308pub struct ScanDiff {
309 pub from_scan_id: ScanId,
311 pub to_scan_id: ScanId,
313 #[serde(default, skip_serializing_if = "Vec::is_empty")]
315 pub added_found: Vec<CheckOutcome>,
316 #[serde(default, skip_serializing_if = "Vec::is_empty")]
318 pub removed_found: Vec<CheckOutcome>,
319 #[serde(default, skip_serializing_if = "Vec::is_empty")]
321 pub verdict_changes: Vec<VerdictChange>,
322 #[serde(default, skip_serializing_if = "Vec::is_empty")]
324 pub evidence_changes: Vec<EvidenceChange>,
325}
326
327#[derive(Debug, Clone, Serialize, Deserialize)]
329pub struct VerdictChange {
330 pub site: String,
332 pub before: MatchKind,
334 pub after: MatchKind,
336}
337
338#[derive(Debug, Clone, Serialize, Deserialize)]
340pub struct EvidenceChange {
341 pub site: String,
343 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
345 pub before_enrichment: BTreeMap<String, String>,
346 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
348 pub after_enrichment: BTreeMap<String, String>,
349 #[serde(default, skip_serializing_if = "Vec::is_empty")]
351 pub before_profile_evidence: Vec<ProfileEvidence>,
352 #[serde(default, skip_serializing_if = "Vec::is_empty")]
354 pub after_profile_evidence: Vec<ProfileEvidence>,
355}
356
357#[derive(Debug, Clone, Serialize, Deserialize)]
359pub struct ScanTimeline {
360 pub username: String,
362 pub scan_count: usize,
364 #[serde(default, skip_serializing_if = "Option::is_none")]
366 pub from_ms: Option<u64>,
367 #[serde(default, skip_serializing_if = "Option::is_none")]
369 pub to_ms: Option<u64>,
370 #[serde(default, skip_serializing_if = "Vec::is_empty")]
372 pub profiles: Vec<TimelineProfile>,
373 #[serde(default, skip_serializing_if = "Vec::is_empty")]
375 pub events: Vec<TimelineEvent>,
376}
377
378#[derive(Debug, Clone, Serialize, Deserialize)]
380pub struct TimelineProfile {
381 pub site: String,
383 pub url: String,
385 pub first_seen_ms: u64,
387 pub last_seen_ms: u64,
389 pub present_in_latest: bool,
391 #[serde(default, skip_serializing_if = "Option::is_none")]
393 pub last_verdict: Option<MatchKind>,
394}
395
396#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
398#[serde(rename_all = "snake_case")]
399pub enum TimelineEventKind {
400 FirstSeen,
402 Disappeared,
404 Reappeared,
406 EvidenceChanged,
408}
409
410#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct TimelineEvent {
413 pub scan_id: ScanId,
415 pub at_ms: u64,
417 pub site: String,
419 pub url: String,
421 pub kind: TimelineEventKind,
423 #[serde(default, skip_serializing_if = "Option::is_none")]
425 pub before: Option<MatchKind>,
426 #[serde(default, skip_serializing_if = "Option::is_none")]
428 pub after: Option<MatchKind>,
429 #[serde(default, skip_serializing_if = "Option::is_none")]
431 pub evidence_change: Option<EvidenceChange>,
432}
433
434#[must_use]
441pub fn diff_scans(previous: &PersistedScan, current: &PersistedScan) -> ScanDiff {
442 let previous_by_site = outcomes_by_site(&previous.outcomes);
443 let current_by_site = outcomes_by_site(¤t.outcomes);
444
445 let mut added_found = Vec::new();
446 let mut removed_found = Vec::new();
447 let mut verdict_changes = Vec::new();
448 let mut evidence_changes = Vec::new();
449
450 for (site, current_outcome) in ¤t_by_site {
451 let previous_outcome = previous_by_site.get(site);
452 if current_outcome.kind == MatchKind::Found
453 && previous_outcome.is_none_or(|o| o.kind != MatchKind::Found)
454 {
455 added_found.push((*current_outcome).clone());
456 }
457 if let Some(previous_outcome) = previous_outcome {
458 if previous_outcome.kind != current_outcome.kind {
459 verdict_changes.push(VerdictChange {
460 site: site.clone(),
461 before: previous_outcome.kind,
462 after: current_outcome.kind,
463 });
464 }
465 if previous_outcome.kind == MatchKind::Found
466 && current_outcome.kind == MatchKind::Found
467 && profile_evidence_changed(previous_outcome, current_outcome)
468 {
469 evidence_changes.push(EvidenceChange {
470 site: site.clone(),
471 before_enrichment: previous_outcome.enrichment.clone(),
472 after_enrichment: current_outcome.enrichment.clone(),
473 before_profile_evidence: previous_outcome.profile_evidence.clone(),
474 after_profile_evidence: current_outcome.profile_evidence.clone(),
475 });
476 }
477 }
478 }
479
480 for (site, previous_outcome) in &previous_by_site {
481 if previous_outcome.kind == MatchKind::Found
482 && current_by_site
483 .get(site)
484 .is_none_or(|o| o.kind != MatchKind::Found)
485 {
486 removed_found.push((*previous_outcome).clone());
487 }
488 }
489
490 ScanDiff {
491 from_scan_id: previous.scan_id.clone(),
492 to_scan_id: current.scan_id.clone(),
493 added_found,
494 removed_found,
495 verdict_changes,
496 evidence_changes,
497 }
498}
499
500#[must_use]
506pub fn build_scan_timeline(scans: &[PersistedScan]) -> ScanTimeline {
507 let mut ordered: Vec<&PersistedScan> = scans.iter().collect();
508 ordered.sort_by(|left, right| {
509 left.created_at_ms
510 .cmp(&right.created_at_ms)
511 .then_with(|| left.scan_id.as_str().cmp(right.scan_id.as_str()))
512 });
513
514 let username = ordered
515 .first()
516 .map(|scan| scan.username.clone())
517 .unwrap_or_default();
518 let from_ms = ordered.first().map(|scan| scan.created_at_ms);
519 let to_ms = ordered.last().map(|scan| scan.created_at_ms);
520 let mut states: BTreeMap<String, TimelineProfileState> = BTreeMap::new();
521 let mut events = Vec::new();
522
523 for scan in &ordered {
524 let current_by_site = outcomes_by_site(&scan.outcomes);
525 let sites = timeline_site_names(&states, ¤t_by_site);
526
527 for site in sites {
528 apply_timeline_site(
529 scan,
530 &site,
531 current_by_site.get(&site).copied(),
532 &mut states,
533 &mut events,
534 );
535 }
536 }
537
538 let profiles = states
539 .into_iter()
540 .map(|(site, state)| TimelineProfile {
541 site,
542 url: state.url,
543 first_seen_ms: state.first_seen_ms,
544 last_seen_ms: state.last_seen_ms,
545 present_in_latest: state.present_in_latest,
546 last_verdict: state.last_verdict,
547 })
548 .collect();
549
550 ScanTimeline {
551 username,
552 scan_count: ordered.len(),
553 from_ms,
554 to_ms,
555 profiles,
556 events,
557 }
558}
559
560fn timeline_site_names(
561 states: &BTreeMap<String, TimelineProfileState>,
562 current_by_site: &BTreeMap<String, &CheckOutcome>,
563) -> Vec<String> {
564 let mut sites: Vec<String> = states.keys().cloned().collect();
565 for site in current_by_site.keys() {
566 if !states.contains_key(site.as_str()) {
567 sites.push((*site).clone());
568 }
569 }
570 sites.sort();
571 sites.dedup();
572 sites
573}
574
575fn apply_timeline_site(
576 scan: &PersistedScan,
577 site: &str,
578 current: Option<&CheckOutcome>,
579 states: &mut BTreeMap<String, TimelineProfileState>,
580 events: &mut Vec<TimelineEvent>,
581) {
582 let current_kind = current.map(|outcome| outcome.kind);
583 let was_present = states
584 .get(site)
585 .is_some_and(|state| state.present_in_latest);
586
587 if current_kind == Some(MatchKind::Found) {
588 apply_found_timeline_site(scan, site, current.expect("found outcome"), states, events);
589 } else if was_present {
590 apply_disappeared_timeline_site(scan, site, current, current_kind, states, events);
591 } else if let (Some(state), Some(outcome)) = (states.get_mut(site), current) {
592 state.last_verdict = Some(outcome.kind);
593 state.url.clone_from(&outcome.url);
594 }
595}
596
597fn apply_found_timeline_site(
598 scan: &PersistedScan,
599 site: &str,
600 outcome: &CheckOutcome,
601 states: &mut BTreeMap<String, TimelineProfileState>,
602 events: &mut Vec<TimelineEvent>,
603) {
604 let current_kind = Some(outcome.kind);
605 let had_state = states.contains_key(site);
606 let was_present = states
607 .get(site)
608 .is_some_and(|state| state.present_in_latest);
609 let state = states
610 .entry(site.to_owned())
611 .or_insert_with(|| TimelineProfileState::new(outcome, scan.created_at_ms));
612
613 if !had_state {
614 events.push(timeline_event(
615 scan,
616 site,
617 &outcome.url,
618 TimelineEventKind::FirstSeen,
619 None,
620 current_kind,
621 None,
622 ));
623 } else if !was_present {
624 events.push(timeline_event(
625 scan,
626 site,
627 &outcome.url,
628 TimelineEventKind::Reappeared,
629 state.last_verdict,
630 current_kind,
631 None,
632 ));
633 } else if state.profile_evidence_changed(outcome) {
634 events.push(timeline_event(
635 scan,
636 site,
637 &outcome.url,
638 TimelineEventKind::EvidenceChanged,
639 Some(MatchKind::Found),
640 current_kind,
641 Some(EvidenceChange {
642 site: site.to_owned(),
643 before_enrichment: state.last_found_enrichment.clone(),
644 after_enrichment: outcome.enrichment.clone(),
645 before_profile_evidence: state.last_found_profile_evidence.clone(),
646 after_profile_evidence: outcome.profile_evidence.clone(),
647 }),
648 ));
649 }
650
651 states
652 .get_mut(site)
653 .expect("state inserted before found update")
654 .update_found(outcome, scan.created_at_ms);
655}
656
657fn apply_disappeared_timeline_site(
658 scan: &PersistedScan,
659 site: &str,
660 current: Option<&CheckOutcome>,
661 current_kind: Option<MatchKind>,
662 states: &mut BTreeMap<String, TimelineProfileState>,
663 events: &mut Vec<TimelineEvent>,
664) {
665 let state = states
666 .get_mut(site)
667 .expect("present state exists before disappearance");
668 let url = current.map_or_else(|| state.url.clone(), |outcome| outcome.url.clone());
669 events.push(timeline_event(
670 scan,
671 site,
672 &url,
673 TimelineEventKind::Disappeared,
674 state.last_verdict,
675 current_kind,
676 None,
677 ));
678 state.present_in_latest = false;
679 state.last_verdict = current_kind;
680 if let Some(outcome) = current {
681 state.url.clone_from(&outcome.url);
682 }
683}
684
685fn timeline_event(
686 scan: &PersistedScan,
687 site: &str,
688 url: &str,
689 kind: TimelineEventKind,
690 before: Option<MatchKind>,
691 after: Option<MatchKind>,
692 evidence_change: Option<EvidenceChange>,
693) -> TimelineEvent {
694 TimelineEvent {
695 scan_id: scan.scan_id.clone(),
696 at_ms: scan.created_at_ms,
697 site: site.to_owned(),
698 url: url.to_owned(),
699 kind,
700 before,
701 after,
702 evidence_change,
703 }
704}
705
706#[derive(Debug, Clone)]
707struct TimelineProfileState {
708 url: String,
709 first_seen_ms: u64,
710 last_seen_ms: u64,
711 present_in_latest: bool,
712 last_verdict: Option<MatchKind>,
713 last_found_enrichment: BTreeMap<String, String>,
714 last_found_profile_evidence: Vec<ProfileEvidence>,
715}
716
717impl TimelineProfileState {
718 fn new(outcome: &CheckOutcome, at_ms: u64) -> Self {
719 Self {
720 url: outcome.url.clone(),
721 first_seen_ms: at_ms,
722 last_seen_ms: at_ms,
723 present_in_latest: true,
724 last_verdict: Some(outcome.kind),
725 last_found_enrichment: outcome.enrichment.clone(),
726 last_found_profile_evidence: outcome.profile_evidence.clone(),
727 }
728 }
729
730 fn update_found(&mut self, outcome: &CheckOutcome, at_ms: u64) {
731 self.url.clone_from(&outcome.url);
732 self.last_seen_ms = at_ms;
733 self.present_in_latest = true;
734 self.last_verdict = Some(outcome.kind);
735 self.last_found_enrichment = outcome.enrichment.clone();
736 self.last_found_profile_evidence
737 .clone_from(&outcome.profile_evidence);
738 }
739
740 fn profile_evidence_changed(&self, outcome: &CheckOutcome) -> bool {
741 self.last_found_enrichment != outcome.enrichment
742 || self.last_found_profile_evidence != outcome.profile_evidence
743 }
744}
745
746fn outcomes_by_site(outcomes: &[CheckOutcome]) -> BTreeMap<String, &CheckOutcome> {
747 outcomes
748 .iter()
749 .map(|outcome| (outcome.site.clone(), outcome))
750 .collect()
751}
752
753fn profile_evidence_changed(previous: &CheckOutcome, current: &CheckOutcome) -> bool {
754 previous.enrichment != current.enrichment
755 || previous.profile_evidence != current.profile_evidence
756}
757
758#[must_use]
764pub fn default_dir() -> PathBuf {
765 if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
766 return PathBuf::from(xdg).join("adler").join("scans");
767 }
768 if let Some(home) = std::env::var_os("HOME") {
769 return PathBuf::from(home)
770 .join(".cache")
771 .join("adler")
772 .join("scans");
773 }
774 PathBuf::from("adler-scans")
775}
776
777pub(crate) async fn save(dir: &Path, scan: &PersistedScan) -> Result<()> {
779 fs::create_dir_all(dir).await.map_err(Error::Persist)?;
780 let path = dir.join(format!("{}.json", scan.scan_id));
781 let tmp = dir.join(format!("{}.json.tmp", scan.scan_id));
782 let mut scan = scan.clone();
783 scan.refresh_derived_fields();
784 let body = serde_json::to_vec_pretty(&scan).map_err(Error::PersistEncode)?;
785 fs::write(&tmp, &body).await.map_err(Error::Persist)?;
786 fs::rename(&tmp, &path).await.map_err(Error::Persist)?;
787 Ok(())
788}
789
790pub(crate) async fn load(dir: &Path, scan_id: &ScanId) -> Option<PersistedScan> {
794 let path = dir.join(format!("{scan_id}.json"));
795 let bytes = fs::read(&path).await.ok()?;
796 serde_json::from_slice(&bytes)
797 .ok()
798 .map(refresh_derived_fields)
799}
800
801pub(crate) async fn load_all(dir: &Path) -> Vec<PersistedScan> {
805 let Ok(mut entries) = fs::read_dir(dir).await else {
806 return Vec::new();
807 };
808 let mut out = Vec::new();
809 while let Ok(Some(entry)) = entries.next_entry().await {
810 let path = entry.path();
811 if path.extension().and_then(|s| s.to_str()) != Some("json") {
812 continue;
813 }
814 let Ok(bytes) = fs::read(&path).await else {
815 continue;
816 };
817 let Ok(scan) = serde_json::from_slice::<PersistedScan>(&bytes) else {
818 continue;
819 };
820 out.push(refresh_derived_fields(scan));
821 }
822 out.sort_by_key(|s| std::cmp::Reverse(s.created_at_ms));
823 out
824}
825
826fn refresh_derived_fields(mut scan: PersistedScan) -> PersistedScan {
827 scan.refresh_derived_fields();
828 scan
829}
830
831pub(crate) async fn prune(dir: &Path, keep_newest: usize) -> usize {
834 let scans = load_all(dir).await;
835 if scans.len() <= keep_newest {
836 return 0;
837 }
838 let mut removed = 0;
839 for s in &scans[keep_newest..] {
840 let path = dir.join(format!("{}.json", s.scan_id));
841 if fs::remove_file(&path).await.is_ok() {
842 removed += 1;
843 }
844 }
845 removed
846}
847
848#[cfg(test)]
849mod tests {
850 use super::*;
851 use adler_core::{
852 ConfidenceLabel, ConfidenceReason, EvidenceAccessPath, MatchKind, ProfileEvidence,
853 TransportTier, UncertainReason,
854 };
855 use std::collections::BTreeMap;
856 use tempfile::TempDir;
857
858 fn sample(scan_id: &str, ts: u64) -> PersistedScan {
859 PersistedScan {
860 schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
861 scan_id: ScanId::from(scan_id.to_owned()),
862 username: "alice".into(),
863 request_context: None,
864 site_count: 2,
865 created_at_ms: ts,
866 summary: Summary {
867 found: 1,
868 not_found: 1,
869 uncertain: 0,
870 },
871 outcomes: vec![
872 CheckOutcome {
873 site: "GitHub".into(),
874 url: "https://github.com/alice".into(),
875 kind: MatchKind::Found,
876 reason: None,
877 elapsed_ms: 120,
878 enrichment: BTreeMap::new(),
879 evidence: vec!["HTTP 200 (status_found)".into()],
880 profile_evidence: Vec::new(),
881 confidence: adler_core::ConfidenceScore::default(),
882 transport: None,
883 escalations: 0,
884 },
885 CheckOutcome {
886 site: "GitLab".into(),
887 url: "https://gitlab.com/alice".into(),
888 kind: MatchKind::NotFound,
889 reason: None,
890 elapsed_ms: 90,
891 enrichment: BTreeMap::new(),
892 evidence: vec!["HTTP 404 (status_not_found)".into()],
893 profile_evidence: Vec::new(),
894 confidence: adler_core::ConfidenceScore::default(),
895 transport: None,
896 escalations: 0,
897 },
898 ],
899 identity_clusters: Vec::new(),
900 elapsed_ms: 210,
901 }
902 }
903
904 fn outcome(site: &str, kind: MatchKind) -> CheckOutcome {
905 CheckOutcome {
906 site: site.into(),
907 url: format!("https://{site}.example/alice"),
908 kind,
909 reason: None,
910 elapsed_ms: 10,
911 enrichment: BTreeMap::new(),
912 evidence: Vec::new(),
913 profile_evidence: Vec::new(),
914 confidence: adler_core::ConfidenceScore::default(),
915 transport: None,
916 escalations: 0,
917 }
918 }
919
920 fn scan_with_outcomes(
921 scan_id: &str,
922 username: &str,
923 ts: u64,
924 outcomes: Vec<CheckOutcome>,
925 ) -> PersistedScan {
926 PersistedScan {
927 schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
928 scan_id: ScanId::from(scan_id.to_owned()),
929 username: username.to_owned(),
930 request_context: None,
931 site_count: outcomes.len(),
932 created_at_ms: ts,
933 summary: Summary::from_outcomes(&outcomes),
934 outcomes,
935 identity_clusters: Vec::new(),
936 elapsed_ms: 10,
937 }
938 }
939
940 fn found_with_website(site: &str, website: &str) -> CheckOutcome {
941 found_with_website_at(site, website, None)
942 }
943
944 fn found_with_website_at(
945 site: &str,
946 website: &str,
947 observed_at_ms: Option<u64>,
948 ) -> CheckOutcome {
949 let mut outcome = outcome(site, MatchKind::Found);
950 outcome
951 .profile_evidence
952 .push(ProfileEvidence::from_enrichment_with_source(
953 site,
954 &outcome.url,
955 "website",
956 website,
957 observed_at_ms,
958 None,
959 ));
960 outcome
961 }
962
963 fn has_historical_reason(outcome: &CheckOutcome, count: usize) -> bool {
964 outcome.confidence.reasons.iter().any(|reason| {
965 matches!(
966 reason,
967 ConfidenceReason::HistoricalConsistency { count: actual } if *actual == count
968 )
969 })
970 }
971
972 fn large_outcomes(count: usize, generation: usize) -> Vec<CheckOutcome> {
973 (0..count)
974 .map(|idx| large_outcome(idx, generation))
975 .collect()
976 }
977
978 fn large_outcome(idx: usize, generation: usize) -> CheckOutcome {
979 let site = format!("LargeSite{idx:04}");
980 let url = format!("https://large{idx:04}.example/alice");
981 let mut kind = match idx % 20 {
982 0 | 1 => MatchKind::Found,
983 3 => MatchKind::Uncertain,
984 _ => MatchKind::NotFound,
985 };
986 if generation > 0 && idx % 20 == 0 {
987 kind = MatchKind::NotFound;
988 } else if generation > 0 && idx % 20 == 2 {
989 kind = MatchKind::Found;
990 }
991
992 let mut outcome = CheckOutcome {
993 site: site.clone(),
994 url: url.clone(),
995 kind,
996 reason: (kind == MatchKind::Uncertain).then_some(UncertainReason::RateLimited),
997 elapsed_ms: 10 + (idx % 75) as u64,
998 enrichment: BTreeMap::new(),
999 evidence: Vec::new(),
1000 profile_evidence: Vec::new(),
1001 confidence: adler_core::ConfidenceScore::default(),
1002 transport: Some(if idx % 7 == 0 {
1003 TransportTier::Browser
1004 } else {
1005 TransportTier::Http
1006 }),
1007 escalations: u8::from(idx % 7 == 0),
1008 };
1009
1010 match kind {
1011 MatchKind::Found => {
1012 let observed_at_ms = 1_781_192_451_000 + generation as u64 * 1_000 + idx as u64;
1013 let website = format!("https://identity-{:02}.example", idx % 25);
1014 let name = format!("Alice Group {:02}", idx % 50);
1015 let bio = if generation > 0 && idx % 20 == 1 {
1016 format!("updated profile generation {generation} for {idx}")
1017 } else {
1018 format!("stable profile generation 0 for {idx}")
1019 };
1020 for (field, value) in [
1021 ("website", website.as_str()),
1022 ("name", name.as_str()),
1023 ("bio", bio.as_str()),
1024 ] {
1025 outcome
1026 .enrichment
1027 .insert(field.to_owned(), value.to_owned());
1028 outcome
1029 .profile_evidence
1030 .push(ProfileEvidence::from_enrichment_with_source(
1031 &site,
1032 &url,
1033 field,
1034 value,
1035 Some(observed_at_ms),
1036 Some(EvidenceAccessPath::new(
1037 outcome.transport.unwrap_or(TransportTier::Http),
1038 outcome.escalations,
1039 idx % 11 == 0,
1040 )),
1041 ));
1042 }
1043 outcome.evidence = vec![
1044 "HTTP 200 (status_found)".to_owned(),
1045 "body matched profile marker".to_owned(),
1046 ];
1047 }
1048 MatchKind::NotFound => {
1049 outcome.evidence = vec!["HTTP 404 (status_not_found)".to_owned()];
1050 }
1051 MatchKind::Uncertain => {}
1052 }
1053 outcome.refresh_confidence();
1054 outcome
1055 }
1056
1057 fn large_persisted_scan(scan_id: &str, generation: usize) -> PersistedScan {
1058 let outcomes = large_outcomes(2_500, generation);
1059 let finished = FinishedScan {
1060 summary: Summary::from_outcomes(&outcomes),
1061 identity_clusters: adler_core::build_identity_clusters("alice", &outcomes),
1062 elapsed_ms: 30_000 + generation as u64,
1063 outcomes,
1064 };
1065 PersistedScan::from_finished(
1066 ScanId::from(scan_id.to_owned()),
1067 "alice".to_owned(),
1068 2_500,
1069 1_781_192_451_000 + generation as u64 * 10_000,
1070 finished,
1071 )
1072 }
1073
1074 #[tokio::test]
1075 async fn save_then_load_roundtrips() {
1076 let tmp = TempDir::new().unwrap();
1077 let s = sample("abc123", 1_700_000_000_000);
1078 save(tmp.path(), &s).await.unwrap();
1079
1080 let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1081 assert_eq!(loaded.scan_id, s.scan_id);
1082 assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1083 assert_eq!(loaded.username, "alice");
1084 assert_eq!(loaded.outcomes.len(), 2);
1085 assert_eq!(loaded.outcomes[0].site, "GitHub");
1086 assert_eq!(loaded.summary.found, 1);
1087 }
1088
1089 #[test]
1090 fn historical_overlay_adds_reason_after_two_prior_stable_found_observations() {
1091 let mut current = scan_with_outcomes(
1092 "current",
1093 "alice",
1094 30,
1095 vec![found_with_website("GitHub", "https://alice.dev")],
1096 );
1097 let previous = scan_with_outcomes(
1098 "previous",
1099 "alice",
1100 20,
1101 vec![found_with_website("GitHub", "https://alice.dev")],
1102 );
1103 let older = scan_with_outcomes(
1104 "older",
1105 "alice",
1106 10,
1107 vec![found_with_website("GitHub", "https://alice.dev")],
1108 );
1109
1110 apply_historical_confidence_overlay(&mut current, &[previous, older]);
1111
1112 assert!(has_historical_reason(¤t.outcomes[0], 2));
1113 assert_eq!(current.outcomes[0].confidence.score, 79);
1114 }
1115
1116 #[test]
1117 fn historical_overlay_ignores_single_prior_found() {
1118 let mut current = scan_with_outcomes(
1119 "current",
1120 "alice",
1121 20,
1122 vec![found_with_website("GitHub", "https://alice.dev")],
1123 );
1124 let previous = scan_with_outcomes(
1125 "previous",
1126 "alice",
1127 10,
1128 vec![found_with_website("GitHub", "https://alice.dev")],
1129 );
1130
1131 apply_historical_confidence_overlay(&mut current, &[previous]);
1132
1133 assert!(!has_historical_reason(¤t.outcomes[0], 1));
1134 assert_eq!(current.outcomes[0].confidence.score, 75);
1135 }
1136
1137 #[test]
1138 fn historical_overlay_resets_on_explicit_non_found() {
1139 let mut current = scan_with_outcomes(
1140 "current",
1141 "alice",
1142 40,
1143 vec![found_with_website("GitHub", "https://alice.dev")],
1144 );
1145 let previous = scan_with_outcomes(
1146 "previous",
1147 "alice",
1148 30,
1149 vec![outcome("GitHub", MatchKind::NotFound)],
1150 );
1151 let older = scan_with_outcomes(
1152 "older",
1153 "alice",
1154 20,
1155 vec![found_with_website("GitHub", "https://alice.dev")],
1156 );
1157 let oldest = scan_with_outcomes(
1158 "oldest",
1159 "alice",
1160 10,
1161 vec![found_with_website("GitHub", "https://alice.dev")],
1162 );
1163
1164 apply_historical_confidence_overlay(&mut current, &[previous, older, oldest]);
1165
1166 assert!(!has_historical_reason(¤t.outcomes[0], 2));
1167 assert_eq!(current.outcomes[0].confidence.score, 75);
1168 }
1169
1170 #[test]
1171 fn historical_overlay_ignores_source_timestamp_changes() {
1172 let mut current = scan_with_outcomes(
1173 "current",
1174 "alice",
1175 30,
1176 vec![found_with_website_at(
1177 "GitHub",
1178 "https://alice.dev",
1179 Some(30),
1180 )],
1181 );
1182 let previous = scan_with_outcomes(
1183 "previous",
1184 "alice",
1185 20,
1186 vec![found_with_website_at(
1187 "GitHub",
1188 "https://alice.dev",
1189 Some(20),
1190 )],
1191 );
1192 let older = scan_with_outcomes(
1193 "older",
1194 "alice",
1195 10,
1196 vec![found_with_website_at(
1197 "GitHub",
1198 "https://alice.dev",
1199 Some(10),
1200 )],
1201 );
1202
1203 apply_historical_confidence_overlay(&mut current, &[previous, older]);
1204
1205 assert!(has_historical_reason(¤t.outcomes[0], 2));
1206 }
1207
1208 #[test]
1209 fn weak_status_only_result_remains_medium_capped_with_history() {
1210 let mut current_outcome = outcome("GitHub", MatchKind::Found);
1211 current_outcome.evidence = vec!["HTTP 200 (status_found)".to_owned()];
1212 let mut previous_outcome = outcome("GitHub", MatchKind::Found);
1213 previous_outcome.evidence = current_outcome.evidence.clone();
1214 let mut older_outcome = outcome("GitHub", MatchKind::Found);
1215 older_outcome.evidence = current_outcome.evidence.clone();
1216
1217 let mut current = scan_with_outcomes("current", "alice", 30, vec![current_outcome]);
1218 let previous = scan_with_outcomes("previous", "alice", 20, vec![previous_outcome]);
1219 let older = scan_with_outcomes("older", "alice", 10, vec![older_outcome]);
1220
1221 apply_historical_confidence_overlay(&mut current, &[previous, older]);
1222
1223 assert!(has_historical_reason(¤t.outcomes[0], 2));
1224 assert_eq!(
1225 current.outcomes[0].confidence.label,
1226 ConfidenceLabel::Medium
1227 );
1228 assert_eq!(current.outcomes[0].confidence.score, 70);
1229 }
1230
1231 #[tokio::test]
1232 async fn historical_overlay_does_not_rewrite_persisted_json() {
1233 let tmp = TempDir::new().unwrap();
1234 let current = scan_with_outcomes(
1235 "current",
1236 "alice",
1237 30,
1238 vec![found_with_website("GitHub", "https://alice.dev")],
1239 );
1240 let previous = scan_with_outcomes(
1241 "previous",
1242 "alice",
1243 20,
1244 vec![found_with_website("GitHub", "https://alice.dev")],
1245 );
1246 let older = scan_with_outcomes(
1247 "older",
1248 "alice",
1249 10,
1250 vec![found_with_website("GitHub", "https://alice.dev")],
1251 );
1252 save(tmp.path(), ¤t).await.unwrap();
1253 save(tmp.path(), &previous).await.unwrap();
1254 save(tmp.path(), &older).await.unwrap();
1255
1256 let current_path = tmp.path().join("current.json");
1257 let before = fs::read(¤t_path).await.unwrap();
1258 let related = load_all(tmp.path()).await;
1259 let mut loaded = load(tmp.path(), &ScanId::from("current".to_owned()))
1260 .await
1261 .unwrap();
1262
1263 apply_historical_confidence_overlay(&mut loaded, &related);
1264
1265 let after = fs::read(¤t_path).await.unwrap();
1266 assert_eq!(before, after);
1267 assert!(has_historical_reason(&loaded.outcomes[0], 2));
1268 }
1269
1270 #[tokio::test]
1271 async fn save_writes_schema_version() {
1272 let tmp = TempDir::new().unwrap();
1273 let s = sample("abc123", 1_700_000_000_000);
1274 save(tmp.path(), &s).await.unwrap();
1275
1276 let raw = fs::read_to_string(tmp.path().join("abc123.json"))
1277 .await
1278 .unwrap();
1279 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1280 assert_eq!(
1281 value["schema_version"],
1282 serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
1283 );
1284 }
1285
1286 #[tokio::test]
1287 async fn save_skips_empty_identity_clusters() {
1288 let tmp = TempDir::new().unwrap();
1289 let s = sample("empty-clusters", 1_700_000_000_000);
1290 save(tmp.path(), &s).await.unwrap();
1291
1292 let raw = fs::read_to_string(tmp.path().join("empty-clusters.json"))
1293 .await
1294 .unwrap();
1295 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1296 assert_eq!(
1297 value["schema_version"],
1298 serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
1299 );
1300 assert!(
1301 value.get("identity_clusters").is_none(),
1302 "empty cluster cache should stay absent from persisted JSON"
1303 );
1304 }
1305
1306 #[tokio::test]
1307 async fn save_writes_derived_identity_clusters() {
1308 let tmp = TempDir::new().unwrap();
1309 let mut s = sample("clusters", 1_700_000_000_000);
1310 s.outcomes = vec![
1311 found_with_website("GitHub", "https://alice.dev"),
1312 found_with_website("GitLab", "https://alice.dev"),
1313 ];
1314
1315 save(tmp.path(), &s).await.unwrap();
1316
1317 let raw = fs::read_to_string(tmp.path().join("clusters.json"))
1318 .await
1319 .unwrap();
1320 let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1321 assert_eq!(value["identity_clusters"].as_array().unwrap().len(), 1);
1322 assert_eq!(
1323 value["identity_clusters"][0]["members"]
1324 .as_array()
1325 .unwrap()
1326 .len(),
1327 2
1328 );
1329 }
1330
1331 #[tokio::test]
1332 async fn save_roundtrips_request_context() {
1333 let tmp = TempDir::new().unwrap();
1334 let context = ScanRequestContext {
1335 username: "alice".into(),
1336 derived_from: Some(ScanId::from("previous".to_owned())),
1337 only: vec!["Git".into()],
1338 exclude: Vec::new(),
1339 tag: vec!["coding".into()],
1340 exclude_tag: vec!["nsfw".into()],
1341 top: Some(100),
1342 nsfw: false,
1343 concurrency: Some(8),
1344 deadline_secs: Some(30),
1345 egress_names: vec!["us-resi".into()],
1346 disabled_matches: vec![PersistedDisabledMatch {
1347 name: "TikTok".into(),
1348 url: "https://www.tiktok.com/@{username}".into(),
1349 tags: vec!["social".into()],
1350 disabled_reason: "Honest Limits: JS hydration".into(),
1351 }],
1352 };
1353 let s = sample("ctx", 1_700_000_000_000).with_request_context(context.clone());
1354 save(tmp.path(), &s).await.unwrap();
1355
1356 let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1357 assert_eq!(loaded.request_context, Some(context));
1358 }
1359
1360 #[test]
1361 fn diff_scans_reports_added_removed_and_verdict_changes() {
1362 let mut previous = sample("old", 1_000);
1363 previous.outcomes = vec![
1364 outcome("GitHub", MatchKind::Found),
1365 outcome("Reddit", MatchKind::Found),
1366 outcome("Mastodon", MatchKind::NotFound),
1367 ];
1368 let mut current = sample("new", 2_000);
1369 current.outcomes = vec![
1370 outcome("GitHub", MatchKind::Found),
1371 outcome("Reddit", MatchKind::NotFound),
1372 outcome("Mastodon", MatchKind::Found),
1373 ];
1374
1375 let diff = diff_scans(&previous, ¤t);
1376
1377 assert_eq!(diff.from_scan_id.as_str(), "old");
1378 assert_eq!(diff.to_scan_id.as_str(), "new");
1379 assert_eq!(
1380 diff.added_found
1381 .iter()
1382 .map(|outcome| outcome.site.as_str())
1383 .collect::<Vec<_>>(),
1384 ["Mastodon"]
1385 );
1386 assert_eq!(
1387 diff.removed_found
1388 .iter()
1389 .map(|outcome| outcome.site.as_str())
1390 .collect::<Vec<_>>(),
1391 ["Reddit"]
1392 );
1393 assert_eq!(diff.verdict_changes.len(), 2);
1394 assert_eq!(diff.verdict_changes[0].site, "Mastodon");
1395 assert_eq!(diff.verdict_changes[0].before, MatchKind::NotFound);
1396 assert_eq!(diff.verdict_changes[0].after, MatchKind::Found);
1397 assert_eq!(diff.verdict_changes[1].site, "Reddit");
1398 assert!(diff.evidence_changes.is_empty());
1399 }
1400
1401 #[test]
1402 fn diff_scans_reports_profile_evidence_changes_for_still_found_sites() {
1403 let mut previous = sample("old", 1_000);
1404 let mut old_github = outcome("GitHub", MatchKind::Found);
1405 old_github.enrichment.insert("name".into(), "Alice".into());
1406 old_github
1407 .profile_evidence
1408 .push(adler_core::ProfileEvidence::from_enrichment(
1409 "GitHub",
1410 "https://github.example/alice",
1411 "name",
1412 "Alice",
1413 ));
1414 previous.outcomes = vec![old_github];
1415
1416 let mut current = sample("new", 2_000);
1417 let mut new_github = outcome("GitHub", MatchKind::Found);
1418 new_github
1419 .enrichment
1420 .insert("name".into(), "Alice Liddell".into());
1421 new_github
1422 .profile_evidence
1423 .push(adler_core::ProfileEvidence::from_enrichment(
1424 "GitHub",
1425 "https://github.example/alice",
1426 "name",
1427 "Alice Liddell",
1428 ));
1429 current.outcomes = vec![new_github];
1430
1431 let diff = diff_scans(&previous, ¤t);
1432
1433 assert!(diff.added_found.is_empty());
1434 assert!(diff.removed_found.is_empty());
1435 assert!(diff.verdict_changes.is_empty());
1436 assert_eq!(diff.evidence_changes.len(), 1);
1437 assert_eq!(diff.evidence_changes[0].site, "GitHub");
1438 assert_eq!(
1439 diff.evidence_changes[0]
1440 .before_enrichment
1441 .get("name")
1442 .unwrap(),
1443 "Alice"
1444 );
1445 assert_eq!(
1446 diff.evidence_changes[0]
1447 .after_enrichment
1448 .get("name")
1449 .unwrap(),
1450 "Alice Liddell"
1451 );
1452 }
1453
1454 #[test]
1455 fn timeline_tracks_first_seen_disappeared_and_reappeared() {
1456 let mut first = sample("first", 1_000);
1457 first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1458 let mut second = sample("second", 2_000);
1459 second.outcomes = vec![outcome("GitHub", MatchKind::NotFound)];
1460 let mut third = sample("third", 3_000);
1461 third.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1462
1463 let timeline = build_scan_timeline(&[third, first, second]);
1464
1465 assert_eq!(timeline.username, "alice");
1466 assert_eq!(timeline.scan_count, 3);
1467 assert_eq!(timeline.from_ms, Some(1_000));
1468 assert_eq!(timeline.to_ms, Some(3_000));
1469 assert_eq!(timeline.profiles.len(), 1);
1470 assert_eq!(timeline.profiles[0].site, "GitHub");
1471 assert_eq!(timeline.profiles[0].first_seen_ms, 1_000);
1472 assert_eq!(timeline.profiles[0].last_seen_ms, 3_000);
1473 assert!(timeline.profiles[0].present_in_latest);
1474 assert_eq!(
1475 timeline
1476 .events
1477 .iter()
1478 .map(|event| event.kind)
1479 .collect::<Vec<_>>(),
1480 [
1481 TimelineEventKind::FirstSeen,
1482 TimelineEventKind::Disappeared,
1483 TimelineEventKind::Reappeared
1484 ]
1485 );
1486 assert_eq!(timeline.events[1].before, Some(MatchKind::Found));
1487 assert_eq!(timeline.events[1].after, Some(MatchKind::NotFound));
1488 }
1489
1490 #[test]
1491 fn timeline_treats_missing_site_as_disappeared() {
1492 let mut first = sample("first", 1_000);
1493 first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1494 let mut second = sample("second", 2_000);
1495 second.outcomes = vec![outcome("GitLab", MatchKind::NotFound)];
1496
1497 let timeline = build_scan_timeline(&[first, second]);
1498
1499 assert_eq!(timeline.profiles.len(), 1);
1500 assert!(!timeline.profiles[0].present_in_latest);
1501 assert_eq!(timeline.events.len(), 2);
1502 assert_eq!(timeline.events[1].kind, TimelineEventKind::Disappeared);
1503 assert_eq!(timeline.events[1].site, "GitHub");
1504 assert_eq!(timeline.events[1].after, None);
1505 }
1506
1507 #[test]
1508 fn timeline_tracks_evidence_changes_for_still_found_profile() {
1509 let mut first = sample("first", 1_000);
1510 let mut old_github = outcome("GitHub", MatchKind::Found);
1511 old_github.enrichment.insert("name".into(), "Alice".into());
1512 old_github
1513 .profile_evidence
1514 .push(adler_core::ProfileEvidence::from_enrichment(
1515 "GitHub",
1516 "https://github.example/alice",
1517 "name",
1518 "Alice",
1519 ));
1520 first.outcomes = vec![old_github];
1521
1522 let mut second = sample("second", 2_000);
1523 let mut new_github = outcome("GitHub", MatchKind::Found);
1524 new_github
1525 .enrichment
1526 .insert("name".into(), "Alice Liddell".into());
1527 new_github
1528 .profile_evidence
1529 .push(adler_core::ProfileEvidence::from_enrichment(
1530 "GitHub",
1531 "https://github.example/alice",
1532 "name",
1533 "Alice Liddell",
1534 ));
1535 second.outcomes = vec![new_github];
1536
1537 let timeline = build_scan_timeline(&[first, second]);
1538
1539 assert_eq!(
1540 timeline
1541 .events
1542 .iter()
1543 .map(|event| event.kind)
1544 .collect::<Vec<_>>(),
1545 [
1546 TimelineEventKind::FirstSeen,
1547 TimelineEventKind::EvidenceChanged
1548 ]
1549 );
1550 let evidence_change = timeline.events[1].evidence_change.as_ref().unwrap();
1551 assert_eq!(
1552 evidence_change.before_enrichment.get("name").unwrap(),
1553 "Alice"
1554 );
1555 assert_eq!(
1556 evidence_change.after_enrichment.get("name").unwrap(),
1557 "Alice Liddell"
1558 );
1559 }
1560
1561 #[tokio::test]
1562 async fn load_all_returns_newest_first() {
1563 let tmp = TempDir::new().unwrap();
1564 save(tmp.path(), &sample("old", 1_000)).await.unwrap();
1565 save(tmp.path(), &sample("mid", 2_000)).await.unwrap();
1566 save(tmp.path(), &sample("new", 3_000)).await.unwrap();
1567 let all = load_all(tmp.path()).await;
1568 assert_eq!(all.len(), 3);
1569 assert_eq!(all[0].scan_id.as_str(), "new");
1570 assert_eq!(all[1].scan_id.as_str(), "mid");
1571 assert_eq!(all[2].scan_id.as_str(), "old");
1572 }
1573
1574 #[tokio::test]
1575 async fn load_returns_none_for_missing() {
1576 let tmp = TempDir::new().unwrap();
1577 let missing = load(tmp.path(), &ScanId::from("nope".to_owned())).await;
1578 assert!(missing.is_none());
1579 }
1580
1581 #[tokio::test]
1582 async fn load_defaults_schema_version_for_legacy_scan_json() {
1583 let tmp = TempDir::new().unwrap();
1584 let path = tmp.path().join("legacy.json");
1585 fs::write(
1586 &path,
1587 br#"{
1588 "scan_id": "legacy",
1589 "username": "alice",
1590 "site_count": 0,
1591 "created_at_ms": 1700000000000,
1592 "summary": { "found": 0, "not_found": 0, "uncertain": 0 },
1593 "outcomes": [],
1594 "elapsed_ms": 0
1595 }"#,
1596 )
1597 .await
1598 .unwrap();
1599
1600 let loaded = load(tmp.path(), &ScanId::from("legacy".to_owned()))
1601 .await
1602 .expect("legacy scan loads");
1603 assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1604 }
1605
1606 #[tokio::test]
1607 async fn load_accepts_v2_scan_json_after_schema_bump() {
1608 let tmp = TempDir::new().unwrap();
1609 let path = tmp.path().join("v2.json");
1610 fs::write(
1611 &path,
1612 br#"{
1613 "schema_version": 2,
1614 "scan_id": "v2",
1615 "username": "alice",
1616 "site_count": 1,
1617 "created_at_ms": 1700000000000,
1618 "summary": { "found": 1, "not_found": 0, "uncertain": 0 },
1619 "outcomes": [
1620 {
1621 "site": "GitHub",
1622 "url": "https://github.example/alice",
1623 "kind": "found",
1624 "elapsed_ms": 10,
1625 "evidence": ["HTTP 200 (status_found)"]
1626 }
1627 ],
1628 "elapsed_ms": 10
1629 }"#,
1630 )
1631 .await
1632 .unwrap();
1633
1634 let loaded = load(tmp.path(), &ScanId::from("v2".to_owned()))
1635 .await
1636 .expect("v2 scan loads");
1637
1638 assert_eq!(loaded.schema_version, 2);
1639 assert_eq!(loaded.summary.found, 1);
1640 assert_eq!(
1641 loaded.outcomes[0].confidence.label,
1642 adler_core::ConfidenceLabel::Medium
1643 );
1644 }
1645
1646 #[tokio::test]
1647 async fn load_derives_identity_clusters_for_legacy_scan_json() {
1648 let tmp = TempDir::new().unwrap();
1649 let path = tmp.path().join("legacy-clusters.json");
1650 fs::write(
1651 &path,
1652 br#"{
1653 "schema_version": 1,
1654 "scan_id": "legacy-clusters",
1655 "username": "alice",
1656 "site_count": 2,
1657 "created_at_ms": 1700000000000,
1658 "summary": { "found": 2, "not_found": 0, "uncertain": 0 },
1659 "outcomes": [
1660 {
1661 "site": "GitHub",
1662 "url": "https://github.example/alice",
1663 "kind": "found",
1664 "elapsed_ms": 10,
1665 "profile_evidence": [
1666 {
1667 "kind": "external_link",
1668 "field": "website",
1669 "value": "https://alice.dev",
1670 "source": {
1671 "site": "GitHub",
1672 "url": "https://github.example/alice",
1673 "origin": "extractor"
1674 }
1675 }
1676 ]
1677 },
1678 {
1679 "site": "GitLab",
1680 "url": "https://gitlab.example/alice",
1681 "kind": "found",
1682 "elapsed_ms": 10,
1683 "profile_evidence": [
1684 {
1685 "kind": "external_link",
1686 "field": "website",
1687 "value": "https://alice.dev/",
1688 "source": {
1689 "site": "GitLab",
1690 "url": "https://gitlab.example/alice",
1691 "origin": "extractor"
1692 }
1693 }
1694 ]
1695 }
1696 ],
1697 "elapsed_ms": 20
1698 }"#,
1699 )
1700 .await
1701 .unwrap();
1702
1703 let loaded = load(tmp.path(), &ScanId::from("legacy-clusters".to_owned()))
1704 .await
1705 .expect("legacy scan loads");
1706
1707 assert_eq!(loaded.identity_clusters.len(), 1);
1708 assert_eq!(loaded.identity_clusters[0].members.len(), 2);
1709 assert!(!loaded.identity_clusters[0].uncertain);
1710 }
1711
1712 #[test]
1713 fn large_scan_artifact_paths_handle_identity_graph_payloads() {
1714 let previous = large_persisted_scan("large-old", 0);
1715 let current = large_persisted_scan("large-new", 1);
1716
1717 assert_eq!(previous.outcomes.len(), 2_500);
1718 assert_eq!(previous.site_count, 2_500);
1719 assert_eq!(
1720 previous.summary.found + previous.summary.not_found + previous.summary.uncertain,
1721 2_500
1722 );
1723 assert!(!previous.identity_clusters.is_empty());
1724
1725 let raw = serde_json::to_string(&previous).unwrap();
1726 let decoded: PersistedScan = serde_json::from_str(&raw).unwrap();
1727 assert_eq!(decoded.outcomes.len(), 2_500);
1728 assert_eq!(
1729 decoded.identity_clusters.len(),
1730 previous.identity_clusters.len()
1731 );
1732
1733 let diff = diff_scans(&previous, ¤t);
1734 assert!(!diff.added_found.is_empty());
1735 assert!(!diff.removed_found.is_empty());
1736 assert!(!diff.verdict_changes.is_empty());
1737 assert!(!diff.evidence_changes.is_empty());
1738
1739 let timeline = build_scan_timeline(&[previous, current]);
1740 assert_eq!(timeline.scan_count, 2);
1741 assert_eq!(timeline.profiles.len(), 375);
1742 assert!(timeline.events.len() > timeline.profiles.len());
1743 }
1744
1745 #[tokio::test]
1746 async fn load_all_skips_unrelated_files() {
1747 let tmp = TempDir::new().unwrap();
1748 fs::write(tmp.path().join("README"), b"not json")
1750 .await
1751 .unwrap();
1752 fs::write(tmp.path().join("broken.json"), b"{ invalid")
1753 .await
1754 .unwrap();
1755 save(tmp.path(), &sample("good", 9_999)).await.unwrap();
1756 let all = load_all(tmp.path()).await;
1757 assert_eq!(all.len(), 1);
1758 assert_eq!(all[0].scan_id.as_str(), "good");
1759 }
1760
1761 #[tokio::test]
1762 async fn prune_keeps_only_newest_n() {
1763 let tmp = TempDir::new().unwrap();
1764 for i in 0u64..5 {
1765 save(tmp.path(), &sample(&format!("s{i}"), i * 1_000))
1766 .await
1767 .unwrap();
1768 }
1769 let removed = prune(tmp.path(), 2).await;
1770 assert_eq!(removed, 3);
1771 let remaining = load_all(tmp.path()).await;
1772 assert_eq!(remaining.len(), 2);
1773 assert_eq!(remaining[0].scan_id.as_str(), "s4");
1774 assert_eq!(remaining[1].scan_id.as_str(), "s3");
1775 }
1776}