Skip to main content

adler_server/
persist.rs

1//! On-disk persistence for finished scans.
2//!
3//! Each scan is serialised as a single JSON file under [`default_dir`]
4//! (`$XDG_CACHE_HOME/adler/scans/`, falling back to
5//! `$HOME/.cache/adler/scans/`). The on-disk format is the full
6//! [`PersistedScan`] — enough for the history listing AND for replaying
7//! the scan into the UI without a fresh probe.
8//!
9//! Writes are atomic: serialise to `<id>.json.tmp`, then rename onto
10//! the final path. A crashed process leaves at most one orphan `.tmp`
11//! file behind, never a half-written `<id>.json`.
12
13use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use adler_core::{CheckOutcome, IdentityCluster, MatchKind, ProfileEvidence, Site};
17use serde::{Deserialize, Serialize};
18use tokio::fs;
19
20use crate::error::{Error, Result};
21use crate::scan::{FinishedScan, ScanId, Summary};
22
23/// Hard cap on how many scans we keep on disk. Beyond this, oldest
24/// (by `created_at_ms`) get [`prune`]d on the next save. Picked to be
25/// large enough for any plausible human-driven OSINT session.
26pub(crate) const MAX_PERSISTED_SCANS: usize = 200;
27/// Current on-disk schema version for [`PersistedScan`].
28pub(crate) const PERSISTED_SCAN_SCHEMA_VERSION: u16 = 3;
29
30/// Self-contained snapshot of a completed scan. Round-trips losslessly
31/// through JSON; tests assert that.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct PersistedScan {
34    /// Version of this persisted scan artifact.
35    #[serde(default = "default_schema_version")]
36    pub schema_version: u16,
37    /// Stable identifier — same value as in-memory [`ScanId`].
38    pub scan_id: ScanId,
39    /// Username that was scanned.
40    pub username: String,
41    /// Request scope and parked-site diagnostics that explain how this
42    /// artifact was produced. Missing on scans saved before v1 context
43    /// support landed.
44    #[serde(default, skip_serializing_if = "Option::is_none")]
45    pub request_context: Option<ScanRequestContext>,
46    /// Total number of sites probed in this scan.
47    pub site_count: usize,
48    /// Unix epoch milliseconds when the scan was started.
49    pub created_at_ms: u64,
50    /// Per-verdict tally over [`Self::outcomes`].
51    pub summary: Summary,
52    /// All outcomes, in completion order.
53    pub outcomes: Vec<CheckOutcome>,
54    /// Deterministic identity candidates derived from found outcomes
55    /// with structured profile evidence.
56    #[serde(default, skip_serializing_if = "Vec::is_empty")]
57    pub identity_clusters: Vec<IdentityCluster>,
58    /// Wall-clock duration, milliseconds.
59    pub elapsed_ms: u64,
60}
61
62impl PersistedScan {
63    /// Build a snapshot from a freshly-completed in-memory scan.
64    #[must_use]
65    pub fn from_finished(
66        scan_id: ScanId,
67        username: String,
68        site_count: usize,
69        created_at_ms: u64,
70        finished: FinishedScan,
71    ) -> Self {
72        let mut scan = Self {
73            schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
74            scan_id,
75            username,
76            request_context: None,
77            site_count,
78            created_at_ms,
79            summary: finished.summary,
80            outcomes: finished.outcomes,
81            identity_clusters: finished.identity_clusters,
82            elapsed_ms: finished.elapsed_ms,
83        };
84        scan.refresh_derived_fields();
85        scan
86    }
87
88    /// Attach request-scope metadata to this persisted scan.
89    #[must_use]
90    pub fn with_request_context(mut self, context: ScanRequestContext) -> Self {
91        self.request_context = Some(context);
92        self
93    }
94
95    pub(crate) fn refresh_derived_fields(&mut self) {
96        for outcome in &mut self.outcomes {
97            outcome.refresh_confidence();
98        }
99        self.summary = Summary::from_outcomes(&self.outcomes);
100        self.identity_clusters =
101            adler_core::build_identity_clusters(&self.username, &self.outcomes);
102    }
103}
104
105const fn default_schema_version() -> u16 {
106    PERSISTED_SCAN_SCHEMA_VERSION
107}
108
109/// Request scope persisted with a finished scan so future timelines and
110/// reports can explain what was scanned and what was intentionally out of
111/// scope.
112#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
113pub struct ScanRequestContext {
114    /// Username supplied by the operator.
115    pub username: String,
116    /// Previous scan id when this scan was created by refiltering.
117    #[serde(default, skip_serializing_if = "Option::is_none")]
118    pub derived_from: Option<ScanId>,
119    /// Site name include filters.
120    #[serde(default, skip_serializing_if = "Vec::is_empty")]
121    pub only: Vec<String>,
122    /// Site name exclude filters.
123    #[serde(default, skip_serializing_if = "Vec::is_empty")]
124    pub exclude: Vec<String>,
125    /// Tag include filters.
126    #[serde(default, skip_serializing_if = "Vec::is_empty")]
127    pub tag: Vec<String>,
128    /// Tag exclude filters.
129    #[serde(default, skip_serializing_if = "Vec::is_empty")]
130    pub exclude_tag: Vec<String>,
131    /// Popularity ceiling, when supplied.
132    #[serde(default, skip_serializing_if = "Option::is_none")]
133    pub top: Option<u32>,
134    /// Whether NSFW-tagged entries were included.
135    pub nsfw: bool,
136    /// Per-scan concurrency override, when supplied.
137    #[serde(default, skip_serializing_if = "Option::is_none")]
138    pub concurrency: Option<usize>,
139    /// Per-scan deadline override, seconds.
140    #[serde(default, skip_serializing_if = "Option::is_none")]
141    pub deadline_secs: Option<u64>,
142    /// Egress subset requested for this scan.
143    #[serde(default, skip_serializing_if = "Vec::is_empty")]
144    pub egress_names: Vec<String>,
145    /// Disabled/parked sites that matched the same filter and were not
146    /// included in the enabled scan set.
147    #[serde(default, skip_serializing_if = "Vec::is_empty")]
148    pub disabled_matches: Vec<PersistedDisabledMatch>,
149}
150
151/// Compact disabled-site diagnostic persisted with scan context.
152#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
153pub struct PersistedDisabledMatch {
154    /// Site name.
155    pub name: String,
156    /// Profile URL template.
157    pub url: String,
158    /// Registry tags.
159    #[serde(default, skip_serializing_if = "Vec::is_empty")]
160    pub tags: Vec<String>,
161    /// Human-readable reason the site is parked.
162    pub disabled_reason: String,
163}
164
165impl From<&Site> for PersistedDisabledMatch {
166    fn from(site: &Site) -> Self {
167        Self {
168            name: site.name.clone(),
169            url: site.url.as_str().to_owned(),
170            tags: site.tags.clone(),
171            disabled_reason: site
172                .disabled_reason
173                .clone()
174                .unwrap_or_else(|| "disabled in registry".to_owned()),
175        }
176    }
177}
178
179/// Deterministic scan-to-scan diff used as the basis for timelines and
180/// watchlists.
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct ScanDiff {
183    /// Previous scan id.
184    pub from_scan_id: ScanId,
185    /// Current scan id.
186    pub to_scan_id: ScanId,
187    /// Found accounts that were not Found in the previous scan.
188    #[serde(default, skip_serializing_if = "Vec::is_empty")]
189    pub added_found: Vec<CheckOutcome>,
190    /// Accounts that were Found previously but are no longer Found.
191    #[serde(default, skip_serializing_if = "Vec::is_empty")]
192    pub removed_found: Vec<CheckOutcome>,
193    /// Sites present in both scans whose verdict changed.
194    #[serde(default, skip_serializing_if = "Vec::is_empty")]
195    pub verdict_changes: Vec<VerdictChange>,
196    /// Found sites whose normalized profile evidence changed.
197    #[serde(default, skip_serializing_if = "Vec::is_empty")]
198    pub evidence_changes: Vec<EvidenceChange>,
199}
200
201/// A verdict transition for one site.
202#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct VerdictChange {
204    /// Site name.
205    pub site: String,
206    /// Previous verdict.
207    pub before: MatchKind,
208    /// Current verdict.
209    pub after: MatchKind,
210}
211
212/// Profile evidence transition for one still-found site.
213#[derive(Debug, Clone, Serialize, Deserialize)]
214pub struct EvidenceChange {
215    /// Site name.
216    pub site: String,
217    /// Previous legacy enrichment fields.
218    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
219    pub before_enrichment: BTreeMap<String, String>,
220    /// Current legacy enrichment fields.
221    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
222    pub after_enrichment: BTreeMap<String, String>,
223    /// Previous normalized profile evidence.
224    #[serde(default, skip_serializing_if = "Vec::is_empty")]
225    pub before_profile_evidence: Vec<ProfileEvidence>,
226    /// Current normalized profile evidence.
227    #[serde(default, skip_serializing_if = "Vec::is_empty")]
228    pub after_profile_evidence: Vec<ProfileEvidence>,
229}
230
231/// Historical view derived from a sequence of persisted scans.
232#[derive(Debug, Clone, Serialize, Deserialize)]
233pub struct ScanTimeline {
234    /// Username shared by the scans used to build this timeline.
235    pub username: String,
236    /// Number of scans considered.
237    pub scan_count: usize,
238    /// Oldest scan timestamp, when at least one scan was supplied.
239    #[serde(default, skip_serializing_if = "Option::is_none")]
240    pub from_ms: Option<u64>,
241    /// Newest scan timestamp, when at least one scan was supplied.
242    #[serde(default, skip_serializing_if = "Option::is_none")]
243    pub to_ms: Option<u64>,
244    /// Per-site lifecycle summary.
245    #[serde(default, skip_serializing_if = "Vec::is_empty")]
246    pub profiles: Vec<TimelineProfile>,
247    /// Chronological lifecycle events.
248    #[serde(default, skip_serializing_if = "Vec::is_empty")]
249    pub events: Vec<TimelineEvent>,
250}
251
252/// Per-site lifecycle state in a scan timeline.
253#[derive(Debug, Clone, Serialize, Deserialize)]
254pub struct TimelineProfile {
255    /// Site name.
256    pub site: String,
257    /// Last known profile URL for the site.
258    pub url: String,
259    /// First scan timestamp where the profile was Found.
260    pub first_seen_ms: u64,
261    /// Most recent scan timestamp where the profile was Found.
262    pub last_seen_ms: u64,
263    /// Whether the profile is Found in the newest scan that mentioned it.
264    pub present_in_latest: bool,
265    /// Last verdict observed for this site, if the newest scan mentioned it.
266    #[serde(default, skip_serializing_if = "Option::is_none")]
267    pub last_verdict: Option<MatchKind>,
268}
269
270/// Timeline event category.
271#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
272#[serde(rename_all = "snake_case")]
273pub enum TimelineEventKind {
274    /// Site was Found for the first time in the supplied scan sequence.
275    FirstSeen,
276    /// Site was Found before, then no longer Found.
277    Disappeared,
278    /// Site was absent/not found after a previous hit, then Found again.
279    Reappeared,
280    /// Site stayed Found but normalized profile evidence changed.
281    EvidenceChanged,
282}
283
284/// One lifecycle event for a profile across scans.
285#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct TimelineEvent {
287    /// Scan id where the event was observed.
288    pub scan_id: ScanId,
289    /// Scan start timestamp.
290    pub at_ms: u64,
291    /// Site name.
292    pub site: String,
293    /// Best URL known for the site at this point in the timeline.
294    pub url: String,
295    /// Event category.
296    pub kind: TimelineEventKind,
297    /// Previous verdict, when known.
298    #[serde(default, skip_serializing_if = "Option::is_none")]
299    pub before: Option<MatchKind>,
300    /// Current verdict, when the current scan mentioned the site.
301    #[serde(default, skip_serializing_if = "Option::is_none")]
302    pub after: Option<MatchKind>,
303    /// Evidence transition for [`TimelineEventKind::EvidenceChanged`].
304    #[serde(default, skip_serializing_if = "Option::is_none")]
305    pub evidence_change: Option<EvidenceChange>,
306}
307
308/// Compare two persisted scans.
309///
310/// The diff is intentionally conservative: `added_found` and
311/// `removed_found` are based only on the `Found` verdict, while
312/// `evidence_changes` are reported only for sites that are Found in both
313/// scans.
314#[must_use]
315pub fn diff_scans(previous: &PersistedScan, current: &PersistedScan) -> ScanDiff {
316    let previous_by_site = outcomes_by_site(&previous.outcomes);
317    let current_by_site = outcomes_by_site(&current.outcomes);
318
319    let mut added_found = Vec::new();
320    let mut removed_found = Vec::new();
321    let mut verdict_changes = Vec::new();
322    let mut evidence_changes = Vec::new();
323
324    for (site, current_outcome) in &current_by_site {
325        let previous_outcome = previous_by_site.get(site);
326        if current_outcome.kind == MatchKind::Found
327            && previous_outcome.is_none_or(|o| o.kind != MatchKind::Found)
328        {
329            added_found.push((*current_outcome).clone());
330        }
331        if let Some(previous_outcome) = previous_outcome {
332            if previous_outcome.kind != current_outcome.kind {
333                verdict_changes.push(VerdictChange {
334                    site: site.clone(),
335                    before: previous_outcome.kind,
336                    after: current_outcome.kind,
337                });
338            }
339            if previous_outcome.kind == MatchKind::Found
340                && current_outcome.kind == MatchKind::Found
341                && profile_evidence_changed(previous_outcome, current_outcome)
342            {
343                evidence_changes.push(EvidenceChange {
344                    site: site.clone(),
345                    before_enrichment: previous_outcome.enrichment.clone(),
346                    after_enrichment: current_outcome.enrichment.clone(),
347                    before_profile_evidence: previous_outcome.profile_evidence.clone(),
348                    after_profile_evidence: current_outcome.profile_evidence.clone(),
349                });
350            }
351        }
352    }
353
354    for (site, previous_outcome) in &previous_by_site {
355        if previous_outcome.kind == MatchKind::Found
356            && current_by_site
357                .get(site)
358                .is_none_or(|o| o.kind != MatchKind::Found)
359        {
360            removed_found.push((*previous_outcome).clone());
361        }
362    }
363
364    ScanDiff {
365        from_scan_id: previous.scan_id.clone(),
366        to_scan_id: current.scan_id.clone(),
367        added_found,
368        removed_found,
369        verdict_changes,
370        evidence_changes,
371    }
372}
373
374/// Build a chronological timeline from persisted scans.
375///
376/// Scans may be supplied in any order; the builder sorts them oldest-first.
377/// Only `Found` outcomes create profiles. A later non-Found or missing site
378/// creates a disappearance event if the profile was previously present.
379#[must_use]
380pub fn build_scan_timeline(scans: &[PersistedScan]) -> ScanTimeline {
381    let mut ordered: Vec<&PersistedScan> = scans.iter().collect();
382    ordered.sort_by(|left, right| {
383        left.created_at_ms
384            .cmp(&right.created_at_ms)
385            .then_with(|| left.scan_id.as_str().cmp(right.scan_id.as_str()))
386    });
387
388    let username = ordered
389        .first()
390        .map(|scan| scan.username.clone())
391        .unwrap_or_default();
392    let from_ms = ordered.first().map(|scan| scan.created_at_ms);
393    let to_ms = ordered.last().map(|scan| scan.created_at_ms);
394    let mut states: BTreeMap<String, TimelineProfileState> = BTreeMap::new();
395    let mut events = Vec::new();
396
397    for scan in &ordered {
398        let current_by_site = outcomes_by_site(&scan.outcomes);
399        let sites = timeline_site_names(&states, &current_by_site);
400
401        for site in sites {
402            apply_timeline_site(
403                scan,
404                &site,
405                current_by_site.get(&site).copied(),
406                &mut states,
407                &mut events,
408            );
409        }
410    }
411
412    let profiles = states
413        .into_iter()
414        .map(|(site, state)| TimelineProfile {
415            site,
416            url: state.url,
417            first_seen_ms: state.first_seen_ms,
418            last_seen_ms: state.last_seen_ms,
419            present_in_latest: state.present_in_latest,
420            last_verdict: state.last_verdict,
421        })
422        .collect();
423
424    ScanTimeline {
425        username,
426        scan_count: ordered.len(),
427        from_ms,
428        to_ms,
429        profiles,
430        events,
431    }
432}
433
434fn timeline_site_names(
435    states: &BTreeMap<String, TimelineProfileState>,
436    current_by_site: &BTreeMap<String, &CheckOutcome>,
437) -> Vec<String> {
438    let mut sites: Vec<String> = states.keys().cloned().collect();
439    for site in current_by_site.keys() {
440        if !states.contains_key(site.as_str()) {
441            sites.push((*site).clone());
442        }
443    }
444    sites.sort();
445    sites.dedup();
446    sites
447}
448
449fn apply_timeline_site(
450    scan: &PersistedScan,
451    site: &str,
452    current: Option<&CheckOutcome>,
453    states: &mut BTreeMap<String, TimelineProfileState>,
454    events: &mut Vec<TimelineEvent>,
455) {
456    let current_kind = current.map(|outcome| outcome.kind);
457    let was_present = states
458        .get(site)
459        .is_some_and(|state| state.present_in_latest);
460
461    if current_kind == Some(MatchKind::Found) {
462        apply_found_timeline_site(scan, site, current.expect("found outcome"), states, events);
463    } else if was_present {
464        apply_disappeared_timeline_site(scan, site, current, current_kind, states, events);
465    } else if let (Some(state), Some(outcome)) = (states.get_mut(site), current) {
466        state.last_verdict = Some(outcome.kind);
467        state.url.clone_from(&outcome.url);
468    }
469}
470
471fn apply_found_timeline_site(
472    scan: &PersistedScan,
473    site: &str,
474    outcome: &CheckOutcome,
475    states: &mut BTreeMap<String, TimelineProfileState>,
476    events: &mut Vec<TimelineEvent>,
477) {
478    let current_kind = Some(outcome.kind);
479    let had_state = states.contains_key(site);
480    let was_present = states
481        .get(site)
482        .is_some_and(|state| state.present_in_latest);
483    let state = states
484        .entry(site.to_owned())
485        .or_insert_with(|| TimelineProfileState::new(outcome, scan.created_at_ms));
486
487    if !had_state {
488        events.push(timeline_event(
489            scan,
490            site,
491            &outcome.url,
492            TimelineEventKind::FirstSeen,
493            None,
494            current_kind,
495            None,
496        ));
497    } else if !was_present {
498        events.push(timeline_event(
499            scan,
500            site,
501            &outcome.url,
502            TimelineEventKind::Reappeared,
503            state.last_verdict,
504            current_kind,
505            None,
506        ));
507    } else if state.profile_evidence_changed(outcome) {
508        events.push(timeline_event(
509            scan,
510            site,
511            &outcome.url,
512            TimelineEventKind::EvidenceChanged,
513            Some(MatchKind::Found),
514            current_kind,
515            Some(EvidenceChange {
516                site: site.to_owned(),
517                before_enrichment: state.last_found_enrichment.clone(),
518                after_enrichment: outcome.enrichment.clone(),
519                before_profile_evidence: state.last_found_profile_evidence.clone(),
520                after_profile_evidence: outcome.profile_evidence.clone(),
521            }),
522        ));
523    }
524
525    states
526        .get_mut(site)
527        .expect("state inserted before found update")
528        .update_found(outcome, scan.created_at_ms);
529}
530
531fn apply_disappeared_timeline_site(
532    scan: &PersistedScan,
533    site: &str,
534    current: Option<&CheckOutcome>,
535    current_kind: Option<MatchKind>,
536    states: &mut BTreeMap<String, TimelineProfileState>,
537    events: &mut Vec<TimelineEvent>,
538) {
539    let state = states
540        .get_mut(site)
541        .expect("present state exists before disappearance");
542    let url = current.map_or_else(|| state.url.clone(), |outcome| outcome.url.clone());
543    events.push(timeline_event(
544        scan,
545        site,
546        &url,
547        TimelineEventKind::Disappeared,
548        state.last_verdict,
549        current_kind,
550        None,
551    ));
552    state.present_in_latest = false;
553    state.last_verdict = current_kind;
554    if let Some(outcome) = current {
555        state.url.clone_from(&outcome.url);
556    }
557}
558
559fn timeline_event(
560    scan: &PersistedScan,
561    site: &str,
562    url: &str,
563    kind: TimelineEventKind,
564    before: Option<MatchKind>,
565    after: Option<MatchKind>,
566    evidence_change: Option<EvidenceChange>,
567) -> TimelineEvent {
568    TimelineEvent {
569        scan_id: scan.scan_id.clone(),
570        at_ms: scan.created_at_ms,
571        site: site.to_owned(),
572        url: url.to_owned(),
573        kind,
574        before,
575        after,
576        evidence_change,
577    }
578}
579
580#[derive(Debug, Clone)]
581struct TimelineProfileState {
582    url: String,
583    first_seen_ms: u64,
584    last_seen_ms: u64,
585    present_in_latest: bool,
586    last_verdict: Option<MatchKind>,
587    last_found_enrichment: BTreeMap<String, String>,
588    last_found_profile_evidence: Vec<ProfileEvidence>,
589}
590
591impl TimelineProfileState {
592    fn new(outcome: &CheckOutcome, at_ms: u64) -> Self {
593        Self {
594            url: outcome.url.clone(),
595            first_seen_ms: at_ms,
596            last_seen_ms: at_ms,
597            present_in_latest: true,
598            last_verdict: Some(outcome.kind),
599            last_found_enrichment: outcome.enrichment.clone(),
600            last_found_profile_evidence: outcome.profile_evidence.clone(),
601        }
602    }
603
604    fn update_found(&mut self, outcome: &CheckOutcome, at_ms: u64) {
605        self.url.clone_from(&outcome.url);
606        self.last_seen_ms = at_ms;
607        self.present_in_latest = true;
608        self.last_verdict = Some(outcome.kind);
609        self.last_found_enrichment = outcome.enrichment.clone();
610        self.last_found_profile_evidence
611            .clone_from(&outcome.profile_evidence);
612    }
613
614    fn profile_evidence_changed(&self, outcome: &CheckOutcome) -> bool {
615        self.last_found_enrichment != outcome.enrichment
616            || self.last_found_profile_evidence != outcome.profile_evidence
617    }
618}
619
620fn outcomes_by_site(outcomes: &[CheckOutcome]) -> BTreeMap<String, &CheckOutcome> {
621    outcomes
622        .iter()
623        .map(|outcome| (outcome.site.clone(), outcome))
624        .collect()
625}
626
627fn profile_evidence_changed(previous: &CheckOutcome, current: &CheckOutcome) -> bool {
628    previous.enrichment != current.enrichment
629        || previous.profile_evidence != current.profile_evidence
630}
631
632/// Default directory for persisted scans.
633///
634/// Mirrors [`adler_core::Cache::default_path`]'s discovery rules:
635/// `$XDG_CACHE_HOME/adler/scans/` → `$HOME/.cache/adler/scans/` →
636/// a relative fallback. The directory is created lazily on first save.
637#[must_use]
638pub fn default_dir() -> PathBuf {
639    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
640        return PathBuf::from(xdg).join("adler").join("scans");
641    }
642    if let Some(home) = std::env::var_os("HOME") {
643        return PathBuf::from(home)
644            .join(".cache")
645            .join("adler")
646            .join("scans");
647    }
648    PathBuf::from("adler-scans")
649}
650
651/// Save `scan` to `<dir>/<id>.json` atomically. Creates `dir` if missing.
652pub(crate) async fn save(dir: &Path, scan: &PersistedScan) -> Result<()> {
653    fs::create_dir_all(dir).await.map_err(Error::Persist)?;
654    let path = dir.join(format!("{}.json", scan.scan_id));
655    let tmp = dir.join(format!("{}.json.tmp", scan.scan_id));
656    let mut scan = scan.clone();
657    scan.refresh_derived_fields();
658    let body = serde_json::to_vec_pretty(&scan).map_err(Error::PersistEncode)?;
659    fs::write(&tmp, &body).await.map_err(Error::Persist)?;
660    fs::rename(&tmp, &path).await.map_err(Error::Persist)?;
661    Ok(())
662}
663
664/// Read one scan from disk by id. Returns `None` on any I/O or parse
665/// error — callers should treat a missing scan as not-found rather
666/// than propagate the underlying cause.
667pub(crate) async fn load(dir: &Path, scan_id: &ScanId) -> Option<PersistedScan> {
668    let path = dir.join(format!("{scan_id}.json"));
669    let bytes = fs::read(&path).await.ok()?;
670    serde_json::from_slice(&bytes)
671        .ok()
672        .map(refresh_derived_fields)
673}
674
675/// Enumerate every persisted scan, newest first. Files that fail to
676/// parse are silently skipped — a corrupted file shouldn't break the
677/// whole listing.
678pub(crate) async fn load_all(dir: &Path) -> Vec<PersistedScan> {
679    let Ok(mut entries) = fs::read_dir(dir).await else {
680        return Vec::new();
681    };
682    let mut out = Vec::new();
683    while let Ok(Some(entry)) = entries.next_entry().await {
684        let path = entry.path();
685        if path.extension().and_then(|s| s.to_str()) != Some("json") {
686            continue;
687        }
688        let Ok(bytes) = fs::read(&path).await else {
689            continue;
690        };
691        let Ok(scan) = serde_json::from_slice::<PersistedScan>(&bytes) else {
692            continue;
693        };
694        out.push(refresh_derived_fields(scan));
695    }
696    out.sort_by_key(|s| std::cmp::Reverse(s.created_at_ms));
697    out
698}
699
700fn refresh_derived_fields(mut scan: PersistedScan) -> PersistedScan {
701    scan.refresh_derived_fields();
702    scan
703}
704
705/// Delete scans beyond `keep_newest`. Newest-by-`created_at_ms` wins.
706/// Returns the number of files actually removed.
707pub(crate) async fn prune(dir: &Path, keep_newest: usize) -> usize {
708    let scans = load_all(dir).await;
709    if scans.len() <= keep_newest {
710        return 0;
711    }
712    let mut removed = 0;
713    for s in &scans[keep_newest..] {
714        let path = dir.join(format!("{}.json", s.scan_id));
715        if fs::remove_file(&path).await.is_ok() {
716            removed += 1;
717        }
718    }
719    removed
720}
721
722#[cfg(test)]
723mod tests {
724    use super::*;
725    use adler_core::{
726        EvidenceAccessPath, MatchKind, ProfileEvidence, TransportTier, UncertainReason,
727    };
728    use std::collections::BTreeMap;
729    use tempfile::TempDir;
730
731    fn sample(scan_id: &str, ts: u64) -> PersistedScan {
732        PersistedScan {
733            schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
734            scan_id: ScanId::from(scan_id.to_owned()),
735            username: "alice".into(),
736            request_context: None,
737            site_count: 2,
738            created_at_ms: ts,
739            summary: Summary {
740                found: 1,
741                not_found: 1,
742                uncertain: 0,
743            },
744            outcomes: vec![
745                CheckOutcome {
746                    site: "GitHub".into(),
747                    url: "https://github.com/alice".into(),
748                    kind: MatchKind::Found,
749                    reason: None,
750                    elapsed_ms: 120,
751                    enrichment: BTreeMap::new(),
752                    evidence: vec!["HTTP 200 (status_found)".into()],
753                    profile_evidence: Vec::new(),
754                    confidence: adler_core::ConfidenceScore::default(),
755                    transport: None,
756                    escalations: 0,
757                },
758                CheckOutcome {
759                    site: "GitLab".into(),
760                    url: "https://gitlab.com/alice".into(),
761                    kind: MatchKind::NotFound,
762                    reason: None,
763                    elapsed_ms: 90,
764                    enrichment: BTreeMap::new(),
765                    evidence: vec!["HTTP 404 (status_not_found)".into()],
766                    profile_evidence: Vec::new(),
767                    confidence: adler_core::ConfidenceScore::default(),
768                    transport: None,
769                    escalations: 0,
770                },
771            ],
772            identity_clusters: Vec::new(),
773            elapsed_ms: 210,
774        }
775    }
776
777    fn outcome(site: &str, kind: MatchKind) -> CheckOutcome {
778        CheckOutcome {
779            site: site.into(),
780            url: format!("https://{site}.example/alice"),
781            kind,
782            reason: None,
783            elapsed_ms: 10,
784            enrichment: BTreeMap::new(),
785            evidence: Vec::new(),
786            profile_evidence: Vec::new(),
787            confidence: adler_core::ConfidenceScore::default(),
788            transport: None,
789            escalations: 0,
790        }
791    }
792
793    fn found_with_website(site: &str, website: &str) -> CheckOutcome {
794        let mut outcome = outcome(site, MatchKind::Found);
795        outcome
796            .profile_evidence
797            .push(ProfileEvidence::from_enrichment(
798                site,
799                &outcome.url,
800                "website",
801                website,
802            ));
803        outcome
804    }
805
806    fn large_outcomes(count: usize, generation: usize) -> Vec<CheckOutcome> {
807        (0..count)
808            .map(|idx| large_outcome(idx, generation))
809            .collect()
810    }
811
812    fn large_outcome(idx: usize, generation: usize) -> CheckOutcome {
813        let site = format!("LargeSite{idx:04}");
814        let url = format!("https://large{idx:04}.example/alice");
815        let mut kind = match idx % 20 {
816            0 | 1 => MatchKind::Found,
817            3 => MatchKind::Uncertain,
818            _ => MatchKind::NotFound,
819        };
820        if generation > 0 && idx % 20 == 0 {
821            kind = MatchKind::NotFound;
822        } else if generation > 0 && idx % 20 == 2 {
823            kind = MatchKind::Found;
824        }
825
826        let mut outcome = CheckOutcome {
827            site: site.clone(),
828            url: url.clone(),
829            kind,
830            reason: (kind == MatchKind::Uncertain).then_some(UncertainReason::RateLimited),
831            elapsed_ms: 10 + (idx % 75) as u64,
832            enrichment: BTreeMap::new(),
833            evidence: Vec::new(),
834            profile_evidence: Vec::new(),
835            confidence: adler_core::ConfidenceScore::default(),
836            transport: Some(if idx % 7 == 0 {
837                TransportTier::Browser
838            } else {
839                TransportTier::Http
840            }),
841            escalations: u8::from(idx % 7 == 0),
842        };
843
844        match kind {
845            MatchKind::Found => {
846                let observed_at_ms = 1_781_192_451_000 + generation as u64 * 1_000 + idx as u64;
847                let website = format!("https://identity-{:02}.example", idx % 25);
848                let name = format!("Alice Group {:02}", idx % 50);
849                let bio = if generation > 0 && idx % 20 == 1 {
850                    format!("updated profile generation {generation} for {idx}")
851                } else {
852                    format!("stable profile generation 0 for {idx}")
853                };
854                for (field, value) in [
855                    ("website", website.as_str()),
856                    ("name", name.as_str()),
857                    ("bio", bio.as_str()),
858                ] {
859                    outcome
860                        .enrichment
861                        .insert(field.to_owned(), value.to_owned());
862                    outcome
863                        .profile_evidence
864                        .push(ProfileEvidence::from_enrichment_with_source(
865                            &site,
866                            &url,
867                            field,
868                            value,
869                            Some(observed_at_ms),
870                            Some(EvidenceAccessPath::new(
871                                outcome.transport.unwrap_or(TransportTier::Http),
872                                outcome.escalations,
873                                idx % 11 == 0,
874                            )),
875                        ));
876                }
877                outcome.evidence = vec![
878                    "HTTP 200 (status_found)".to_owned(),
879                    "body matched profile marker".to_owned(),
880                ];
881            }
882            MatchKind::NotFound => {
883                outcome.evidence = vec!["HTTP 404 (status_not_found)".to_owned()];
884            }
885            MatchKind::Uncertain => {}
886        }
887        outcome.refresh_confidence();
888        outcome
889    }
890
891    fn large_persisted_scan(scan_id: &str, generation: usize) -> PersistedScan {
892        let outcomes = large_outcomes(2_500, generation);
893        let finished = FinishedScan {
894            summary: Summary::from_outcomes(&outcomes),
895            identity_clusters: adler_core::build_identity_clusters("alice", &outcomes),
896            elapsed_ms: 30_000 + generation as u64,
897            outcomes,
898        };
899        PersistedScan::from_finished(
900            ScanId::from(scan_id.to_owned()),
901            "alice".to_owned(),
902            2_500,
903            1_781_192_451_000 + generation as u64 * 10_000,
904            finished,
905        )
906    }
907
908    #[tokio::test]
909    async fn save_then_load_roundtrips() {
910        let tmp = TempDir::new().unwrap();
911        let s = sample("abc123", 1_700_000_000_000);
912        save(tmp.path(), &s).await.unwrap();
913
914        let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
915        assert_eq!(loaded.scan_id, s.scan_id);
916        assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
917        assert_eq!(loaded.username, "alice");
918        assert_eq!(loaded.outcomes.len(), 2);
919        assert_eq!(loaded.outcomes[0].site, "GitHub");
920        assert_eq!(loaded.summary.found, 1);
921    }
922
923    #[tokio::test]
924    async fn save_writes_schema_version() {
925        let tmp = TempDir::new().unwrap();
926        let s = sample("abc123", 1_700_000_000_000);
927        save(tmp.path(), &s).await.unwrap();
928
929        let raw = fs::read_to_string(tmp.path().join("abc123.json"))
930            .await
931            .unwrap();
932        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
933        assert_eq!(
934            value["schema_version"],
935            serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
936        );
937    }
938
939    #[tokio::test]
940    async fn save_skips_empty_identity_clusters() {
941        let tmp = TempDir::new().unwrap();
942        let s = sample("empty-clusters", 1_700_000_000_000);
943        save(tmp.path(), &s).await.unwrap();
944
945        let raw = fs::read_to_string(tmp.path().join("empty-clusters.json"))
946            .await
947            .unwrap();
948        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
949        assert_eq!(
950            value["schema_version"],
951            serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
952        );
953        assert!(
954            value.get("identity_clusters").is_none(),
955            "empty cluster cache should stay absent from persisted JSON"
956        );
957    }
958
959    #[tokio::test]
960    async fn save_writes_derived_identity_clusters() {
961        let tmp = TempDir::new().unwrap();
962        let mut s = sample("clusters", 1_700_000_000_000);
963        s.outcomes = vec![
964            found_with_website("GitHub", "https://alice.dev"),
965            found_with_website("GitLab", "https://alice.dev"),
966        ];
967
968        save(tmp.path(), &s).await.unwrap();
969
970        let raw = fs::read_to_string(tmp.path().join("clusters.json"))
971            .await
972            .unwrap();
973        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
974        assert_eq!(value["identity_clusters"].as_array().unwrap().len(), 1);
975        assert_eq!(
976            value["identity_clusters"][0]["members"]
977                .as_array()
978                .unwrap()
979                .len(),
980            2
981        );
982    }
983
984    #[tokio::test]
985    async fn save_roundtrips_request_context() {
986        let tmp = TempDir::new().unwrap();
987        let context = ScanRequestContext {
988            username: "alice".into(),
989            derived_from: Some(ScanId::from("previous".to_owned())),
990            only: vec!["Git".into()],
991            exclude: Vec::new(),
992            tag: vec!["coding".into()],
993            exclude_tag: vec!["nsfw".into()],
994            top: Some(100),
995            nsfw: false,
996            concurrency: Some(8),
997            deadline_secs: Some(30),
998            egress_names: vec!["us-resi".into()],
999            disabled_matches: vec![PersistedDisabledMatch {
1000                name: "TikTok".into(),
1001                url: "https://www.tiktok.com/@{username}".into(),
1002                tags: vec!["social".into()],
1003                disabled_reason: "Honest Limits: JS hydration".into(),
1004            }],
1005        };
1006        let s = sample("ctx", 1_700_000_000_000).with_request_context(context.clone());
1007        save(tmp.path(), &s).await.unwrap();
1008
1009        let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1010        assert_eq!(loaded.request_context, Some(context));
1011    }
1012
1013    #[test]
1014    fn diff_scans_reports_added_removed_and_verdict_changes() {
1015        let mut previous = sample("old", 1_000);
1016        previous.outcomes = vec![
1017            outcome("GitHub", MatchKind::Found),
1018            outcome("Reddit", MatchKind::Found),
1019            outcome("Mastodon", MatchKind::NotFound),
1020        ];
1021        let mut current = sample("new", 2_000);
1022        current.outcomes = vec![
1023            outcome("GitHub", MatchKind::Found),
1024            outcome("Reddit", MatchKind::NotFound),
1025            outcome("Mastodon", MatchKind::Found),
1026        ];
1027
1028        let diff = diff_scans(&previous, &current);
1029
1030        assert_eq!(diff.from_scan_id.as_str(), "old");
1031        assert_eq!(diff.to_scan_id.as_str(), "new");
1032        assert_eq!(
1033            diff.added_found
1034                .iter()
1035                .map(|outcome| outcome.site.as_str())
1036                .collect::<Vec<_>>(),
1037            ["Mastodon"]
1038        );
1039        assert_eq!(
1040            diff.removed_found
1041                .iter()
1042                .map(|outcome| outcome.site.as_str())
1043                .collect::<Vec<_>>(),
1044            ["Reddit"]
1045        );
1046        assert_eq!(diff.verdict_changes.len(), 2);
1047        assert_eq!(diff.verdict_changes[0].site, "Mastodon");
1048        assert_eq!(diff.verdict_changes[0].before, MatchKind::NotFound);
1049        assert_eq!(diff.verdict_changes[0].after, MatchKind::Found);
1050        assert_eq!(diff.verdict_changes[1].site, "Reddit");
1051        assert!(diff.evidence_changes.is_empty());
1052    }
1053
1054    #[test]
1055    fn diff_scans_reports_profile_evidence_changes_for_still_found_sites() {
1056        let mut previous = sample("old", 1_000);
1057        let mut old_github = outcome("GitHub", MatchKind::Found);
1058        old_github.enrichment.insert("name".into(), "Alice".into());
1059        old_github
1060            .profile_evidence
1061            .push(adler_core::ProfileEvidence::from_enrichment(
1062                "GitHub",
1063                "https://github.example/alice",
1064                "name",
1065                "Alice",
1066            ));
1067        previous.outcomes = vec![old_github];
1068
1069        let mut current = sample("new", 2_000);
1070        let mut new_github = outcome("GitHub", MatchKind::Found);
1071        new_github
1072            .enrichment
1073            .insert("name".into(), "Alice Liddell".into());
1074        new_github
1075            .profile_evidence
1076            .push(adler_core::ProfileEvidence::from_enrichment(
1077                "GitHub",
1078                "https://github.example/alice",
1079                "name",
1080                "Alice Liddell",
1081            ));
1082        current.outcomes = vec![new_github];
1083
1084        let diff = diff_scans(&previous, &current);
1085
1086        assert!(diff.added_found.is_empty());
1087        assert!(diff.removed_found.is_empty());
1088        assert!(diff.verdict_changes.is_empty());
1089        assert_eq!(diff.evidence_changes.len(), 1);
1090        assert_eq!(diff.evidence_changes[0].site, "GitHub");
1091        assert_eq!(
1092            diff.evidence_changes[0]
1093                .before_enrichment
1094                .get("name")
1095                .unwrap(),
1096            "Alice"
1097        );
1098        assert_eq!(
1099            diff.evidence_changes[0]
1100                .after_enrichment
1101                .get("name")
1102                .unwrap(),
1103            "Alice Liddell"
1104        );
1105    }
1106
1107    #[test]
1108    fn timeline_tracks_first_seen_disappeared_and_reappeared() {
1109        let mut first = sample("first", 1_000);
1110        first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1111        let mut second = sample("second", 2_000);
1112        second.outcomes = vec![outcome("GitHub", MatchKind::NotFound)];
1113        let mut third = sample("third", 3_000);
1114        third.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1115
1116        let timeline = build_scan_timeline(&[third, first, second]);
1117
1118        assert_eq!(timeline.username, "alice");
1119        assert_eq!(timeline.scan_count, 3);
1120        assert_eq!(timeline.from_ms, Some(1_000));
1121        assert_eq!(timeline.to_ms, Some(3_000));
1122        assert_eq!(timeline.profiles.len(), 1);
1123        assert_eq!(timeline.profiles[0].site, "GitHub");
1124        assert_eq!(timeline.profiles[0].first_seen_ms, 1_000);
1125        assert_eq!(timeline.profiles[0].last_seen_ms, 3_000);
1126        assert!(timeline.profiles[0].present_in_latest);
1127        assert_eq!(
1128            timeline
1129                .events
1130                .iter()
1131                .map(|event| event.kind)
1132                .collect::<Vec<_>>(),
1133            [
1134                TimelineEventKind::FirstSeen,
1135                TimelineEventKind::Disappeared,
1136                TimelineEventKind::Reappeared
1137            ]
1138        );
1139        assert_eq!(timeline.events[1].before, Some(MatchKind::Found));
1140        assert_eq!(timeline.events[1].after, Some(MatchKind::NotFound));
1141    }
1142
1143    #[test]
1144    fn timeline_treats_missing_site_as_disappeared() {
1145        let mut first = sample("first", 1_000);
1146        first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1147        let mut second = sample("second", 2_000);
1148        second.outcomes = vec![outcome("GitLab", MatchKind::NotFound)];
1149
1150        let timeline = build_scan_timeline(&[first, second]);
1151
1152        assert_eq!(timeline.profiles.len(), 1);
1153        assert!(!timeline.profiles[0].present_in_latest);
1154        assert_eq!(timeline.events.len(), 2);
1155        assert_eq!(timeline.events[1].kind, TimelineEventKind::Disappeared);
1156        assert_eq!(timeline.events[1].site, "GitHub");
1157        assert_eq!(timeline.events[1].after, None);
1158    }
1159
1160    #[test]
1161    fn timeline_tracks_evidence_changes_for_still_found_profile() {
1162        let mut first = sample("first", 1_000);
1163        let mut old_github = outcome("GitHub", MatchKind::Found);
1164        old_github.enrichment.insert("name".into(), "Alice".into());
1165        old_github
1166            .profile_evidence
1167            .push(adler_core::ProfileEvidence::from_enrichment(
1168                "GitHub",
1169                "https://github.example/alice",
1170                "name",
1171                "Alice",
1172            ));
1173        first.outcomes = vec![old_github];
1174
1175        let mut second = sample("second", 2_000);
1176        let mut new_github = outcome("GitHub", MatchKind::Found);
1177        new_github
1178            .enrichment
1179            .insert("name".into(), "Alice Liddell".into());
1180        new_github
1181            .profile_evidence
1182            .push(adler_core::ProfileEvidence::from_enrichment(
1183                "GitHub",
1184                "https://github.example/alice",
1185                "name",
1186                "Alice Liddell",
1187            ));
1188        second.outcomes = vec![new_github];
1189
1190        let timeline = build_scan_timeline(&[first, second]);
1191
1192        assert_eq!(
1193            timeline
1194                .events
1195                .iter()
1196                .map(|event| event.kind)
1197                .collect::<Vec<_>>(),
1198            [
1199                TimelineEventKind::FirstSeen,
1200                TimelineEventKind::EvidenceChanged
1201            ]
1202        );
1203        let evidence_change = timeline.events[1].evidence_change.as_ref().unwrap();
1204        assert_eq!(
1205            evidence_change.before_enrichment.get("name").unwrap(),
1206            "Alice"
1207        );
1208        assert_eq!(
1209            evidence_change.after_enrichment.get("name").unwrap(),
1210            "Alice Liddell"
1211        );
1212    }
1213
1214    #[tokio::test]
1215    async fn load_all_returns_newest_first() {
1216        let tmp = TempDir::new().unwrap();
1217        save(tmp.path(), &sample("old", 1_000)).await.unwrap();
1218        save(tmp.path(), &sample("mid", 2_000)).await.unwrap();
1219        save(tmp.path(), &sample("new", 3_000)).await.unwrap();
1220        let all = load_all(tmp.path()).await;
1221        assert_eq!(all.len(), 3);
1222        assert_eq!(all[0].scan_id.as_str(), "new");
1223        assert_eq!(all[1].scan_id.as_str(), "mid");
1224        assert_eq!(all[2].scan_id.as_str(), "old");
1225    }
1226
1227    #[tokio::test]
1228    async fn load_returns_none_for_missing() {
1229        let tmp = TempDir::new().unwrap();
1230        let missing = load(tmp.path(), &ScanId::from("nope".to_owned())).await;
1231        assert!(missing.is_none());
1232    }
1233
1234    #[tokio::test]
1235    async fn load_defaults_schema_version_for_legacy_scan_json() {
1236        let tmp = TempDir::new().unwrap();
1237        let path = tmp.path().join("legacy.json");
1238        fs::write(
1239            &path,
1240            br#"{
1241                "scan_id": "legacy",
1242                "username": "alice",
1243                "site_count": 0,
1244                "created_at_ms": 1700000000000,
1245                "summary": { "found": 0, "not_found": 0, "uncertain": 0 },
1246                "outcomes": [],
1247                "elapsed_ms": 0
1248            }"#,
1249        )
1250        .await
1251        .unwrap();
1252
1253        let loaded = load(tmp.path(), &ScanId::from("legacy".to_owned()))
1254            .await
1255            .expect("legacy scan loads");
1256        assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1257    }
1258
1259    #[tokio::test]
1260    async fn load_accepts_v2_scan_json_after_schema_bump() {
1261        let tmp = TempDir::new().unwrap();
1262        let path = tmp.path().join("v2.json");
1263        fs::write(
1264            &path,
1265            br#"{
1266                "schema_version": 2,
1267                "scan_id": "v2",
1268                "username": "alice",
1269                "site_count": 1,
1270                "created_at_ms": 1700000000000,
1271                "summary": { "found": 1, "not_found": 0, "uncertain": 0 },
1272                "outcomes": [
1273                    {
1274                        "site": "GitHub",
1275                        "url": "https://github.example/alice",
1276                        "kind": "found",
1277                        "elapsed_ms": 10,
1278                        "evidence": ["HTTP 200 (status_found)"]
1279                    }
1280                ],
1281                "elapsed_ms": 10
1282            }"#,
1283        )
1284        .await
1285        .unwrap();
1286
1287        let loaded = load(tmp.path(), &ScanId::from("v2".to_owned()))
1288            .await
1289            .expect("v2 scan loads");
1290
1291        assert_eq!(loaded.schema_version, 2);
1292        assert_eq!(loaded.summary.found, 1);
1293        assert_eq!(
1294            loaded.outcomes[0].confidence.label,
1295            adler_core::ConfidenceLabel::Medium
1296        );
1297    }
1298
1299    #[tokio::test]
1300    async fn load_derives_identity_clusters_for_legacy_scan_json() {
1301        let tmp = TempDir::new().unwrap();
1302        let path = tmp.path().join("legacy-clusters.json");
1303        fs::write(
1304            &path,
1305            br#"{
1306                "schema_version": 1,
1307                "scan_id": "legacy-clusters",
1308                "username": "alice",
1309                "site_count": 2,
1310                "created_at_ms": 1700000000000,
1311                "summary": { "found": 2, "not_found": 0, "uncertain": 0 },
1312                "outcomes": [
1313                    {
1314                        "site": "GitHub",
1315                        "url": "https://github.example/alice",
1316                        "kind": "found",
1317                        "elapsed_ms": 10,
1318                        "profile_evidence": [
1319                            {
1320                                "kind": "external_link",
1321                                "field": "website",
1322                                "value": "https://alice.dev",
1323                                "source": {
1324                                    "site": "GitHub",
1325                                    "url": "https://github.example/alice",
1326                                    "origin": "extractor"
1327                                }
1328                            }
1329                        ]
1330                    },
1331                    {
1332                        "site": "GitLab",
1333                        "url": "https://gitlab.example/alice",
1334                        "kind": "found",
1335                        "elapsed_ms": 10,
1336                        "profile_evidence": [
1337                            {
1338                                "kind": "external_link",
1339                                "field": "website",
1340                                "value": "https://alice.dev/",
1341                                "source": {
1342                                    "site": "GitLab",
1343                                    "url": "https://gitlab.example/alice",
1344                                    "origin": "extractor"
1345                                }
1346                            }
1347                        ]
1348                    }
1349                ],
1350                "elapsed_ms": 20
1351            }"#,
1352        )
1353        .await
1354        .unwrap();
1355
1356        let loaded = load(tmp.path(), &ScanId::from("legacy-clusters".to_owned()))
1357            .await
1358            .expect("legacy scan loads");
1359
1360        assert_eq!(loaded.identity_clusters.len(), 1);
1361        assert_eq!(loaded.identity_clusters[0].members.len(), 2);
1362        assert!(!loaded.identity_clusters[0].uncertain);
1363    }
1364
1365    #[test]
1366    fn large_scan_artifact_paths_handle_identity_graph_payloads() {
1367        let previous = large_persisted_scan("large-old", 0);
1368        let current = large_persisted_scan("large-new", 1);
1369
1370        assert_eq!(previous.outcomes.len(), 2_500);
1371        assert_eq!(previous.site_count, 2_500);
1372        assert_eq!(
1373            previous.summary.found + previous.summary.not_found + previous.summary.uncertain,
1374            2_500
1375        );
1376        assert!(!previous.identity_clusters.is_empty());
1377
1378        let raw = serde_json::to_string(&previous).unwrap();
1379        let decoded: PersistedScan = serde_json::from_str(&raw).unwrap();
1380        assert_eq!(decoded.outcomes.len(), 2_500);
1381        assert_eq!(
1382            decoded.identity_clusters.len(),
1383            previous.identity_clusters.len()
1384        );
1385
1386        let diff = diff_scans(&previous, &current);
1387        assert!(!diff.added_found.is_empty());
1388        assert!(!diff.removed_found.is_empty());
1389        assert!(!diff.verdict_changes.is_empty());
1390        assert!(!diff.evidence_changes.is_empty());
1391
1392        let timeline = build_scan_timeline(&[previous, current]);
1393        assert_eq!(timeline.scan_count, 2);
1394        assert_eq!(timeline.profiles.len(), 375);
1395        assert!(timeline.events.len() > timeline.profiles.len());
1396    }
1397
1398    #[tokio::test]
1399    async fn load_all_skips_unrelated_files() {
1400        let tmp = TempDir::new().unwrap();
1401        // Drop a non-JSON file and a malformed JSON file alongside.
1402        fs::write(tmp.path().join("README"), b"not json")
1403            .await
1404            .unwrap();
1405        fs::write(tmp.path().join("broken.json"), b"{ invalid")
1406            .await
1407            .unwrap();
1408        save(tmp.path(), &sample("good", 9_999)).await.unwrap();
1409        let all = load_all(tmp.path()).await;
1410        assert_eq!(all.len(), 1);
1411        assert_eq!(all[0].scan_id.as_str(), "good");
1412    }
1413
1414    #[tokio::test]
1415    async fn prune_keeps_only_newest_n() {
1416        let tmp = TempDir::new().unwrap();
1417        for i in 0u64..5 {
1418            save(tmp.path(), &sample(&format!("s{i}"), i * 1_000))
1419                .await
1420                .unwrap();
1421        }
1422        let removed = prune(tmp.path(), 2).await;
1423        assert_eq!(removed, 3);
1424        let remaining = load_all(tmp.path()).await;
1425        assert_eq!(remaining.len(), 2);
1426        assert_eq!(remaining[0].scan_id.as_str(), "s4");
1427        assert_eq!(remaining[1].scan_id.as_str(), "s3");
1428    }
1429}