Skip to main content

adler_server/
persist.rs

1//! On-disk persistence for finished scans.
2//!
3//! Each scan is serialised as a single JSON file under [`default_dir`]
4//! (`$XDG_CACHE_HOME/adler/scans/`, falling back to
5//! `$HOME/.cache/adler/scans/`). The on-disk format is the full
6//! [`PersistedScan`] — enough for the history listing AND for replaying
7//! the scan into the UI without a fresh probe.
8//!
9//! Writes are atomic: serialise to `<id>.json.tmp`, then rename onto
10//! the final path. A crashed process leaves at most one orphan `.tmp`
11//! file behind, never a half-written `<id>.json`.
12
13use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use adler_core::{
17    CheckOutcome, HistoricalScanRef, IdentityCluster, InvestigationReport, MatchKind,
18    ProfileEvidence, ReportDisabledSite, ReportTimelineEvent, ReportTimelineEventKind, Site,
19};
20use serde::{Deserialize, Serialize};
21use tokio::fs;
22
23use crate::error::{Error, Result};
24use crate::scan::{FinishedScan, ScanId, Summary};
25
26/// Hard cap on how many scans we keep on disk. Beyond this, oldest
27/// (by `created_at_ms`) get [`prune`]d on the next save. Picked to be
28/// large enough for any plausible human-driven OSINT session.
29pub(crate) const MAX_PERSISTED_SCANS: usize = 200;
30/// Current on-disk schema version for [`PersistedScan`].
31pub(crate) const PERSISTED_SCAN_SCHEMA_VERSION: u16 = 3;
32
33/// Self-contained snapshot of a completed scan. Round-trips losslessly
34/// through JSON; tests assert that.
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct PersistedScan {
37    /// Version of this persisted scan artifact.
38    #[serde(default = "default_schema_version")]
39    pub schema_version: u16,
40    /// Stable identifier — same value as in-memory [`ScanId`].
41    pub scan_id: ScanId,
42    /// Username that was scanned.
43    pub username: String,
44    /// Request scope and parked-site diagnostics that explain how this
45    /// artifact was produced. Missing on scans saved before v1 context
46    /// support landed.
47    #[serde(default, skip_serializing_if = "Option::is_none")]
48    pub request_context: Option<ScanRequestContext>,
49    /// Total number of sites probed in this scan.
50    pub site_count: usize,
51    /// Unix epoch milliseconds when the scan was started.
52    pub created_at_ms: u64,
53    /// Per-verdict tally over [`Self::outcomes`].
54    pub summary: Summary,
55    /// All outcomes, in completion order.
56    pub outcomes: Vec<CheckOutcome>,
57    /// Deterministic identity candidates derived from found outcomes
58    /// with structured profile evidence.
59    #[serde(default, skip_serializing_if = "Vec::is_empty")]
60    pub identity_clusters: Vec<IdentityCluster>,
61    /// Wall-clock duration, milliseconds.
62    pub elapsed_ms: u64,
63}
64
65impl PersistedScan {
66    /// Build a snapshot from a freshly-completed in-memory scan.
67    #[must_use]
68    pub fn from_finished(
69        scan_id: ScanId,
70        username: String,
71        site_count: usize,
72        created_at_ms: u64,
73        finished: FinishedScan,
74    ) -> Self {
75        let mut scan = Self {
76            schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
77            scan_id,
78            username,
79            request_context: None,
80            site_count,
81            created_at_ms,
82            summary: finished.summary,
83            outcomes: finished.outcomes,
84            identity_clusters: finished.identity_clusters,
85            elapsed_ms: finished.elapsed_ms,
86        };
87        scan.refresh_derived_fields();
88        scan
89    }
90
91    /// Attach request-scope metadata to this persisted scan.
92    #[must_use]
93    pub fn with_request_context(mut self, context: ScanRequestContext) -> Self {
94        self.request_context = Some(context);
95        self
96    }
97
98    pub(crate) fn refresh_derived_fields(&mut self) {
99        for outcome in &mut self.outcomes {
100            outcome.refresh_confidence();
101        }
102        self.summary = Summary::from_outcomes(&self.outcomes);
103        self.identity_clusters =
104            adler_core::build_identity_clusters(&self.username, &self.outcomes);
105    }
106}
107
108/// Apply a non-persisted confidence overlay from previous scans of the same
109/// username.
110///
111/// The on-disk artifact remains the stateless source of truth. This helper is
112/// intended for history-aware read surfaces such as reports and persisted scan
113/// API/resource views.
114pub fn apply_historical_confidence_overlay(
115    current: &mut PersistedScan,
116    related_scans: &[PersistedScan],
117) {
118    current.refresh_derived_fields();
119    let history_counts = historical_consistency_counts(current, related_scans);
120
121    for outcome in &mut current.outcomes {
122        let count = history_counts.get(&outcome.site).copied().unwrap_or(0);
123        outcome.refresh_confidence_with_history(count);
124    }
125
126    current.identity_clusters =
127        adler_core::build_identity_clusters(&current.username, &current.outcomes);
128}
129
130/// Build a history-aware investigation report from a scan artifact.
131///
132/// The input scan is consumed and enriched in memory only. Persisted JSON files
133/// are never rewritten by this helper.
134#[must_use]
135pub fn build_investigation_report(
136    mut scan: PersistedScan,
137    related_scans: &[PersistedScan],
138) -> InvestigationReport {
139    apply_historical_confidence_overlay(&mut scan, related_scans);
140    let timeline = report_timeline_from_scans(related_scans, &scan);
141    let disabled_sites = scan
142        .request_context
143        .as_ref()
144        .map(|context| {
145            context
146                .disabled_matches
147                .iter()
148                .map(|site| ReportDisabledSite {
149                    name: site.name.clone(),
150                    url: site.url.clone(),
151                    tags: site.tags.clone(),
152                    disabled_reason: site.disabled_reason.clone(),
153                })
154                .collect()
155        })
156        .unwrap_or_default();
157
158    InvestigationReport::builder(scan.username, &scan.outcomes)
159        .identity_clusters(scan.identity_clusters)
160        .timeline(timeline)
161        .disabled_sites(disabled_sites)
162        .build()
163}
164
165fn report_timeline_from_scans(
166    related_scans: &[PersistedScan],
167    current: &PersistedScan,
168) -> Vec<ReportTimelineEvent> {
169    let mut scans = related_scans.to_vec();
170    if !scans.iter().any(|scan| scan.scan_id == current.scan_id) {
171        scans.push(current.clone());
172    }
173    build_scan_timeline(&scans)
174        .events
175        .into_iter()
176        .map(report_timeline_event)
177        .collect()
178}
179
180fn report_timeline_event(event: TimelineEvent) -> ReportTimelineEvent {
181    ReportTimelineEvent {
182        kind: match event.kind {
183            TimelineEventKind::FirstSeen => ReportTimelineEventKind::AddedFound,
184            TimelineEventKind::Disappeared => ReportTimelineEventKind::RemovedFound,
185            TimelineEventKind::Reappeared => ReportTimelineEventKind::Reappeared,
186            TimelineEventKind::EvidenceChanged => ReportTimelineEventKind::EvidenceChanged,
187        },
188        site: Some(event.site),
189        scan_id: Some(event.scan_id.to_string()),
190        observed_at_ms: Some(event.at_ms),
191        detail: Some(timeline_detail(event.before, event.after)),
192    }
193}
194
195fn timeline_detail(before: Option<MatchKind>, after: Option<MatchKind>) -> String {
196    match (before, after) {
197        (Some(before), Some(after)) => format!("{} -> {}", kind_label(before), kind_label(after)),
198        (None, Some(after)) => format!("new {}", kind_label(after)),
199        (Some(before), None) => format!("after {}", kind_label(before)),
200        (None, None) => "changed".to_owned(),
201    }
202}
203
204fn kind_label(kind: MatchKind) -> &'static str {
205    match kind {
206        MatchKind::Found => "found",
207        MatchKind::NotFound => "not_found",
208        MatchKind::Uncertain => "uncertain",
209    }
210}
211
212fn historical_consistency_counts(
213    current: &PersistedScan,
214    related_scans: &[PersistedScan],
215) -> BTreeMap<String, usize> {
216    let current_ref = HistoricalScanRef {
217        scan_id: current.scan_id.as_str(),
218        username: &current.username,
219        created_at_ms: current.created_at_ms,
220        outcomes: &current.outcomes,
221    };
222    let related_refs = related_scans.iter().map(|scan| HistoricalScanRef {
223        scan_id: scan.scan_id.as_str(),
224        username: &scan.username,
225        created_at_ms: scan.created_at_ms,
226        outcomes: &scan.outcomes,
227    });
228    adler_core::historical_consistency_counts(current_ref, related_refs)
229}
230
231const fn default_schema_version() -> u16 {
232    PERSISTED_SCAN_SCHEMA_VERSION
233}
234
235/// Request scope persisted with a finished scan so future timelines and
236/// reports can explain what was scanned and what was intentionally out of
237/// scope.
238#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
239pub struct ScanRequestContext {
240    /// Username supplied by the operator.
241    pub username: String,
242    /// Previous scan id when this scan was created by refiltering.
243    #[serde(default, skip_serializing_if = "Option::is_none")]
244    pub derived_from: Option<ScanId>,
245    /// Site name include filters.
246    #[serde(default, skip_serializing_if = "Vec::is_empty")]
247    pub only: Vec<String>,
248    /// Site name exclude filters.
249    #[serde(default, skip_serializing_if = "Vec::is_empty")]
250    pub exclude: Vec<String>,
251    /// Tag include filters.
252    #[serde(default, skip_serializing_if = "Vec::is_empty")]
253    pub tag: Vec<String>,
254    /// Tag exclude filters.
255    #[serde(default, skip_serializing_if = "Vec::is_empty")]
256    pub exclude_tag: Vec<String>,
257    /// Popularity ceiling, when supplied.
258    #[serde(default, skip_serializing_if = "Option::is_none")]
259    pub top: Option<u32>,
260    /// Whether NSFW-tagged entries were included.
261    pub nsfw: bool,
262    /// Per-scan concurrency override, when supplied.
263    #[serde(default, skip_serializing_if = "Option::is_none")]
264    pub concurrency: Option<usize>,
265    /// Per-scan deadline override, seconds.
266    #[serde(default, skip_serializing_if = "Option::is_none")]
267    pub deadline_secs: Option<u64>,
268    /// Egress subset requested for this scan.
269    #[serde(default, skip_serializing_if = "Vec::is_empty")]
270    pub egress_names: Vec<String>,
271    /// Disabled/parked sites that matched the same filter and were not
272    /// included in the enabled scan set.
273    #[serde(default, skip_serializing_if = "Vec::is_empty")]
274    pub disabled_matches: Vec<PersistedDisabledMatch>,
275}
276
277/// Compact disabled-site diagnostic persisted with scan context.
278#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
279pub struct PersistedDisabledMatch {
280    /// Site name.
281    pub name: String,
282    /// Profile URL template.
283    pub url: String,
284    /// Registry tags.
285    #[serde(default, skip_serializing_if = "Vec::is_empty")]
286    pub tags: Vec<String>,
287    /// Human-readable reason the site is parked.
288    pub disabled_reason: String,
289}
290
291impl From<&Site> for PersistedDisabledMatch {
292    fn from(site: &Site) -> Self {
293        Self {
294            name: site.name.clone(),
295            url: site.url.as_str().to_owned(),
296            tags: site.tags.clone(),
297            disabled_reason: site
298                .disabled_reason
299                .clone()
300                .unwrap_or_else(|| "disabled in registry".to_owned()),
301        }
302    }
303}
304
305/// Deterministic scan-to-scan diff used as the basis for timelines and
306/// watchlists.
307#[derive(Debug, Clone, Serialize, Deserialize)]
308pub struct ScanDiff {
309    /// Previous scan id.
310    pub from_scan_id: ScanId,
311    /// Current scan id.
312    pub to_scan_id: ScanId,
313    /// Found accounts that were not Found in the previous scan.
314    #[serde(default, skip_serializing_if = "Vec::is_empty")]
315    pub added_found: Vec<CheckOutcome>,
316    /// Accounts that were Found previously but are no longer Found.
317    #[serde(default, skip_serializing_if = "Vec::is_empty")]
318    pub removed_found: Vec<CheckOutcome>,
319    /// Sites present in both scans whose verdict changed.
320    #[serde(default, skip_serializing_if = "Vec::is_empty")]
321    pub verdict_changes: Vec<VerdictChange>,
322    /// Found sites whose normalized profile evidence changed.
323    #[serde(default, skip_serializing_if = "Vec::is_empty")]
324    pub evidence_changes: Vec<EvidenceChange>,
325}
326
327/// A verdict transition for one site.
328#[derive(Debug, Clone, Serialize, Deserialize)]
329pub struct VerdictChange {
330    /// Site name.
331    pub site: String,
332    /// Previous verdict.
333    pub before: MatchKind,
334    /// Current verdict.
335    pub after: MatchKind,
336}
337
338/// Profile evidence transition for one still-found site.
339#[derive(Debug, Clone, Serialize, Deserialize)]
340pub struct EvidenceChange {
341    /// Site name.
342    pub site: String,
343    /// Previous legacy enrichment fields.
344    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
345    pub before_enrichment: BTreeMap<String, String>,
346    /// Current legacy enrichment fields.
347    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
348    pub after_enrichment: BTreeMap<String, String>,
349    /// Previous normalized profile evidence.
350    #[serde(default, skip_serializing_if = "Vec::is_empty")]
351    pub before_profile_evidence: Vec<ProfileEvidence>,
352    /// Current normalized profile evidence.
353    #[serde(default, skip_serializing_if = "Vec::is_empty")]
354    pub after_profile_evidence: Vec<ProfileEvidence>,
355}
356
357/// Historical view derived from a sequence of persisted scans.
358#[derive(Debug, Clone, Serialize, Deserialize)]
359pub struct ScanTimeline {
360    /// Username shared by the scans used to build this timeline.
361    pub username: String,
362    /// Number of scans considered.
363    pub scan_count: usize,
364    /// Oldest scan timestamp, when at least one scan was supplied.
365    #[serde(default, skip_serializing_if = "Option::is_none")]
366    pub from_ms: Option<u64>,
367    /// Newest scan timestamp, when at least one scan was supplied.
368    #[serde(default, skip_serializing_if = "Option::is_none")]
369    pub to_ms: Option<u64>,
370    /// Per-site lifecycle summary.
371    #[serde(default, skip_serializing_if = "Vec::is_empty")]
372    pub profiles: Vec<TimelineProfile>,
373    /// Chronological lifecycle events.
374    #[serde(default, skip_serializing_if = "Vec::is_empty")]
375    pub events: Vec<TimelineEvent>,
376}
377
378/// Per-site lifecycle state in a scan timeline.
379#[derive(Debug, Clone, Serialize, Deserialize)]
380pub struct TimelineProfile {
381    /// Site name.
382    pub site: String,
383    /// Last known profile URL for the site.
384    pub url: String,
385    /// First scan timestamp where the profile was Found.
386    pub first_seen_ms: u64,
387    /// Most recent scan timestamp where the profile was Found.
388    pub last_seen_ms: u64,
389    /// Whether the profile is Found in the newest scan that mentioned it.
390    pub present_in_latest: bool,
391    /// Last verdict observed for this site, if the newest scan mentioned it.
392    #[serde(default, skip_serializing_if = "Option::is_none")]
393    pub last_verdict: Option<MatchKind>,
394}
395
396/// Timeline event category.
397#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
398#[serde(rename_all = "snake_case")]
399pub enum TimelineEventKind {
400    /// Site was Found for the first time in the supplied scan sequence.
401    FirstSeen,
402    /// Site was Found before, then no longer Found.
403    Disappeared,
404    /// Site was absent/not found after a previous hit, then Found again.
405    Reappeared,
406    /// Site stayed Found but normalized profile evidence changed.
407    EvidenceChanged,
408}
409
410/// One lifecycle event for a profile across scans.
411#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct TimelineEvent {
413    /// Scan id where the event was observed.
414    pub scan_id: ScanId,
415    /// Scan start timestamp.
416    pub at_ms: u64,
417    /// Site name.
418    pub site: String,
419    /// Best URL known for the site at this point in the timeline.
420    pub url: String,
421    /// Event category.
422    pub kind: TimelineEventKind,
423    /// Previous verdict, when known.
424    #[serde(default, skip_serializing_if = "Option::is_none")]
425    pub before: Option<MatchKind>,
426    /// Current verdict, when the current scan mentioned the site.
427    #[serde(default, skip_serializing_if = "Option::is_none")]
428    pub after: Option<MatchKind>,
429    /// Evidence transition for [`TimelineEventKind::EvidenceChanged`].
430    #[serde(default, skip_serializing_if = "Option::is_none")]
431    pub evidence_change: Option<EvidenceChange>,
432}
433
434/// Compare two persisted scans.
435///
436/// The diff is intentionally conservative: `added_found` and
437/// `removed_found` are based only on the `Found` verdict, while
438/// `evidence_changes` are reported only for sites that are Found in both
439/// scans.
440#[must_use]
441pub fn diff_scans(previous: &PersistedScan, current: &PersistedScan) -> ScanDiff {
442    let previous_by_site = outcomes_by_site(&previous.outcomes);
443    let current_by_site = outcomes_by_site(&current.outcomes);
444
445    let mut added_found = Vec::new();
446    let mut removed_found = Vec::new();
447    let mut verdict_changes = Vec::new();
448    let mut evidence_changes = Vec::new();
449
450    for (site, current_outcome) in &current_by_site {
451        let previous_outcome = previous_by_site.get(site);
452        if current_outcome.kind == MatchKind::Found
453            && previous_outcome.is_none_or(|o| o.kind != MatchKind::Found)
454        {
455            added_found.push((*current_outcome).clone());
456        }
457        if let Some(previous_outcome) = previous_outcome {
458            if previous_outcome.kind != current_outcome.kind {
459                verdict_changes.push(VerdictChange {
460                    site: site.clone(),
461                    before: previous_outcome.kind,
462                    after: current_outcome.kind,
463                });
464            }
465            if previous_outcome.kind == MatchKind::Found
466                && current_outcome.kind == MatchKind::Found
467                && profile_evidence_changed(previous_outcome, current_outcome)
468            {
469                evidence_changes.push(EvidenceChange {
470                    site: site.clone(),
471                    before_enrichment: previous_outcome.enrichment.clone(),
472                    after_enrichment: current_outcome.enrichment.clone(),
473                    before_profile_evidence: previous_outcome.profile_evidence.clone(),
474                    after_profile_evidence: current_outcome.profile_evidence.clone(),
475                });
476            }
477        }
478    }
479
480    for (site, previous_outcome) in &previous_by_site {
481        if previous_outcome.kind == MatchKind::Found
482            && current_by_site
483                .get(site)
484                .is_none_or(|o| o.kind != MatchKind::Found)
485        {
486            removed_found.push((*previous_outcome).clone());
487        }
488    }
489
490    ScanDiff {
491        from_scan_id: previous.scan_id.clone(),
492        to_scan_id: current.scan_id.clone(),
493        added_found,
494        removed_found,
495        verdict_changes,
496        evidence_changes,
497    }
498}
499
500/// Build a chronological timeline from persisted scans.
501///
502/// Scans may be supplied in any order; the builder sorts them oldest-first.
503/// Only `Found` outcomes create profiles. A later non-Found or missing site
504/// creates a disappearance event if the profile was previously present.
505#[must_use]
506pub fn build_scan_timeline(scans: &[PersistedScan]) -> ScanTimeline {
507    let mut ordered: Vec<&PersistedScan> = scans.iter().collect();
508    ordered.sort_by(|left, right| {
509        left.created_at_ms
510            .cmp(&right.created_at_ms)
511            .then_with(|| left.scan_id.as_str().cmp(right.scan_id.as_str()))
512    });
513
514    let username = ordered
515        .first()
516        .map(|scan| scan.username.clone())
517        .unwrap_or_default();
518    let from_ms = ordered.first().map(|scan| scan.created_at_ms);
519    let to_ms = ordered.last().map(|scan| scan.created_at_ms);
520    let mut states: BTreeMap<String, TimelineProfileState> = BTreeMap::new();
521    let mut events = Vec::new();
522
523    for scan in &ordered {
524        let current_by_site = outcomes_by_site(&scan.outcomes);
525        let sites = timeline_site_names(&states, &current_by_site);
526
527        for site in sites {
528            apply_timeline_site(
529                scan,
530                &site,
531                current_by_site.get(&site).copied(),
532                &mut states,
533                &mut events,
534            );
535        }
536    }
537
538    let profiles = states
539        .into_iter()
540        .map(|(site, state)| TimelineProfile {
541            site,
542            url: state.url,
543            first_seen_ms: state.first_seen_ms,
544            last_seen_ms: state.last_seen_ms,
545            present_in_latest: state.present_in_latest,
546            last_verdict: state.last_verdict,
547        })
548        .collect();
549
550    ScanTimeline {
551        username,
552        scan_count: ordered.len(),
553        from_ms,
554        to_ms,
555        profiles,
556        events,
557    }
558}
559
560fn timeline_site_names(
561    states: &BTreeMap<String, TimelineProfileState>,
562    current_by_site: &BTreeMap<String, &CheckOutcome>,
563) -> Vec<String> {
564    let mut sites: Vec<String> = states.keys().cloned().collect();
565    for site in current_by_site.keys() {
566        if !states.contains_key(site.as_str()) {
567            sites.push((*site).clone());
568        }
569    }
570    sites.sort();
571    sites.dedup();
572    sites
573}
574
575fn apply_timeline_site(
576    scan: &PersistedScan,
577    site: &str,
578    current: Option<&CheckOutcome>,
579    states: &mut BTreeMap<String, TimelineProfileState>,
580    events: &mut Vec<TimelineEvent>,
581) {
582    let current_kind = current.map(|outcome| outcome.kind);
583    let was_present = states
584        .get(site)
585        .is_some_and(|state| state.present_in_latest);
586
587    if current_kind == Some(MatchKind::Found) {
588        apply_found_timeline_site(scan, site, current.expect("found outcome"), states, events);
589    } else if was_present {
590        apply_disappeared_timeline_site(scan, site, current, current_kind, states, events);
591    } else if let (Some(state), Some(outcome)) = (states.get_mut(site), current) {
592        state.last_verdict = Some(outcome.kind);
593        state.url.clone_from(&outcome.url);
594    }
595}
596
597fn apply_found_timeline_site(
598    scan: &PersistedScan,
599    site: &str,
600    outcome: &CheckOutcome,
601    states: &mut BTreeMap<String, TimelineProfileState>,
602    events: &mut Vec<TimelineEvent>,
603) {
604    let current_kind = Some(outcome.kind);
605    let had_state = states.contains_key(site);
606    let was_present = states
607        .get(site)
608        .is_some_and(|state| state.present_in_latest);
609    let state = states
610        .entry(site.to_owned())
611        .or_insert_with(|| TimelineProfileState::new(outcome, scan.created_at_ms));
612
613    if !had_state {
614        events.push(timeline_event(
615            scan,
616            site,
617            &outcome.url,
618            TimelineEventKind::FirstSeen,
619            None,
620            current_kind,
621            None,
622        ));
623    } else if !was_present {
624        events.push(timeline_event(
625            scan,
626            site,
627            &outcome.url,
628            TimelineEventKind::Reappeared,
629            state.last_verdict,
630            current_kind,
631            None,
632        ));
633    } else if state.profile_evidence_changed(outcome) {
634        events.push(timeline_event(
635            scan,
636            site,
637            &outcome.url,
638            TimelineEventKind::EvidenceChanged,
639            Some(MatchKind::Found),
640            current_kind,
641            Some(EvidenceChange {
642                site: site.to_owned(),
643                before_enrichment: state.last_found_enrichment.clone(),
644                after_enrichment: outcome.enrichment.clone(),
645                before_profile_evidence: state.last_found_profile_evidence.clone(),
646                after_profile_evidence: outcome.profile_evidence.clone(),
647            }),
648        ));
649    }
650
651    states
652        .get_mut(site)
653        .expect("state inserted before found update")
654        .update_found(outcome, scan.created_at_ms);
655}
656
657fn apply_disappeared_timeline_site(
658    scan: &PersistedScan,
659    site: &str,
660    current: Option<&CheckOutcome>,
661    current_kind: Option<MatchKind>,
662    states: &mut BTreeMap<String, TimelineProfileState>,
663    events: &mut Vec<TimelineEvent>,
664) {
665    let state = states
666        .get_mut(site)
667        .expect("present state exists before disappearance");
668    let url = current.map_or_else(|| state.url.clone(), |outcome| outcome.url.clone());
669    events.push(timeline_event(
670        scan,
671        site,
672        &url,
673        TimelineEventKind::Disappeared,
674        state.last_verdict,
675        current_kind,
676        None,
677    ));
678    state.present_in_latest = false;
679    state.last_verdict = current_kind;
680    if let Some(outcome) = current {
681        state.url.clone_from(&outcome.url);
682    }
683}
684
685fn timeline_event(
686    scan: &PersistedScan,
687    site: &str,
688    url: &str,
689    kind: TimelineEventKind,
690    before: Option<MatchKind>,
691    after: Option<MatchKind>,
692    evidence_change: Option<EvidenceChange>,
693) -> TimelineEvent {
694    TimelineEvent {
695        scan_id: scan.scan_id.clone(),
696        at_ms: scan.created_at_ms,
697        site: site.to_owned(),
698        url: url.to_owned(),
699        kind,
700        before,
701        after,
702        evidence_change,
703    }
704}
705
706#[derive(Debug, Clone)]
707struct TimelineProfileState {
708    url: String,
709    first_seen_ms: u64,
710    last_seen_ms: u64,
711    present_in_latest: bool,
712    last_verdict: Option<MatchKind>,
713    last_found_enrichment: BTreeMap<String, String>,
714    last_found_profile_evidence: Vec<ProfileEvidence>,
715}
716
717impl TimelineProfileState {
718    fn new(outcome: &CheckOutcome, at_ms: u64) -> Self {
719        Self {
720            url: outcome.url.clone(),
721            first_seen_ms: at_ms,
722            last_seen_ms: at_ms,
723            present_in_latest: true,
724            last_verdict: Some(outcome.kind),
725            last_found_enrichment: outcome.enrichment.clone(),
726            last_found_profile_evidence: outcome.profile_evidence.clone(),
727        }
728    }
729
730    fn update_found(&mut self, outcome: &CheckOutcome, at_ms: u64) {
731        self.url.clone_from(&outcome.url);
732        self.last_seen_ms = at_ms;
733        self.present_in_latest = true;
734        self.last_verdict = Some(outcome.kind);
735        self.last_found_enrichment = outcome.enrichment.clone();
736        self.last_found_profile_evidence
737            .clone_from(&outcome.profile_evidence);
738    }
739
740    fn profile_evidence_changed(&self, outcome: &CheckOutcome) -> bool {
741        self.last_found_enrichment != outcome.enrichment
742            || self.last_found_profile_evidence != outcome.profile_evidence
743    }
744}
745
746fn outcomes_by_site(outcomes: &[CheckOutcome]) -> BTreeMap<String, &CheckOutcome> {
747    outcomes
748        .iter()
749        .map(|outcome| (outcome.site.clone(), outcome))
750        .collect()
751}
752
753fn profile_evidence_changed(previous: &CheckOutcome, current: &CheckOutcome) -> bool {
754    previous.enrichment != current.enrichment
755        || previous.profile_evidence != current.profile_evidence
756}
757
758/// Default directory for persisted scans.
759///
760/// Mirrors [`adler_core::Cache::default_path`]'s discovery rules:
761/// `$XDG_CACHE_HOME/adler/scans/` → `$HOME/.cache/adler/scans/` →
762/// a relative fallback. The directory is created lazily on first save.
763#[must_use]
764pub fn default_dir() -> PathBuf {
765    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
766        return PathBuf::from(xdg).join("adler").join("scans");
767    }
768    if let Some(home) = std::env::var_os("HOME") {
769        return PathBuf::from(home)
770            .join(".cache")
771            .join("adler")
772            .join("scans");
773    }
774    PathBuf::from("adler-scans")
775}
776
777/// Save `scan` to `<dir>/<id>.json` atomically. Creates `dir` if missing.
778pub(crate) async fn save(dir: &Path, scan: &PersistedScan) -> Result<()> {
779    fs::create_dir_all(dir).await.map_err(Error::Persist)?;
780    let path = dir.join(format!("{}.json", scan.scan_id));
781    let tmp = dir.join(format!("{}.json.tmp", scan.scan_id));
782    let mut scan = scan.clone();
783    scan.refresh_derived_fields();
784    let body = serde_json::to_vec_pretty(&scan).map_err(Error::PersistEncode)?;
785    fs::write(&tmp, &body).await.map_err(Error::Persist)?;
786    fs::rename(&tmp, &path).await.map_err(Error::Persist)?;
787    Ok(())
788}
789
790/// Read one scan from disk by id. Returns `None` on any I/O or parse
791/// error — callers should treat a missing scan as not-found rather
792/// than propagate the underlying cause.
793pub(crate) async fn load(dir: &Path, scan_id: &ScanId) -> Option<PersistedScan> {
794    let path = dir.join(format!("{scan_id}.json"));
795    let bytes = fs::read(&path).await.ok()?;
796    serde_json::from_slice(&bytes)
797        .ok()
798        .map(refresh_derived_fields)
799}
800
801/// Enumerate every persisted scan, newest first. Files that fail to
802/// parse are silently skipped — a corrupted file shouldn't break the
803/// whole listing.
804pub(crate) async fn load_all(dir: &Path) -> Vec<PersistedScan> {
805    let Ok(mut entries) = fs::read_dir(dir).await else {
806        return Vec::new();
807    };
808    let mut out = Vec::new();
809    while let Ok(Some(entry)) = entries.next_entry().await {
810        let path = entry.path();
811        if path.extension().and_then(|s| s.to_str()) != Some("json") {
812            continue;
813        }
814        let Ok(bytes) = fs::read(&path).await else {
815            continue;
816        };
817        let Ok(scan) = serde_json::from_slice::<PersistedScan>(&bytes) else {
818            continue;
819        };
820        out.push(refresh_derived_fields(scan));
821    }
822    out.sort_by_key(|s| std::cmp::Reverse(s.created_at_ms));
823    out
824}
825
826fn refresh_derived_fields(mut scan: PersistedScan) -> PersistedScan {
827    scan.refresh_derived_fields();
828    scan
829}
830
831/// Delete scans beyond `keep_newest`. Newest-by-`created_at_ms` wins.
832/// Returns the number of files actually removed.
833pub(crate) async fn prune(dir: &Path, keep_newest: usize) -> usize {
834    let scans = load_all(dir).await;
835    if scans.len() <= keep_newest {
836        return 0;
837    }
838    let mut removed = 0;
839    for s in &scans[keep_newest..] {
840        let path = dir.join(format!("{}.json", s.scan_id));
841        if fs::remove_file(&path).await.is_ok() {
842            removed += 1;
843        }
844    }
845    removed
846}
847
848#[cfg(test)]
849mod tests {
850    use super::*;
851    use adler_core::{
852        ConfidenceLabel, ConfidenceReason, EvidenceAccessPath, MatchKind, ProfileEvidence,
853        TransportTier, UncertainReason,
854    };
855    use std::collections::BTreeMap;
856    use tempfile::TempDir;
857
858    fn sample(scan_id: &str, ts: u64) -> PersistedScan {
859        PersistedScan {
860            schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
861            scan_id: ScanId::from(scan_id.to_owned()),
862            username: "alice".into(),
863            request_context: None,
864            site_count: 2,
865            created_at_ms: ts,
866            summary: Summary {
867                found: 1,
868                not_found: 1,
869                uncertain: 0,
870            },
871            outcomes: vec![
872                CheckOutcome {
873                    site: "GitHub".into(),
874                    url: "https://github.com/alice".into(),
875                    kind: MatchKind::Found,
876                    reason: None,
877                    elapsed_ms: 120,
878                    enrichment: BTreeMap::new(),
879                    evidence: vec!["HTTP 200 (status_found)".into()],
880                    profile_evidence: Vec::new(),
881                    confidence: adler_core::ConfidenceScore::default(),
882                    transport: None,
883                    escalations: 0,
884                },
885                CheckOutcome {
886                    site: "GitLab".into(),
887                    url: "https://gitlab.com/alice".into(),
888                    kind: MatchKind::NotFound,
889                    reason: None,
890                    elapsed_ms: 90,
891                    enrichment: BTreeMap::new(),
892                    evidence: vec!["HTTP 404 (status_not_found)".into()],
893                    profile_evidence: Vec::new(),
894                    confidence: adler_core::ConfidenceScore::default(),
895                    transport: None,
896                    escalations: 0,
897                },
898            ],
899            identity_clusters: Vec::new(),
900            elapsed_ms: 210,
901        }
902    }
903
904    fn outcome(site: &str, kind: MatchKind) -> CheckOutcome {
905        CheckOutcome {
906            site: site.into(),
907            url: format!("https://{site}.example/alice"),
908            kind,
909            reason: None,
910            elapsed_ms: 10,
911            enrichment: BTreeMap::new(),
912            evidence: Vec::new(),
913            profile_evidence: Vec::new(),
914            confidence: adler_core::ConfidenceScore::default(),
915            transport: None,
916            escalations: 0,
917        }
918    }
919
920    fn scan_with_outcomes(
921        scan_id: &str,
922        username: &str,
923        ts: u64,
924        outcomes: Vec<CheckOutcome>,
925    ) -> PersistedScan {
926        PersistedScan {
927            schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
928            scan_id: ScanId::from(scan_id.to_owned()),
929            username: username.to_owned(),
930            request_context: None,
931            site_count: outcomes.len(),
932            created_at_ms: ts,
933            summary: Summary::from_outcomes(&outcomes),
934            outcomes,
935            identity_clusters: Vec::new(),
936            elapsed_ms: 10,
937        }
938    }
939
940    fn found_with_website(site: &str, website: &str) -> CheckOutcome {
941        found_with_website_at(site, website, None)
942    }
943
944    fn found_with_website_at(
945        site: &str,
946        website: &str,
947        observed_at_ms: Option<u64>,
948    ) -> CheckOutcome {
949        let mut outcome = outcome(site, MatchKind::Found);
950        outcome
951            .profile_evidence
952            .push(ProfileEvidence::from_enrichment_with_source(
953                site,
954                &outcome.url,
955                "website",
956                website,
957                observed_at_ms,
958                None,
959            ));
960        outcome
961    }
962
963    fn has_historical_reason(outcome: &CheckOutcome, count: usize) -> bool {
964        outcome.confidence.reasons.iter().any(|reason| {
965            matches!(
966                reason,
967                ConfidenceReason::HistoricalConsistency { count: actual } if *actual == count
968            )
969        })
970    }
971
972    fn large_outcomes(count: usize, generation: usize) -> Vec<CheckOutcome> {
973        (0..count)
974            .map(|idx| large_outcome(idx, generation))
975            .collect()
976    }
977
978    fn large_outcome(idx: usize, generation: usize) -> CheckOutcome {
979        let site = format!("LargeSite{idx:04}");
980        let url = format!("https://large{idx:04}.example/alice");
981        let mut kind = match idx % 20 {
982            0 | 1 => MatchKind::Found,
983            3 => MatchKind::Uncertain,
984            _ => MatchKind::NotFound,
985        };
986        if generation > 0 && idx % 20 == 0 {
987            kind = MatchKind::NotFound;
988        } else if generation > 0 && idx % 20 == 2 {
989            kind = MatchKind::Found;
990        }
991
992        let mut outcome = CheckOutcome {
993            site: site.clone(),
994            url: url.clone(),
995            kind,
996            reason: (kind == MatchKind::Uncertain).then_some(UncertainReason::RateLimited),
997            elapsed_ms: 10 + (idx % 75) as u64,
998            enrichment: BTreeMap::new(),
999            evidence: Vec::new(),
1000            profile_evidence: Vec::new(),
1001            confidence: adler_core::ConfidenceScore::default(),
1002            transport: Some(if idx % 7 == 0 {
1003                TransportTier::Browser
1004            } else {
1005                TransportTier::Http
1006            }),
1007            escalations: u8::from(idx % 7 == 0),
1008        };
1009
1010        match kind {
1011            MatchKind::Found => {
1012                let observed_at_ms = 1_781_192_451_000 + generation as u64 * 1_000 + idx as u64;
1013                let website = format!("https://identity-{:02}.example", idx % 25);
1014                let name = format!("Alice Group {:02}", idx % 50);
1015                let bio = if generation > 0 && idx % 20 == 1 {
1016                    format!("updated profile generation {generation} for {idx}")
1017                } else {
1018                    format!("stable profile generation 0 for {idx}")
1019                };
1020                for (field, value) in [
1021                    ("website", website.as_str()),
1022                    ("name", name.as_str()),
1023                    ("bio", bio.as_str()),
1024                ] {
1025                    outcome
1026                        .enrichment
1027                        .insert(field.to_owned(), value.to_owned());
1028                    outcome
1029                        .profile_evidence
1030                        .push(ProfileEvidence::from_enrichment_with_source(
1031                            &site,
1032                            &url,
1033                            field,
1034                            value,
1035                            Some(observed_at_ms),
1036                            Some(EvidenceAccessPath::new(
1037                                outcome.transport.unwrap_or(TransportTier::Http),
1038                                outcome.escalations,
1039                                idx % 11 == 0,
1040                            )),
1041                        ));
1042                }
1043                outcome.evidence = vec![
1044                    "HTTP 200 (status_found)".to_owned(),
1045                    "body matched profile marker".to_owned(),
1046                ];
1047            }
1048            MatchKind::NotFound => {
1049                outcome.evidence = vec!["HTTP 404 (status_not_found)".to_owned()];
1050            }
1051            MatchKind::Uncertain => {}
1052        }
1053        outcome.refresh_confidence();
1054        outcome
1055    }
1056
1057    fn large_persisted_scan(scan_id: &str, generation: usize) -> PersistedScan {
1058        let outcomes = large_outcomes(2_500, generation);
1059        let finished = FinishedScan {
1060            summary: Summary::from_outcomes(&outcomes),
1061            identity_clusters: adler_core::build_identity_clusters("alice", &outcomes),
1062            elapsed_ms: 30_000 + generation as u64,
1063            outcomes,
1064        };
1065        PersistedScan::from_finished(
1066            ScanId::from(scan_id.to_owned()),
1067            "alice".to_owned(),
1068            2_500,
1069            1_781_192_451_000 + generation as u64 * 10_000,
1070            finished,
1071        )
1072    }
1073
1074    #[tokio::test]
1075    async fn save_then_load_roundtrips() {
1076        let tmp = TempDir::new().unwrap();
1077        let s = sample("abc123", 1_700_000_000_000);
1078        save(tmp.path(), &s).await.unwrap();
1079
1080        let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1081        assert_eq!(loaded.scan_id, s.scan_id);
1082        assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1083        assert_eq!(loaded.username, "alice");
1084        assert_eq!(loaded.outcomes.len(), 2);
1085        assert_eq!(loaded.outcomes[0].site, "GitHub");
1086        assert_eq!(loaded.summary.found, 1);
1087    }
1088
1089    #[test]
1090    fn historical_overlay_adds_reason_after_two_prior_stable_found_observations() {
1091        let mut current = scan_with_outcomes(
1092            "current",
1093            "alice",
1094            30,
1095            vec![found_with_website("GitHub", "https://alice.dev")],
1096        );
1097        let previous = scan_with_outcomes(
1098            "previous",
1099            "alice",
1100            20,
1101            vec![found_with_website("GitHub", "https://alice.dev")],
1102        );
1103        let older = scan_with_outcomes(
1104            "older",
1105            "alice",
1106            10,
1107            vec![found_with_website("GitHub", "https://alice.dev")],
1108        );
1109
1110        apply_historical_confidence_overlay(&mut current, &[previous, older]);
1111
1112        assert!(has_historical_reason(&current.outcomes[0], 2));
1113        assert_eq!(current.outcomes[0].confidence.score, 79);
1114    }
1115
1116    #[test]
1117    fn historical_overlay_ignores_single_prior_found() {
1118        let mut current = scan_with_outcomes(
1119            "current",
1120            "alice",
1121            20,
1122            vec![found_with_website("GitHub", "https://alice.dev")],
1123        );
1124        let previous = scan_with_outcomes(
1125            "previous",
1126            "alice",
1127            10,
1128            vec![found_with_website("GitHub", "https://alice.dev")],
1129        );
1130
1131        apply_historical_confidence_overlay(&mut current, &[previous]);
1132
1133        assert!(!has_historical_reason(&current.outcomes[0], 1));
1134        assert_eq!(current.outcomes[0].confidence.score, 75);
1135    }
1136
1137    #[test]
1138    fn historical_overlay_resets_on_explicit_non_found() {
1139        let mut current = scan_with_outcomes(
1140            "current",
1141            "alice",
1142            40,
1143            vec![found_with_website("GitHub", "https://alice.dev")],
1144        );
1145        let previous = scan_with_outcomes(
1146            "previous",
1147            "alice",
1148            30,
1149            vec![outcome("GitHub", MatchKind::NotFound)],
1150        );
1151        let older = scan_with_outcomes(
1152            "older",
1153            "alice",
1154            20,
1155            vec![found_with_website("GitHub", "https://alice.dev")],
1156        );
1157        let oldest = scan_with_outcomes(
1158            "oldest",
1159            "alice",
1160            10,
1161            vec![found_with_website("GitHub", "https://alice.dev")],
1162        );
1163
1164        apply_historical_confidence_overlay(&mut current, &[previous, older, oldest]);
1165
1166        assert!(!has_historical_reason(&current.outcomes[0], 2));
1167        assert_eq!(current.outcomes[0].confidence.score, 75);
1168    }
1169
1170    #[test]
1171    fn historical_overlay_ignores_source_timestamp_changes() {
1172        let mut current = scan_with_outcomes(
1173            "current",
1174            "alice",
1175            30,
1176            vec![found_with_website_at(
1177                "GitHub",
1178                "https://alice.dev",
1179                Some(30),
1180            )],
1181        );
1182        let previous = scan_with_outcomes(
1183            "previous",
1184            "alice",
1185            20,
1186            vec![found_with_website_at(
1187                "GitHub",
1188                "https://alice.dev",
1189                Some(20),
1190            )],
1191        );
1192        let older = scan_with_outcomes(
1193            "older",
1194            "alice",
1195            10,
1196            vec![found_with_website_at(
1197                "GitHub",
1198                "https://alice.dev",
1199                Some(10),
1200            )],
1201        );
1202
1203        apply_historical_confidence_overlay(&mut current, &[previous, older]);
1204
1205        assert!(has_historical_reason(&current.outcomes[0], 2));
1206    }
1207
1208    #[test]
1209    fn weak_status_only_result_remains_medium_capped_with_history() {
1210        let mut current_outcome = outcome("GitHub", MatchKind::Found);
1211        current_outcome.evidence = vec!["HTTP 200 (status_found)".to_owned()];
1212        let mut previous_outcome = outcome("GitHub", MatchKind::Found);
1213        previous_outcome.evidence = current_outcome.evidence.clone();
1214        let mut older_outcome = outcome("GitHub", MatchKind::Found);
1215        older_outcome.evidence = current_outcome.evidence.clone();
1216
1217        let mut current = scan_with_outcomes("current", "alice", 30, vec![current_outcome]);
1218        let previous = scan_with_outcomes("previous", "alice", 20, vec![previous_outcome]);
1219        let older = scan_with_outcomes("older", "alice", 10, vec![older_outcome]);
1220
1221        apply_historical_confidence_overlay(&mut current, &[previous, older]);
1222
1223        assert!(has_historical_reason(&current.outcomes[0], 2));
1224        assert_eq!(
1225            current.outcomes[0].confidence.label,
1226            ConfidenceLabel::Medium
1227        );
1228        assert_eq!(current.outcomes[0].confidence.score, 70);
1229    }
1230
1231    #[tokio::test]
1232    async fn historical_overlay_does_not_rewrite_persisted_json() {
1233        let tmp = TempDir::new().unwrap();
1234        let current = scan_with_outcomes(
1235            "current",
1236            "alice",
1237            30,
1238            vec![found_with_website("GitHub", "https://alice.dev")],
1239        );
1240        let previous = scan_with_outcomes(
1241            "previous",
1242            "alice",
1243            20,
1244            vec![found_with_website("GitHub", "https://alice.dev")],
1245        );
1246        let older = scan_with_outcomes(
1247            "older",
1248            "alice",
1249            10,
1250            vec![found_with_website("GitHub", "https://alice.dev")],
1251        );
1252        save(tmp.path(), &current).await.unwrap();
1253        save(tmp.path(), &previous).await.unwrap();
1254        save(tmp.path(), &older).await.unwrap();
1255
1256        let current_path = tmp.path().join("current.json");
1257        let before = fs::read(&current_path).await.unwrap();
1258        let related = load_all(tmp.path()).await;
1259        let mut loaded = load(tmp.path(), &ScanId::from("current".to_owned()))
1260            .await
1261            .unwrap();
1262
1263        apply_historical_confidence_overlay(&mut loaded, &related);
1264
1265        let after = fs::read(&current_path).await.unwrap();
1266        assert_eq!(before, after);
1267        assert!(has_historical_reason(&loaded.outcomes[0], 2));
1268    }
1269
1270    #[tokio::test]
1271    async fn save_writes_schema_version() {
1272        let tmp = TempDir::new().unwrap();
1273        let s = sample("abc123", 1_700_000_000_000);
1274        save(tmp.path(), &s).await.unwrap();
1275
1276        let raw = fs::read_to_string(tmp.path().join("abc123.json"))
1277            .await
1278            .unwrap();
1279        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1280        assert_eq!(
1281            value["schema_version"],
1282            serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
1283        );
1284    }
1285
1286    #[tokio::test]
1287    async fn save_skips_empty_identity_clusters() {
1288        let tmp = TempDir::new().unwrap();
1289        let s = sample("empty-clusters", 1_700_000_000_000);
1290        save(tmp.path(), &s).await.unwrap();
1291
1292        let raw = fs::read_to_string(tmp.path().join("empty-clusters.json"))
1293            .await
1294            .unwrap();
1295        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1296        assert_eq!(
1297            value["schema_version"],
1298            serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
1299        );
1300        assert!(
1301            value.get("identity_clusters").is_none(),
1302            "empty cluster cache should stay absent from persisted JSON"
1303        );
1304    }
1305
1306    #[tokio::test]
1307    async fn save_writes_derived_identity_clusters() {
1308        let tmp = TempDir::new().unwrap();
1309        let mut s = sample("clusters", 1_700_000_000_000);
1310        s.outcomes = vec![
1311            found_with_website("GitHub", "https://alice.dev"),
1312            found_with_website("GitLab", "https://alice.dev"),
1313        ];
1314
1315        save(tmp.path(), &s).await.unwrap();
1316
1317        let raw = fs::read_to_string(tmp.path().join("clusters.json"))
1318            .await
1319            .unwrap();
1320        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1321        assert_eq!(value["identity_clusters"].as_array().unwrap().len(), 1);
1322        assert_eq!(
1323            value["identity_clusters"][0]["members"]
1324                .as_array()
1325                .unwrap()
1326                .len(),
1327            2
1328        );
1329    }
1330
1331    #[tokio::test]
1332    async fn save_roundtrips_request_context() {
1333        let tmp = TempDir::new().unwrap();
1334        let context = ScanRequestContext {
1335            username: "alice".into(),
1336            derived_from: Some(ScanId::from("previous".to_owned())),
1337            only: vec!["Git".into()],
1338            exclude: Vec::new(),
1339            tag: vec!["coding".into()],
1340            exclude_tag: vec!["nsfw".into()],
1341            top: Some(100),
1342            nsfw: false,
1343            concurrency: Some(8),
1344            deadline_secs: Some(30),
1345            egress_names: vec!["us-resi".into()],
1346            disabled_matches: vec![PersistedDisabledMatch {
1347                name: "TikTok".into(),
1348                url: "https://www.tiktok.com/@{username}".into(),
1349                tags: vec!["social".into()],
1350                disabled_reason: "Honest Limits: JS hydration".into(),
1351            }],
1352        };
1353        let s = sample("ctx", 1_700_000_000_000).with_request_context(context.clone());
1354        save(tmp.path(), &s).await.unwrap();
1355
1356        let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1357        assert_eq!(loaded.request_context, Some(context));
1358    }
1359
1360    #[test]
1361    fn diff_scans_reports_added_removed_and_verdict_changes() {
1362        let mut previous = sample("old", 1_000);
1363        previous.outcomes = vec![
1364            outcome("GitHub", MatchKind::Found),
1365            outcome("Reddit", MatchKind::Found),
1366            outcome("Mastodon", MatchKind::NotFound),
1367        ];
1368        let mut current = sample("new", 2_000);
1369        current.outcomes = vec![
1370            outcome("GitHub", MatchKind::Found),
1371            outcome("Reddit", MatchKind::NotFound),
1372            outcome("Mastodon", MatchKind::Found),
1373        ];
1374
1375        let diff = diff_scans(&previous, &current);
1376
1377        assert_eq!(diff.from_scan_id.as_str(), "old");
1378        assert_eq!(diff.to_scan_id.as_str(), "new");
1379        assert_eq!(
1380            diff.added_found
1381                .iter()
1382                .map(|outcome| outcome.site.as_str())
1383                .collect::<Vec<_>>(),
1384            ["Mastodon"]
1385        );
1386        assert_eq!(
1387            diff.removed_found
1388                .iter()
1389                .map(|outcome| outcome.site.as_str())
1390                .collect::<Vec<_>>(),
1391            ["Reddit"]
1392        );
1393        assert_eq!(diff.verdict_changes.len(), 2);
1394        assert_eq!(diff.verdict_changes[0].site, "Mastodon");
1395        assert_eq!(diff.verdict_changes[0].before, MatchKind::NotFound);
1396        assert_eq!(diff.verdict_changes[0].after, MatchKind::Found);
1397        assert_eq!(diff.verdict_changes[1].site, "Reddit");
1398        assert!(diff.evidence_changes.is_empty());
1399    }
1400
1401    #[test]
1402    fn diff_scans_reports_profile_evidence_changes_for_still_found_sites() {
1403        let mut previous = sample("old", 1_000);
1404        let mut old_github = outcome("GitHub", MatchKind::Found);
1405        old_github.enrichment.insert("name".into(), "Alice".into());
1406        old_github
1407            .profile_evidence
1408            .push(adler_core::ProfileEvidence::from_enrichment(
1409                "GitHub",
1410                "https://github.example/alice",
1411                "name",
1412                "Alice",
1413            ));
1414        previous.outcomes = vec![old_github];
1415
1416        let mut current = sample("new", 2_000);
1417        let mut new_github = outcome("GitHub", MatchKind::Found);
1418        new_github
1419            .enrichment
1420            .insert("name".into(), "Alice Liddell".into());
1421        new_github
1422            .profile_evidence
1423            .push(adler_core::ProfileEvidence::from_enrichment(
1424                "GitHub",
1425                "https://github.example/alice",
1426                "name",
1427                "Alice Liddell",
1428            ));
1429        current.outcomes = vec![new_github];
1430
1431        let diff = diff_scans(&previous, &current);
1432
1433        assert!(diff.added_found.is_empty());
1434        assert!(diff.removed_found.is_empty());
1435        assert!(diff.verdict_changes.is_empty());
1436        assert_eq!(diff.evidence_changes.len(), 1);
1437        assert_eq!(diff.evidence_changes[0].site, "GitHub");
1438        assert_eq!(
1439            diff.evidence_changes[0]
1440                .before_enrichment
1441                .get("name")
1442                .unwrap(),
1443            "Alice"
1444        );
1445        assert_eq!(
1446            diff.evidence_changes[0]
1447                .after_enrichment
1448                .get("name")
1449                .unwrap(),
1450            "Alice Liddell"
1451        );
1452    }
1453
1454    #[test]
1455    fn timeline_tracks_first_seen_disappeared_and_reappeared() {
1456        let mut first = sample("first", 1_000);
1457        first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1458        let mut second = sample("second", 2_000);
1459        second.outcomes = vec![outcome("GitHub", MatchKind::NotFound)];
1460        let mut third = sample("third", 3_000);
1461        third.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1462
1463        let timeline = build_scan_timeline(&[third, first, second]);
1464
1465        assert_eq!(timeline.username, "alice");
1466        assert_eq!(timeline.scan_count, 3);
1467        assert_eq!(timeline.from_ms, Some(1_000));
1468        assert_eq!(timeline.to_ms, Some(3_000));
1469        assert_eq!(timeline.profiles.len(), 1);
1470        assert_eq!(timeline.profiles[0].site, "GitHub");
1471        assert_eq!(timeline.profiles[0].first_seen_ms, 1_000);
1472        assert_eq!(timeline.profiles[0].last_seen_ms, 3_000);
1473        assert!(timeline.profiles[0].present_in_latest);
1474        assert_eq!(
1475            timeline
1476                .events
1477                .iter()
1478                .map(|event| event.kind)
1479                .collect::<Vec<_>>(),
1480            [
1481                TimelineEventKind::FirstSeen,
1482                TimelineEventKind::Disappeared,
1483                TimelineEventKind::Reappeared
1484            ]
1485        );
1486        assert_eq!(timeline.events[1].before, Some(MatchKind::Found));
1487        assert_eq!(timeline.events[1].after, Some(MatchKind::NotFound));
1488    }
1489
1490    #[test]
1491    fn timeline_treats_missing_site_as_disappeared() {
1492        let mut first = sample("first", 1_000);
1493        first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1494        let mut second = sample("second", 2_000);
1495        second.outcomes = vec![outcome("GitLab", MatchKind::NotFound)];
1496
1497        let timeline = build_scan_timeline(&[first, second]);
1498
1499        assert_eq!(timeline.profiles.len(), 1);
1500        assert!(!timeline.profiles[0].present_in_latest);
1501        assert_eq!(timeline.events.len(), 2);
1502        assert_eq!(timeline.events[1].kind, TimelineEventKind::Disappeared);
1503        assert_eq!(timeline.events[1].site, "GitHub");
1504        assert_eq!(timeline.events[1].after, None);
1505    }
1506
1507    #[test]
1508    fn timeline_tracks_evidence_changes_for_still_found_profile() {
1509        let mut first = sample("first", 1_000);
1510        let mut old_github = outcome("GitHub", MatchKind::Found);
1511        old_github.enrichment.insert("name".into(), "Alice".into());
1512        old_github
1513            .profile_evidence
1514            .push(adler_core::ProfileEvidence::from_enrichment(
1515                "GitHub",
1516                "https://github.example/alice",
1517                "name",
1518                "Alice",
1519            ));
1520        first.outcomes = vec![old_github];
1521
1522        let mut second = sample("second", 2_000);
1523        let mut new_github = outcome("GitHub", MatchKind::Found);
1524        new_github
1525            .enrichment
1526            .insert("name".into(), "Alice Liddell".into());
1527        new_github
1528            .profile_evidence
1529            .push(adler_core::ProfileEvidence::from_enrichment(
1530                "GitHub",
1531                "https://github.example/alice",
1532                "name",
1533                "Alice Liddell",
1534            ));
1535        second.outcomes = vec![new_github];
1536
1537        let timeline = build_scan_timeline(&[first, second]);
1538
1539        assert_eq!(
1540            timeline
1541                .events
1542                .iter()
1543                .map(|event| event.kind)
1544                .collect::<Vec<_>>(),
1545            [
1546                TimelineEventKind::FirstSeen,
1547                TimelineEventKind::EvidenceChanged
1548            ]
1549        );
1550        let evidence_change = timeline.events[1].evidence_change.as_ref().unwrap();
1551        assert_eq!(
1552            evidence_change.before_enrichment.get("name").unwrap(),
1553            "Alice"
1554        );
1555        assert_eq!(
1556            evidence_change.after_enrichment.get("name").unwrap(),
1557            "Alice Liddell"
1558        );
1559    }
1560
1561    #[tokio::test]
1562    async fn load_all_returns_newest_first() {
1563        let tmp = TempDir::new().unwrap();
1564        save(tmp.path(), &sample("old", 1_000)).await.unwrap();
1565        save(tmp.path(), &sample("mid", 2_000)).await.unwrap();
1566        save(tmp.path(), &sample("new", 3_000)).await.unwrap();
1567        let all = load_all(tmp.path()).await;
1568        assert_eq!(all.len(), 3);
1569        assert_eq!(all[0].scan_id.as_str(), "new");
1570        assert_eq!(all[1].scan_id.as_str(), "mid");
1571        assert_eq!(all[2].scan_id.as_str(), "old");
1572    }
1573
1574    #[tokio::test]
1575    async fn load_returns_none_for_missing() {
1576        let tmp = TempDir::new().unwrap();
1577        let missing = load(tmp.path(), &ScanId::from("nope".to_owned())).await;
1578        assert!(missing.is_none());
1579    }
1580
1581    #[tokio::test]
1582    async fn load_defaults_schema_version_for_legacy_scan_json() {
1583        let tmp = TempDir::new().unwrap();
1584        let path = tmp.path().join("legacy.json");
1585        fs::write(
1586            &path,
1587            br#"{
1588                "scan_id": "legacy",
1589                "username": "alice",
1590                "site_count": 0,
1591                "created_at_ms": 1700000000000,
1592                "summary": { "found": 0, "not_found": 0, "uncertain": 0 },
1593                "outcomes": [],
1594                "elapsed_ms": 0
1595            }"#,
1596        )
1597        .await
1598        .unwrap();
1599
1600        let loaded = load(tmp.path(), &ScanId::from("legacy".to_owned()))
1601            .await
1602            .expect("legacy scan loads");
1603        assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1604    }
1605
1606    #[tokio::test]
1607    async fn load_accepts_v2_scan_json_after_schema_bump() {
1608        let tmp = TempDir::new().unwrap();
1609        let path = tmp.path().join("v2.json");
1610        fs::write(
1611            &path,
1612            br#"{
1613                "schema_version": 2,
1614                "scan_id": "v2",
1615                "username": "alice",
1616                "site_count": 1,
1617                "created_at_ms": 1700000000000,
1618                "summary": { "found": 1, "not_found": 0, "uncertain": 0 },
1619                "outcomes": [
1620                    {
1621                        "site": "GitHub",
1622                        "url": "https://github.example/alice",
1623                        "kind": "found",
1624                        "elapsed_ms": 10,
1625                        "evidence": ["HTTP 200 (status_found)"]
1626                    }
1627                ],
1628                "elapsed_ms": 10
1629            }"#,
1630        )
1631        .await
1632        .unwrap();
1633
1634        let loaded = load(tmp.path(), &ScanId::from("v2".to_owned()))
1635            .await
1636            .expect("v2 scan loads");
1637
1638        assert_eq!(loaded.schema_version, 2);
1639        assert_eq!(loaded.summary.found, 1);
1640        assert_eq!(
1641            loaded.outcomes[0].confidence.label,
1642            adler_core::ConfidenceLabel::Medium
1643        );
1644    }
1645
1646    #[tokio::test]
1647    async fn load_derives_identity_clusters_for_legacy_scan_json() {
1648        let tmp = TempDir::new().unwrap();
1649        let path = tmp.path().join("legacy-clusters.json");
1650        fs::write(
1651            &path,
1652            br#"{
1653                "schema_version": 1,
1654                "scan_id": "legacy-clusters",
1655                "username": "alice",
1656                "site_count": 2,
1657                "created_at_ms": 1700000000000,
1658                "summary": { "found": 2, "not_found": 0, "uncertain": 0 },
1659                "outcomes": [
1660                    {
1661                        "site": "GitHub",
1662                        "url": "https://github.example/alice",
1663                        "kind": "found",
1664                        "elapsed_ms": 10,
1665                        "profile_evidence": [
1666                            {
1667                                "kind": "external_link",
1668                                "field": "website",
1669                                "value": "https://alice.dev",
1670                                "source": {
1671                                    "site": "GitHub",
1672                                    "url": "https://github.example/alice",
1673                                    "origin": "extractor"
1674                                }
1675                            }
1676                        ]
1677                    },
1678                    {
1679                        "site": "GitLab",
1680                        "url": "https://gitlab.example/alice",
1681                        "kind": "found",
1682                        "elapsed_ms": 10,
1683                        "profile_evidence": [
1684                            {
1685                                "kind": "external_link",
1686                                "field": "website",
1687                                "value": "https://alice.dev/",
1688                                "source": {
1689                                    "site": "GitLab",
1690                                    "url": "https://gitlab.example/alice",
1691                                    "origin": "extractor"
1692                                }
1693                            }
1694                        ]
1695                    }
1696                ],
1697                "elapsed_ms": 20
1698            }"#,
1699        )
1700        .await
1701        .unwrap();
1702
1703        let loaded = load(tmp.path(), &ScanId::from("legacy-clusters".to_owned()))
1704            .await
1705            .expect("legacy scan loads");
1706
1707        assert_eq!(loaded.identity_clusters.len(), 1);
1708        assert_eq!(loaded.identity_clusters[0].members.len(), 2);
1709        assert!(!loaded.identity_clusters[0].uncertain);
1710    }
1711
1712    #[test]
1713    fn large_scan_artifact_paths_handle_identity_graph_payloads() {
1714        let previous = large_persisted_scan("large-old", 0);
1715        let current = large_persisted_scan("large-new", 1);
1716
1717        assert_eq!(previous.outcomes.len(), 2_500);
1718        assert_eq!(previous.site_count, 2_500);
1719        assert_eq!(
1720            previous.summary.found + previous.summary.not_found + previous.summary.uncertain,
1721            2_500
1722        );
1723        assert!(!previous.identity_clusters.is_empty());
1724
1725        let raw = serde_json::to_string(&previous).unwrap();
1726        let decoded: PersistedScan = serde_json::from_str(&raw).unwrap();
1727        assert_eq!(decoded.outcomes.len(), 2_500);
1728        assert_eq!(
1729            decoded.identity_clusters.len(),
1730            previous.identity_clusters.len()
1731        );
1732
1733        let diff = diff_scans(&previous, &current);
1734        assert!(!diff.added_found.is_empty());
1735        assert!(!diff.removed_found.is_empty());
1736        assert!(!diff.verdict_changes.is_empty());
1737        assert!(!diff.evidence_changes.is_empty());
1738
1739        let timeline = build_scan_timeline(&[previous, current]);
1740        assert_eq!(timeline.scan_count, 2);
1741        assert_eq!(timeline.profiles.len(), 375);
1742        assert!(timeline.events.len() > timeline.profiles.len());
1743    }
1744
1745    #[tokio::test]
1746    async fn load_all_skips_unrelated_files() {
1747        let tmp = TempDir::new().unwrap();
1748        // Drop a non-JSON file and a malformed JSON file alongside.
1749        fs::write(tmp.path().join("README"), b"not json")
1750            .await
1751            .unwrap();
1752        fs::write(tmp.path().join("broken.json"), b"{ invalid")
1753            .await
1754            .unwrap();
1755        save(tmp.path(), &sample("good", 9_999)).await.unwrap();
1756        let all = load_all(tmp.path()).await;
1757        assert_eq!(all.len(), 1);
1758        assert_eq!(all[0].scan_id.as_str(), "good");
1759    }
1760
1761    #[tokio::test]
1762    async fn prune_keeps_only_newest_n() {
1763        let tmp = TempDir::new().unwrap();
1764        for i in 0u64..5 {
1765            save(tmp.path(), &sample(&format!("s{i}"), i * 1_000))
1766                .await
1767                .unwrap();
1768        }
1769        let removed = prune(tmp.path(), 2).await;
1770        assert_eq!(removed, 3);
1771        let remaining = load_all(tmp.path()).await;
1772        assert_eq!(remaining.len(), 2);
1773        assert_eq!(remaining[0].scan_id.as_str(), "s4");
1774        assert_eq!(remaining[1].scan_id.as_str(), "s3");
1775    }
1776}