Skip to main content

adler_server/
persist.rs

1//! On-disk persistence for finished scans.
2//!
3//! Each scan is serialised as a single JSON file under [`default_dir`]
4//! (`$XDG_CACHE_HOME/adler/scans/`, falling back to
5//! `$HOME/.cache/adler/scans/`). The on-disk format is the full
6//! [`PersistedScan`] — enough for the history listing AND for replaying
7//! the scan into the UI without a fresh probe.
8//!
9//! Writes are atomic: serialise to `<id>.json.tmp`, then rename onto
10//! the final path. A crashed process leaves at most one orphan `.tmp`
11//! file behind, never a half-written `<id>.json`.
12
13use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use adler_core::{
17    CheckOutcome, HistoricalScanRef, IdentityCluster, InvestigationReport, MatchKind,
18    ProfileEvidence, ReportDisabledSite, ReportTimelineEvent, ReportTimelineEventKind, Site,
19};
20use serde::{Deserialize, Serialize};
21use tokio::fs;
22
23use crate::error::{Error, Result};
24use crate::scan::{FinishedScan, ScanId, Summary};
25
26/// Hard cap on how many scans we keep on disk. Beyond this, oldest
27/// (by `created_at_ms`) get [`prune`]d on the next save. Picked to be
28/// large enough for any plausible human-driven OSINT session.
29pub(crate) const MAX_PERSISTED_SCANS: usize = 200;
30/// Current on-disk schema version for [`PersistedScan`].
31pub(crate) const PERSISTED_SCAN_SCHEMA_VERSION: u16 = 3;
32
33/// Self-contained snapshot of a completed scan. Round-trips losslessly
34/// through JSON; tests assert that.
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct PersistedScan {
37    /// Version of this persisted scan artifact.
38    #[serde(default = "default_schema_version")]
39    pub schema_version: u16,
40    /// Stable identifier — same value as in-memory [`ScanId`].
41    pub scan_id: ScanId,
42    /// Username that was scanned.
43    pub username: String,
44    /// Request scope and parked-site diagnostics that explain how this
45    /// artifact was produced. Missing on scans saved before v1 context
46    /// support landed.
47    #[serde(default, skip_serializing_if = "Option::is_none")]
48    pub request_context: Option<ScanRequestContext>,
49    /// Total number of sites probed in this scan.
50    pub site_count: usize,
51    /// Unix epoch milliseconds when the scan was started.
52    pub created_at_ms: u64,
53    /// Per-verdict tally over [`Self::outcomes`].
54    pub summary: Summary,
55    /// All outcomes, in completion order.
56    pub outcomes: Vec<CheckOutcome>,
57    /// Deterministic identity candidates derived from found outcomes
58    /// with structured profile evidence.
59    #[serde(default, skip_serializing_if = "Vec::is_empty")]
60    pub identity_clusters: Vec<IdentityCluster>,
61    /// Wall-clock duration, milliseconds.
62    pub elapsed_ms: u64,
63}
64
65impl PersistedScan {
66    /// Build a snapshot from a freshly-completed in-memory scan.
67    #[must_use]
68    pub fn from_finished(
69        scan_id: ScanId,
70        username: String,
71        site_count: usize,
72        created_at_ms: u64,
73        finished: FinishedScan,
74    ) -> Self {
75        let mut scan = Self {
76            schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
77            scan_id,
78            username,
79            request_context: None,
80            site_count,
81            created_at_ms,
82            summary: finished.summary,
83            outcomes: finished.outcomes,
84            identity_clusters: finished.identity_clusters,
85            elapsed_ms: finished.elapsed_ms,
86        };
87        scan.refresh_derived_fields();
88        scan
89    }
90
91    /// Attach request-scope metadata to this persisted scan.
92    #[must_use]
93    pub fn with_request_context(mut self, context: ScanRequestContext) -> Self {
94        self.request_context = Some(context);
95        self
96    }
97
98    pub(crate) fn refresh_derived_fields(&mut self) {
99        for outcome in &mut self.outcomes {
100            outcome.refresh_confidence();
101        }
102        self.summary = Summary::from_outcomes(&self.outcomes);
103        self.identity_clusters =
104            adler_core::build_identity_clusters(&self.username, &self.outcomes);
105    }
106}
107
108/// Apply a non-persisted confidence overlay from previous scans of the same
109/// username.
110///
111/// The on-disk artifact remains the stateless source of truth. This helper is
112/// intended for history-aware read surfaces such as reports and persisted scan
113/// API/resource views.
114pub fn apply_historical_confidence_overlay(
115    current: &mut PersistedScan,
116    related_scans: &[PersistedScan],
117) {
118    current.refresh_derived_fields();
119    let history_counts = historical_consistency_counts(current, related_scans);
120
121    for outcome in &mut current.outcomes {
122        let count = history_counts.get(&outcome.site).copied().unwrap_or(0);
123        outcome.refresh_confidence_with_history(count);
124    }
125
126    current.identity_clusters = historical_identity_clusters(current, related_scans);
127}
128
129/// Build a history-aware investigation report from a scan artifact.
130///
131/// The input scan is consumed and enriched in memory only. Persisted JSON files
132/// are never rewritten by this helper.
133#[must_use]
134pub fn build_investigation_report(
135    mut scan: PersistedScan,
136    related_scans: &[PersistedScan],
137) -> InvestigationReport {
138    apply_historical_confidence_overlay(&mut scan, related_scans);
139    let timeline = report_timeline_from_scans(related_scans, &scan);
140    let disabled_sites = scan
141        .request_context
142        .as_ref()
143        .map(|context| {
144            context
145                .disabled_matches
146                .iter()
147                .map(|site| ReportDisabledSite {
148                    name: site.name.clone(),
149                    url: site.url.clone(),
150                    tags: site.tags.clone(),
151                    disabled_reason: site.disabled_reason.clone(),
152                })
153                .collect()
154        })
155        .unwrap_or_default();
156
157    InvestigationReport::builder(scan.username, &scan.outcomes)
158        .identity_clusters(scan.identity_clusters)
159        .timeline(timeline)
160        .disabled_sites(disabled_sites)
161        .build()
162}
163
164fn report_timeline_from_scans(
165    related_scans: &[PersistedScan],
166    current: &PersistedScan,
167) -> Vec<ReportTimelineEvent> {
168    let mut scans = related_scans.to_vec();
169    if !scans.iter().any(|scan| scan.scan_id == current.scan_id) {
170        scans.push(current.clone());
171    }
172    build_scan_timeline(&scans)
173        .events
174        .into_iter()
175        .map(report_timeline_event)
176        .collect()
177}
178
179fn report_timeline_event(event: TimelineEvent) -> ReportTimelineEvent {
180    ReportTimelineEvent {
181        kind: match event.kind {
182            TimelineEventKind::FirstSeen => ReportTimelineEventKind::AddedFound,
183            TimelineEventKind::Disappeared => ReportTimelineEventKind::RemovedFound,
184            TimelineEventKind::Reappeared => ReportTimelineEventKind::Reappeared,
185            TimelineEventKind::EvidenceChanged => ReportTimelineEventKind::EvidenceChanged,
186        },
187        site: Some(event.site),
188        scan_id: Some(event.scan_id.to_string()),
189        observed_at_ms: Some(event.at_ms),
190        detail: Some(timeline_detail(event.before, event.after)),
191    }
192}
193
194fn timeline_detail(before: Option<MatchKind>, after: Option<MatchKind>) -> String {
195    match (before, after) {
196        (Some(before), Some(after)) => format!("{} -> {}", kind_label(before), kind_label(after)),
197        (None, Some(after)) => format!("new {}", kind_label(after)),
198        (Some(before), None) => format!("after {}", kind_label(before)),
199        (None, None) => "changed".to_owned(),
200    }
201}
202
203fn kind_label(kind: MatchKind) -> &'static str {
204    match kind {
205        MatchKind::Found => "found",
206        MatchKind::NotFound => "not_found",
207        MatchKind::Uncertain => "uncertain",
208    }
209}
210
211fn historical_consistency_counts(
212    current: &PersistedScan,
213    related_scans: &[PersistedScan],
214) -> BTreeMap<String, usize> {
215    let current_ref = HistoricalScanRef {
216        scan_id: current.scan_id.as_str(),
217        username: &current.username,
218        created_at_ms: current.created_at_ms,
219        outcomes: &current.outcomes,
220    };
221    let related_refs = related_scans.iter().map(|scan| HistoricalScanRef {
222        scan_id: scan.scan_id.as_str(),
223        username: &scan.username,
224        created_at_ms: scan.created_at_ms,
225        outcomes: &scan.outcomes,
226    });
227    adler_core::historical_consistency_counts(current_ref, related_refs)
228}
229
230fn historical_identity_clusters(
231    current: &PersistedScan,
232    related_scans: &[PersistedScan],
233) -> Vec<IdentityCluster> {
234    let current_ref = HistoricalScanRef {
235        scan_id: current.scan_id.as_str(),
236        username: &current.username,
237        created_at_ms: current.created_at_ms,
238        outcomes: &current.outcomes,
239    };
240    let related_refs = related_scans.iter().map(|scan| HistoricalScanRef {
241        scan_id: scan.scan_id.as_str(),
242        username: &scan.username,
243        created_at_ms: scan.created_at_ms,
244        outcomes: &scan.outcomes,
245    });
246    adler_core::build_identity_clusters_with_history(current_ref, related_refs)
247}
248
249const fn default_schema_version() -> u16 {
250    PERSISTED_SCAN_SCHEMA_VERSION
251}
252
253/// Request scope persisted with a finished scan so future timelines and
254/// reports can explain what was scanned and what was intentionally out of
255/// scope.
256#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
257pub struct ScanRequestContext {
258    /// Username supplied by the operator.
259    pub username: String,
260    /// Previous scan id when this scan was created by refiltering.
261    #[serde(default, skip_serializing_if = "Option::is_none")]
262    pub derived_from: Option<ScanId>,
263    /// Site name include filters.
264    #[serde(default, skip_serializing_if = "Vec::is_empty")]
265    pub only: Vec<String>,
266    /// Site name exclude filters.
267    #[serde(default, skip_serializing_if = "Vec::is_empty")]
268    pub exclude: Vec<String>,
269    /// Tag include filters.
270    #[serde(default, skip_serializing_if = "Vec::is_empty")]
271    pub tag: Vec<String>,
272    /// Tag exclude filters.
273    #[serde(default, skip_serializing_if = "Vec::is_empty")]
274    pub exclude_tag: Vec<String>,
275    /// Popularity ceiling, when supplied.
276    #[serde(default, skip_serializing_if = "Option::is_none")]
277    pub top: Option<u32>,
278    /// Whether NSFW-tagged entries were included.
279    pub nsfw: bool,
280    /// Per-scan concurrency override, when supplied.
281    #[serde(default, skip_serializing_if = "Option::is_none")]
282    pub concurrency: Option<usize>,
283    /// Per-scan deadline override, seconds.
284    #[serde(default, skip_serializing_if = "Option::is_none")]
285    pub deadline_secs: Option<u64>,
286    /// Egress subset requested for this scan.
287    #[serde(default, skip_serializing_if = "Vec::is_empty")]
288    pub egress_names: Vec<String>,
289    /// Disabled/parked sites that matched the same filter and were not
290    /// included in the enabled scan set.
291    #[serde(default, skip_serializing_if = "Vec::is_empty")]
292    pub disabled_matches: Vec<PersistedDisabledMatch>,
293}
294
295/// Compact disabled-site diagnostic persisted with scan context.
296#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
297pub struct PersistedDisabledMatch {
298    /// Site name.
299    pub name: String,
300    /// Profile URL template.
301    pub url: String,
302    /// Registry tags.
303    #[serde(default, skip_serializing_if = "Vec::is_empty")]
304    pub tags: Vec<String>,
305    /// Human-readable reason the site is parked.
306    pub disabled_reason: String,
307}
308
309impl From<&Site> for PersistedDisabledMatch {
310    fn from(site: &Site) -> Self {
311        Self {
312            name: site.name.clone(),
313            url: site.url.as_str().to_owned(),
314            tags: site.tags.clone(),
315            disabled_reason: site
316                .disabled_reason
317                .clone()
318                .unwrap_or_else(|| "disabled in registry".to_owned()),
319        }
320    }
321}
322
323/// Deterministic scan-to-scan diff used as the basis for timelines and
324/// watchlists.
325#[derive(Debug, Clone, Serialize, Deserialize)]
326pub struct ScanDiff {
327    /// Previous scan id.
328    pub from_scan_id: ScanId,
329    /// Current scan id.
330    pub to_scan_id: ScanId,
331    /// Found accounts that were not Found in the previous scan.
332    #[serde(default, skip_serializing_if = "Vec::is_empty")]
333    pub added_found: Vec<CheckOutcome>,
334    /// Accounts that were Found previously but are no longer Found.
335    #[serde(default, skip_serializing_if = "Vec::is_empty")]
336    pub removed_found: Vec<CheckOutcome>,
337    /// Sites present in both scans whose verdict changed.
338    #[serde(default, skip_serializing_if = "Vec::is_empty")]
339    pub verdict_changes: Vec<VerdictChange>,
340    /// Found sites whose normalized profile evidence changed.
341    #[serde(default, skip_serializing_if = "Vec::is_empty")]
342    pub evidence_changes: Vec<EvidenceChange>,
343}
344
345/// A verdict transition for one site.
346#[derive(Debug, Clone, Serialize, Deserialize)]
347pub struct VerdictChange {
348    /// Site name.
349    pub site: String,
350    /// Previous verdict.
351    pub before: MatchKind,
352    /// Current verdict.
353    pub after: MatchKind,
354}
355
356/// Profile evidence transition for one still-found site.
357#[derive(Debug, Clone, Serialize, Deserialize)]
358pub struct EvidenceChange {
359    /// Site name.
360    pub site: String,
361    /// Previous legacy enrichment fields.
362    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
363    pub before_enrichment: BTreeMap<String, String>,
364    /// Current legacy enrichment fields.
365    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
366    pub after_enrichment: BTreeMap<String, String>,
367    /// Previous normalized profile evidence.
368    #[serde(default, skip_serializing_if = "Vec::is_empty")]
369    pub before_profile_evidence: Vec<ProfileEvidence>,
370    /// Current normalized profile evidence.
371    #[serde(default, skip_serializing_if = "Vec::is_empty")]
372    pub after_profile_evidence: Vec<ProfileEvidence>,
373}
374
375/// Historical view derived from a sequence of persisted scans.
376#[derive(Debug, Clone, Serialize, Deserialize)]
377pub struct ScanTimeline {
378    /// Username shared by the scans used to build this timeline.
379    pub username: String,
380    /// Number of scans considered.
381    pub scan_count: usize,
382    /// Oldest scan timestamp, when at least one scan was supplied.
383    #[serde(default, skip_serializing_if = "Option::is_none")]
384    pub from_ms: Option<u64>,
385    /// Newest scan timestamp, when at least one scan was supplied.
386    #[serde(default, skip_serializing_if = "Option::is_none")]
387    pub to_ms: Option<u64>,
388    /// Per-site lifecycle summary.
389    #[serde(default, skip_serializing_if = "Vec::is_empty")]
390    pub profiles: Vec<TimelineProfile>,
391    /// Chronological lifecycle events.
392    #[serde(default, skip_serializing_if = "Vec::is_empty")]
393    pub events: Vec<TimelineEvent>,
394}
395
396/// Per-site lifecycle state in a scan timeline.
397#[derive(Debug, Clone, Serialize, Deserialize)]
398pub struct TimelineProfile {
399    /// Site name.
400    pub site: String,
401    /// Last known profile URL for the site.
402    pub url: String,
403    /// First scan timestamp where the profile was Found.
404    pub first_seen_ms: u64,
405    /// Most recent scan timestamp where the profile was Found.
406    pub last_seen_ms: u64,
407    /// Whether the profile is Found in the newest scan that mentioned it.
408    pub present_in_latest: bool,
409    /// Last verdict observed for this site, if the newest scan mentioned it.
410    #[serde(default, skip_serializing_if = "Option::is_none")]
411    pub last_verdict: Option<MatchKind>,
412}
413
414/// Timeline event category.
415#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
416#[serde(rename_all = "snake_case")]
417pub enum TimelineEventKind {
418    /// Site was Found for the first time in the supplied scan sequence.
419    FirstSeen,
420    /// Site was Found before, then no longer Found.
421    Disappeared,
422    /// Site was absent/not found after a previous hit, then Found again.
423    Reappeared,
424    /// Site stayed Found but normalized profile evidence changed.
425    EvidenceChanged,
426}
427
428/// One lifecycle event for a profile across scans.
429#[derive(Debug, Clone, Serialize, Deserialize)]
430pub struct TimelineEvent {
431    /// Scan id where the event was observed.
432    pub scan_id: ScanId,
433    /// Scan start timestamp.
434    pub at_ms: u64,
435    /// Site name.
436    pub site: String,
437    /// Best URL known for the site at this point in the timeline.
438    pub url: String,
439    /// Event category.
440    pub kind: TimelineEventKind,
441    /// Previous verdict, when known.
442    #[serde(default, skip_serializing_if = "Option::is_none")]
443    pub before: Option<MatchKind>,
444    /// Current verdict, when the current scan mentioned the site.
445    #[serde(default, skip_serializing_if = "Option::is_none")]
446    pub after: Option<MatchKind>,
447    /// Evidence transition for [`TimelineEventKind::EvidenceChanged`].
448    #[serde(default, skip_serializing_if = "Option::is_none")]
449    pub evidence_change: Option<EvidenceChange>,
450}
451
452/// Compare two persisted scans.
453///
454/// The diff is intentionally conservative: `added_found` and
455/// `removed_found` are based only on the `Found` verdict, while
456/// `evidence_changes` are reported only for sites that are Found in both
457/// scans.
458#[must_use]
459pub fn diff_scans(previous: &PersistedScan, current: &PersistedScan) -> ScanDiff {
460    let previous_by_site = outcomes_by_site(&previous.outcomes);
461    let current_by_site = outcomes_by_site(&current.outcomes);
462
463    let mut added_found = Vec::new();
464    let mut removed_found = Vec::new();
465    let mut verdict_changes = Vec::new();
466    let mut evidence_changes = Vec::new();
467
468    for (site, current_outcome) in &current_by_site {
469        let previous_outcome = previous_by_site.get(site);
470        if current_outcome.kind == MatchKind::Found
471            && previous_outcome.is_none_or(|o| o.kind != MatchKind::Found)
472        {
473            added_found.push((*current_outcome).clone());
474        }
475        if let Some(previous_outcome) = previous_outcome {
476            if previous_outcome.kind != current_outcome.kind {
477                verdict_changes.push(VerdictChange {
478                    site: site.clone(),
479                    before: previous_outcome.kind,
480                    after: current_outcome.kind,
481                });
482            }
483            if previous_outcome.kind == MatchKind::Found
484                && current_outcome.kind == MatchKind::Found
485                && profile_evidence_changed(previous_outcome, current_outcome)
486            {
487                evidence_changes.push(EvidenceChange {
488                    site: site.clone(),
489                    before_enrichment: previous_outcome.enrichment.clone(),
490                    after_enrichment: current_outcome.enrichment.clone(),
491                    before_profile_evidence: previous_outcome.profile_evidence.clone(),
492                    after_profile_evidence: current_outcome.profile_evidence.clone(),
493                });
494            }
495        }
496    }
497
498    for (site, previous_outcome) in &previous_by_site {
499        if previous_outcome.kind == MatchKind::Found
500            && current_by_site
501                .get(site)
502                .is_none_or(|o| o.kind != MatchKind::Found)
503        {
504            removed_found.push((*previous_outcome).clone());
505        }
506    }
507
508    ScanDiff {
509        from_scan_id: previous.scan_id.clone(),
510        to_scan_id: current.scan_id.clone(),
511        added_found,
512        removed_found,
513        verdict_changes,
514        evidence_changes,
515    }
516}
517
518/// Build a chronological timeline from persisted scans.
519///
520/// Scans may be supplied in any order; the builder sorts them oldest-first.
521/// Only `Found` outcomes create profiles. A later non-Found or missing site
522/// creates a disappearance event if the profile was previously present.
523#[must_use]
524pub fn build_scan_timeline(scans: &[PersistedScan]) -> ScanTimeline {
525    let mut ordered: Vec<&PersistedScan> = scans.iter().collect();
526    ordered.sort_by(|left, right| {
527        left.created_at_ms
528            .cmp(&right.created_at_ms)
529            .then_with(|| left.scan_id.as_str().cmp(right.scan_id.as_str()))
530    });
531
532    let username = ordered
533        .first()
534        .map(|scan| scan.username.clone())
535        .unwrap_or_default();
536    let from_ms = ordered.first().map(|scan| scan.created_at_ms);
537    let to_ms = ordered.last().map(|scan| scan.created_at_ms);
538    let mut states: BTreeMap<String, TimelineProfileState> = BTreeMap::new();
539    let mut events = Vec::new();
540
541    for scan in &ordered {
542        let current_by_site = outcomes_by_site(&scan.outcomes);
543        let sites = timeline_site_names(&states, &current_by_site);
544
545        for site in sites {
546            apply_timeline_site(
547                scan,
548                &site,
549                current_by_site.get(&site).copied(),
550                &mut states,
551                &mut events,
552            );
553        }
554    }
555
556    let profiles = states
557        .into_iter()
558        .map(|(site, state)| TimelineProfile {
559            site,
560            url: state.url,
561            first_seen_ms: state.first_seen_ms,
562            last_seen_ms: state.last_seen_ms,
563            present_in_latest: state.present_in_latest,
564            last_verdict: state.last_verdict,
565        })
566        .collect();
567
568    ScanTimeline {
569        username,
570        scan_count: ordered.len(),
571        from_ms,
572        to_ms,
573        profiles,
574        events,
575    }
576}
577
578fn timeline_site_names(
579    states: &BTreeMap<String, TimelineProfileState>,
580    current_by_site: &BTreeMap<String, &CheckOutcome>,
581) -> Vec<String> {
582    let mut sites: Vec<String> = states.keys().cloned().collect();
583    for site in current_by_site.keys() {
584        if !states.contains_key(site.as_str()) {
585            sites.push((*site).clone());
586        }
587    }
588    sites.sort();
589    sites.dedup();
590    sites
591}
592
593fn apply_timeline_site(
594    scan: &PersistedScan,
595    site: &str,
596    current: Option<&CheckOutcome>,
597    states: &mut BTreeMap<String, TimelineProfileState>,
598    events: &mut Vec<TimelineEvent>,
599) {
600    let current_kind = current.map(|outcome| outcome.kind);
601    let was_present = states
602        .get(site)
603        .is_some_and(|state| state.present_in_latest);
604
605    if current_kind == Some(MatchKind::Found) {
606        apply_found_timeline_site(scan, site, current.expect("found outcome"), states, events);
607    } else if was_present {
608        apply_disappeared_timeline_site(scan, site, current, current_kind, states, events);
609    } else if let (Some(state), Some(outcome)) = (states.get_mut(site), current) {
610        state.last_verdict = Some(outcome.kind);
611        state.url.clone_from(&outcome.url);
612    }
613}
614
615fn apply_found_timeline_site(
616    scan: &PersistedScan,
617    site: &str,
618    outcome: &CheckOutcome,
619    states: &mut BTreeMap<String, TimelineProfileState>,
620    events: &mut Vec<TimelineEvent>,
621) {
622    let current_kind = Some(outcome.kind);
623    let had_state = states.contains_key(site);
624    let was_present = states
625        .get(site)
626        .is_some_and(|state| state.present_in_latest);
627    let state = states
628        .entry(site.to_owned())
629        .or_insert_with(|| TimelineProfileState::new(outcome, scan.created_at_ms));
630
631    if !had_state {
632        events.push(timeline_event(
633            scan,
634            site,
635            &outcome.url,
636            TimelineEventKind::FirstSeen,
637            None,
638            current_kind,
639            None,
640        ));
641    } else if !was_present {
642        events.push(timeline_event(
643            scan,
644            site,
645            &outcome.url,
646            TimelineEventKind::Reappeared,
647            state.last_verdict,
648            current_kind,
649            None,
650        ));
651    } else if state.profile_evidence_changed(outcome) {
652        events.push(timeline_event(
653            scan,
654            site,
655            &outcome.url,
656            TimelineEventKind::EvidenceChanged,
657            Some(MatchKind::Found),
658            current_kind,
659            Some(EvidenceChange {
660                site: site.to_owned(),
661                before_enrichment: state.last_found_enrichment.clone(),
662                after_enrichment: outcome.enrichment.clone(),
663                before_profile_evidence: state.last_found_profile_evidence.clone(),
664                after_profile_evidence: outcome.profile_evidence.clone(),
665            }),
666        ));
667    }
668
669    states
670        .get_mut(site)
671        .expect("state inserted before found update")
672        .update_found(outcome, scan.created_at_ms);
673}
674
675fn apply_disappeared_timeline_site(
676    scan: &PersistedScan,
677    site: &str,
678    current: Option<&CheckOutcome>,
679    current_kind: Option<MatchKind>,
680    states: &mut BTreeMap<String, TimelineProfileState>,
681    events: &mut Vec<TimelineEvent>,
682) {
683    let state = states
684        .get_mut(site)
685        .expect("present state exists before disappearance");
686    let url = current.map_or_else(|| state.url.clone(), |outcome| outcome.url.clone());
687    events.push(timeline_event(
688        scan,
689        site,
690        &url,
691        TimelineEventKind::Disappeared,
692        state.last_verdict,
693        current_kind,
694        None,
695    ));
696    state.present_in_latest = false;
697    state.last_verdict = current_kind;
698    if let Some(outcome) = current {
699        state.url.clone_from(&outcome.url);
700    }
701}
702
703fn timeline_event(
704    scan: &PersistedScan,
705    site: &str,
706    url: &str,
707    kind: TimelineEventKind,
708    before: Option<MatchKind>,
709    after: Option<MatchKind>,
710    evidence_change: Option<EvidenceChange>,
711) -> TimelineEvent {
712    TimelineEvent {
713        scan_id: scan.scan_id.clone(),
714        at_ms: scan.created_at_ms,
715        site: site.to_owned(),
716        url: url.to_owned(),
717        kind,
718        before,
719        after,
720        evidence_change,
721    }
722}
723
724#[derive(Debug, Clone)]
725struct TimelineProfileState {
726    url: String,
727    first_seen_ms: u64,
728    last_seen_ms: u64,
729    present_in_latest: bool,
730    last_verdict: Option<MatchKind>,
731    last_found_enrichment: BTreeMap<String, String>,
732    last_found_profile_evidence: Vec<ProfileEvidence>,
733}
734
735impl TimelineProfileState {
736    fn new(outcome: &CheckOutcome, at_ms: u64) -> Self {
737        Self {
738            url: outcome.url.clone(),
739            first_seen_ms: at_ms,
740            last_seen_ms: at_ms,
741            present_in_latest: true,
742            last_verdict: Some(outcome.kind),
743            last_found_enrichment: outcome.enrichment.clone(),
744            last_found_profile_evidence: outcome.profile_evidence.clone(),
745        }
746    }
747
748    fn update_found(&mut self, outcome: &CheckOutcome, at_ms: u64) {
749        self.url.clone_from(&outcome.url);
750        self.last_seen_ms = at_ms;
751        self.present_in_latest = true;
752        self.last_verdict = Some(outcome.kind);
753        self.last_found_enrichment = outcome.enrichment.clone();
754        self.last_found_profile_evidence
755            .clone_from(&outcome.profile_evidence);
756    }
757
758    fn profile_evidence_changed(&self, outcome: &CheckOutcome) -> bool {
759        self.last_found_enrichment != outcome.enrichment
760            || self.last_found_profile_evidence != outcome.profile_evidence
761    }
762}
763
764fn outcomes_by_site(outcomes: &[CheckOutcome]) -> BTreeMap<String, &CheckOutcome> {
765    outcomes
766        .iter()
767        .map(|outcome| (outcome.site.clone(), outcome))
768        .collect()
769}
770
771fn profile_evidence_changed(previous: &CheckOutcome, current: &CheckOutcome) -> bool {
772    previous.enrichment != current.enrichment
773        || previous.profile_evidence != current.profile_evidence
774}
775
776/// Default directory for persisted scans.
777///
778/// Mirrors [`adler_core::Cache::default_path`]'s discovery rules:
779/// `$XDG_CACHE_HOME/adler/scans/` → `$HOME/.cache/adler/scans/` →
780/// a relative fallback. The directory is created lazily on first save.
781#[must_use]
782pub fn default_dir() -> PathBuf {
783    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
784        return PathBuf::from(xdg).join("adler").join("scans");
785    }
786    if let Some(home) = std::env::var_os("HOME") {
787        return PathBuf::from(home)
788            .join(".cache")
789            .join("adler")
790            .join("scans");
791    }
792    PathBuf::from("adler-scans")
793}
794
795/// Save `scan` to `<dir>/<id>.json` atomically. Creates `dir` if missing.
796pub(crate) async fn save(dir: &Path, scan: &PersistedScan) -> Result<()> {
797    fs::create_dir_all(dir).await.map_err(Error::Persist)?;
798    let path = dir.join(format!("{}.json", scan.scan_id));
799    let tmp = dir.join(format!("{}.json.tmp", scan.scan_id));
800    let mut scan = scan.clone();
801    scan.refresh_derived_fields();
802    let body = serde_json::to_vec_pretty(&scan).map_err(Error::PersistEncode)?;
803    fs::write(&tmp, &body).await.map_err(Error::Persist)?;
804    fs::rename(&tmp, &path).await.map_err(Error::Persist)?;
805    Ok(())
806}
807
808/// Read one scan from disk by id. Returns `None` on any I/O or parse
809/// error — callers should treat a missing scan as not-found rather
810/// than propagate the underlying cause.
811pub(crate) async fn load(dir: &Path, scan_id: &ScanId) -> Option<PersistedScan> {
812    let path = dir.join(format!("{scan_id}.json"));
813    let bytes = fs::read(&path).await.ok()?;
814    serde_json::from_slice(&bytes)
815        .ok()
816        .map(refresh_derived_fields)
817}
818
819/// Enumerate every persisted scan, newest first. Files that fail to
820/// parse are silently skipped — a corrupted file shouldn't break the
821/// whole listing.
822pub(crate) async fn load_all(dir: &Path) -> Vec<PersistedScan> {
823    let Ok(mut entries) = fs::read_dir(dir).await else {
824        return Vec::new();
825    };
826    let mut out = Vec::new();
827    while let Ok(Some(entry)) = entries.next_entry().await {
828        let path = entry.path();
829        if path.extension().and_then(|s| s.to_str()) != Some("json") {
830            continue;
831        }
832        let Ok(bytes) = fs::read(&path).await else {
833            continue;
834        };
835        let Ok(scan) = serde_json::from_slice::<PersistedScan>(&bytes) else {
836            continue;
837        };
838        out.push(refresh_derived_fields(scan));
839    }
840    out.sort_by_key(|s| std::cmp::Reverse(s.created_at_ms));
841    out
842}
843
844fn refresh_derived_fields(mut scan: PersistedScan) -> PersistedScan {
845    scan.refresh_derived_fields();
846    scan
847}
848
849/// Delete scans beyond `keep_newest`. Newest-by-`created_at_ms` wins.
850/// Returns the number of files actually removed.
851pub(crate) async fn prune(dir: &Path, keep_newest: usize) -> usize {
852    let scans = load_all(dir).await;
853    if scans.len() <= keep_newest {
854        return 0;
855    }
856    let mut removed = 0;
857    for s in &scans[keep_newest..] {
858        let path = dir.join(format!("{}.json", s.scan_id));
859        if fs::remove_file(&path).await.is_ok() {
860            removed += 1;
861        }
862    }
863    removed
864}
865
866#[cfg(test)]
867mod tests {
868    use super::*;
869    use adler_core::{
870        ConfidenceLabel, ConfidenceReason, EvidenceAccessPath, MatchKind, ProfileEvidence,
871        TransportTier, UncertainReason,
872    };
873    use std::collections::BTreeMap;
874    use tempfile::TempDir;
875
876    fn sample(scan_id: &str, ts: u64) -> PersistedScan {
877        PersistedScan {
878            schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
879            scan_id: ScanId::from(scan_id.to_owned()),
880            username: "alice".into(),
881            request_context: None,
882            site_count: 2,
883            created_at_ms: ts,
884            summary: Summary {
885                found: 1,
886                not_found: 1,
887                uncertain: 0,
888            },
889            outcomes: vec![
890                CheckOutcome {
891                    site: "GitHub".into(),
892                    url: "https://github.com/alice".into(),
893                    kind: MatchKind::Found,
894                    reason: None,
895                    elapsed_ms: 120,
896                    enrichment: BTreeMap::new(),
897                    evidence: vec!["HTTP 200 (status_found)".into()],
898                    profile_evidence: Vec::new(),
899                    confidence: adler_core::ConfidenceScore::default(),
900                    transport: None,
901                    escalations: 0,
902                },
903                CheckOutcome {
904                    site: "GitLab".into(),
905                    url: "https://gitlab.com/alice".into(),
906                    kind: MatchKind::NotFound,
907                    reason: None,
908                    elapsed_ms: 90,
909                    enrichment: BTreeMap::new(),
910                    evidence: vec!["HTTP 404 (status_not_found)".into()],
911                    profile_evidence: Vec::new(),
912                    confidence: adler_core::ConfidenceScore::default(),
913                    transport: None,
914                    escalations: 0,
915                },
916            ],
917            identity_clusters: Vec::new(),
918            elapsed_ms: 210,
919        }
920    }
921
922    fn outcome(site: &str, kind: MatchKind) -> CheckOutcome {
923        CheckOutcome {
924            site: site.into(),
925            url: format!("https://{site}.example/alice"),
926            kind,
927            reason: None,
928            elapsed_ms: 10,
929            enrichment: BTreeMap::new(),
930            evidence: Vec::new(),
931            profile_evidence: Vec::new(),
932            confidence: adler_core::ConfidenceScore::default(),
933            transport: None,
934            escalations: 0,
935        }
936    }
937
938    fn scan_with_outcomes(
939        scan_id: &str,
940        username: &str,
941        ts: u64,
942        outcomes: Vec<CheckOutcome>,
943    ) -> PersistedScan {
944        PersistedScan {
945            schema_version: PERSISTED_SCAN_SCHEMA_VERSION,
946            scan_id: ScanId::from(scan_id.to_owned()),
947            username: username.to_owned(),
948            request_context: None,
949            site_count: outcomes.len(),
950            created_at_ms: ts,
951            summary: Summary::from_outcomes(&outcomes),
952            outcomes,
953            identity_clusters: Vec::new(),
954            elapsed_ms: 10,
955        }
956    }
957
958    fn found_with_website(site: &str, website: &str) -> CheckOutcome {
959        found_with_website_at(site, website, None)
960    }
961
962    fn found_with_website_at(
963        site: &str,
964        website: &str,
965        observed_at_ms: Option<u64>,
966    ) -> CheckOutcome {
967        let mut outcome = outcome(site, MatchKind::Found);
968        outcome
969            .profile_evidence
970            .push(ProfileEvidence::from_enrichment_with_source(
971                site,
972                &outcome.url,
973                "website",
974                website,
975                observed_at_ms,
976                None,
977            ));
978        outcome
979    }
980
981    fn has_historical_reason(outcome: &CheckOutcome, count: usize) -> bool {
982        outcome.confidence.reasons.iter().any(|reason| {
983            matches!(
984                reason,
985                ConfidenceReason::HistoricalConsistency { count: actual } if *actual == count
986            )
987        })
988    }
989
990    fn large_outcomes(count: usize, generation: usize) -> Vec<CheckOutcome> {
991        (0..count)
992            .map(|idx| large_outcome(idx, generation))
993            .collect()
994    }
995
996    fn large_outcome(idx: usize, generation: usize) -> CheckOutcome {
997        let site = format!("LargeSite{idx:04}");
998        let url = format!("https://large{idx:04}.example/alice");
999        let mut kind = match idx % 20 {
1000            0 | 1 => MatchKind::Found,
1001            3 => MatchKind::Uncertain,
1002            _ => MatchKind::NotFound,
1003        };
1004        if generation > 0 && idx % 20 == 0 {
1005            kind = MatchKind::NotFound;
1006        } else if generation > 0 && idx % 20 == 2 {
1007            kind = MatchKind::Found;
1008        }
1009
1010        let mut outcome = CheckOutcome {
1011            site: site.clone(),
1012            url: url.clone(),
1013            kind,
1014            reason: (kind == MatchKind::Uncertain).then_some(UncertainReason::RateLimited),
1015            elapsed_ms: 10 + (idx % 75) as u64,
1016            enrichment: BTreeMap::new(),
1017            evidence: Vec::new(),
1018            profile_evidence: Vec::new(),
1019            confidence: adler_core::ConfidenceScore::default(),
1020            transport: Some(if idx % 7 == 0 {
1021                TransportTier::Browser
1022            } else {
1023                TransportTier::Http
1024            }),
1025            escalations: u8::from(idx % 7 == 0),
1026        };
1027
1028        match kind {
1029            MatchKind::Found => {
1030                let observed_at_ms = 1_781_192_451_000 + generation as u64 * 1_000 + idx as u64;
1031                let website = format!("https://identity-{:02}.example", idx % 25);
1032                let name = format!("Alice Group {:02}", idx % 50);
1033                let bio = if generation > 0 && idx % 20 == 1 {
1034                    format!("updated profile generation {generation} for {idx}")
1035                } else {
1036                    format!("stable profile generation 0 for {idx}")
1037                };
1038                for (field, value) in [
1039                    ("website", website.as_str()),
1040                    ("name", name.as_str()),
1041                    ("bio", bio.as_str()),
1042                ] {
1043                    outcome
1044                        .enrichment
1045                        .insert(field.to_owned(), value.to_owned());
1046                    outcome
1047                        .profile_evidence
1048                        .push(ProfileEvidence::from_enrichment_with_source(
1049                            &site,
1050                            &url,
1051                            field,
1052                            value,
1053                            Some(observed_at_ms),
1054                            Some(EvidenceAccessPath::new(
1055                                outcome.transport.unwrap_or(TransportTier::Http),
1056                                outcome.escalations,
1057                                idx % 11 == 0,
1058                            )),
1059                        ));
1060                }
1061                outcome.evidence = vec![
1062                    "HTTP 200 (status_found)".to_owned(),
1063                    "body matched profile marker".to_owned(),
1064                ];
1065            }
1066            MatchKind::NotFound => {
1067                outcome.evidence = vec!["HTTP 404 (status_not_found)".to_owned()];
1068            }
1069            MatchKind::Uncertain => {}
1070        }
1071        outcome.refresh_confidence();
1072        outcome
1073    }
1074
1075    fn large_persisted_scan(scan_id: &str, generation: usize) -> PersistedScan {
1076        let outcomes = large_outcomes(2_500, generation);
1077        let finished = FinishedScan {
1078            summary: Summary::from_outcomes(&outcomes),
1079            identity_clusters: adler_core::build_identity_clusters("alice", &outcomes),
1080            elapsed_ms: 30_000 + generation as u64,
1081            outcomes,
1082        };
1083        PersistedScan::from_finished(
1084            ScanId::from(scan_id.to_owned()),
1085            "alice".to_owned(),
1086            2_500,
1087            1_781_192_451_000 + generation as u64 * 10_000,
1088            finished,
1089        )
1090    }
1091
1092    #[tokio::test]
1093    async fn save_then_load_roundtrips() {
1094        let tmp = TempDir::new().unwrap();
1095        let s = sample("abc123", 1_700_000_000_000);
1096        save(tmp.path(), &s).await.unwrap();
1097
1098        let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1099        assert_eq!(loaded.scan_id, s.scan_id);
1100        assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1101        assert_eq!(loaded.username, "alice");
1102        assert_eq!(loaded.outcomes.len(), 2);
1103        assert_eq!(loaded.outcomes[0].site, "GitHub");
1104        assert_eq!(loaded.summary.found, 1);
1105    }
1106
1107    #[test]
1108    fn historical_overlay_adds_reason_after_two_prior_stable_found_observations() {
1109        let mut current = scan_with_outcomes(
1110            "current",
1111            "alice",
1112            30,
1113            vec![found_with_website("GitHub", "https://alice.dev")],
1114        );
1115        let previous = scan_with_outcomes(
1116            "previous",
1117            "alice",
1118            20,
1119            vec![found_with_website("GitHub", "https://alice.dev")],
1120        );
1121        let older = scan_with_outcomes(
1122            "older",
1123            "alice",
1124            10,
1125            vec![found_with_website("GitHub", "https://alice.dev")],
1126        );
1127
1128        apply_historical_confidence_overlay(&mut current, &[previous, older]);
1129
1130        assert!(has_historical_reason(&current.outcomes[0], 2));
1131        assert_eq!(current.outcomes[0].confidence.score, 79);
1132    }
1133
1134    #[test]
1135    fn historical_overlay_ignores_single_prior_found() {
1136        let mut current = scan_with_outcomes(
1137            "current",
1138            "alice",
1139            20,
1140            vec![found_with_website("GitHub", "https://alice.dev")],
1141        );
1142        let previous = scan_with_outcomes(
1143            "previous",
1144            "alice",
1145            10,
1146            vec![found_with_website("GitHub", "https://alice.dev")],
1147        );
1148
1149        apply_historical_confidence_overlay(&mut current, &[previous]);
1150
1151        assert!(!has_historical_reason(&current.outcomes[0], 1));
1152        assert_eq!(current.outcomes[0].confidence.score, 75);
1153    }
1154
1155    #[test]
1156    fn historical_overlay_resets_on_explicit_non_found() {
1157        let mut current = scan_with_outcomes(
1158            "current",
1159            "alice",
1160            40,
1161            vec![found_with_website("GitHub", "https://alice.dev")],
1162        );
1163        let previous = scan_with_outcomes(
1164            "previous",
1165            "alice",
1166            30,
1167            vec![outcome("GitHub", MatchKind::NotFound)],
1168        );
1169        let older = scan_with_outcomes(
1170            "older",
1171            "alice",
1172            20,
1173            vec![found_with_website("GitHub", "https://alice.dev")],
1174        );
1175        let oldest = scan_with_outcomes(
1176            "oldest",
1177            "alice",
1178            10,
1179            vec![found_with_website("GitHub", "https://alice.dev")],
1180        );
1181
1182        apply_historical_confidence_overlay(&mut current, &[previous, older, oldest]);
1183
1184        assert!(!has_historical_reason(&current.outcomes[0], 2));
1185        assert_eq!(current.outcomes[0].confidence.score, 75);
1186    }
1187
1188    #[test]
1189    fn historical_overlay_ignores_source_timestamp_changes() {
1190        let mut current = scan_with_outcomes(
1191            "current",
1192            "alice",
1193            30,
1194            vec![found_with_website_at(
1195                "GitHub",
1196                "https://alice.dev",
1197                Some(30),
1198            )],
1199        );
1200        let previous = scan_with_outcomes(
1201            "previous",
1202            "alice",
1203            20,
1204            vec![found_with_website_at(
1205                "GitHub",
1206                "https://alice.dev",
1207                Some(20),
1208            )],
1209        );
1210        let older = scan_with_outcomes(
1211            "older",
1212            "alice",
1213            10,
1214            vec![found_with_website_at(
1215                "GitHub",
1216                "https://alice.dev",
1217                Some(10),
1218            )],
1219        );
1220
1221        apply_historical_confidence_overlay(&mut current, &[previous, older]);
1222
1223        assert!(has_historical_reason(&current.outcomes[0], 2));
1224    }
1225
1226    #[test]
1227    fn historical_overlay_adds_cluster_co_occurrence_reason() {
1228        let mut current = scan_with_outcomes(
1229            "current",
1230            "alice",
1231            30,
1232            vec![
1233                found_with_website("GitHub", "https://alice.dev"),
1234                found_with_website("GitLab", "https://alice.dev"),
1235            ],
1236        );
1237        let previous = scan_with_outcomes(
1238            "previous",
1239            "alice",
1240            20,
1241            vec![
1242                found_with_website("GitHub", "https://alice.dev"),
1243                found_with_website("GitLab", "https://alice.dev"),
1244            ],
1245        );
1246        let older = scan_with_outcomes(
1247            "older",
1248            "alice",
1249            10,
1250            vec![
1251                found_with_website("GitHub", "https://alice.dev"),
1252                found_with_website("GitLab", "https://alice.dev"),
1253            ],
1254        );
1255
1256        apply_historical_confidence_overlay(&mut current, &[previous, older]);
1257
1258        assert_eq!(current.identity_clusters.len(), 1);
1259        assert_eq!(current.identity_clusters[0].confidence, 95);
1260        assert!(
1261            current.identity_clusters[0]
1262                .reasons
1263                .contains(&adler_core::ClusterReason::HistoricalCoOccurrence)
1264        );
1265    }
1266
1267    #[test]
1268    fn weak_status_only_result_remains_medium_capped_with_history() {
1269        let mut current_outcome = outcome("GitHub", MatchKind::Found);
1270        current_outcome.evidence = vec!["HTTP 200 (status_found)".to_owned()];
1271        let mut previous_outcome = outcome("GitHub", MatchKind::Found);
1272        previous_outcome.evidence = current_outcome.evidence.clone();
1273        let mut older_outcome = outcome("GitHub", MatchKind::Found);
1274        older_outcome.evidence = current_outcome.evidence.clone();
1275
1276        let mut current = scan_with_outcomes("current", "alice", 30, vec![current_outcome]);
1277        let previous = scan_with_outcomes("previous", "alice", 20, vec![previous_outcome]);
1278        let older = scan_with_outcomes("older", "alice", 10, vec![older_outcome]);
1279
1280        apply_historical_confidence_overlay(&mut current, &[previous, older]);
1281
1282        assert!(has_historical_reason(&current.outcomes[0], 2));
1283        assert_eq!(
1284            current.outcomes[0].confidence.label,
1285            ConfidenceLabel::Medium
1286        );
1287        assert_eq!(current.outcomes[0].confidence.score, 70);
1288    }
1289
1290    #[tokio::test]
1291    async fn historical_overlay_does_not_rewrite_persisted_json() {
1292        let tmp = TempDir::new().unwrap();
1293        let current = scan_with_outcomes(
1294            "current",
1295            "alice",
1296            30,
1297            vec![found_with_website("GitHub", "https://alice.dev")],
1298        );
1299        let previous = scan_with_outcomes(
1300            "previous",
1301            "alice",
1302            20,
1303            vec![found_with_website("GitHub", "https://alice.dev")],
1304        );
1305        let older = scan_with_outcomes(
1306            "older",
1307            "alice",
1308            10,
1309            vec![found_with_website("GitHub", "https://alice.dev")],
1310        );
1311        save(tmp.path(), &current).await.unwrap();
1312        save(tmp.path(), &previous).await.unwrap();
1313        save(tmp.path(), &older).await.unwrap();
1314
1315        let current_path = tmp.path().join("current.json");
1316        let before = fs::read(&current_path).await.unwrap();
1317        let related = load_all(tmp.path()).await;
1318        let mut loaded = load(tmp.path(), &ScanId::from("current".to_owned()))
1319            .await
1320            .unwrap();
1321
1322        apply_historical_confidence_overlay(&mut loaded, &related);
1323
1324        let after = fs::read(&current_path).await.unwrap();
1325        assert_eq!(before, after);
1326        assert!(has_historical_reason(&loaded.outcomes[0], 2));
1327    }
1328
1329    #[tokio::test]
1330    async fn save_writes_schema_version() {
1331        let tmp = TempDir::new().unwrap();
1332        let s = sample("abc123", 1_700_000_000_000);
1333        save(tmp.path(), &s).await.unwrap();
1334
1335        let raw = fs::read_to_string(tmp.path().join("abc123.json"))
1336            .await
1337            .unwrap();
1338        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1339        assert_eq!(
1340            value["schema_version"],
1341            serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
1342        );
1343    }
1344
1345    #[tokio::test]
1346    async fn save_skips_empty_identity_clusters() {
1347        let tmp = TempDir::new().unwrap();
1348        let s = sample("empty-clusters", 1_700_000_000_000);
1349        save(tmp.path(), &s).await.unwrap();
1350
1351        let raw = fs::read_to_string(tmp.path().join("empty-clusters.json"))
1352            .await
1353            .unwrap();
1354        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1355        assert_eq!(
1356            value["schema_version"],
1357            serde_json::json!(PERSISTED_SCAN_SCHEMA_VERSION)
1358        );
1359        assert!(
1360            value.get("identity_clusters").is_none(),
1361            "empty cluster cache should stay absent from persisted JSON"
1362        );
1363    }
1364
1365    #[tokio::test]
1366    async fn save_writes_derived_identity_clusters() {
1367        let tmp = TempDir::new().unwrap();
1368        let mut s = sample("clusters", 1_700_000_000_000);
1369        s.outcomes = vec![
1370            found_with_website("GitHub", "https://alice.dev"),
1371            found_with_website("GitLab", "https://alice.dev"),
1372        ];
1373
1374        save(tmp.path(), &s).await.unwrap();
1375
1376        let raw = fs::read_to_string(tmp.path().join("clusters.json"))
1377            .await
1378            .unwrap();
1379        let value: serde_json::Value = serde_json::from_str(&raw).unwrap();
1380        assert_eq!(value["identity_clusters"].as_array().unwrap().len(), 1);
1381        assert_eq!(
1382            value["identity_clusters"][0]["members"]
1383                .as_array()
1384                .unwrap()
1385                .len(),
1386            2
1387        );
1388    }
1389
1390    #[tokio::test]
1391    async fn save_roundtrips_request_context() {
1392        let tmp = TempDir::new().unwrap();
1393        let context = ScanRequestContext {
1394            username: "alice".into(),
1395            derived_from: Some(ScanId::from("previous".to_owned())),
1396            only: vec!["Git".into()],
1397            exclude: Vec::new(),
1398            tag: vec!["coding".into()],
1399            exclude_tag: vec!["nsfw".into()],
1400            top: Some(100),
1401            nsfw: false,
1402            concurrency: Some(8),
1403            deadline_secs: Some(30),
1404            egress_names: vec!["us-resi".into()],
1405            disabled_matches: vec![PersistedDisabledMatch {
1406                name: "Threads".into(),
1407                url: "https://www.threads.net/@{username}".into(),
1408                tags: vec!["social".into()],
1409                disabled_reason: "Honest Limits: login wall".into(),
1410            }],
1411        };
1412        let s = sample("ctx", 1_700_000_000_000).with_request_context(context.clone());
1413        save(tmp.path(), &s).await.unwrap();
1414
1415        let loaded = load(tmp.path(), &s.scan_id).await.expect("loaded");
1416        assert_eq!(loaded.request_context, Some(context));
1417    }
1418
1419    #[test]
1420    fn diff_scans_reports_added_removed_and_verdict_changes() {
1421        let mut previous = sample("old", 1_000);
1422        previous.outcomes = vec![
1423            outcome("GitHub", MatchKind::Found),
1424            outcome("Reddit", MatchKind::Found),
1425            outcome("Mastodon", MatchKind::NotFound),
1426        ];
1427        let mut current = sample("new", 2_000);
1428        current.outcomes = vec![
1429            outcome("GitHub", MatchKind::Found),
1430            outcome("Reddit", MatchKind::NotFound),
1431            outcome("Mastodon", MatchKind::Found),
1432        ];
1433
1434        let diff = diff_scans(&previous, &current);
1435
1436        assert_eq!(diff.from_scan_id.as_str(), "old");
1437        assert_eq!(diff.to_scan_id.as_str(), "new");
1438        assert_eq!(
1439            diff.added_found
1440                .iter()
1441                .map(|outcome| outcome.site.as_str())
1442                .collect::<Vec<_>>(),
1443            ["Mastodon"]
1444        );
1445        assert_eq!(
1446            diff.removed_found
1447                .iter()
1448                .map(|outcome| outcome.site.as_str())
1449                .collect::<Vec<_>>(),
1450            ["Reddit"]
1451        );
1452        assert_eq!(diff.verdict_changes.len(), 2);
1453        assert_eq!(diff.verdict_changes[0].site, "Mastodon");
1454        assert_eq!(diff.verdict_changes[0].before, MatchKind::NotFound);
1455        assert_eq!(diff.verdict_changes[0].after, MatchKind::Found);
1456        assert_eq!(diff.verdict_changes[1].site, "Reddit");
1457        assert!(diff.evidence_changes.is_empty());
1458    }
1459
1460    #[test]
1461    fn diff_scans_reports_profile_evidence_changes_for_still_found_sites() {
1462        let mut previous = sample("old", 1_000);
1463        let mut old_github = outcome("GitHub", MatchKind::Found);
1464        old_github.enrichment.insert("name".into(), "Alice".into());
1465        old_github
1466            .profile_evidence
1467            .push(adler_core::ProfileEvidence::from_enrichment(
1468                "GitHub",
1469                "https://github.example/alice",
1470                "name",
1471                "Alice",
1472            ));
1473        previous.outcomes = vec![old_github];
1474
1475        let mut current = sample("new", 2_000);
1476        let mut new_github = outcome("GitHub", MatchKind::Found);
1477        new_github
1478            .enrichment
1479            .insert("name".into(), "Alice Liddell".into());
1480        new_github
1481            .profile_evidence
1482            .push(adler_core::ProfileEvidence::from_enrichment(
1483                "GitHub",
1484                "https://github.example/alice",
1485                "name",
1486                "Alice Liddell",
1487            ));
1488        current.outcomes = vec![new_github];
1489
1490        let diff = diff_scans(&previous, &current);
1491
1492        assert!(diff.added_found.is_empty());
1493        assert!(diff.removed_found.is_empty());
1494        assert!(diff.verdict_changes.is_empty());
1495        assert_eq!(diff.evidence_changes.len(), 1);
1496        assert_eq!(diff.evidence_changes[0].site, "GitHub");
1497        assert_eq!(
1498            diff.evidence_changes[0]
1499                .before_enrichment
1500                .get("name")
1501                .unwrap(),
1502            "Alice"
1503        );
1504        assert_eq!(
1505            diff.evidence_changes[0]
1506                .after_enrichment
1507                .get("name")
1508                .unwrap(),
1509            "Alice Liddell"
1510        );
1511    }
1512
1513    #[test]
1514    fn timeline_tracks_first_seen_disappeared_and_reappeared() {
1515        let mut first = sample("first", 1_000);
1516        first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1517        let mut second = sample("second", 2_000);
1518        second.outcomes = vec![outcome("GitHub", MatchKind::NotFound)];
1519        let mut third = sample("third", 3_000);
1520        third.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1521
1522        let timeline = build_scan_timeline(&[third, first, second]);
1523
1524        assert_eq!(timeline.username, "alice");
1525        assert_eq!(timeline.scan_count, 3);
1526        assert_eq!(timeline.from_ms, Some(1_000));
1527        assert_eq!(timeline.to_ms, Some(3_000));
1528        assert_eq!(timeline.profiles.len(), 1);
1529        assert_eq!(timeline.profiles[0].site, "GitHub");
1530        assert_eq!(timeline.profiles[0].first_seen_ms, 1_000);
1531        assert_eq!(timeline.profiles[0].last_seen_ms, 3_000);
1532        assert!(timeline.profiles[0].present_in_latest);
1533        assert_eq!(
1534            timeline
1535                .events
1536                .iter()
1537                .map(|event| event.kind)
1538                .collect::<Vec<_>>(),
1539            [
1540                TimelineEventKind::FirstSeen,
1541                TimelineEventKind::Disappeared,
1542                TimelineEventKind::Reappeared
1543            ]
1544        );
1545        assert_eq!(timeline.events[1].before, Some(MatchKind::Found));
1546        assert_eq!(timeline.events[1].after, Some(MatchKind::NotFound));
1547    }
1548
1549    #[test]
1550    fn timeline_treats_missing_site_as_disappeared() {
1551        let mut first = sample("first", 1_000);
1552        first.outcomes = vec![outcome("GitHub", MatchKind::Found)];
1553        let mut second = sample("second", 2_000);
1554        second.outcomes = vec![outcome("GitLab", MatchKind::NotFound)];
1555
1556        let timeline = build_scan_timeline(&[first, second]);
1557
1558        assert_eq!(timeline.profiles.len(), 1);
1559        assert!(!timeline.profiles[0].present_in_latest);
1560        assert_eq!(timeline.events.len(), 2);
1561        assert_eq!(timeline.events[1].kind, TimelineEventKind::Disappeared);
1562        assert_eq!(timeline.events[1].site, "GitHub");
1563        assert_eq!(timeline.events[1].after, None);
1564    }
1565
1566    #[test]
1567    fn timeline_tracks_evidence_changes_for_still_found_profile() {
1568        let mut first = sample("first", 1_000);
1569        let mut old_github = outcome("GitHub", MatchKind::Found);
1570        old_github.enrichment.insert("name".into(), "Alice".into());
1571        old_github
1572            .profile_evidence
1573            .push(adler_core::ProfileEvidence::from_enrichment(
1574                "GitHub",
1575                "https://github.example/alice",
1576                "name",
1577                "Alice",
1578            ));
1579        first.outcomes = vec![old_github];
1580
1581        let mut second = sample("second", 2_000);
1582        let mut new_github = outcome("GitHub", MatchKind::Found);
1583        new_github
1584            .enrichment
1585            .insert("name".into(), "Alice Liddell".into());
1586        new_github
1587            .profile_evidence
1588            .push(adler_core::ProfileEvidence::from_enrichment(
1589                "GitHub",
1590                "https://github.example/alice",
1591                "name",
1592                "Alice Liddell",
1593            ));
1594        second.outcomes = vec![new_github];
1595
1596        let timeline = build_scan_timeline(&[first, second]);
1597
1598        assert_eq!(
1599            timeline
1600                .events
1601                .iter()
1602                .map(|event| event.kind)
1603                .collect::<Vec<_>>(),
1604            [
1605                TimelineEventKind::FirstSeen,
1606                TimelineEventKind::EvidenceChanged
1607            ]
1608        );
1609        let evidence_change = timeline.events[1].evidence_change.as_ref().unwrap();
1610        assert_eq!(
1611            evidence_change.before_enrichment.get("name").unwrap(),
1612            "Alice"
1613        );
1614        assert_eq!(
1615            evidence_change.after_enrichment.get("name").unwrap(),
1616            "Alice Liddell"
1617        );
1618    }
1619
1620    #[tokio::test]
1621    async fn load_all_returns_newest_first() {
1622        let tmp = TempDir::new().unwrap();
1623        save(tmp.path(), &sample("old", 1_000)).await.unwrap();
1624        save(tmp.path(), &sample("mid", 2_000)).await.unwrap();
1625        save(tmp.path(), &sample("new", 3_000)).await.unwrap();
1626        let all = load_all(tmp.path()).await;
1627        assert_eq!(all.len(), 3);
1628        assert_eq!(all[0].scan_id.as_str(), "new");
1629        assert_eq!(all[1].scan_id.as_str(), "mid");
1630        assert_eq!(all[2].scan_id.as_str(), "old");
1631    }
1632
1633    #[tokio::test]
1634    async fn load_returns_none_for_missing() {
1635        let tmp = TempDir::new().unwrap();
1636        let missing = load(tmp.path(), &ScanId::from("nope".to_owned())).await;
1637        assert!(missing.is_none());
1638    }
1639
1640    #[tokio::test]
1641    async fn load_defaults_schema_version_for_legacy_scan_json() {
1642        let tmp = TempDir::new().unwrap();
1643        let path = tmp.path().join("legacy.json");
1644        fs::write(
1645            &path,
1646            br#"{
1647                "scan_id": "legacy",
1648                "username": "alice",
1649                "site_count": 0,
1650                "created_at_ms": 1700000000000,
1651                "summary": { "found": 0, "not_found": 0, "uncertain": 0 },
1652                "outcomes": [],
1653                "elapsed_ms": 0
1654            }"#,
1655        )
1656        .await
1657        .unwrap();
1658
1659        let loaded = load(tmp.path(), &ScanId::from("legacy".to_owned()))
1660            .await
1661            .expect("legacy scan loads");
1662        assert_eq!(loaded.schema_version, PERSISTED_SCAN_SCHEMA_VERSION);
1663    }
1664
1665    #[tokio::test]
1666    async fn load_accepts_v2_scan_json_after_schema_bump() {
1667        let tmp = TempDir::new().unwrap();
1668        let path = tmp.path().join("v2.json");
1669        fs::write(
1670            &path,
1671            br#"{
1672                "schema_version": 2,
1673                "scan_id": "v2",
1674                "username": "alice",
1675                "site_count": 1,
1676                "created_at_ms": 1700000000000,
1677                "summary": { "found": 1, "not_found": 0, "uncertain": 0 },
1678                "outcomes": [
1679                    {
1680                        "site": "GitHub",
1681                        "url": "https://github.example/alice",
1682                        "kind": "found",
1683                        "elapsed_ms": 10,
1684                        "evidence": ["HTTP 200 (status_found)"]
1685                    }
1686                ],
1687                "elapsed_ms": 10
1688            }"#,
1689        )
1690        .await
1691        .unwrap();
1692
1693        let loaded = load(tmp.path(), &ScanId::from("v2".to_owned()))
1694            .await
1695            .expect("v2 scan loads");
1696
1697        assert_eq!(loaded.schema_version, 2);
1698        assert_eq!(loaded.summary.found, 1);
1699        assert_eq!(
1700            loaded.outcomes[0].confidence.label,
1701            adler_core::ConfidenceLabel::Medium
1702        );
1703    }
1704
1705    #[tokio::test]
1706    async fn load_derives_identity_clusters_for_legacy_scan_json() {
1707        let tmp = TempDir::new().unwrap();
1708        let path = tmp.path().join("legacy-clusters.json");
1709        fs::write(
1710            &path,
1711            br#"{
1712                "schema_version": 1,
1713                "scan_id": "legacy-clusters",
1714                "username": "alice",
1715                "site_count": 2,
1716                "created_at_ms": 1700000000000,
1717                "summary": { "found": 2, "not_found": 0, "uncertain": 0 },
1718                "outcomes": [
1719                    {
1720                        "site": "GitHub",
1721                        "url": "https://github.example/alice",
1722                        "kind": "found",
1723                        "elapsed_ms": 10,
1724                        "profile_evidence": [
1725                            {
1726                                "kind": "external_link",
1727                                "field": "website",
1728                                "value": "https://alice.dev",
1729                                "source": {
1730                                    "site": "GitHub",
1731                                    "url": "https://github.example/alice",
1732                                    "origin": "extractor"
1733                                }
1734                            }
1735                        ]
1736                    },
1737                    {
1738                        "site": "GitLab",
1739                        "url": "https://gitlab.example/alice",
1740                        "kind": "found",
1741                        "elapsed_ms": 10,
1742                        "profile_evidence": [
1743                            {
1744                                "kind": "external_link",
1745                                "field": "website",
1746                                "value": "https://alice.dev/",
1747                                "source": {
1748                                    "site": "GitLab",
1749                                    "url": "https://gitlab.example/alice",
1750                                    "origin": "extractor"
1751                                }
1752                            }
1753                        ]
1754                    }
1755                ],
1756                "elapsed_ms": 20
1757            }"#,
1758        )
1759        .await
1760        .unwrap();
1761
1762        let loaded = load(tmp.path(), &ScanId::from("legacy-clusters".to_owned()))
1763            .await
1764            .expect("legacy scan loads");
1765
1766        assert_eq!(loaded.identity_clusters.len(), 1);
1767        assert_eq!(loaded.identity_clusters[0].members.len(), 2);
1768        assert!(!loaded.identity_clusters[0].uncertain);
1769    }
1770
1771    #[test]
1772    fn large_scan_artifact_paths_handle_identity_graph_payloads() {
1773        let previous = large_persisted_scan("large-old", 0);
1774        let current = large_persisted_scan("large-new", 1);
1775
1776        assert_eq!(previous.outcomes.len(), 2_500);
1777        assert_eq!(previous.site_count, 2_500);
1778        assert_eq!(
1779            previous.summary.found + previous.summary.not_found + previous.summary.uncertain,
1780            2_500
1781        );
1782        assert!(!previous.identity_clusters.is_empty());
1783
1784        let raw = serde_json::to_string(&previous).unwrap();
1785        let decoded: PersistedScan = serde_json::from_str(&raw).unwrap();
1786        assert_eq!(decoded.outcomes.len(), 2_500);
1787        assert_eq!(
1788            decoded.identity_clusters.len(),
1789            previous.identity_clusters.len()
1790        );
1791
1792        let diff = diff_scans(&previous, &current);
1793        assert!(!diff.added_found.is_empty());
1794        assert!(!diff.removed_found.is_empty());
1795        assert!(!diff.verdict_changes.is_empty());
1796        assert!(!diff.evidence_changes.is_empty());
1797
1798        let timeline = build_scan_timeline(&[previous, current]);
1799        assert_eq!(timeline.scan_count, 2);
1800        assert_eq!(timeline.profiles.len(), 375);
1801        assert!(timeline.events.len() > timeline.profiles.len());
1802    }
1803
1804    #[tokio::test]
1805    async fn load_all_skips_unrelated_files() {
1806        let tmp = TempDir::new().unwrap();
1807        // Drop a non-JSON file and a malformed JSON file alongside.
1808        fs::write(tmp.path().join("README"), b"not json")
1809            .await
1810            .unwrap();
1811        fs::write(tmp.path().join("broken.json"), b"{ invalid")
1812            .await
1813            .unwrap();
1814        save(tmp.path(), &sample("good", 9_999)).await.unwrap();
1815        let all = load_all(tmp.path()).await;
1816        assert_eq!(all.len(), 1);
1817        assert_eq!(all[0].scan_id.as_str(), "good");
1818    }
1819
1820    #[tokio::test]
1821    async fn prune_keeps_only_newest_n() {
1822        let tmp = TempDir::new().unwrap();
1823        for i in 0u64..5 {
1824            save(tmp.path(), &sample(&format!("s{i}"), i * 1_000))
1825                .await
1826                .unwrap();
1827        }
1828        let removed = prune(tmp.path(), 2).await;
1829        assert_eq!(removed, 3);
1830        let remaining = load_all(tmp.path()).await;
1831        assert_eq!(remaining.len(), 2);
1832        assert_eq!(remaining[0].scan_id.as_str(), "s4");
1833        assert_eq!(remaining[1].scan_id.as_str(), "s3");
1834    }
1835}