Skip to main content

modde_core/
scanner.rs

1use std::collections::{HashMap, HashSet};
2
3use crate::manifest::wabbajack::{
4    compute_manifest_hash, ArchiveEntry, ArchiveState, InstallDirective, WabbajackManifest,
5};
6use crate::profile::{EnabledMod, LoadOrderLock, LockReason, Profile};
7
8/// Canonical `mod_id` derivation for a Wabbajack archive entry.
9///
10/// Used by **both** the scanner and the Wabbajack installer so that a
11/// profile installed via `modde install wabbajack` and the same modlist
12/// re-scanned via `modde scan --manifest` produce identical `mod_id`
13/// strings — otherwise retroactive-lock flows would create duplicates
14/// rather than matching existing mods.
15///
16/// - Nexus-sourced archives: `nexus_{game_domain}_{mod_id}_{file_id}`
17/// - Everything else:        `wj_{archive_hash}`
18pub fn archive_mod_id(archive: &ArchiveEntry) -> String {
19    if let Some(ArchiveState::NexusDownloader {
20        game_name,
21        mod_id,
22        file_id,
23    }) = archive.state.as_ref()
24    {
25        format!("nexus_{game_name}_{mod_id}_{file_id}")
26    } else {
27        format!("wj_{}", archive.hash)
28    }
29}
30
31/// A mod discovered by matching a Wabbajack manifest against files on disk.
32pub struct ManifestMatch {
33    /// Stable unique ID based on Nexus identity or archive hash.
34    pub mod_id: String,
35    /// Human-readable name (from archive filename, cleaned).
36    pub display_name: String,
37    /// Original archive filename.
38    pub archive_name: String,
39    pub archive_hash: u64,
40    pub total_files: usize,
41    pub present_files: usize,
42    pub confidence: f32,
43    pub nexus_mod_id: Option<i64>,
44    pub nexus_file_id: Option<i64>,
45    pub nexus_game_domain: Option<String>,
46    /// Game-relative file paths that this archive covers on disk (lowercased).
47    /// Used for correlation with filesystem-discovered mods.
48    pub covered_paths: Vec<String>,
49}
50
51/// Match files on disk against a Wabbajack manifest.
52///
53/// Groups directives by their source `archive_hash`, then checks what
54/// fraction of each archive's `to` paths exist in `on_disk_files`.
55/// Archives where the fraction meets or exceeds `threshold` are returned.
56///
57/// `on_disk_files` should contain lowercased, forward-slash relative paths
58/// from the game install root.
59pub fn match_wabbajack_manifest(
60    manifest: &WabbajackManifest,
61    on_disk_files: &HashSet<String>,
62    threshold: f32,
63) -> Vec<ManifestMatch> {
64    let directives = manifest.install_directives();
65
66    // Group directives by archive_hash → list of game-relative paths.
67    // Also extract the MO2 mod name from the `mods/<Name>/...` prefix.
68    let mut archive_files: HashMap<u64, Vec<String>> = HashMap::new();
69    let mut archive_mod_names: HashMap<u64, String> = HashMap::new();
70
71    for d in &directives {
72        match d {
73            InstallDirective::FromArchive {
74                archive_hash, to, ..
75            }
76            | InstallDirective::PatchedFromArchive {
77                archive_hash, to, ..
78            } => {
79                let normalized = to.replace('\\', "/");
80
81                // Extract the MO2 mod name before lowercasing (preserves casing).
82                if archive_mod_names.get(archive_hash).is_none() {
83                    if let Some(name) = extract_mo2_mod_name(&normalized) {
84                        archive_mod_names.insert(*archive_hash, name);
85                    }
86                }
87
88                // Strip prefix and lowercase for matching.
89                let game_relative = strip_mo2_prefix(&normalized.to_lowercase());
90                archive_files
91                    .entry(*archive_hash)
92                    .or_default()
93                    .push(game_relative);
94            }
95            _ => {}
96        }
97    }
98
99    // Build archive hash → ArchiveEntry lookup for metadata.
100    let archive_map: HashMap<u64, &crate::manifest::wabbajack::ArchiveEntry> = manifest
101        .archives
102        .iter()
103        .map(|a| (a.hash, a))
104        .collect();
105
106    let mut results = Vec::new();
107
108    for (hash, files) in &archive_files {
109        let total = files.len();
110        if total == 0 {
111            continue;
112        }
113
114        let present_paths: Vec<String> = files
115            .iter()
116            .filter(|path| on_disk_files.contains(path.as_str()))
117            .cloned()
118            .collect();
119        let present = present_paths.len();
120
121        let fraction = present as f32 / total as f32;
122        if fraction < threshold {
123            continue;
124        }
125
126        let archive = archive_map.get(hash);
127        let archive_name = archive
128            .map(|a| a.name.clone())
129            .unwrap_or_else(|| format!("unknown_{hash}"));
130
131        // Display name: prefer cleaned archive filename (unique per archive).
132        let display_name = clean_archive_name(&archive_name);
133
134        let (nexus_mod_id, nexus_file_id, nexus_game_domain) = archive
135            .and_then(|a| a.state.as_ref())
136            .map(|state| match state {
137                ArchiveState::NexusDownloader {
138                    game_name,
139                    mod_id,
140                    file_id,
141                } => (
142                    Some(*mod_id as i64),
143                    Some(*file_id as i64),
144                    Some(game_name.clone()),
145                ),
146                _ => (None, None, None),
147            })
148            .unwrap_or((None, None, None));
149
150        // Canonical mod_id — must match `archive_mod_id` exactly so Wabbajack
151        // installs + retroactive scans dedup correctly.
152        let mod_id = match archive {
153            Some(a) => archive_mod_id(a),
154            None => format!("wj_{hash}"),
155        };
156
157        results.push(ManifestMatch {
158            mod_id,
159            display_name,
160            archive_name,
161            archive_hash: *hash,
162            total_files: total,
163            present_files: present,
164            confidence: fraction,
165            nexus_mod_id,
166            nexus_file_id,
167            nexus_game_domain,
168            covered_paths: present_paths,
169        });
170    }
171
172    // Sort by display_name for readability.
173    results.sort_by(|a, b| {
174        a.display_name
175            .to_lowercase()
176            .cmp(&b.display_name.to_lowercase())
177    });
178    results
179}
180
181/// Convert a `ManifestMatch` into an `EnabledMod` for database storage.
182pub fn manifest_match_to_enabled(m: &ManifestMatch) -> EnabledMod {
183    EnabledMod {
184        mod_id: m.mod_id.clone(),
185        display_name: Some(m.display_name.clone()),
186        enabled: true,
187        version: None,
188        fomod_config: None,
189        nexus_mod_id: m.nexus_mod_id,
190        nexus_file_id: m.nexus_file_id,
191        nexus_game_domain: m.nexus_game_domain.clone(),
192        installed_timestamp: Some(
193            std::time::SystemTime::now()
194                .duration_since(std::time::UNIX_EPOCH)
195                .unwrap_or_default()
196                .as_secs() as i64,
197        ),
198        ..Default::default()
199    }
200}
201
202/// Extract the MO2 mod name from a directive path.
203///
204/// Paths like `mods/Immersive Healing/archive/pc/mod/ImmersiveHealing.archive`
205/// yield `"Immersive Healing"`.
206fn extract_mo2_mod_name(path: &str) -> Option<String> {
207    let rest = path.strip_prefix("mods/")?;
208    let end = rest.find('/')?;
209    let name = &rest[..end];
210    if name.is_empty() {
211        return None;
212    }
213    Some(name.to_string())
214}
215
216/// Strip MO2 staging prefix from a path.
217///
218/// `mods/<mod_name>/<game_relative_path>` → `<game_relative_path>`.
219/// Non-mod paths (e.g., MO2 executables) are returned as-is.
220fn strip_mo2_prefix(path: &str) -> String {
221    if let Some(rest) = path.strip_prefix("mods/") {
222        if let Some(idx) = rest.find('/') {
223            return rest[idx + 1..].to_string();
224        }
225    }
226    path.to_string()
227}
228
229/// Clean an archive filename into a display name.
230///
231/// `ImmersiveHealing-26281-3-1-3-1772288704.zip` → `ImmersiveHealing`.
232/// Strips the Nexus suffix pattern (mod_id-version-timestamp.ext).
233fn clean_archive_name(name: &str) -> String {
234    // Strip extension.
235    let stem = name.rsplit_once('.').map(|(s, _)| s).unwrap_or(name);
236    // Nexus filenames: "ModName-modid-version-timestamp". Strip from first `-{digits}`.
237    if let Some(idx) = stem.find(|c: char| c == '-').and_then(|i| {
238        if stem[i + 1..].starts_with(|c: char| c.is_ascii_digit()) {
239            Some(i)
240        } else {
241            None
242        }
243    }) {
244        stem[..idx].replace('_', " ")
245    } else {
246        stem.replace('_', " ")
247    }
248}
249
250/// Compute the canonical mod order from a Wabbajack manifest's install
251/// directives.
252///
253/// `WabbajackManifest.archives` is an unordered JSON array — not a load
254/// order. The *directive* list, however, is the sequence Wabbajack applies
255/// on install, so the first-appearance order of each archive in the
256/// directives is the closest reproducible approximation of "load order".
257///
258/// Returns a `Vec<String>` of canonical `mod_id`s (as produced by
259/// [`archive_mod_id`]) in the order the corresponding archives first
260/// appear in the install directives. Archives that never appear in a
261/// [`InstallDirective::FromArchive`] / [`InstallDirective::PatchedFromArchive`]
262/// are omitted.
263pub fn manifest_directive_order(manifest: &WabbajackManifest) -> Vec<String> {
264    let archive_by_hash: HashMap<u64, &ArchiveEntry> =
265        manifest.archives.iter().map(|a| (a.hash, a)).collect();
266
267    let mut seen: HashSet<u64> = HashSet::new();
268    let mut order: Vec<String> = Vec::new();
269    for d in manifest.install_directives() {
270        let hash = match d {
271            InstallDirective::FromArchive { archive_hash, .. }
272            | InstallDirective::PatchedFromArchive { archive_hash, .. } => archive_hash,
273            _ => continue,
274        };
275        if !seen.insert(hash) {
276            continue;
277        }
278        if let Some(archive) = archive_by_hash.get(&hash) {
279            order.push(archive_mod_id(archive));
280        }
281    }
282    order
283}
284
285/// Report from [`apply_wabbajack_lock`] — what the in-place reorder did.
286#[derive(Debug, Clone, PartialEq, Eq)]
287pub struct WabbajackLockApplied {
288    /// `manifest_hash` recorded on the new lock. Matches
289    /// `ProfileSource::Wabbajack { manifest_hash }` on installs.
290    pub manifest_hash: String,
291    /// Number of mods whose `mod_id` is present in the manifest order
292    /// (these end up at the front of the mod list).
293    pub matched: usize,
294    /// Number of pre-existing profile mods not mentioned by the
295    /// manifest (these are appended after, preserving relative order).
296    pub unmatched: usize,
297    /// Whether the profile already carried a lock that was overwritten.
298    pub replaced_existing_lock: bool,
299}
300
301/// Reorder `profile.mods` to follow the manifest's install-directive
302/// order and stamp a `LockReason::Wabbajack` lock onto the profile.
303///
304/// This is the pure helper that powers `modde scan --manifest` and is
305/// the recommended way to retroactively lock an existing profile to a
306/// Wabbajack modlist. Extracted from `scan.rs` so it can be unit-tested
307/// without touching the filesystem scanner.
308///
309/// Invariants:
310///
311/// 1. **Mod count is preserved** — no mod is ever dropped. Matched mods
312///    move to the front in manifest order; unmatched mods retain their
313///    original relative order and are appended after.
314/// 2. **Matched mods are sorted by first-appearance in install
315///    directives** — see [`manifest_directive_order`] for the semantic.
316/// 3. **`profile.load_order_lock` is overwritten** — any prior lock
317///    (including a stale Wabbajack or Manual lock) is replaced. The
318///    return value's `replaced_existing_lock` field lets callers surface
319///    this to the user.
320pub fn apply_wabbajack_lock(
321    profile: &mut Profile,
322    manifest: &WabbajackManifest,
323) -> WabbajackLockApplied {
324    let manifest_order = manifest_directive_order(manifest);
325    let manifest_rank: HashMap<String, usize> = manifest_order
326        .iter()
327        .enumerate()
328        .map(|(i, mid)| (mid.clone(), i))
329        .collect();
330
331    // Stable partition: matched first (in manifest order), unmatched
332    // after (original relative order preserved).
333    let (mut matched, unmatched): (Vec<EnabledMod>, Vec<EnabledMod>) =
334        std::mem::take(&mut profile.mods)
335            .into_iter()
336            .partition(|m| manifest_rank.contains_key(&m.mod_id));
337
338    matched.sort_by_key(|m| manifest_rank.get(&m.mod_id).copied().unwrap_or(usize::MAX));
339
340    let matched_count = matched.len();
341    let unmatched_count = unmatched.len();
342    profile.mods = matched;
343    profile.mods.extend(unmatched);
344
345    let manifest_hash = compute_manifest_hash(manifest);
346    let replaced_existing_lock = profile.load_order_lock.is_some();
347    profile.load_order_lock = Some(LoadOrderLock::now(LockReason::Wabbajack {
348        manifest_hash: manifest_hash.clone(),
349    }));
350
351    WabbajackLockApplied {
352        manifest_hash,
353        matched: matched_count,
354        unmatched: unmatched_count,
355        replaced_existing_lock,
356    }
357}
358
359/// The filesystem footprint of a mod discovered by a game-specific
360/// filesystem scanner.
361///
362/// Game scanners produce mod_ids in schemes like `cet/<name>`,
363/// `archive/<stem>`, etc. To correlate those rows against a Wabbajack
364/// manifest's install directives, we need to know what portion of the
365/// game directory each mod owns. That's what this enum expresses.
366///
367/// - [`ModFootprint::Directory`] — the mod owns everything under a
368///   subtree of the game install (e.g. `bin/x64/plugins/cyber_engine_tweaks/mods/<name>/`).
369/// - [`ModFootprint::File`] — the mod *is* a single file (e.g. a
370///   loose `.archive` under `archive/pc/mod/`).
371///
372/// Paths are lowercased, use forward slashes, and (for `Directory`)
373/// end with a trailing `/`. This matches the conventions used by
374/// [`dir_prefixes`](crate::scanner) and the manifest-covered-dirs set
375/// built in `modde-cli::commands::scan`.
376#[derive(Debug, Clone, PartialEq, Eq)]
377pub enum ModFootprint {
378    /// A directory subtree owned by the mod. Compared against the set of
379    /// directories the manifest writes into.
380    Directory(String),
381    /// A single file owned by the mod. Compared against the set of
382    /// `To` paths in the manifest's install directives.
383    File(String),
384}
385
386/// Result of [`detect_stale_duplicates`] — a partition of a profile's
387/// filesystem-scanner rows into "covered by the manifest" (leaked
388/// duplicates) and "not covered" (genuine additions).
389///
390/// mod_ids whose footprint cannot be determined by the supplied
391/// `mod_id_to_footprint` closure (typically `nexus_*`, `wj_*`, or any
392/// non-filesystem-scheme row) are **not** included in either list —
393/// they're skipped silently because they aren't candidates for this
394/// kind of dedup.
395#[derive(Debug, Clone, Default, PartialEq, Eq)]
396pub struct DuplicateReport {
397    /// Filesystem-scanner mod_ids whose footprint is covered by the
398    /// manifest. These are safe to delete from the profile: a
399    /// manifest-authored row (usually `nexus_*`) already deploys the
400    /// same files under a different ID.
401    pub leaked: Vec<String>,
402    /// Filesystem-scanner mod_ids whose footprint is **not** covered
403    /// by the manifest. These are genuine additions the user made on
404    /// top of the Wabbajack modlist and must be preserved.
405    pub genuine: Vec<String>,
406}
407
408/// Classify a profile's filesystem-scanner rows against a Wabbajack
409/// manifest into "leaked duplicates" and "genuine additions".
410///
411/// This is the pure helper that powers `modde profile dedup` and the
412/// `--prune-duplicates` flag on `modde scan`. See
413/// `/home/can/.claude/plans/greedy-shimmying-pine.md` and the companion
414/// discussion in `docs/` (if present) for the design rationale.
415///
416/// The `mod_id_to_footprint` closure is the game-specific bridge: it
417/// maps a filesystem-scanner mod_id (e.g. `cet/ImmersiveHealing`) back
418/// to the directory or file the mod owns in the game install. For
419/// Cyberpunk 2077 this is
420/// [`modde_games::cyberpunk::scanner::mod_id_footprint`]. Profiles
421/// spanning multiple games aren't supported — each profile is tied to
422/// a single game via `profile.game_id`, so callers wire up a
423/// per-game closure.
424///
425/// Classification rules:
426///
427/// 1. If the closure returns `None` for a mod_id, the row is **not a
428///    candidate** — it's skipped silently. `nexus_*` and `wj_*` rows
429///    are manifest-authored and shouldn't be classified as duplicates
430///    of themselves.
431/// 2. If the footprint is [`ModFootprint::Directory`] and the manifest
432///    writes any file under that directory → **LEAKED** (the nexus
433///    archive that deployed those files is already tracked under its
434///    `nexus_*` ID).
435/// 3. If the footprint is [`ModFootprint::File`] and the exact file
436///    path appears in the manifest's install directives → **LEAKED**.
437/// 4. Otherwise → **GENUINE**: the user added this mod on top of the
438///    Wabbajack and it must not be deleted.
439///
440/// Case and slash-normalization: paths are lowercased and
441/// forward-slashed internally, so callers don't need to pre-normalize.
442///
443/// Complexity: O(D × A + M) where D is manifest directive count,
444/// A is average path depth, and M is profile mod count. For a typical
445/// CP2077 modlist (≈7k directives, ≈700 mods) this runs in well under
446/// a millisecond.
447pub fn detect_stale_duplicates<F>(
448    profile: &Profile,
449    manifest: &WabbajackManifest,
450    mod_id_to_footprint: F,
451) -> DuplicateReport
452where
453    F: Fn(&str) -> Option<ModFootprint>,
454{
455    // Build the manifest's covered file set + covered directory set
456    // from its install directives. Only `FromArchive` /
457    // `PatchedFromArchive` directives are "physical" file placements
458    // we can compare against — `CreateBSA` and `InlineFile` don't map
459    // cleanly to a single on-disk file at scan time.
460    //
461    // Wabbajack `To` paths are MO2-staged: they look like
462    // `mods\<MO2 Mod Name>\<game-relative-path>`. We must strip the
463    // `mods/<name>/` prefix before comparing against game-relative
464    // footprints — this mirrors what `match_wabbajack_manifest` does
465    // via `strip_mo2_prefix`. Without this step, every directive path
466    // in a CP2077 modlist begins with `mods/<big mod name>/`, which
467    // never overlaps with a `bin/x64/...` or `archive/pc/mod/...`
468    // footprint, and `detect_stale_duplicates` silently classifies
469    // every row as GENUINE. See profile 3077 for the failure mode.
470    let mut covered_files: HashSet<String> = HashSet::new();
471    for d in manifest.install_directives() {
472        let to = match d {
473            InstallDirective::FromArchive { to, .. }
474            | InstallDirective::PatchedFromArchive { to, .. } => to,
475            _ => continue,
476        };
477        let normalized = to.replace('\\', "/").to_lowercase();
478        covered_files.insert(strip_mo2_prefix(&normalized));
479    }
480
481    // Expand each covered file into its ancestor-directory prefixes so
482    // the Directory footprint check becomes a single O(1) HashSet lookup.
483    let mut covered_dirs: HashSet<String> = HashSet::new();
484    for f in &covered_files {
485        let mut cur = f.as_str();
486        while let Some(idx) = cur.rfind('/') {
487            cur = &cur[..idx];
488            covered_dirs.insert(format!("{cur}/"));
489        }
490    }
491
492    let mut report = DuplicateReport::default();
493    for m in &profile.mods {
494        let footprint = match mod_id_to_footprint(&m.mod_id) {
495            Some(fp) => fp,
496            None => continue, // Not a filesystem-scanner row; skip.
497        };
498        let covered = match &footprint {
499            ModFootprint::Directory(d) => covered_dirs.contains(d),
500            ModFootprint::File(f) => covered_files.contains(f),
501        };
502        if covered {
503            report.leaked.push(m.mod_id.clone());
504        } else {
505            report.genuine.push(m.mod_id.clone());
506        }
507    }
508    report
509}
510
511/// Convert a filesystem-discovered mod into an `EnabledMod`.
512pub fn discovered_to_enabled(
513    mod_id: &str,
514    display_name: &str,
515    version: Option<&str>,
516    _confidence: f32,
517) -> EnabledMod {
518    EnabledMod {
519        mod_id: mod_id.to_string(),
520        display_name: Some(display_name.to_string()),
521        enabled: true,
522        version: version.map(String::from),
523        ..Default::default()
524    }
525}