modde_core/scanner.rs
1use std::collections::{HashMap, HashSet};
2
3use crate::manifest::wabbajack::{
4 compute_manifest_hash, ArchiveEntry, ArchiveState, InstallDirective, WabbajackManifest,
5};
6use crate::profile::{EnabledMod, LoadOrderLock, LockReason, Profile};
7
8/// Canonical `mod_id` derivation for a Wabbajack archive entry.
9///
10/// Used by **both** the scanner and the Wabbajack installer so that a
11/// profile installed via `modde install wabbajack` and the same modlist
12/// re-scanned via `modde scan --manifest` produce identical `mod_id`
13/// strings — otherwise retroactive-lock flows would create duplicates
14/// rather than matching existing mods.
15///
16/// - Nexus-sourced archives: `nexus_{game_domain}_{mod_id}_{file_id}`
17/// - Everything else: `wj_{archive_hash}`
18pub fn archive_mod_id(archive: &ArchiveEntry) -> String {
19 if let Some(ArchiveState::NexusDownloader {
20 game_name,
21 mod_id,
22 file_id,
23 }) = archive.state.as_ref()
24 {
25 format!("nexus_{game_name}_{mod_id}_{file_id}")
26 } else {
27 format!("wj_{}", archive.hash)
28 }
29}
30
31/// A mod discovered by matching a Wabbajack manifest against files on disk.
32pub struct ManifestMatch {
33 /// Stable unique ID based on Nexus identity or archive hash.
34 pub mod_id: String,
35 /// Human-readable name (from archive filename, cleaned).
36 pub display_name: String,
37 /// Original archive filename.
38 pub archive_name: String,
39 pub archive_hash: u64,
40 pub total_files: usize,
41 pub present_files: usize,
42 pub confidence: f32,
43 pub nexus_mod_id: Option<i64>,
44 pub nexus_file_id: Option<i64>,
45 pub nexus_game_domain: Option<String>,
46 /// Game-relative file paths that this archive covers on disk (lowercased).
47 /// Used for correlation with filesystem-discovered mods.
48 pub covered_paths: Vec<String>,
49}
50
51/// Match files on disk against a Wabbajack manifest.
52///
53/// Groups directives by their source `archive_hash`, then checks what
54/// fraction of each archive's `to` paths exist in `on_disk_files`.
55/// Archives where the fraction meets or exceeds `threshold` are returned.
56///
57/// `on_disk_files` should contain lowercased, forward-slash relative paths
58/// from the game install root.
59pub fn match_wabbajack_manifest(
60 manifest: &WabbajackManifest,
61 on_disk_files: &HashSet<String>,
62 threshold: f32,
63) -> Vec<ManifestMatch> {
64 let directives = manifest.install_directives();
65
66 // Group directives by archive_hash → list of game-relative paths.
67 // Also extract the MO2 mod name from the `mods/<Name>/...` prefix.
68 let mut archive_files: HashMap<u64, Vec<String>> = HashMap::new();
69 let mut archive_mod_names: HashMap<u64, String> = HashMap::new();
70
71 for d in &directives {
72 match d {
73 InstallDirective::FromArchive {
74 archive_hash, to, ..
75 }
76 | InstallDirective::PatchedFromArchive {
77 archive_hash, to, ..
78 } => {
79 let normalized = to.replace('\\', "/");
80
81 // Extract the MO2 mod name before lowercasing (preserves casing).
82 if archive_mod_names.get(archive_hash).is_none() {
83 if let Some(name) = extract_mo2_mod_name(&normalized) {
84 archive_mod_names.insert(*archive_hash, name);
85 }
86 }
87
88 // Strip prefix and lowercase for matching.
89 let game_relative = strip_mo2_prefix(&normalized.to_lowercase());
90 archive_files
91 .entry(*archive_hash)
92 .or_default()
93 .push(game_relative);
94 }
95 _ => {}
96 }
97 }
98
99 // Build archive hash → ArchiveEntry lookup for metadata.
100 let archive_map: HashMap<u64, &crate::manifest::wabbajack::ArchiveEntry> = manifest
101 .archives
102 .iter()
103 .map(|a| (a.hash, a))
104 .collect();
105
106 let mut results = Vec::new();
107
108 for (hash, files) in &archive_files {
109 let total = files.len();
110 if total == 0 {
111 continue;
112 }
113
114 let present_paths: Vec<String> = files
115 .iter()
116 .filter(|path| on_disk_files.contains(path.as_str()))
117 .cloned()
118 .collect();
119 let present = present_paths.len();
120
121 let fraction = present as f32 / total as f32;
122 if fraction < threshold {
123 continue;
124 }
125
126 let archive = archive_map.get(hash);
127 let archive_name = archive
128 .map(|a| a.name.clone())
129 .unwrap_or_else(|| format!("unknown_{hash}"));
130
131 // Display name: prefer cleaned archive filename (unique per archive).
132 let display_name = clean_archive_name(&archive_name);
133
134 let (nexus_mod_id, nexus_file_id, nexus_game_domain) = archive
135 .and_then(|a| a.state.as_ref())
136 .map(|state| match state {
137 ArchiveState::NexusDownloader {
138 game_name,
139 mod_id,
140 file_id,
141 } => (
142 Some(*mod_id as i64),
143 Some(*file_id as i64),
144 Some(game_name.clone()),
145 ),
146 _ => (None, None, None),
147 })
148 .unwrap_or((None, None, None));
149
150 // Canonical mod_id — must match `archive_mod_id` exactly so Wabbajack
151 // installs + retroactive scans dedup correctly.
152 let mod_id = match archive {
153 Some(a) => archive_mod_id(a),
154 None => format!("wj_{hash}"),
155 };
156
157 results.push(ManifestMatch {
158 mod_id,
159 display_name,
160 archive_name,
161 archive_hash: *hash,
162 total_files: total,
163 present_files: present,
164 confidence: fraction,
165 nexus_mod_id,
166 nexus_file_id,
167 nexus_game_domain,
168 covered_paths: present_paths,
169 });
170 }
171
172 // Sort by display_name for readability.
173 results.sort_by(|a, b| {
174 a.display_name
175 .to_lowercase()
176 .cmp(&b.display_name.to_lowercase())
177 });
178 results
179}
180
181/// Convert a `ManifestMatch` into an `EnabledMod` for database storage.
182pub fn manifest_match_to_enabled(m: &ManifestMatch) -> EnabledMod {
183 EnabledMod {
184 mod_id: m.mod_id.clone(),
185 display_name: Some(m.display_name.clone()),
186 enabled: true,
187 version: None,
188 fomod_config: None,
189 nexus_mod_id: m.nexus_mod_id,
190 nexus_file_id: m.nexus_file_id,
191 nexus_game_domain: m.nexus_game_domain.clone(),
192 installed_timestamp: Some(
193 std::time::SystemTime::now()
194 .duration_since(std::time::UNIX_EPOCH)
195 .unwrap_or_default()
196 .as_secs() as i64,
197 ),
198 ..Default::default()
199 }
200}
201
202/// Extract the MO2 mod name from a directive path.
203///
204/// Paths like `mods/Immersive Healing/archive/pc/mod/ImmersiveHealing.archive`
205/// yield `"Immersive Healing"`.
206fn extract_mo2_mod_name(path: &str) -> Option<String> {
207 let rest = path.strip_prefix("mods/")?;
208 let end = rest.find('/')?;
209 let name = &rest[..end];
210 if name.is_empty() {
211 return None;
212 }
213 Some(name.to_string())
214}
215
216/// Strip MO2 staging prefix from a path.
217///
218/// `mods/<mod_name>/<game_relative_path>` → `<game_relative_path>`.
219/// Non-mod paths (e.g., MO2 executables) are returned as-is.
220fn strip_mo2_prefix(path: &str) -> String {
221 if let Some(rest) = path.strip_prefix("mods/") {
222 if let Some(idx) = rest.find('/') {
223 return rest[idx + 1..].to_string();
224 }
225 }
226 path.to_string()
227}
228
229/// Clean an archive filename into a display name.
230///
231/// `ImmersiveHealing-26281-3-1-3-1772288704.zip` → `ImmersiveHealing`.
232/// Strips the Nexus suffix pattern (mod_id-version-timestamp.ext).
233fn clean_archive_name(name: &str) -> String {
234 // Strip extension.
235 let stem = name.rsplit_once('.').map(|(s, _)| s).unwrap_or(name);
236 // Nexus filenames: "ModName-modid-version-timestamp". Strip from first `-{digits}`.
237 if let Some(idx) = stem.find(|c: char| c == '-').and_then(|i| {
238 if stem[i + 1..].starts_with(|c: char| c.is_ascii_digit()) {
239 Some(i)
240 } else {
241 None
242 }
243 }) {
244 stem[..idx].replace('_', " ")
245 } else {
246 stem.replace('_', " ")
247 }
248}
249
250/// Compute the canonical mod order from a Wabbajack manifest's install
251/// directives.
252///
253/// `WabbajackManifest.archives` is an unordered JSON array — not a load
254/// order. The *directive* list, however, is the sequence Wabbajack applies
255/// on install, so the first-appearance order of each archive in the
256/// directives is the closest reproducible approximation of "load order".
257///
258/// Returns a `Vec<String>` of canonical `mod_id`s (as produced by
259/// [`archive_mod_id`]) in the order the corresponding archives first
260/// appear in the install directives. Archives that never appear in a
261/// [`InstallDirective::FromArchive`] / [`InstallDirective::PatchedFromArchive`]
262/// are omitted.
263pub fn manifest_directive_order(manifest: &WabbajackManifest) -> Vec<String> {
264 let archive_by_hash: HashMap<u64, &ArchiveEntry> =
265 manifest.archives.iter().map(|a| (a.hash, a)).collect();
266
267 let mut seen: HashSet<u64> = HashSet::new();
268 let mut order: Vec<String> = Vec::new();
269 for d in manifest.install_directives() {
270 let hash = match d {
271 InstallDirective::FromArchive { archive_hash, .. }
272 | InstallDirective::PatchedFromArchive { archive_hash, .. } => archive_hash,
273 _ => continue,
274 };
275 if !seen.insert(hash) {
276 continue;
277 }
278 if let Some(archive) = archive_by_hash.get(&hash) {
279 order.push(archive_mod_id(archive));
280 }
281 }
282 order
283}
284
285/// Report from [`apply_wabbajack_lock`] — what the in-place reorder did.
286#[derive(Debug, Clone, PartialEq, Eq)]
287pub struct WabbajackLockApplied {
288 /// `manifest_hash` recorded on the new lock. Matches
289 /// `ProfileSource::Wabbajack { manifest_hash }` on installs.
290 pub manifest_hash: String,
291 /// Number of mods whose `mod_id` is present in the manifest order
292 /// (these end up at the front of the mod list).
293 pub matched: usize,
294 /// Number of pre-existing profile mods not mentioned by the
295 /// manifest (these are appended after, preserving relative order).
296 pub unmatched: usize,
297 /// Whether the profile already carried a lock that was overwritten.
298 pub replaced_existing_lock: bool,
299}
300
301/// Reorder `profile.mods` to follow the manifest's install-directive
302/// order and stamp a `LockReason::Wabbajack` lock onto the profile.
303///
304/// This is the pure helper that powers `modde scan --manifest` and is
305/// the recommended way to retroactively lock an existing profile to a
306/// Wabbajack modlist. Extracted from `scan.rs` so it can be unit-tested
307/// without touching the filesystem scanner.
308///
309/// Invariants:
310///
311/// 1. **Mod count is preserved** — no mod is ever dropped. Matched mods
312/// move to the front in manifest order; unmatched mods retain their
313/// original relative order and are appended after.
314/// 2. **Matched mods are sorted by first-appearance in install
315/// directives** — see [`manifest_directive_order`] for the semantic.
316/// 3. **`profile.load_order_lock` is overwritten** — any prior lock
317/// (including a stale Wabbajack or Manual lock) is replaced. The
318/// return value's `replaced_existing_lock` field lets callers surface
319/// this to the user.
320pub fn apply_wabbajack_lock(
321 profile: &mut Profile,
322 manifest: &WabbajackManifest,
323) -> WabbajackLockApplied {
324 let manifest_order = manifest_directive_order(manifest);
325 let manifest_rank: HashMap<String, usize> = manifest_order
326 .iter()
327 .enumerate()
328 .map(|(i, mid)| (mid.clone(), i))
329 .collect();
330
331 // Stable partition: matched first (in manifest order), unmatched
332 // after (original relative order preserved).
333 let (mut matched, unmatched): (Vec<EnabledMod>, Vec<EnabledMod>) =
334 std::mem::take(&mut profile.mods)
335 .into_iter()
336 .partition(|m| manifest_rank.contains_key(&m.mod_id));
337
338 matched.sort_by_key(|m| manifest_rank.get(&m.mod_id).copied().unwrap_or(usize::MAX));
339
340 let matched_count = matched.len();
341 let unmatched_count = unmatched.len();
342 profile.mods = matched;
343 profile.mods.extend(unmatched);
344
345 let manifest_hash = compute_manifest_hash(manifest);
346 let replaced_existing_lock = profile.load_order_lock.is_some();
347 profile.load_order_lock = Some(LoadOrderLock::now(LockReason::Wabbajack {
348 manifest_hash: manifest_hash.clone(),
349 }));
350
351 WabbajackLockApplied {
352 manifest_hash,
353 matched: matched_count,
354 unmatched: unmatched_count,
355 replaced_existing_lock,
356 }
357}
358
359/// The filesystem footprint of a mod discovered by a game-specific
360/// filesystem scanner.
361///
362/// Game scanners produce mod_ids in schemes like `cet/<name>`,
363/// `archive/<stem>`, etc. To correlate those rows against a Wabbajack
364/// manifest's install directives, we need to know what portion of the
365/// game directory each mod owns. That's what this enum expresses.
366///
367/// - [`ModFootprint::Directory`] — the mod owns everything under a
368/// subtree of the game install (e.g. `bin/x64/plugins/cyber_engine_tweaks/mods/<name>/`).
369/// - [`ModFootprint::File`] — the mod *is* a single file (e.g. a
370/// loose `.archive` under `archive/pc/mod/`).
371///
372/// Paths are lowercased, use forward slashes, and (for `Directory`)
373/// end with a trailing `/`. This matches the conventions used by
374/// [`dir_prefixes`](crate::scanner) and the manifest-covered-dirs set
375/// built in `modde-cli::commands::scan`.
376#[derive(Debug, Clone, PartialEq, Eq)]
377pub enum ModFootprint {
378 /// A directory subtree owned by the mod. Compared against the set of
379 /// directories the manifest writes into.
380 Directory(String),
381 /// A single file owned by the mod. Compared against the set of
382 /// `To` paths in the manifest's install directives.
383 File(String),
384}
385
386/// Result of [`detect_stale_duplicates`] — a partition of a profile's
387/// filesystem-scanner rows into "covered by the manifest" (leaked
388/// duplicates) and "not covered" (genuine additions).
389///
390/// mod_ids whose footprint cannot be determined by the supplied
391/// `mod_id_to_footprint` closure (typically `nexus_*`, `wj_*`, or any
392/// non-filesystem-scheme row) are **not** included in either list —
393/// they're skipped silently because they aren't candidates for this
394/// kind of dedup.
395#[derive(Debug, Clone, Default, PartialEq, Eq)]
396pub struct DuplicateReport {
397 /// Filesystem-scanner mod_ids whose footprint is covered by the
398 /// manifest. These are safe to delete from the profile: a
399 /// manifest-authored row (usually `nexus_*`) already deploys the
400 /// same files under a different ID.
401 pub leaked: Vec<String>,
402 /// Filesystem-scanner mod_ids whose footprint is **not** covered
403 /// by the manifest. These are genuine additions the user made on
404 /// top of the Wabbajack modlist and must be preserved.
405 pub genuine: Vec<String>,
406}
407
408/// Classify a profile's filesystem-scanner rows against a Wabbajack
409/// manifest into "leaked duplicates" and "genuine additions".
410///
411/// This is the pure helper that powers `modde profile dedup` and the
412/// `--prune-duplicates` flag on `modde scan`. See
413/// `/home/can/.claude/plans/greedy-shimmying-pine.md` and the companion
414/// discussion in `docs/` (if present) for the design rationale.
415///
416/// The `mod_id_to_footprint` closure is the game-specific bridge: it
417/// maps a filesystem-scanner mod_id (e.g. `cet/ImmersiveHealing`) back
418/// to the directory or file the mod owns in the game install. For
419/// Cyberpunk 2077 this is
420/// [`modde_games::cyberpunk::scanner::mod_id_footprint`]. Profiles
421/// spanning multiple games aren't supported — each profile is tied to
422/// a single game via `profile.game_id`, so callers wire up a
423/// per-game closure.
424///
425/// Classification rules:
426///
427/// 1. If the closure returns `None` for a mod_id, the row is **not a
428/// candidate** — it's skipped silently. `nexus_*` and `wj_*` rows
429/// are manifest-authored and shouldn't be classified as duplicates
430/// of themselves.
431/// 2. If the footprint is [`ModFootprint::Directory`] and the manifest
432/// writes any file under that directory → **LEAKED** (the nexus
433/// archive that deployed those files is already tracked under its
434/// `nexus_*` ID).
435/// 3. If the footprint is [`ModFootprint::File`] and the exact file
436/// path appears in the manifest's install directives → **LEAKED**.
437/// 4. Otherwise → **GENUINE**: the user added this mod on top of the
438/// Wabbajack and it must not be deleted.
439///
440/// Case and slash-normalization: paths are lowercased and
441/// forward-slashed internally, so callers don't need to pre-normalize.
442///
443/// Complexity: O(D × A + M) where D is manifest directive count,
444/// A is average path depth, and M is profile mod count. For a typical
445/// CP2077 modlist (≈7k directives, ≈700 mods) this runs in well under
446/// a millisecond.
447pub fn detect_stale_duplicates<F>(
448 profile: &Profile,
449 manifest: &WabbajackManifest,
450 mod_id_to_footprint: F,
451) -> DuplicateReport
452where
453 F: Fn(&str) -> Option<ModFootprint>,
454{
455 // Build the manifest's covered file set + covered directory set
456 // from its install directives. Only `FromArchive` /
457 // `PatchedFromArchive` directives are "physical" file placements
458 // we can compare against — `CreateBSA` and `InlineFile` don't map
459 // cleanly to a single on-disk file at scan time.
460 //
461 // Wabbajack `To` paths are MO2-staged: they look like
462 // `mods\<MO2 Mod Name>\<game-relative-path>`. We must strip the
463 // `mods/<name>/` prefix before comparing against game-relative
464 // footprints — this mirrors what `match_wabbajack_manifest` does
465 // via `strip_mo2_prefix`. Without this step, every directive path
466 // in a CP2077 modlist begins with `mods/<big mod name>/`, which
467 // never overlaps with a `bin/x64/...` or `archive/pc/mod/...`
468 // footprint, and `detect_stale_duplicates` silently classifies
469 // every row as GENUINE. See profile 3077 for the failure mode.
470 let mut covered_files: HashSet<String> = HashSet::new();
471 for d in manifest.install_directives() {
472 let to = match d {
473 InstallDirective::FromArchive { to, .. }
474 | InstallDirective::PatchedFromArchive { to, .. } => to,
475 _ => continue,
476 };
477 let normalized = to.replace('\\', "/").to_lowercase();
478 covered_files.insert(strip_mo2_prefix(&normalized));
479 }
480
481 // Expand each covered file into its ancestor-directory prefixes so
482 // the Directory footprint check becomes a single O(1) HashSet lookup.
483 let mut covered_dirs: HashSet<String> = HashSet::new();
484 for f in &covered_files {
485 let mut cur = f.as_str();
486 while let Some(idx) = cur.rfind('/') {
487 cur = &cur[..idx];
488 covered_dirs.insert(format!("{cur}/"));
489 }
490 }
491
492 let mut report = DuplicateReport::default();
493 for m in &profile.mods {
494 let footprint = match mod_id_to_footprint(&m.mod_id) {
495 Some(fp) => fp,
496 None => continue, // Not a filesystem-scanner row; skip.
497 };
498 let covered = match &footprint {
499 ModFootprint::Directory(d) => covered_dirs.contains(d),
500 ModFootprint::File(f) => covered_files.contains(f),
501 };
502 if covered {
503 report.leaked.push(m.mod_id.clone());
504 } else {
505 report.genuine.push(m.mod_id.clone());
506 }
507 }
508 report
509}
510
511/// Convert a filesystem-discovered mod into an `EnabledMod`.
512pub fn discovered_to_enabled(
513 mod_id: &str,
514 display_name: &str,
515 version: Option<&str>,
516 _confidence: f32,
517) -> EnabledMod {
518 EnabledMod {
519 mod_id: mod_id.to_string(),
520 display_name: Some(display_name.to_string()),
521 enabled: true,
522 version: version.map(String::from),
523 ..Default::default()
524 }
525}