modde_core/scanner.rs
1use std::collections::{HashMap, HashSet};
2
3use crate::manifest::wabbajack::{
4 ArchiveEntry, ArchiveState, InstallDirective, WabbajackManifest, compute_manifest_hash,
5};
6use crate::nexus_id::{NexusFileId, NexusModId};
7use crate::profile::{EnabledMod, LoadOrderLock, LockReason, Profile};
8
9/// Canonical `mod_id` derivation for a Wabbajack archive entry.
10///
11/// Used by **both** the scanner and the Wabbajack installer so that a
12/// profile installed via `modde install wabbajack` and the same modlist
13/// re-scanned via `modde scan --manifest` produce identical `mod_id`
14/// strings — otherwise retroactive-lock flows would create duplicates
15/// rather than matching existing mods.
16///
17/// - Nexus-sourced archives: `nexus_{game_domain}_{mod_id}_{file_id}`
18/// - Everything else: `wj_{archive_hash}`
19#[must_use]
20pub fn archive_mod_id(archive: &ArchiveEntry) -> String {
21 if let Some(ArchiveState::NexusDownloader {
22 game_name,
23 mod_id,
24 file_id,
25 }) = archive.state.as_ref()
26 {
27 format!("nexus_{game_name}_{mod_id}_{file_id}")
28 } else {
29 format!("wj_{}", archive.hash)
30 }
31}
32
33/// A mod discovered by matching a Wabbajack manifest against files on disk.
34pub struct ManifestMatch {
35 /// Stable unique ID based on Nexus identity or archive hash.
36 pub mod_id: String,
37 /// Human-readable name (from archive filename, cleaned).
38 pub display_name: String,
39 /// Original archive filename.
40 pub archive_name: String,
41 pub archive_hash: u64,
42 pub total_files: usize,
43 pub present_files: usize,
44 pub confidence: f32,
45 pub nexus_mod_id: Option<NexusModId>,
46 pub nexus_file_id: Option<NexusFileId>,
47 pub nexus_game_domain: Option<String>,
48 /// Game-relative file paths that this archive covers on disk (lowercased).
49 /// Used for correlation with filesystem-discovered mods.
50 pub covered_paths: Vec<String>,
51}
52
53/// Match files on disk against a Wabbajack manifest.
54///
55/// Groups directives by their source `archive_hash`, then checks what
56/// fraction of each archive's `to` paths exist in `on_disk_files`.
57/// Archives where the fraction meets or exceeds `threshold` are returned.
58///
59/// `on_disk_files` should contain lowercased, forward-slash relative paths
60/// from the game install root.
61#[must_use]
62pub fn match_wabbajack_manifest(
63 manifest: &WabbajackManifest,
64 on_disk_files: &HashSet<String>,
65 threshold: f32,
66) -> Vec<ManifestMatch> {
67 let directives = manifest.install_directives();
68
69 // Group directives by archive_hash → list of game-relative paths.
70 // Also extract the MO2 mod name from the `mods/<Name>/...` prefix.
71 let mut archive_files: HashMap<u64, Vec<String>> = HashMap::new();
72 let mut archive_mod_names: HashMap<u64, String> = HashMap::new();
73
74 for d in &directives {
75 match d {
76 InstallDirective::FromArchive {
77 archive_hash, to, ..
78 }
79 | InstallDirective::PatchedFromArchive {
80 archive_hash, to, ..
81 } => {
82 let normalized = to.replace('\\', "/");
83
84 // Extract the MO2 mod name before lowercasing (preserves casing).
85 if archive_mod_names.get(archive_hash).is_none()
86 && let Some(name) = extract_mo2_mod_name(&normalized)
87 {
88 archive_mod_names.insert(*archive_hash, name);
89 }
90
91 // Strip prefix and lowercase for matching.
92 let game_relative = strip_mo2_prefix(&normalized.to_lowercase());
93 archive_files
94 .entry(*archive_hash)
95 .or_default()
96 .push(game_relative);
97 }
98 _ => {}
99 }
100 }
101
102 // Build archive hash → ArchiveEntry lookup for metadata.
103 let archive_map: HashMap<u64, &crate::manifest::wabbajack::ArchiveEntry> =
104 manifest.archives.iter().map(|a| (a.hash, a)).collect();
105
106 let mut results = Vec::new();
107
108 for (hash, files) in &archive_files {
109 let total = files.len();
110 if total == 0 {
111 continue;
112 }
113
114 let present_paths: Vec<String> = files
115 .iter()
116 .filter(|path| on_disk_files.contains(path.as_str()))
117 .cloned()
118 .collect();
119 let present = present_paths.len();
120
121 let fraction = present as f32 / total as f32;
122 if fraction < threshold {
123 continue;
124 }
125
126 let archive = archive_map.get(hash);
127 let archive_name = archive.map_or_else(|| format!("unknown_{hash}"), |a| a.name.clone());
128
129 // Display name: prefer cleaned archive filename (unique per archive).
130 let display_name = clean_archive_name(&archive_name);
131
132 let (nexus_mod_id, nexus_file_id, nexus_game_domain) = archive
133 .and_then(|a| a.state.as_ref())
134 .map_or((None, None, None), |state| match state {
135 ArchiveState::NexusDownloader {
136 game_name,
137 mod_id,
138 file_id,
139 } => (Some(*mod_id), Some(*file_id), Some(game_name.clone())),
140 _ => (None, None, None),
141 });
142
143 // Canonical mod_id — must match `archive_mod_id` exactly so Wabbajack
144 // installs + retroactive scans dedup correctly.
145 let mod_id = match archive {
146 Some(a) => archive_mod_id(a),
147 None => format!("wj_{hash}"),
148 };
149
150 results.push(ManifestMatch {
151 mod_id,
152 display_name,
153 archive_name,
154 archive_hash: *hash,
155 total_files: total,
156 present_files: present,
157 confidence: fraction,
158 nexus_mod_id,
159 nexus_file_id,
160 nexus_game_domain,
161 covered_paths: present_paths,
162 });
163 }
164
165 // Sort by display_name for readability.
166 results.sort_by(|a, b| {
167 a.display_name
168 .to_lowercase()
169 .cmp(&b.display_name.to_lowercase())
170 });
171 results
172}
173
174/// Convert a `ManifestMatch` into an `EnabledMod` for database storage.
175#[must_use]
176pub fn manifest_match_to_enabled(m: &ManifestMatch) -> EnabledMod {
177 EnabledMod {
178 mod_id: m.mod_id.clone(),
179 display_name: Some(m.display_name.clone()),
180 enabled: true,
181 version: None,
182 fomod_config: None,
183 nexus_mod_id: m.nexus_mod_id,
184 nexus_file_id: m.nexus_file_id,
185 nexus_game_domain: m.nexus_game_domain.clone(),
186 installed_timestamp: Some(
187 std::time::SystemTime::now()
188 .duration_since(std::time::UNIX_EPOCH)
189 .unwrap_or_default()
190 .as_secs() as i64,
191 ),
192 ..Default::default()
193 }
194}
195
196/// Extract the MO2 mod name from a directive path.
197///
198/// Paths like `mods/Immersive Healing/archive/pc/mod/ImmersiveHealing.archive`
199/// yield `"Immersive Healing"`.
200fn extract_mo2_mod_name(path: &str) -> Option<String> {
201 let rest = path.strip_prefix("mods/")?;
202 let end = rest.find('/')?;
203 let name = &rest[..end];
204 if name.is_empty() {
205 return None;
206 }
207 Some(name.to_string())
208}
209
210/// Strip MO2 staging prefix from a path.
211///
212/// `mods/<mod_name>/<game_relative_path>` → `<game_relative_path>`.
213/// Non-mod paths (e.g., MO2 executables) are returned as-is.
214fn strip_mo2_prefix(path: &str) -> String {
215 if let Some(rest) = path.strip_prefix("mods/")
216 && let Some(idx) = rest.find('/')
217 {
218 return rest[idx + 1..].to_string();
219 }
220 path.to_string()
221}
222
223/// Clean an archive filename into a display name.
224///
225/// `ImmersiveHealing-26281-3-1-3-1772288704.zip` → `ImmersiveHealing`.
226/// Strips the Nexus suffix pattern (mod_id-version-timestamp.ext).
227fn clean_archive_name(name: &str) -> String {
228 // Strip extension.
229 let stem = name.rsplit_once('.').map_or(name, |(s, _)| s);
230 // Nexus filenames: "ModName-modid-version-timestamp". Strip from first `-{digits}`.
231 if let Some(idx) = stem
232 .find('-')
233 .filter(|&i| stem[i + 1..].starts_with(|c: char| c.is_ascii_digit()))
234 {
235 stem[..idx].replace('_', " ")
236 } else {
237 stem.replace('_', " ")
238 }
239}
240
241/// Compute the canonical mod order from a Wabbajack manifest's install
242/// directives.
243///
244/// `WabbajackManifest.archives` is an unordered JSON array — not a load
245/// order. The *directive* list, however, is the sequence Wabbajack applies
246/// on install, so the first-appearance order of each archive in the
247/// directives is the closest reproducible approximation of "load order".
248///
249/// Returns a `Vec<String>` of canonical `mod_id`s (as produced by
250/// [`archive_mod_id`]) in the order the corresponding archives first
251/// appear in the install directives. Archives that never appear in a
252/// [`InstallDirective::FromArchive`] / [`InstallDirective::PatchedFromArchive`]
253/// are omitted.
254#[must_use]
255pub fn manifest_directive_order(manifest: &WabbajackManifest) -> Vec<String> {
256 let archive_by_hash: HashMap<u64, &ArchiveEntry> =
257 manifest.archives.iter().map(|a| (a.hash, a)).collect();
258
259 let mut seen: HashSet<u64> = HashSet::new();
260 let mut order: Vec<String> = Vec::new();
261 for d in manifest.install_directives() {
262 let hash = match d {
263 InstallDirective::FromArchive { archive_hash, .. }
264 | InstallDirective::PatchedFromArchive { archive_hash, .. } => archive_hash,
265 _ => continue,
266 };
267 if !seen.insert(hash) {
268 continue;
269 }
270 if let Some(archive) = archive_by_hash.get(&hash) {
271 order.push(archive_mod_id(archive));
272 }
273 }
274 order
275}
276
277/// Report from [`apply_wabbajack_lock`] — what the in-place reorder did.
278#[derive(Debug, Clone, PartialEq, Eq)]
279pub struct WabbajackLockApplied {
280 /// `manifest_hash` recorded on the new lock. Matches
281 /// `ProfileSource::Wabbajack { manifest_hash }` on installs.
282 pub manifest_hash: String,
283 /// Number of mods whose `mod_id` is present in the manifest order
284 /// (these end up at the front of the mod list).
285 pub matched: usize,
286 /// Number of pre-existing profile mods not mentioned by the
287 /// manifest (these are appended after, preserving relative order).
288 pub unmatched: usize,
289 /// Whether the profile already carried a lock that was overwritten.
290 pub replaced_existing_lock: bool,
291}
292
293/// Reorder `profile.mods` to follow the manifest's install-directive
294/// order and stamp a `LockReason::Wabbajack` lock onto the profile.
295///
296/// This is the pure helper that powers `modde scan --manifest` and is
297/// the recommended way to retroactively lock an existing profile to a
298/// Wabbajack modlist. Extracted from `scan.rs` so it can be unit-tested
299/// without touching the filesystem scanner.
300///
301/// Invariants:
302///
303/// 1. **Mod count is preserved** — no mod is ever dropped. Matched mods
304/// move to the front in manifest order; unmatched mods retain their
305/// original relative order and are appended after.
306/// 2. **Matched mods are sorted by first-appearance in install
307/// directives** — see [`manifest_directive_order`] for the semantic.
308/// 3. **`profile.load_order_lock` is overwritten** — any prior lock
309/// (including a stale Wabbajack or Manual lock) is replaced. The
310/// return value's `replaced_existing_lock` field lets callers surface
311/// this to the user.
312pub fn apply_wabbajack_lock(
313 profile: &mut Profile,
314 manifest: &WabbajackManifest,
315) -> WabbajackLockApplied {
316 let manifest_order = manifest_directive_order(manifest);
317 let manifest_rank: HashMap<String, usize> = manifest_order
318 .iter()
319 .enumerate()
320 .map(|(i, mid)| (mid.clone(), i))
321 .collect();
322
323 // Stable partition: matched first (in manifest order), unmatched
324 // after (original relative order preserved).
325 let (mut matched, unmatched): (Vec<EnabledMod>, Vec<EnabledMod>) =
326 std::mem::take(&mut profile.mods)
327 .into_iter()
328 .partition(|m| manifest_rank.contains_key(&m.mod_id));
329
330 matched.sort_by_key(|m| manifest_rank.get(&m.mod_id).copied().unwrap_or(usize::MAX));
331
332 let matched_count = matched.len();
333 let unmatched_count = unmatched.len();
334 profile.mods = matched;
335 profile.mods.extend(unmatched);
336
337 let manifest_hash = compute_manifest_hash(manifest);
338 let replaced_existing_lock = profile.load_order_lock.is_some();
339 profile.load_order_lock = Some(LoadOrderLock::now(LockReason::Wabbajack {
340 manifest_hash: manifest_hash.clone(),
341 }));
342
343 WabbajackLockApplied {
344 manifest_hash,
345 matched: matched_count,
346 unmatched: unmatched_count,
347 replaced_existing_lock,
348 }
349}
350
351/// The filesystem footprint of a mod discovered by a game-specific
352/// filesystem scanner.
353///
354/// Game scanners produce `mod_ids` in schemes like `cet/<name>`,
355/// `archive/<stem>`, etc. To correlate those rows against a Wabbajack
356/// manifest's install directives, we need to know what portion of the
357/// game directory each mod owns. That's what this enum expresses.
358///
359/// - [`ModFootprint::Directory`] — the mod owns everything under a
360/// subtree of the game install (e.g. `bin/x64/plugins/cyber_engine_tweaks/mods/<name>/`).
361/// - [`ModFootprint::File`] — the mod *is* a single file (e.g. a
362/// loose `.archive` under `archive/pc/mod/`).
363///
364/// Paths are lowercased, use forward slashes, and (for `Directory`)
365/// end with a trailing `/`. This matches the conventions used by
366/// [`dir_prefixes`](crate::scanner) and the manifest-covered-dirs set
367/// built in `modde-cli::commands::scan`.
368#[derive(Debug, Clone, PartialEq, Eq)]
369pub enum ModFootprint {
370 /// A directory subtree owned by the mod. Compared against the set of
371 /// directories the manifest writes into.
372 Directory(String),
373 /// A single file owned by the mod. Compared against the set of
374 /// `To` paths in the manifest's install directives.
375 File(String),
376}
377
378/// Result of [`detect_stale_duplicates`] — a partition of a profile's
379/// filesystem-scanner rows into "covered by the manifest" (leaked
380/// duplicates) and "not covered" (genuine additions).
381///
382/// `mod_ids` whose footprint cannot be determined by the supplied
383/// `mod_id_to_footprint` closure (typically `nexus_*`, `wj_*`, or any
384/// non-filesystem-scheme row) are **not** included in either list —
385/// they're skipped silently because they aren't candidates for this
386/// kind of dedup.
387#[derive(Debug, Clone, Default, PartialEq, Eq)]
388pub struct DuplicateReport {
389 /// Filesystem-scanner `mod_ids` whose footprint is covered by the
390 /// manifest. These are safe to delete from the profile: a
391 /// manifest-authored row (usually `nexus_*`) already deploys the
392 /// same files under a different ID.
393 pub leaked: Vec<String>,
394 /// Filesystem-scanner `mod_ids` whose footprint is **not** covered
395 /// by the manifest. These are genuine additions the user made on
396 /// top of the Wabbajack modlist and must be preserved.
397 pub genuine: Vec<String>,
398}
399
400/// Classify a profile's filesystem-scanner rows against a Wabbajack
401/// manifest into "leaked duplicates" and "genuine additions".
402///
403/// This is the pure helper that powers `modde profile dedup` and the
404/// `--prune-duplicates` flag on `modde scan`. See
405/// `/home/can/.claude/plans/greedy-shimmying-pine.md` and the companion
406/// discussion in `docs/` (if present) for the design rationale.
407///
408/// The `mod_id_to_footprint` closure is the game-specific bridge: it
409/// maps a filesystem-scanner `mod_id` (e.g. `cet/ImmersiveHealing`) back
410/// to the directory or file the mod owns in the game install. For
411/// Cyberpunk 2077 this is `modde_games::cyberpunk::scanner::mod_id_footprint`.
412/// Profiles
413/// spanning multiple games aren't supported — each profile is tied to
414/// a single game via `profile.game_id`, so callers wire up a
415/// per-game closure.
416///
417/// Classification rules:
418///
419/// 1. If the closure returns `None` for a `mod_id`, the row is **not a
420/// candidate** — it's skipped silently. `nexus_*` and `wj_*` rows
421/// are manifest-authored and shouldn't be classified as duplicates
422/// of themselves.
423/// 2. If the footprint is [`ModFootprint::Directory`] and the manifest
424/// writes any file under that directory → **LEAKED** (the nexus
425/// archive that deployed those files is already tracked under its
426/// `nexus_*` ID).
427/// 3. If the footprint is [`ModFootprint::File`] and the exact file
428/// path appears in the manifest's install directives → **LEAKED**.
429/// 4. Otherwise → **GENUINE**: the user added this mod on top of the
430/// Wabbajack and it must not be deleted.
431///
432/// Case and slash-normalization: paths are lowercased and
433/// forward-slashed internally, so callers don't need to pre-normalize.
434///
435/// Complexity: O(D × A + M) where D is manifest directive count,
436/// A is average path depth, and M is profile mod count. For a typical
437/// CP2077 modlist (≈7k directives, ≈700 mods) this runs in well under
438/// a millisecond.
439pub fn detect_stale_duplicates<F>(
440 profile: &Profile,
441 manifest: &WabbajackManifest,
442 mod_id_to_footprint: F,
443) -> DuplicateReport
444where
445 F: Fn(&str) -> Option<ModFootprint>,
446{
447 // Build the manifest's covered file set + covered directory set
448 // from its install directives. Only `FromArchive` /
449 // `PatchedFromArchive` directives are "physical" file placements
450 // we can compare against — `CreateBSA` and `InlineFile` don't map
451 // cleanly to a single on-disk file at scan time.
452 //
453 // Wabbajack `To` paths are MO2-staged: they look like
454 // `mods\<MO2 Mod Name>\<game-relative-path>`. We must strip the
455 // `mods/<name>/` prefix before comparing against game-relative
456 // footprints — this mirrors what `match_wabbajack_manifest` does
457 // via `strip_mo2_prefix`. Without this step, every directive path
458 // in a CP2077 modlist begins with `mods/<big mod name>/`, which
459 // never overlaps with a `bin/x64/...` or `archive/pc/mod/...`
460 // footprint, and `detect_stale_duplicates` silently classifies
461 // every row as GENUINE. See profile 3077 for the failure mode.
462 let mut covered_files: HashSet<String> = HashSet::new();
463 for d in manifest.install_directives() {
464 let to = match d {
465 InstallDirective::FromArchive { to, .. }
466 | InstallDirective::PatchedFromArchive { to, .. } => to,
467 _ => continue,
468 };
469 let normalized = to.replace('\\', "/").to_lowercase();
470 covered_files.insert(strip_mo2_prefix(&normalized));
471 }
472
473 // Expand each covered file into its ancestor-directory prefixes so
474 // the Directory footprint check becomes a single O(1) HashSet lookup.
475 let mut covered_dirs: HashSet<String> = HashSet::new();
476 for f in &covered_files {
477 let mut cur = f.as_str();
478 while let Some(idx) = cur.rfind('/') {
479 cur = &cur[..idx];
480 covered_dirs.insert(format!("{cur}/"));
481 }
482 }
483
484 let mut report = DuplicateReport::default();
485 for m in &profile.mods {
486 let footprint = match mod_id_to_footprint(&m.mod_id) {
487 Some(fp) => fp,
488 None => continue, // Not a filesystem-scanner row; skip.
489 };
490 let covered = match &footprint {
491 ModFootprint::Directory(d) => covered_dirs.contains(d),
492 ModFootprint::File(f) => covered_files.contains(f),
493 };
494 if covered {
495 report.leaked.push(m.mod_id.clone());
496 } else {
497 report.genuine.push(m.mod_id.clone());
498 }
499 }
500 report
501}
502
503/// Convert a filesystem-discovered mod into an `EnabledMod`.
504pub fn discovered_to_enabled(
505 mod_id: &str,
506 display_name: &str,
507 version: Option<&str>,
508 _confidence: f32,
509) -> EnabledMod {
510 EnabledMod {
511 mod_id: mod_id.to_string(),
512 display_name: Some(display_name.to_string()),
513 enabled: true,
514 version: version.map(String::from),
515 ..Default::default()
516 }
517}