keyhog_sources/filesystem.rs
1//! Filesystem source: recursively walks a directory tree, skips binary files,
2//! respects `.gitignore`, and yields chunks for scanning.
3
4use codewalk::{CodeWalker, WalkConfig};
5use keyhog_core::merkle_index::MerkleIndex;
6use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
7use std::collections::HashSet;
8use std::io::{Read, Seek, SeekFrom};
9use std::path::{Path, PathBuf};
10use std::sync::atomic::{AtomicUsize, Ordering};
11use std::sync::Arc;
12
13mod read;
14
15/// Minimum file size to use memory mapping. The crossover point is
16/// platform-specific:
17///
18/// * Linux / macOS: mmap setup is sub-microsecond and avoids the
19/// `read(2)` copy from kernel page cache to userland buffer. Worth
20/// it as soon as the file is at least one page (4 KiB) — pick
21/// 64 KiB to keep tiny-config-file scans on the buffered path
22/// where the syscall floor dominates either way.
23/// * Windows: `MapViewOfFile` has more setup cost (security tokens,
24/// section-object routing) and the `ReadFile` path is already
25/// well-optimised by the OS for buffered I/O. Keep the historical
26/// 1 MiB threshold here to avoid regressing typical source-tree
27/// scans.
28#[cfg(unix)]
29const MMAP_THRESHOLD: u64 = 64 * 1024;
30#[cfg(not(unix))]
31const MMAP_THRESHOLD: u64 = 1024 * 1024;
32/// Default window size for the >64 MiB scanning path. Overridable on a
33/// per-source basis (see `with_window_config`) so tests can exercise
34/// the windowed flow without writing 64 MiB+ fixtures.
35const DEFAULT_WINDOW_SIZE: usize = 64 * 1024 * 1024;
36/// Default overlap between consecutive windows. 4 KiB matches the
37/// longest plausible secret span we want to catch across the cut.
38const DEFAULT_WINDOW_OVERLAP: usize = 4 * 1024;
39
40/// Scans files in a directory tree.
41pub struct FilesystemSource {
42 root: PathBuf,
43 max_file_size: u64,
44 ignore_paths: Vec<String>,
45 include_paths: Vec<PathBuf>,
46 /// Whether to honor `.gitignore` / `.keyhogignore` files during the walk.
47 /// `true` (default) is correct for normal scans. `keyhog scan-system`
48 /// flips this to `false` because an attacker stashing a leaked key
49 /// inside a project would `.gitignore` it.
50 respect_gitignore: bool,
51 /// Optional merkle-index handle. When set, the iterator consults the
52 /// index per file BEFORE reading: if `(path, mtime_ns, size)` matches
53 /// a stored entry the file is skipped without an open() / read() —
54 /// the dominant cost on cold-cache disk. Doubles as an output sink:
55 /// when `record_metadata` is true, the source records the live
56 /// `(mtime, size)` of every chunk it does emit so the orchestrator
57 /// only has to attach the BLAKE3 hash post-scan.
58 merkle: Option<Arc<MerkleIndex>>,
59 /// Counter incremented for every file the metadata fast-path skips.
60 /// The orchestrator reads it after the scan to log how much I/O the
61 /// cache saved. Atomic so rayon-driven walkers don't have to lock.
62 skipped: Arc<AtomicUsize>,
63 /// Window size for the big-file scan path. Tests override this via
64 /// `with_window_config` to exercise the windowed flow without
65 /// writing the 64 MiB fixtures the production threshold requires.
66 window_size: usize,
67 /// Bytes of overlap between consecutive windows. Same rationale.
68 window_overlap: usize,
69}
70
71impl FilesystemSource {
72 /// Create a filesystem source rooted at `root`.
73 pub fn new(root: PathBuf) -> Self {
74 // Canonicalize so that discovered file paths are absolute and match
75 // include_paths that are typically absolute (e.g. from git diff).
76 let root = root.canonicalize().unwrap_or(root);
77 Self {
78 root,
79 max_file_size: 100 * 1024 * 1024, // 100 MB default — large files use windowed scanning
80 ignore_paths: Vec::new(),
81 include_paths: Vec::new(),
82 respect_gitignore: true,
83 merkle: None,
84 skipped: Arc::new(AtomicUsize::new(0)),
85 window_size: DEFAULT_WINDOW_SIZE,
86 window_overlap: DEFAULT_WINDOW_OVERLAP,
87 }
88 }
89
90 /// Override the windowed-scan parameters. Production callers stick
91 /// with the defaults (64 MiB / 4 KiB); tests use this to exercise
92 /// the multi-window path on tiny fixtures. `window_size` must
93 /// strictly exceed `overlap` (the underlying slicer asserts this).
94 pub fn with_window_config(mut self, window_size: usize, overlap: usize) -> Self {
95 assert!(window_size > overlap, "window must exceed overlap");
96 self.window_size = window_size;
97 self.window_overlap = overlap;
98 self
99 }
100
101 /// Wire the source up to a merkle index so `(path, mtime, size)`
102 /// matches skip the file *before* it is read. The cache contents
103 /// themselves are loaded by the orchestrator (which also handles
104 /// detector-spec-hash invalidation) and shared via `Arc` so multiple
105 /// sources can consult one index.
106 pub fn with_merkle_skip(mut self, merkle: Arc<MerkleIndex>) -> Self {
107 self.merkle = Some(merkle);
108 self
109 }
110
111 /// Returns a counter that the source increments every time the
112 /// metadata fast-path skips a file. Cloned `Arc<AtomicUsize>`, safe
113 /// to read after the iterator drains.
114 pub fn skipped_counter(&self) -> Arc<AtomicUsize> {
115 self.skipped.clone()
116 }
117
118 /// Only include files whose paths match one of the given paths.
119 /// Paths are compared against the absolute path of each discovered file.
120 pub fn with_include_paths(mut self, paths: Vec<PathBuf>) -> Self {
121 self.include_paths = paths;
122 self
123 }
124
125 /// Override the maximum file size scanned from disk.
126 pub fn with_max_file_size(mut self, bytes: u64) -> Self {
127 self.max_file_size = bytes;
128 self
129 }
130
131 /// Add patterns to ignore during the walk.
132 pub fn with_ignore_paths(mut self, paths: Vec<String>) -> Self {
133 self.ignore_paths = paths;
134 self
135 }
136
137 /// Override whether the walk honors `.gitignore` / `.keyhogignore`.
138 /// `keyhog scan-system` flips this to `false` so a leaked key
139 /// stashed in `.gitignore` can't hide.
140 pub fn with_respect_gitignore(mut self, respect: bool) -> Self {
141 self.respect_gitignore = respect;
142 self
143 }
144}
145
146/// File extensions to skip (binary, images, etc.).
147const SKIP_EXTENSIONS: &[&str] = &[
148 // Images
149 "png",
150 "jpg",
151 "jpeg",
152 "gif",
153 "bmp",
154 "ico",
155 "cur",
156 "icns",
157 "webp",
158 "svg",
159 // Audio/Video
160 "mp3",
161 "mp4",
162 "avi",
163 "mov",
164 "mkv",
165 "flac",
166 "wav",
167 "ogg",
168 "webm",
169 // Archives (binary — secrets inside are caught by archive source, not filesystem)
170 "tar",
171 // gz / zst / lz4 / sz are handled by `extract_compressed_chunks`
172 // below, NOT skipped — earlier versions had them in this list,
173 // which silently bypassed the streaming-decompression path. See
174 // the dispatch on line ~340 for the actual decoder routing.
175 "tgz",
176 "bz2",
177 "xz",
178 "rar",
179 "7z",
180 "zip",
181 // Native binaries
182 "exe",
183 "dll",
184 "so",
185 "dylib",
186 "o",
187 "a",
188 "lib",
189 "obj",
190 // Compiled/bytecode
191 "class",
192 "wasm",
193 "pyc",
194 "pyo",
195 "elc",
196 "beam",
197 // Documents (binary formats)
198 "pdf",
199 "doc",
200 "docx",
201 "xls",
202 "xlsx",
203 "ppt",
204 "pptx",
205 // Fonts
206 "ttf",
207 "otf",
208 "woff",
209 "woff2",
210 "eot",
211 // Database files
212 "db",
213 "sqlite",
214 "sqlite3",
215 // Disk images / firmware
216 "iso",
217 "img",
218 "bin",
219 "rom",
220 // Serialized data (not human-authored)
221 "pickle",
222 "npy",
223 "npz",
224 "onnx",
225 "pb",
226 "tflite",
227 "pt",
228 "safetensors",
229];
230
231/// Directories to skip entirely.
232const SKIP_DIRS: &[&str] = &[
233 ".git",
234 "node_modules",
235 "target",
236 "__pycache__",
237 ".venv",
238 "venv",
239 ".tox",
240 "dist",
241 "build",
242 ".next",
243 ".nuxt",
244 "vendor",
245 "swagger-ui",
246 "swagger",
247];
248
249impl Source for FilesystemSource {
250 fn name(&self) -> &str {
251 "filesystem"
252 }
253
254 fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
255 let max_size = self.max_file_size;
256 let mut config = walker_config(self.max_file_size, &self.ignore_paths);
257 if !self.respect_gitignore {
258 config = config.respect_gitignore(false);
259 }
260 // Use walk_iter (NOT walk()) so per-entry errors don't
261 // collapse the entire scan. `walk()` collects into a Vec
262 // via `.collect()` on a Result iterator — a single
263 // permission-denied (chmod 000 sub-tree, EACCES on a
264 // sibling) short-circuits the whole walk and the user
265 // gets ZERO findings. Production-grade behaviour is to
266 // log+skip the failed entry and keep walking everything
267 // else.
268 let walker = CodeWalker::new(&self.root, config);
269 let mut entries: Vec<codewalk::FileEntry> = walker
270 .walk_iter()
271 .filter_map(|result| match result {
272 Ok(entry) => Some(entry),
273 Err(error) => {
274 tracing::warn!(
275 %error,
276 "skipping unreadable filesystem entry; scan continues"
277 );
278 None
279 }
280 })
281 .collect();
282
283 if !self.include_paths.is_empty() {
284 // Canonicalize both sides for consistent comparison
285 let allowed: HashSet<PathBuf> = self
286 .include_paths
287 .iter()
288 .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()))
289 .collect();
290 entries.retain(|e| {
291 let canonical = e.path.canonicalize().unwrap_or_else(|_| e.path.clone());
292 allowed.contains(&canonical)
293 });
294 }
295
296 let merkle = self.merkle.clone();
297 let skipped = self.skipped.clone();
298 let window_size = self.window_size;
299 let window_overlap = self.window_overlap;
300
301 Box::new(entries.into_iter().flat_map(move |entry| {
302 let path = entry.path;
303 let file_size = entry.size;
304
305 let ext = path
306 .extension()
307 .and_then(|e| e.to_str())
308 .unwrap_or("")
309 .to_lowercase();
310
311 if SKIP_EXTENSIONS.contains(&ext.as_str()) {
312 return vec![];
313 }
314
315 // Fast-path skip: stat the file once, ask the cache "have I
316 // seen this exact (path, mtime, size) tuple?" If yes, never
317 // open() or read() — the dominant cost on cold-cache disk.
318 // Stored alongside the chunk so the orchestrator can refresh
319 // the index entry post-scan without a second stat.
320 let live_mtime_ns = file_mtime_ns(&path);
321 if let (Some(idx), Some(mtime_ns)) = (merkle.as_ref(), live_mtime_ns) {
322 if idx.metadata_unchanged(&path, mtime_ns, file_size) {
323 skipped.fetch_add(1, Ordering::Relaxed);
324 return vec![];
325 }
326 }
327
328 if ext == "zip" || ext == "apk" || ext == "ipa" || ext == "crx" || ext == "jar" {
329 // Per-entry uncompressed-size cap to defeat zip-bomb DoS.
330 // openpack's central directory exposes uncompressed_size; skip
331 // any entry that exceeds max_size (per-file cap) and the total
332 // uncompressed budget.
333 let mut archive_chunks = Vec::new();
334 let mut total_uncompressed: u64 = 0;
335 let total_budget: u64 = max_size.saturating_mul(4); // 4x file cap budget for archives
336 if let Ok(pack) = openpack::OpenPack::open_default(&path) {
337 if let Ok(entries) = pack.entries() {
338 for archive_entry in entries {
339 if archive_entry.is_dir || is_default_excluded(&archive_entry.name) {
340 continue;
341 }
342 if archive_entry.uncompressed_size > max_size {
343 tracing::warn!(
344 archive = %path.display(),
345 entry = %archive_entry.name,
346 size = archive_entry.uncompressed_size,
347 "skipping archive entry: uncompressed size exceeds per-file cap"
348 );
349 continue;
350 }
351 total_uncompressed = total_uncompressed
352 .saturating_add(archive_entry.uncompressed_size);
353 if total_uncompressed > total_budget {
354 tracing::warn!(
355 archive = %path.display(),
356 "aborting archive extraction: total uncompressed size exceeds 4x file cap (zip-bomb guard)"
357 );
358 break;
359 }
360 if let Ok(content) = pack.read_entry(&archive_entry.name) {
361 if let Ok(s) = String::from_utf8(content.clone()) {
362 archive_chunks.push(Ok(Chunk {
363 data: s.into(),
364 metadata: ChunkMetadata {
365 source_type: "filesystem/archive".into(),
366 path: Some(format!(
367 "{}//{}",
368 path.display(),
369 archive_entry.name
370 )),
371 ..Default::default()
372 },
373 }));
374 } else {
375 let strings =
376 crate::strings::extract_printable_strings(&content, 8);
377 if !strings.is_empty() {
378 archive_chunks.push(Ok(Chunk {
379 data: keyhog_core::SensitiveString::join(&strings, "\n"),
380 metadata: ChunkMetadata {
381 source_type: "filesystem/archive-binary".into(),
382 path: Some(format!(
383 "{}//{}",
384 path.display(),
385 archive_entry.name
386 )),
387 ..Default::default()
388 },
389 }));
390 }
391 }
392 }
393 }
394 }
395 }
396 return archive_chunks;
397 } else if ext == "gz" || ext == "zst" || ext == "lz4" || ext == "sz" {
398 return extract_compressed_chunks(&path, max_size);
399 }
400
401 let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
402 if is_default_excluded(filename) {
403 return vec![];
404 }
405 if filename.contains(".min.")
406 || filename.contains(".bundle.")
407 || filename.ends_with(".chunk.js")
408 {
409 return vec![];
410 }
411
412 if file_size > window_size as u64 {
413 // Fast path: mmap once and slice zero-copy into
414 // overlapping `window_size` views with `window_overlap`
415 // shared bytes between neighbours. Replaces a 64 MiB
416 // heap buffer + per-window `seek-back+re-read`
417 // round-trip with a single mmap + madvise(SEQUENTIAL).
418 if let Some(windows) =
419 read::read_file_windowed_mmap(&path, window_size, window_overlap)
420 {
421 return windows
422 .into_iter()
423 .map(|w| {
424 Ok(Chunk {
425 data: w.text.into(),
426 metadata: ChunkMetadata {
427 source_type: "filesystem/windowed".to_string(),
428 path: Some(path.display().to_string()),
429 base_offset: w.offset,
430 mtime_ns: live_mtime_ns,
431 size_bytes: Some(file_size),
432 ..Default::default()
433 },
434 })
435 })
436 .collect();
437 }
438 // Buffered fallback: mmap refused (locked writer,
439 // unsupported filesystem). Same semantics as before —
440 // working buffer + seek-back overlap. Sized to the
441 // configured window so test overrides apply here too.
442 let mut window_chunks = Vec::new();
443 if let Ok(mut file) = std::fs::File::open(&path) {
444 let mut current_offset = 0;
445 let mut buffer = vec![0u8; window_size];
446 while let Ok(n) = file.read(&mut buffer) {
447 if n == 0 { break; }
448 let data = String::from_utf8_lossy(&buffer[..n]).into_owned();
449 window_chunks.push(Ok(Chunk {
450 data: data.into(),
451 metadata: ChunkMetadata {
452 source_type: "filesystem/windowed".to_string(),
453 path: Some(path.display().to_string()),
454 base_offset: current_offset,
455 mtime_ns: live_mtime_ns,
456 size_bytes: Some(file_size),
457 ..Default::default()
458 },
459 }));
460 if n < window_size { break; }
461 let _ = file.seek(SeekFrom::Current(-(window_overlap as i64)));
462 current_offset += n - window_overlap;
463 }
464 }
465 return window_chunks;
466 }
467 let file_text = if file_size >= MMAP_THRESHOLD {
468 read::read_file_mmap(&path)
469 } else {
470 read::read_file_buffered(&path)
471 };
472
473 let (content, source_type) = match file_text {
474 Some(text) if !text.is_empty() => (text.into(), "filesystem"),
475 _ => {
476 if let Ok(bytes) = read::read_file_safe(&path) {
477 let strings = crate::strings::extract_printable_strings(&bytes, 8);
478 if strings.is_empty() {
479 return vec![];
480 }
481 (keyhog_core::SensitiveString::join(&strings, "\n"), "filesystem:binary-strings")
482 } else {
483 return vec![];
484 }
485 }
486 };
487
488 vec![Ok(Chunk {
489 data: content,
490 metadata: ChunkMetadata {
491 source_type: source_type.to_string(),
492 path: Some(path.display().to_string()),
493 mtime_ns: live_mtime_ns,
494 size_bytes: Some(file_size),
495 ..Default::default()
496 },
497 })]
498 }))
499 }
500
501 fn as_any(&self) -> &dyn std::any::Any {
502 self
503 }
504}
505
506fn extract_compressed_chunks(path: &Path, max_size: u64) -> Vec<Result<Chunk, SourceError>> {
507 let ext = path
508 .extension()
509 .and_then(|e| e.to_str())
510 .unwrap_or("")
511 .to_lowercase();
512 let format = match ext.as_str() {
513 "gz" => ziftsieve::CompressionFormat::Gzip,
514 "zst" => ziftsieve::CompressionFormat::Zstd,
515 "lz4" => ziftsieve::CompressionFormat::Lz4,
516 _ => ziftsieve::CompressionFormat::Snappy,
517 };
518
519 // mmap the compressed file when possible — ziftsieve only takes a
520 // contiguous `&[u8]`, so a streaming decoder isn't on the menu, but
521 // mmap lets us hand it the whole file without a corresponding heap
522 // allocation. A 1 GiB `.zst` previously turned into a 1 GiB
523 // `Vec<u8>` before decompression even started; now it sits in the
524 // page cache backed by the file. Falls back to a buffered read
525 // when mmap is refused (locked writer, unsupported filesystem) so
526 // behaviour is identical to the prior implementation in that case.
527 //
528 // The per-source `max_size` doubles as the compressed-input cap:
529 // anything bigger is refused before mapping. The decompressed
530 // budget gate (4× max_size) still applies inside the loop below.
531 let file_bytes = match read::read_file_for_compressed_input(path, max_size) {
532 Some(b) => b,
533 None => return Vec::new(),
534 };
535 let bytes = file_bytes.as_slice();
536
537 // Decompression-bomb cap: a 4x compression-ratio multiplier on the
538 // per-file size budget bounds total expanded bytes. A 1 MB gzip bomb
539 // expanding to 4 GB hits this ceiling and aborts cleanly instead of
540 // OOMing. See audit release-2026-04-26 filesystem.rs:308-361.
541 let total_budget: usize = max_size.saturating_mul(4) as usize;
542
543 let mut chunks = Vec::new();
544
545 if let Ok(blocks) = ziftsieve::extract_from_bytes(format, bytes) {
546 let mut current_chunk_literals = String::new();
547 let mut total_decompressed: usize = 0;
548 for block in blocks {
549 if let Ok(s) = std::str::from_utf8(block.literals()) {
550 total_decompressed = total_decompressed.saturating_add(s.len());
551 if total_decompressed > total_budget {
552 tracing::warn!(
553 path = %path.display(),
554 bytes = total_decompressed,
555 cap = total_budget,
556 "aborting compressed extraction: total decompressed size exceeds 4x file cap (gzip-bomb guard)"
557 );
558 break;
559 }
560 current_chunk_literals.push_str(s);
561 current_chunk_literals.push('\n');
562 }
563
564 if current_chunk_literals.len() > 8 * 1024 * 1024 {
565 chunks.push(Ok(Chunk {
566 data: std::mem::take(&mut current_chunk_literals).into(),
567 metadata: ChunkMetadata {
568 source_type: "filesystem/compressed".into(),
569 path: Some(path.display().to_string()),
570 ..Default::default()
571 },
572 }));
573 }
574 }
575 if !current_chunk_literals.is_empty() {
576 chunks.push(Ok(Chunk {
577 data: current_chunk_literals.into(),
578 metadata: ChunkMetadata {
579 source_type: "filesystem/compressed".into(),
580 path: Some(path.display().to_string()),
581 ..Default::default()
582 },
583 }));
584 }
585 }
586 chunks
587}
588
589/// Check if a path matches the built-in default exclusion patterns.
590/// Mirrors the patterns in `crates/cli/src/sources.rs`.
591///
592/// ASCII case-insensitive byte comparisons; splits on both `/` and
593/// `\` so Windows paths get the same treatment as POSIX. The previous
594/// flow built a fully-lowercased copy of the entire path and ran
595/// POSIX-only `.contains("/x/")` checks, which (a) allocated per
596/// file on the walker hot path and (b) silently failed to exclude
597/// `\node_modules\`, `\vendor\`, etc. on Windows checkouts.
598fn is_default_excluded(path: &str) -> bool {
599 let bytes = path.as_bytes();
600 let ends_ci = |suffix: &[u8]| -> bool {
601 bytes.len() >= suffix.len()
602 && bytes[bytes.len() - suffix.len()..].eq_ignore_ascii_case(suffix)
603 };
604
605 // File suffixes
606 const SUFFIXES: &[&[u8]] = &[
607 b".min.js",
608 b".min.css",
609 b".bak",
610 b".swp",
611 b".tmp",
612 b".map",
613 b".cache",
614 ];
615 if SUFFIXES.iter().any(|s| ends_ci(s)) {
616 return true;
617 }
618
619 // Directory contents — segment-walk catches both separators.
620 const SKIP_SEGMENTS: &[&[u8]] = &[
621 b"node_modules",
622 b".git",
623 b"__pycache__",
624 b"vendor",
625 b"dist",
626 b"build",
627 b"out",
628 ];
629 let mut filename: &[u8] = bytes;
630 for segment in path.split(['/', '\\']) {
631 let seg_bytes = segment.as_bytes();
632 if SKIP_SEGMENTS
633 .iter()
634 .any(|skip| seg_bytes.eq_ignore_ascii_case(skip))
635 {
636 return true;
637 }
638 if !seg_bytes.is_empty() {
639 filename = seg_bytes;
640 }
641 }
642
643 // Specific filename matches (the trailing component only —
644 // intermediate-dir matches were already handled above).
645 const FILENAMES: &[&[u8]] = &[
646 b"package-lock.json",
647 b"yarn.lock",
648 b"pnpm-lock.yaml",
649 b"cache.json",
650 b"cargo.lock",
651 b"go.sum",
652 b"gemfile.lock",
653 b"angular.json",
654 ];
655 if FILENAMES
656 .iter()
657 .any(|name| filename.eq_ignore_ascii_case(name))
658 {
659 return true;
660 }
661
662 // tsconfig*.json
663 let tsc = b"tsconfig";
664 let json = b".json";
665 if filename.len() >= tsc.len() + json.len()
666 && filename[..tsc.len()].eq_ignore_ascii_case(tsc)
667 && filename[filename.len() - json.len()..].eq_ignore_ascii_case(json)
668 {
669 return true;
670 }
671
672 false
673}
674
675/// Read the mtime as nanoseconds-since-UNIX-epoch via a single `stat`.
676/// Returns `None` when the platform/filesystem doesn't expose a usable
677/// modified time — in that case the cache fast-path simply doesn't fire,
678/// which is strictly better than a false skip.
679fn file_mtime_ns(path: &Path) -> Option<u64> {
680 let meta = std::fs::metadata(path).ok()?;
681 let modified = meta.modified().ok()?;
682 let dur = modified
683 .duration_since(std::time::UNIX_EPOCH)
684 .ok()?;
685 // Cap nanos at u64::MAX for the (unrealistic) far-future case so the
686 // numeric key stays stable. ~584 years from epoch fits in u64 ns
687 // comfortably; the real concern is filesystems returning weird values.
688 let nanos = dur.as_secs() as u128 * 1_000_000_000 + dur.subsec_nanos() as u128;
689 Some(u64::try_from(nanos).unwrap_or(u64::MAX))
690}
691
692fn walker_config(max_file_size: u64, ignore_paths: &[String]) -> WalkConfig {
693 let mut exclude_extensions = HashSet::new();
694 exclude_extensions.extend(SKIP_EXTENSIONS.iter().map(|ext| (*ext).to_string()));
695
696 let mut exclude_dirs = HashSet::new();
697 exclude_dirs.extend(SKIP_DIRS.iter().map(|dir| (*dir).to_string()));
698
699 let ignore_overrides = ignore_paths
700 .iter()
701 .map(|pattern| {
702 if pattern.starts_with('!') {
703 pattern.clone()
704 } else {
705 format!("!{pattern}")
706 }
707 })
708 .collect();
709
710 WalkConfig::default()
711 .max_file_size(max_file_size)
712 .follow_symlinks(false)
713 .respect_gitignore(true)
714 .skip_hidden(false)
715 .skip_binary(false)
716 .exclude_extensions(exclude_extensions)
717 .exclude_dirs(exclude_dirs)
718 .ignore_files(vec![".keyhogignore".to_string()])
719 .ignore_patterns(ignore_overrides)
720}