Skip to main content

keyhog_sources/
filesystem.rs

1//! Filesystem source: recursively walks a directory tree, skips binary files,
2//! respects `.gitignore`, and yields chunks for scanning.
3
4use codewalk::{CodeWalker, WalkConfig};
5use keyhog_core::merkle_index::MerkleIndex;
6use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
7use std::collections::HashSet;
8use std::io::{Read, Seek, SeekFrom};
9use std::path::{Path, PathBuf};
10use std::sync::atomic::{AtomicUsize, Ordering};
11use std::sync::Arc;
12
13mod read;
14
15/// Minimum file size to use memory mapping. The crossover point is
16/// platform-specific:
17///
18///   * Linux / macOS: mmap setup is sub-microsecond and avoids the
19///     `read(2)` copy from kernel page cache to userland buffer. Worth
20///     it as soon as the file is at least one page (4 KiB) — pick
21///     64 KiB to keep tiny-config-file scans on the buffered path
22///     where the syscall floor dominates either way.
23///   * Windows: `MapViewOfFile` has more setup cost (security tokens,
24///     section-object routing) and the `ReadFile` path is already
25///     well-optimised by the OS for buffered I/O. Keep the historical
26///     1 MiB threshold here to avoid regressing typical source-tree
27///     scans.
28#[cfg(unix)]
29const MMAP_THRESHOLD: u64 = 64 * 1024;
30#[cfg(not(unix))]
31const MMAP_THRESHOLD: u64 = 1024 * 1024;
32/// Default window size for the >64 MiB scanning path. Overridable on a
33/// per-source basis (see `with_window_config`) so tests can exercise
34/// the windowed flow without writing 64 MiB+ fixtures.
35const DEFAULT_WINDOW_SIZE: usize = 64 * 1024 * 1024;
36/// Default overlap between consecutive windows. 4 KiB matches the
37/// longest plausible secret span we want to catch across the cut.
38const DEFAULT_WINDOW_OVERLAP: usize = 4 * 1024;
39
40/// Scans files in a directory tree.
41pub struct FilesystemSource {
42    root: PathBuf,
43    max_file_size: u64,
44    ignore_paths: Vec<String>,
45    include_paths: Vec<PathBuf>,
46    /// Whether to honor `.gitignore` / `.keyhogignore` files during the walk.
47    /// `true` (default) is correct for normal scans. `keyhog scan-system`
48    /// flips this to `false` because an attacker stashing a leaked key
49    /// inside a project would `.gitignore` it.
50    respect_gitignore: bool,
51    /// Optional merkle-index handle. When set, the iterator consults the
52    /// index per file BEFORE reading: if `(path, mtime_ns, size)` matches
53    /// a stored entry the file is skipped without an open() / read() —
54    /// the dominant cost on cold-cache disk. Doubles as an output sink:
55    /// when `record_metadata` is true, the source records the live
56    /// `(mtime, size)` of every chunk it does emit so the orchestrator
57    /// only has to attach the BLAKE3 hash post-scan.
58    merkle: Option<Arc<MerkleIndex>>,
59    /// Counter incremented for every file the metadata fast-path skips.
60    /// The orchestrator reads it after the scan to log how much I/O the
61    /// cache saved. Atomic so rayon-driven walkers don't have to lock.
62    skipped: Arc<AtomicUsize>,
63    /// Window size for the big-file scan path. Tests override this via
64    /// `with_window_config` to exercise the windowed flow without
65    /// writing the 64 MiB fixtures the production threshold requires.
66    window_size: usize,
67    /// Bytes of overlap between consecutive windows. Same rationale.
68    window_overlap: usize,
69}
70
71impl FilesystemSource {
72    /// Create a filesystem source rooted at `root`.
73    pub fn new(root: PathBuf) -> Self {
74        // Canonicalize so that discovered file paths are absolute and match
75        // include_paths that are typically absolute (e.g. from git diff).
76        let root = root.canonicalize().unwrap_or(root);
77        Self {
78            root,
79            max_file_size: 100 * 1024 * 1024, // 100 MB default — large files use windowed scanning
80            ignore_paths: Vec::new(),
81            include_paths: Vec::new(),
82            respect_gitignore: true,
83            merkle: None,
84            skipped: Arc::new(AtomicUsize::new(0)),
85            window_size: DEFAULT_WINDOW_SIZE,
86            window_overlap: DEFAULT_WINDOW_OVERLAP,
87        }
88    }
89
90    /// Override the windowed-scan parameters. Production callers stick
91    /// with the defaults (64 MiB / 4 KiB); tests use this to exercise
92    /// the multi-window path on tiny fixtures. `window_size` must
93    /// strictly exceed `overlap` (the underlying slicer asserts this).
94    pub fn with_window_config(mut self, window_size: usize, overlap: usize) -> Self {
95        assert!(window_size > overlap, "window must exceed overlap");
96        self.window_size = window_size;
97        self.window_overlap = overlap;
98        self
99    }
100
101    /// Wire the source up to a merkle index so `(path, mtime, size)`
102    /// matches skip the file *before* it is read. The cache contents
103    /// themselves are loaded by the orchestrator (which also handles
104    /// detector-spec-hash invalidation) and shared via `Arc` so multiple
105    /// sources can consult one index.
106    pub fn with_merkle_skip(mut self, merkle: Arc<MerkleIndex>) -> Self {
107        self.merkle = Some(merkle);
108        self
109    }
110
111    /// Returns a counter that the source increments every time the
112    /// metadata fast-path skips a file. Cloned `Arc<AtomicUsize>`, safe
113    /// to read after the iterator drains.
114    pub fn skipped_counter(&self) -> Arc<AtomicUsize> {
115        self.skipped.clone()
116    }
117
118    /// Only include files whose paths match one of the given paths.
119    /// Paths are compared against the absolute path of each discovered file.
120    pub fn with_include_paths(mut self, paths: Vec<PathBuf>) -> Self {
121        self.include_paths = paths;
122        self
123    }
124
125    /// Override the maximum file size scanned from disk.
126    pub fn with_max_file_size(mut self, bytes: u64) -> Self {
127        self.max_file_size = bytes;
128        self
129    }
130
131    /// Add patterns to ignore during the walk.
132    pub fn with_ignore_paths(mut self, paths: Vec<String>) -> Self {
133        self.ignore_paths = paths;
134        self
135    }
136
137    /// Override whether the walk honors `.gitignore` / `.keyhogignore`.
138    /// `keyhog scan-system` flips this to `false` so a leaked key
139    /// stashed in `.gitignore` can't hide.
140    pub fn with_respect_gitignore(mut self, respect: bool) -> Self {
141        self.respect_gitignore = respect;
142        self
143    }
144}
145
146/// File extensions to skip (binary, images, etc.).
147const SKIP_EXTENSIONS: &[&str] = &[
148    // Images
149    "png",
150    "jpg",
151    "jpeg",
152    "gif",
153    "bmp",
154    "ico",
155    "cur",
156    "icns",
157    "webp",
158    "svg",
159    // Audio/Video
160    "mp3",
161    "mp4",
162    "avi",
163    "mov",
164    "mkv",
165    "flac",
166    "wav",
167    "ogg",
168    "webm",
169    // Archives (binary — secrets inside are caught by archive source, not filesystem)
170    "tar",
171    // gz / zst / lz4 / sz are handled by `extract_compressed_chunks`
172    // below, NOT skipped — earlier versions had them in this list,
173    // which silently bypassed the streaming-decompression path. See
174    // the dispatch on line ~340 for the actual decoder routing.
175    "tgz",
176    "bz2",
177    "xz",
178    "rar",
179    "7z",
180    "zip",
181    // Native binaries
182    "exe",
183    "dll",
184    "so",
185    "dylib",
186    "o",
187    "a",
188    "lib",
189    "obj",
190    // Compiled/bytecode
191    "class",
192    "wasm",
193    "pyc",
194    "pyo",
195    "elc",
196    "beam",
197    // Documents (binary formats)
198    "pdf",
199    "doc",
200    "docx",
201    "xls",
202    "xlsx",
203    "ppt",
204    "pptx",
205    // Fonts
206    "ttf",
207    "otf",
208    "woff",
209    "woff2",
210    "eot",
211    // Database files
212    "db",
213    "sqlite",
214    "sqlite3",
215    // Disk images / firmware
216    "iso",
217    "img",
218    "bin",
219    "rom",
220    // Serialized data (not human-authored)
221    "pickle",
222    "npy",
223    "npz",
224    "onnx",
225    "pb",
226    "tflite",
227    "pt",
228    "safetensors",
229];
230
231/// Directories to skip entirely.
232const SKIP_DIRS: &[&str] = &[
233    ".git",
234    "node_modules",
235    "target",
236    "__pycache__",
237    ".venv",
238    "venv",
239    ".tox",
240    "dist",
241    "build",
242    ".next",
243    ".nuxt",
244    "vendor",
245    "swagger-ui",
246    "swagger",
247];
248
249impl Source for FilesystemSource {
250    fn name(&self) -> &str {
251        "filesystem"
252    }
253
254    fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
255        let max_size = self.max_file_size;
256        let mut config = walker_config(self.max_file_size, &self.ignore_paths);
257        if !self.respect_gitignore {
258            config = config.respect_gitignore(false);
259        }
260        // Use walk_iter (NOT walk()) so per-entry errors don't
261        // collapse the entire scan. `walk()` collects into a Vec
262        // via `.collect()` on a Result iterator — a single
263        // permission-denied (chmod 000 sub-tree, EACCES on a
264        // sibling) short-circuits the whole walk and the user
265        // gets ZERO findings. Production-grade behaviour is to
266        // log+skip the failed entry and keep walking everything
267        // else.
268        let walker = CodeWalker::new(&self.root, config);
269        let mut entries: Vec<codewalk::FileEntry> = walker
270            .walk_iter()
271            .filter_map(|result| match result {
272                Ok(entry) => Some(entry),
273                Err(error) => {
274                    tracing::warn!(
275                        %error,
276                        "skipping unreadable filesystem entry; scan continues"
277                    );
278                    None
279                }
280            })
281            .collect();
282
283        if !self.include_paths.is_empty() {
284            // Canonicalize both sides for consistent comparison
285            let allowed: HashSet<PathBuf> = self
286                .include_paths
287                .iter()
288                .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()))
289                .collect();
290            entries.retain(|e| {
291                let canonical = e.path.canonicalize().unwrap_or_else(|_| e.path.clone());
292                allowed.contains(&canonical)
293            });
294        }
295
296        let merkle = self.merkle.clone();
297        let skipped = self.skipped.clone();
298        let window_size = self.window_size;
299        let window_overlap = self.window_overlap;
300
301        Box::new(entries.into_iter().flat_map(move |entry| {
302            let path = entry.path;
303            let file_size = entry.size;
304
305            let ext = path
306                .extension()
307                .and_then(|e| e.to_str())
308                .unwrap_or("")
309                .to_lowercase();
310
311            if SKIP_EXTENSIONS.contains(&ext.as_str()) {
312                return vec![];
313            }
314
315            // Fast-path skip: stat the file once, ask the cache "have I
316            // seen this exact (path, mtime, size) tuple?" If yes, never
317            // open() or read() — the dominant cost on cold-cache disk.
318            // Stored alongside the chunk so the orchestrator can refresh
319            // the index entry post-scan without a second stat.
320            let live_mtime_ns = file_mtime_ns(&path);
321            if let (Some(idx), Some(mtime_ns)) = (merkle.as_ref(), live_mtime_ns) {
322                if idx.metadata_unchanged(&path, mtime_ns, file_size) {
323                    skipped.fetch_add(1, Ordering::Relaxed);
324                    return vec![];
325                }
326            }
327
328            if ext == "zip" || ext == "apk" || ext == "ipa" || ext == "crx" || ext == "jar" {
329                // Per-entry uncompressed-size cap to defeat zip-bomb DoS.
330                // openpack's central directory exposes uncompressed_size; skip
331                // any entry that exceeds max_size (per-file cap) and the total
332                // uncompressed budget.
333                let mut archive_chunks = Vec::new();
334                let mut total_uncompressed: u64 = 0;
335                let total_budget: u64 = max_size.saturating_mul(4); // 4x file cap budget for archives
336                if let Ok(pack) = openpack::OpenPack::open_default(&path) {
337                    if let Ok(entries) = pack.entries() {
338                        for archive_entry in entries {
339                            if archive_entry.is_dir || is_default_excluded(&archive_entry.name) {
340                                continue;
341                            }
342                            if archive_entry.uncompressed_size > max_size {
343                                tracing::warn!(
344                                    archive = %path.display(),
345                                    entry = %archive_entry.name,
346                                    size = archive_entry.uncompressed_size,
347                                    "skipping archive entry: uncompressed size exceeds per-file cap"
348                                );
349                                continue;
350                            }
351                            total_uncompressed = total_uncompressed
352                                .saturating_add(archive_entry.uncompressed_size);
353                            if total_uncompressed > total_budget {
354                                tracing::warn!(
355                                    archive = %path.display(),
356                                    "aborting archive extraction: total uncompressed size exceeds 4x file cap (zip-bomb guard)"
357                                );
358                                break;
359                            }
360                            if let Ok(content) = pack.read_entry(&archive_entry.name) {
361                                if let Ok(s) = String::from_utf8(content.clone()) {
362                                    archive_chunks.push(Ok(Chunk {
363                                        data: s.into(),
364                                        metadata: ChunkMetadata {
365                                            source_type: "filesystem/archive".into(),
366                                            path: Some(format!(
367                                                "{}//{}",
368                                                path.display(),
369                                                archive_entry.name
370                                            )),
371                                            ..Default::default()
372                                        },
373                                    }));
374                                } else {
375                                    let strings =
376                                        crate::strings::extract_printable_strings(&content, 8);
377                                    if !strings.is_empty() {
378                                        archive_chunks.push(Ok(Chunk {
379                                            data: keyhog_core::SensitiveString::join(&strings, "\n"),
380                                            metadata: ChunkMetadata {
381                                                source_type: "filesystem/archive-binary".into(),
382                                                path: Some(format!(
383                                                    "{}//{}",
384                                                    path.display(),
385                                                    archive_entry.name
386                                                )),
387                                                ..Default::default()
388                                            },
389                                        }));
390                                    }
391                                }
392                            }
393                        }
394                    }
395                }
396                return archive_chunks;
397            } else if ext == "gz" || ext == "zst" || ext == "lz4" || ext == "sz" {
398                return extract_compressed_chunks(&path, max_size);
399            }
400
401            let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
402            if is_default_excluded(filename) {
403                return vec![];
404            }
405            if filename.contains(".min.")
406                || filename.contains(".bundle.")
407                || filename.ends_with(".chunk.js")
408            {
409                return vec![];
410            }
411
412            if file_size > window_size as u64 {
413                // Fast path: mmap once and slice zero-copy into
414                // overlapping `window_size` views with `window_overlap`
415                // shared bytes between neighbours. Replaces a 64 MiB
416                // heap buffer + per-window `seek-back+re-read`
417                // round-trip with a single mmap + madvise(SEQUENTIAL).
418                if let Some(windows) =
419                    read::read_file_windowed_mmap(&path, window_size, window_overlap)
420                {
421                    return windows
422                        .into_iter()
423                        .map(|w| {
424                            Ok(Chunk {
425                                data: w.text.into(),
426                                metadata: ChunkMetadata {
427                                    source_type: "filesystem/windowed".to_string(),
428                                    path: Some(path.display().to_string()),
429                                    base_offset: w.offset,
430                                    mtime_ns: live_mtime_ns,
431                                    size_bytes: Some(file_size),
432                                    ..Default::default()
433                                },
434                            })
435                        })
436                        .collect();
437                }
438                // Buffered fallback: mmap refused (locked writer,
439                // unsupported filesystem). Same semantics as before —
440                // working buffer + seek-back overlap. Sized to the
441                // configured window so test overrides apply here too.
442                let mut window_chunks = Vec::new();
443                if let Ok(mut file) = std::fs::File::open(&path) {
444                    let mut current_offset = 0;
445                    let mut buffer = vec![0u8; window_size];
446                    while let Ok(n) = file.read(&mut buffer) {
447                        if n == 0 { break; }
448                        let data = String::from_utf8_lossy(&buffer[..n]).into_owned();
449                        window_chunks.push(Ok(Chunk {
450                            data: data.into(),
451                            metadata: ChunkMetadata {
452                                source_type: "filesystem/windowed".to_string(),
453                                path: Some(path.display().to_string()),
454                                base_offset: current_offset,
455                                mtime_ns: live_mtime_ns,
456                                size_bytes: Some(file_size),
457                                ..Default::default()
458                            },
459                        }));
460                        if n < window_size { break; }
461                        let _ = file.seek(SeekFrom::Current(-(window_overlap as i64)));
462                        current_offset += n - window_overlap;
463                    }
464                }
465                return window_chunks;
466            }
467            let file_text = if file_size >= MMAP_THRESHOLD {
468                read::read_file_mmap(&path)
469            } else {
470                read::read_file_buffered(&path)
471            };
472
473            let (content, source_type) = match file_text {
474                Some(text) if !text.is_empty() => (text.into(), "filesystem"),
475                _ => {
476                    if let Ok(bytes) = read::read_file_safe(&path) {
477                        let strings = crate::strings::extract_printable_strings(&bytes, 8);
478                        if strings.is_empty() {
479                            return vec![];
480                        }
481                        (keyhog_core::SensitiveString::join(&strings, "\n"), "filesystem:binary-strings")
482                    } else {
483                        return vec![];
484                    }
485                }
486            };
487
488            vec![Ok(Chunk {
489                data: content,
490                metadata: ChunkMetadata {
491                    source_type: source_type.to_string(),
492                    path: Some(path.display().to_string()),
493                    mtime_ns: live_mtime_ns,
494                    size_bytes: Some(file_size),
495                    ..Default::default()
496                },
497            })]
498        }))
499    }
500
501    fn as_any(&self) -> &dyn std::any::Any {
502        self
503    }
504}
505
506fn extract_compressed_chunks(path: &Path, max_size: u64) -> Vec<Result<Chunk, SourceError>> {
507    let ext = path
508        .extension()
509        .and_then(|e| e.to_str())
510        .unwrap_or("")
511        .to_lowercase();
512    let format = match ext.as_str() {
513        "gz" => ziftsieve::CompressionFormat::Gzip,
514        "zst" => ziftsieve::CompressionFormat::Zstd,
515        "lz4" => ziftsieve::CompressionFormat::Lz4,
516        _ => ziftsieve::CompressionFormat::Snappy,
517    };
518
519    // mmap the compressed file when possible — ziftsieve only takes a
520    // contiguous `&[u8]`, so a streaming decoder isn't on the menu, but
521    // mmap lets us hand it the whole file without a corresponding heap
522    // allocation. A 1 GiB `.zst` previously turned into a 1 GiB
523    // `Vec<u8>` before decompression even started; now it sits in the
524    // page cache backed by the file. Falls back to a buffered read
525    // when mmap is refused (locked writer, unsupported filesystem) so
526    // behaviour is identical to the prior implementation in that case.
527    //
528    // The per-source `max_size` doubles as the compressed-input cap:
529    // anything bigger is refused before mapping. The decompressed
530    // budget gate (4× max_size) still applies inside the loop below.
531    let file_bytes = match read::read_file_for_compressed_input(path, max_size) {
532        Some(b) => b,
533        None => return Vec::new(),
534    };
535    let bytes = file_bytes.as_slice();
536
537    // Decompression-bomb cap: a 4x compression-ratio multiplier on the
538    // per-file size budget bounds total expanded bytes. A 1 MB gzip bomb
539    // expanding to 4 GB hits this ceiling and aborts cleanly instead of
540    // OOMing. See audit release-2026-04-26 filesystem.rs:308-361.
541    let total_budget: usize = max_size.saturating_mul(4) as usize;
542
543    let mut chunks = Vec::new();
544
545    if let Ok(blocks) = ziftsieve::extract_from_bytes(format, bytes) {
546        let mut current_chunk_literals = String::new();
547        let mut total_decompressed: usize = 0;
548        for block in blocks {
549            if let Ok(s) = std::str::from_utf8(block.literals()) {
550                total_decompressed = total_decompressed.saturating_add(s.len());
551                if total_decompressed > total_budget {
552                    tracing::warn!(
553                        path = %path.display(),
554                        bytes = total_decompressed,
555                        cap = total_budget,
556                        "aborting compressed extraction: total decompressed size exceeds 4x file cap (gzip-bomb guard)"
557                    );
558                    break;
559                }
560                current_chunk_literals.push_str(s);
561                current_chunk_literals.push('\n');
562            }
563
564            if current_chunk_literals.len() > 8 * 1024 * 1024 {
565                chunks.push(Ok(Chunk {
566                    data: std::mem::take(&mut current_chunk_literals).into(),
567                    metadata: ChunkMetadata {
568                        source_type: "filesystem/compressed".into(),
569                        path: Some(path.display().to_string()),
570                        ..Default::default()
571                    },
572                }));
573            }
574        }
575        if !current_chunk_literals.is_empty() {
576            chunks.push(Ok(Chunk {
577                data: current_chunk_literals.into(),
578                metadata: ChunkMetadata {
579                    source_type: "filesystem/compressed".into(),
580                    path: Some(path.display().to_string()),
581                    ..Default::default()
582                },
583            }));
584        }
585    }
586    chunks
587}
588
589/// Check if a path matches the built-in default exclusion patterns.
590/// Mirrors the patterns in `crates/cli/src/sources.rs`.
591///
592/// ASCII case-insensitive byte comparisons; splits on both `/` and
593/// `\` so Windows paths get the same treatment as POSIX. The previous
594/// flow built a fully-lowercased copy of the entire path and ran
595/// POSIX-only `.contains("/x/")` checks, which (a) allocated per
596/// file on the walker hot path and (b) silently failed to exclude
597/// `\node_modules\`, `\vendor\`, etc. on Windows checkouts.
598fn is_default_excluded(path: &str) -> bool {
599    let bytes = path.as_bytes();
600    let ends_ci = |suffix: &[u8]| -> bool {
601        bytes.len() >= suffix.len()
602            && bytes[bytes.len() - suffix.len()..].eq_ignore_ascii_case(suffix)
603    };
604
605    // File suffixes
606    const SUFFIXES: &[&[u8]] = &[
607        b".min.js",
608        b".min.css",
609        b".bak",
610        b".swp",
611        b".tmp",
612        b".map",
613        b".cache",
614    ];
615    if SUFFIXES.iter().any(|s| ends_ci(s)) {
616        return true;
617    }
618
619    // Directory contents — segment-walk catches both separators.
620    const SKIP_SEGMENTS: &[&[u8]] = &[
621        b"node_modules",
622        b".git",
623        b"__pycache__",
624        b"vendor",
625        b"dist",
626        b"build",
627        b"out",
628    ];
629    let mut filename: &[u8] = bytes;
630    for segment in path.split(['/', '\\']) {
631        let seg_bytes = segment.as_bytes();
632        if SKIP_SEGMENTS
633            .iter()
634            .any(|skip| seg_bytes.eq_ignore_ascii_case(skip))
635        {
636            return true;
637        }
638        if !seg_bytes.is_empty() {
639            filename = seg_bytes;
640        }
641    }
642
643    // Specific filename matches (the trailing component only —
644    // intermediate-dir matches were already handled above).
645    const FILENAMES: &[&[u8]] = &[
646        b"package-lock.json",
647        b"yarn.lock",
648        b"pnpm-lock.yaml",
649        b"cache.json",
650        b"cargo.lock",
651        b"go.sum",
652        b"gemfile.lock",
653        b"angular.json",
654    ];
655    if FILENAMES
656        .iter()
657        .any(|name| filename.eq_ignore_ascii_case(name))
658    {
659        return true;
660    }
661
662    // tsconfig*.json
663    let tsc = b"tsconfig";
664    let json = b".json";
665    if filename.len() >= tsc.len() + json.len()
666        && filename[..tsc.len()].eq_ignore_ascii_case(tsc)
667        && filename[filename.len() - json.len()..].eq_ignore_ascii_case(json)
668    {
669        return true;
670    }
671
672    false
673}
674
675/// Read the mtime as nanoseconds-since-UNIX-epoch via a single `stat`.
676/// Returns `None` when the platform/filesystem doesn't expose a usable
677/// modified time — in that case the cache fast-path simply doesn't fire,
678/// which is strictly better than a false skip.
679fn file_mtime_ns(path: &Path) -> Option<u64> {
680    let meta = std::fs::metadata(path).ok()?;
681    let modified = meta.modified().ok()?;
682    let dur = modified
683        .duration_since(std::time::UNIX_EPOCH)
684        .ok()?;
685    // Cap nanos at u64::MAX for the (unrealistic) far-future case so the
686    // numeric key stays stable. ~584 years from epoch fits in u64 ns
687    // comfortably; the real concern is filesystems returning weird values.
688    let nanos = dur.as_secs() as u128 * 1_000_000_000 + dur.subsec_nanos() as u128;
689    Some(u64::try_from(nanos).unwrap_or(u64::MAX))
690}
691
692fn walker_config(max_file_size: u64, ignore_paths: &[String]) -> WalkConfig {
693    let mut exclude_extensions = HashSet::new();
694    exclude_extensions.extend(SKIP_EXTENSIONS.iter().map(|ext| (*ext).to_string()));
695
696    let mut exclude_dirs = HashSet::new();
697    exclude_dirs.extend(SKIP_DIRS.iter().map(|dir| (*dir).to_string()));
698
699    let ignore_overrides = ignore_paths
700        .iter()
701        .map(|pattern| {
702            if pattern.starts_with('!') {
703                pattern.clone()
704            } else {
705                format!("!{pattern}")
706            }
707        })
708        .collect();
709
710    WalkConfig::default()
711        .max_file_size(max_file_size)
712        .follow_symlinks(false)
713        .respect_gitignore(true)
714        .skip_hidden(false)
715        .skip_binary(false)
716        .exclude_extensions(exclude_extensions)
717        .exclude_dirs(exclude_dirs)
718        .ignore_files(vec![".keyhogignore".to_string()])
719        .ignore_patterns(ignore_overrides)
720}