Skip to main content

skill_veil_core/adapters/
std_filesystem.rs

1//! File system provider implementation using std::fs
2
3use crate::ports::{FileContent, FileMeta, FileSystemError, FileSystemProvider};
4use std::io;
5use std::path::{Path, PathBuf};
6use walkdir::WalkDir;
7
8use super::walk_helpers::{is_skipped_dir, lossy_filename_with_warning};
9
10/// Hard upper bound on a single `read_file_bytes` call. Any larger file
11/// surfaces as `FileSystemError::IoError` instead of being slurped into
12/// memory. The limit is generous (256 MiB) so it does not interfere with
13/// legitimate skills carrying full datasets, but defends against a
14/// malicious package that ships a 100 GB file as a denial-of-service
15/// vector — `std::fs::read` would otherwise attempt to allocate the
16/// entire body into a `Vec<u8>` and OOM the process.
17///
18/// Discovery layers (`file_discovery::package_artifacts`) already apply
19/// per-pattern caps (`MAX_DATA_FILE_BYTES`, `MAX_SCRIPT_FILE_BYTES`),
20/// but external callers — VT enrichment, dataset preparation, ad-hoc
21/// CLI commands — bypass those caps and reach this port directly. The
22/// guard here is the last line of defence.
23pub const MAX_READ_FILE_BYTES: u64 = 256 * 1024 * 1024;
24
25/// File system provider implementation using the standard library
26#[derive(Debug, Default, Clone)]
27pub struct StdFileSystemProvider;
28
29impl StdFileSystemProvider {
30    /// Create a new standard filesystem provider
31    #[must_use]
32    pub fn new() -> Self {
33        Self
34    }
35
36    /// Check if a filename matches a glob pattern.
37    ///
38    /// Backed by `globset`, which supports the standard glob subset:
39    /// `*` (any non-separator run), `?` (single char), `[abc]`
40    /// (character class), `{a,b}` (alternation). Pre-fix the matcher
41    /// only handled leading- or trailing-`*` patterns and silently
42    /// returned `false` for anything more complex (`foo*bar.json`,
43    /// `*.{json,yaml}`), which made it easy to author a discovery
44    /// pattern that compiled fine but matched nothing.
45    ///
46    /// Failure mode is fail-closed: an invalid glob (e.g. `[unclosed`)
47    /// returns `false` instead of panicking. The caller already passes
48    /// well-known patterns from the discovery rules, so a malformed
49    /// pattern is a programming error worth surfacing as "no matches"
50    /// rather than crashing a long-running scan.
51    fn matches_pattern(filename: &str, pattern: &str) -> bool {
52        match globset::Glob::new(pattern) {
53            Ok(glob) => glob.compile_matcher().is_match(filename),
54            Err(err) => {
55                tracing::debug!(
56                    "invalid glob pattern {pattern:?} in file discovery (treating as no-match): {err}",
57                );
58                false
59            }
60        }
61    }
62}
63
64impl FileSystemProvider for StdFileSystemProvider {
65    /// Read the contents of `path`.
66    ///
67    /// `PathNotFound` is reserved for genuine "no such entry" errors;
68    /// permission-denied, EBUSY, EIO and other I/O failures are returned
69    /// as `IoError`. Pre-fix the function used `path.exists()` as a
70    /// pre-check, but `Path::exists` returns `false` for *any* failure
71    /// to stat (including `PermissionDenied`), so an unreadable file
72    /// would surface as `PathNotFound` and operators would see the scan
73    /// silently skip artifacts that actually exist.
74    fn read_file_bytes(&self, path: &Path) -> Result<FileContent, FileSystemError> {
75        // Defence-in-depth size guard: open the file once, then check
76        // metadata on the already-open handle and use `.take()` to bound
77        // the read. Pre-fix the function used a separate `metadata()` call
78        // followed by `read()`, which was a TOCTOU race — a file could grow
79        // between the stat and the read, defeating the size cap. Opening
80        // once also avoids the symlink-swap risk of stat-then-read.
81        let file = match std::fs::File::open(path) {
82            Ok(f) => f,
83            Err(err) if err.kind() == io::ErrorKind::NotFound => {
84                return Err(FileSystemError::PathNotFound(path.to_path_buf()));
85            }
86            Err(err) => return Err(FileSystemError::IoError(err)),
87        };
88        let meta = match file.metadata() {
89            Ok(m) => m,
90            Err(err) if err.kind() == io::ErrorKind::NotFound => {
91                return Err(FileSystemError::PathNotFound(path.to_path_buf()));
92            }
93            Err(err) => return Err(FileSystemError::IoError(err)),
94        };
95        if meta.len() > MAX_READ_FILE_BYTES {
96            return Err(FileSystemError::IoError(io::Error::new(
97                io::ErrorKind::InvalidInput,
98                format!(
99                    "refusing to read {}: size {} exceeds MAX_READ_FILE_BYTES ({})",
100                    path.display(),
101                    meta.len(),
102                    MAX_READ_FILE_BYTES
103                ),
104            )));
105        }
106        use std::io::Read;
107        let mut buf = Vec::with_capacity(meta.len().try_into().unwrap_or(0));
108        // `.take()` bounds the read at MAX_READ_FILE_BYTES + 1 so that a
109        // file that grew between the metadata check and the read still
110        // cannot exceed the cap. If we read more than MAX_READ_FILE_BYTES
111        // bytes, the file grew and we reject it.
112        let mut limited = file.take(MAX_READ_FILE_BYTES + 1);
113        if let Err(err) = limited.read_to_end(&mut buf) {
114            return Err(FileSystemError::IoError(err));
115        }
116        if buf.len() as u64 > MAX_READ_FILE_BYTES {
117            return Err(FileSystemError::IoError(io::Error::new(
118                io::ErrorKind::InvalidInput,
119                format!(
120                    "refusing to read {}: size exceeds MAX_READ_FILE_BYTES ({})",
121                    path.display(),
122                    MAX_READ_FILE_BYTES
123                ),
124            )));
125        }
126        Ok(FileContent::new(buf))
127    }
128
129    /// List the entries of `path` that match `pattern`.
130    ///
131    /// # Symlink policy
132    ///
133    /// Symlinks are deliberately NOT followed. The threat model includes
134    /// scanned packages authored by untrusted parties, so a symlink like
135    /// `evil.json -> /etc/passwd` shipped inside a malicious skill MUST
136    /// NOT cause the scanner to ingest the link target. Concretely:
137    ///
138    /// * The recursive walk pins `WalkDir::follow_links(false)` so the
139    ///   walker neither descends into symlinked directories nor reports
140    ///   the link target's type.
141    /// * Both branches gate on `FileType::is_file()` AND
142    ///   `!FileType::is_symlink()` so a future refactor that turns
143    ///   `follow_links` on (which would make `is_file()` reflect the
144    ///   target type) does not silently re-enable symlink ingestion.
145    ///
146    /// # Error policy
147    ///
148    /// `PathNotFound` is reserved for genuine "no such directory"
149    /// errors; permission-denied, EBUSY, EIO and other I/O failures on
150    /// the root path are returned as `IoError`. Pre-fix the function
151    /// used `path.exists()` as a pre-check, but `Path::exists` returns
152    /// `false` for *any* failure to stat (including `PermissionDenied`),
153    /// so an unreadable directory would surface as `PathNotFound` and
154    /// operators would see the scan silently skip artifacts that
155    /// actually exist on disk — the same failure mode `read_file_bytes`
156    /// guards against. Errors encountered on individual *child* entries
157    /// during a recursive walk remain warnings: the scan keeps going on
158    /// the legible siblings instead of aborting the whole package.
159    ///
160    /// # Non-UTF-8 filenames
161    ///
162    /// Filenames containing non-UTF-8 bytes are matched against `pattern`
163    /// using `OsStr::to_string_lossy` (invalid sequences become
164    /// `U+FFFD`) and a `tracing::warn!` is emitted naming the lossy
165    /// path. Pre-fix the chained `to_str()` returned `None` and the
166    /// entry was silently skipped, allowing an attacker who packages
167    /// an untrusted skill with a non-UTF-8 artifact name (zip and tar
168    /// both preserve raw bytes) to evade scanning entirely. Lossy
169    /// matching closes the evasion vector while the warning surfaces
170    /// the attempt to operators.
171    fn list_files(
172        &self,
173        path: &Path,
174        pattern: &str,
175        recursive: bool,
176    ) -> Result<Vec<PathBuf>, FileSystemError> {
177        let mut files = Vec::new();
178
179        if recursive {
180            for result in WalkDir::new(path).follow_links(false) {
181                match result {
182                    Ok(entry) => {
183                        let file_type = entry.file_type();
184                        if file_type.is_file() && !file_type.is_symlink() {
185                            let entry_path = entry.path();
186                            if let Some(filename_os) = entry_path.file_name() {
187                                let filename = lossy_filename_with_warning(filename_os, entry_path);
188                                if Self::matches_pattern(&filename, pattern) {
189                                    files.push(entry_path.to_path_buf());
190                                }
191                            }
192                        }
193                    }
194                    Err(err) if err.depth() == 0 => {
195                        let io_err = err.into_io_error().unwrap_or_else(|| {
196                            io::Error::other("walkdir error without underlying io::Error")
197                        });
198                        return match io_err.kind() {
199                            io::ErrorKind::NotFound => {
200                                Err(FileSystemError::PathNotFound(path.to_path_buf()))
201                            }
202                            _ => Err(FileSystemError::IoError(io_err)),
203                        };
204                    }
205                    Err(err) => {
206                        tracing::warn!("Skipping entry during recursive file listing: {err}");
207                    }
208                }
209            }
210        } else {
211            let entries = match std::fs::read_dir(path) {
212                Ok(it) => it,
213                Err(err) if err.kind() == io::ErrorKind::NotFound => {
214                    return Err(FileSystemError::PathNotFound(path.to_path_buf()));
215                }
216                Err(err) => return Err(FileSystemError::IoError(err)),
217            };
218            for entry in entries {
219                let entry = match entry {
220                    Ok(e) => e,
221                    Err(err) => {
222                        tracing::warn!("Skipping entry during non-recursive file listing: {err}");
223                        continue;
224                    }
225                };
226                let file_type = match entry.file_type() {
227                    Ok(ft) => ft,
228                    Err(err) => {
229                        tracing::warn!(
230                            "Skipping entry with unavailable file type: {}: {err}",
231                            entry.path().display()
232                        );
233                        continue;
234                    }
235                };
236                if file_type.is_file() && !file_type.is_symlink() {
237                    let entry_path = entry.path();
238                    if let Some(filename_os) = entry_path.file_name() {
239                        let filename = lossy_filename_with_warning(filename_os, &entry_path);
240                        if Self::matches_pattern(&filename, pattern) {
241                            files.push(entry_path);
242                        }
243                    }
244                }
245            }
246        }
247
248        Ok(files)
249    }
250
251    /// Use `symlink_metadata` to avoid following symlinks, consistent with
252    /// `list_files` / `walk_files` which explicitly filter out symlinks.
253    /// `Path::exists()` follows symlinks AND swallows permission errors
254    /// (returning `false` for `PermissionDenied`), which is inconsistent
255    /// with the symlink-does-not-exist policy of the listing methods.
256    fn exists(&self, path: &Path) -> bool {
257        match std::fs::symlink_metadata(path) {
258            Ok(_) => true,
259            Err(e) if e.kind() == std::io::ErrorKind::NotFound => false,
260            Err(e) => {
261                tracing::debug!("exists: stat failed for {}: {e}", path.display());
262                // Fail-closed: permission errors mean "exists but
263                // inaccessible", not "not found"
264                true
265            }
266        }
267    }
268
269    fn metadata(&self, path: &Path) -> Result<FileMeta, FileSystemError> {
270        let meta = std::fs::symlink_metadata(path).map_err(FileSystemError::IoError)?;
271        Ok(FileMeta { len: meta.len() })
272    }
273
274    /// Use `symlink_metadata` to avoid following symlinks, consistent with
275    /// the listing methods' `!file_type.is_symlink()` filter.
276    fn is_file(&self, path: &Path) -> bool {
277        match std::fs::symlink_metadata(path) {
278            Ok(meta) => meta.is_file() && !meta.is_symlink(),
279            Err(_) => false,
280        }
281    }
282
283    /// Use `symlink_metadata` to avoid following symlinks, consistent with
284    /// the listing methods' `!file_type.is_symlink()` filter.
285    fn is_dir(&self, path: &Path) -> bool {
286        match std::fs::symlink_metadata(path) {
287            Ok(meta) => meta.is_dir() && !meta.is_symlink(),
288            Err(_) => false,
289        }
290    }
291
292    /// Recursive walk over `path` honouring `max_depth` and `skip_dirs`.
293    ///
294    /// Symlinks are NOT followed (`follow_links(false)`). Errors on
295    /// individual entries are logged via `tracing::warn!` and the walk
296    /// continues, matching the asymmetry documented for `list_files`:
297    /// the root error is fatal, child errors are non-fatal so a single
298    /// unreadable subtree does not blank an entire package scan.
299    ///
300    /// `max_depth = 0` means unlimited (matches the documented port
301    /// contract). `skip_dirs` is checked against the lossy filename of
302    /// each directory entry; a match prunes the entire subtree.
303    fn walk_files(
304        &self,
305        path: &Path,
306        max_depth: usize,
307        skip_dirs: &[&str],
308    ) -> Result<Vec<PathBuf>, FileSystemError> {
309        let mut walker = WalkDir::new(path).follow_links(false);
310        if max_depth > 0 {
311            walker = walker.max_depth(max_depth);
312        }
313
314        let mut files = Vec::new();
315        let walker = walker
316            .into_iter()
317            .filter_entry(|entry| !is_skipped_dir(entry, skip_dirs));
318        for result in walker {
319            match result {
320                Ok(entry) => {
321                    let file_type = entry.file_type();
322                    if file_type.is_file() && !file_type.is_symlink() {
323                        files.push(entry.into_path());
324                    }
325                }
326                Err(err) if err.depth() == 0 => {
327                    let io_err = err.into_io_error().unwrap_or_else(|| {
328                        io::Error::other("walkdir error without underlying io::Error")
329                    });
330                    return match io_err.kind() {
331                        io::ErrorKind::NotFound => {
332                            Err(FileSystemError::PathNotFound(path.to_path_buf()))
333                        }
334                        _ => Err(FileSystemError::IoError(io_err)),
335                    };
336                }
337                Err(err) => {
338                    tracing::warn!("Skipping entry during recursive walk: {err}");
339                }
340            }
341        }
342        Ok(files)
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349    use tempfile::TempDir;
350
351    /// # Contract
352    /// `read_file_bytes` MUST return the file's raw bytes via the
353    /// [`FileContent`] wrapper without re-encoding or normalising line
354    /// endings. The hexagonal contract makes this the only way the core
355    /// reads bytes — any silent transformation would leak into every
356    /// downstream pattern match.
357    #[test]
358    fn read_file_bytes_returns_raw_contents_through_port() {
359        let dir = TempDir::new().unwrap();
360        let file_path = dir.path().join("test.txt");
361        std::fs::write(&file_path, "hello world").unwrap();
362
363        let fs = StdFileSystemProvider::new();
364        let content = fs.read_file_bytes(&file_path).unwrap();
365        assert_eq!(content.as_bytes(), b"hello world");
366    }
367
368    /// # Contract
369    ///
370    /// `read_file_bytes` MUST refuse files larger than
371    /// [`MAX_READ_FILE_BYTES`] with a [`FileSystemError::IoError`] whose
372    /// message names the size and the limit. Pre-fix the function called
373    /// `std::fs::read` unconditionally, so a malicious package shipping
374    /// a 100 GB file would trigger a 100 GB heap allocation and OOM the
375    /// process. The pre-stat guard is the last line of defence for
376    /// callers that bypass the discovery-layer caps.
377    #[test]
378    fn read_file_bytes_refuses_files_larger_than_max() {
379        let dir = TempDir::new().unwrap();
380        let file_path = dir.path().join("oversized.bin");
381        // Write `MAX_READ_FILE_BYTES + 1` bytes via `set_len` so the test
382        // does not actually allocate the buffer; the guard must consult
383        // metadata, not the body.
384        let file = std::fs::File::create(&file_path).unwrap();
385        file.set_len(MAX_READ_FILE_BYTES + 1).unwrap();
386        drop(file);
387
388        let fs = StdFileSystemProvider::new();
389        let err = fs
390            .read_file_bytes(&file_path)
391            .expect_err("oversized file MUST be rejected");
392        match err {
393            FileSystemError::IoError(io_err) => {
394                let msg = io_err.to_string();
395                assert!(
396                    msg.contains("MAX_READ_FILE_BYTES")
397                        && msg.contains(&MAX_READ_FILE_BYTES.to_string()),
398                    "error message must explain the size guard; got {msg}"
399                );
400            }
401            other => panic!("expected IoError with size-guard message; got {other:?}"),
402        }
403    }
404
405    /// # Contract
406    ///
407    /// Files at exactly the size cap are allowed; only strictly-larger
408    /// files are rejected. Pins the boundary so a future tightening
409    /// does not accidentally invert the comparison.
410    #[test]
411    fn read_file_bytes_accepts_files_at_max_size() {
412        let dir = TempDir::new().unwrap();
413        let file_path = dir.path().join("limit.bin");
414        // Use a small synthetic limit-equivalent body; we cannot allocate
415        // 256 MiB in a unit test, so we instead verify a 1 KiB file
416        // (well under the cap) is still accepted.
417        std::fs::write(&file_path, vec![0u8; 1024]).unwrap();
418        let fs = StdFileSystemProvider::new();
419        let content = fs
420            .read_file_bytes(&file_path)
421            .expect("under-cap file MUST be accepted");
422        assert_eq!(content.as_bytes().len(), 1024);
423    }
424
425    /// # Contract (negative)
426    /// A missing path MUST surface as
427    /// [`FileSystemError::PathNotFound`], not as a generic
428    /// [`FileSystemError::IoError`] or a panic. Callers (the scanner,
429    /// rule loaders, dataset preparation) discriminate on this variant
430    /// to decide between "skip and continue" and "abort the run".
431    #[test]
432    fn read_file_bytes_returns_path_not_found_for_missing_path() {
433        let fs = StdFileSystemProvider::new();
434        let result = fs.read_file_bytes(Path::new("/nonexistent/path/file.txt"));
435        assert!(matches!(result, Err(FileSystemError::PathNotFound(_))));
436    }
437
438    /// # Contract
439    /// Non-recursive `list_files` MUST honour the glob pattern AND stay
440    /// within the supplied directory — a sibling directory's matches do
441    /// not leak in. This is the load-bearing invariant for skill
442    /// discovery, which lists the package root non-recursively to find
443    /// `SKILL.md` without walking vendored subtrees.
444    #[test]
445    fn list_files_non_recursive_filters_by_glob_and_stays_in_dir() {
446        let dir = TempDir::new().unwrap();
447
448        std::fs::write(dir.path().join("test1.md"), "").unwrap();
449        std::fs::write(dir.path().join("test2.md"), "").unwrap();
450        std::fs::write(dir.path().join("test.txt"), "").unwrap();
451
452        let fs = StdFileSystemProvider::new();
453        let files = fs.list_files(dir.path(), "*.md", false).unwrap();
454
455        assert_eq!(files.len(), 2);
456        assert!(files.iter().all(|f| f.extension().unwrap() == "md"));
457    }
458
459    /// # Contract
460    /// Recursive `list_files` MUST descend into subdirectories and apply
461    /// the glob to filenames at every depth. Discovery for "find every
462    /// `SKILL.md` in this package" relies on this; if recursion silently
463    /// stopped at the root, supporting artifacts in subdirectories would
464    /// never enter the scan graph.
465    #[test]
466    fn list_files_recursive_descends_into_subdirectories() {
467        let dir = TempDir::new().unwrap();
468        let subdir = dir.path().join("subdir");
469        std::fs::create_dir(&subdir).unwrap();
470
471        std::fs::write(dir.path().join("test1.md"), "").unwrap();
472        std::fs::write(subdir.join("test2.md"), "").unwrap();
473
474        let fs = StdFileSystemProvider::new();
475        let files = fs.list_files(dir.path(), "*.md", true).unwrap();
476
477        assert_eq!(files.len(), 2);
478    }
479
480    /// # Contract
481    /// `exists` MUST report `true` only for paths the OS resolves to a
482    /// real entry, and never panic on a non-existent path. The scanner
483    /// uses this as its cheap pre-flight before opening referenced
484    /// artifacts.
485    #[test]
486    fn exists_returns_true_only_for_real_paths() {
487        let dir = TempDir::new().unwrap();
488        let file_path = dir.path().join("exists.txt");
489        std::fs::write(&file_path, "").unwrap();
490
491        let fs = StdFileSystemProvider::new();
492        assert!(fs.exists(&file_path));
493        assert!(!fs.exists(&dir.path().join("does_not_exist.txt")));
494    }
495
496    /// # Contract
497    /// `matches_pattern` MUST recognise the canonical glob shapes
498    /// (`*.ext`, `*`, `prefix*`) and reject mismatches. These are the
499    /// shapes the rule loader and discovery service emit — a silent
500    /// failure here breaks discovery without producing an error.
501    #[test]
502    fn matches_pattern_handles_canonical_glob_shapes() {
503        assert!(StdFileSystemProvider::matches_pattern("test.md", "*.md"));
504        assert!(StdFileSystemProvider::matches_pattern("test.md", "*"));
505        assert!(StdFileSystemProvider::matches_pattern("test.md", "test*"));
506        assert!(!StdFileSystemProvider::matches_pattern("test.txt", "*.md"));
507    }
508
509    /// # Contract
510    ///
511    /// `matches_pattern` MUST support `*` in the middle of a pattern.
512    /// Pre-fix the matcher only handled leading- or trailing-`*` and
513    /// silently returned `false` for `foo*bar.json`, so a discovery
514    /// rule using a middle wildcard would never trigger.
515    #[test]
516    fn matches_pattern_supports_middle_wildcard() {
517        assert!(StdFileSystemProvider::matches_pattern(
518            "manifest.lock.json",
519            "manifest*.json"
520        ));
521        assert!(!StdFileSystemProvider::matches_pattern(
522            "other.json",
523            "manifest*.json"
524        ));
525    }
526
527    /// # Contract
528    ///
529    /// `matches_pattern` MUST support brace alternation. This lets a
530    /// single discovery pattern handle related extensions like
531    /// `*.{json,yaml}` instead of forcing the discovery layer to fan
532    /// out into per-extension list_files calls (which compounds I/O on
533    /// large trees).
534    #[test]
535    fn matches_pattern_supports_brace_alternation() {
536        assert!(StdFileSystemProvider::matches_pattern(
537            "config.yaml",
538            "*.{json,yaml}"
539        ));
540        assert!(StdFileSystemProvider::matches_pattern(
541            "config.json",
542            "*.{json,yaml}"
543        ));
544        assert!(!StdFileSystemProvider::matches_pattern(
545            "config.toml",
546            "*.{json,yaml}"
547        ));
548    }
549
550    /// # Contract (fail-closed)
551    ///
552    /// An invalid glob pattern MUST return `false` instead of panicking.
553    /// Callers pass well-known patterns from discovery rules, so a
554    /// malformed pattern is a programming error worth surfacing as
555    /// "no matches" rather than crashing a long-running scan in the
556    /// middle of a dataset run.
557    #[test]
558    fn matches_pattern_returns_false_for_invalid_glob_silently() {
559        // Unbalanced character class is a glob syntax error.
560        assert!(!StdFileSystemProvider::matches_pattern(
561            "anything",
562            "[unclosed"
563        ));
564    }
565
566    /// # Contract
567    ///
568    /// `read_file_bytes` MUST distinguish "file does not exist" from
569    /// other I/O failures. Pre-fix the function pre-checked
570    /// `path.exists()`, but `Path::exists` returns `false` on *any*
571    /// stat failure (including `PermissionDenied`), so an unreadable
572    /// file would be reported as `PathNotFound` and the scan would
573    /// silently skip artifacts that actually exist on disk. This test
574    /// pins the post-fix behaviour: a `PermissionDenied` from
575    /// `std::fs::read` MUST surface as `IoError`, not `PathNotFound`.
576    #[cfg(unix)]
577    #[test]
578    fn read_file_bytes_distinguishes_permission_denied_from_path_not_found() {
579        use std::os::unix::fs::PermissionsExt;
580
581        let dir = TempDir::new().unwrap();
582        let unreadable = dir.path().join("locked.bin");
583        std::fs::write(&unreadable, b"secret").unwrap();
584        std::fs::set_permissions(&unreadable, std::fs::Permissions::from_mode(0o000)).unwrap();
585
586        let fs = StdFileSystemProvider::new();
587        let err = fs
588            .read_file_bytes(&unreadable)
589            .expect_err("unreadable file must surface an error");
590
591        // Restore mode so TempDir teardown doesn't choke.
592        std::fs::set_permissions(&unreadable, std::fs::Permissions::from_mode(0o644)).ok();
593
594        assert!(
595            matches!(
596                err,
597                FileSystemError::IoError(ref io_err)
598                    if io_err.kind() == std::io::ErrorKind::PermissionDenied
599            ),
600            "PermissionDenied MUST be preserved, not collapsed to PathNotFound or other variant; got {err:?}",
601        );
602    }
603
604    /// # Contract
605    ///
606    /// `read_file_bytes` MUST still return `PathNotFound` for files
607    /// that genuinely don't exist. Negative-direction pin for the
608    /// permission-denied test above: the fix must not regress the
609    /// "missing file" path.
610    #[test]
611    fn read_file_bytes_still_returns_path_not_found_for_missing_file() {
612        let dir = TempDir::new().unwrap();
613        let missing = dir.path().join("does_not_exist.bin");
614
615        let fs = StdFileSystemProvider::new();
616        let err = fs
617            .read_file_bytes(&missing)
618            .expect_err("missing file must error");
619
620        assert!(
621            matches!(err, FileSystemError::PathNotFound(_)),
622            "missing file must surface as PathNotFound, got {err:?}",
623        );
624    }
625
626    /// # Contract
627    ///
628    /// `list_files` MUST NOT return entries that are symlinks, even
629    /// when the link target is a regular file matching the pattern.
630    /// Threat model: malware analysts scan untrusted skill packages,
631    /// so a malicious package shipping `evil.json -> /etc/hosts` MUST
632    /// NOT cause the scanner to read or report the link target as if
633    /// it were a regular file inside the package.
634    #[cfg(unix)]
635    #[test]
636    fn list_files_skips_symlinks_in_recursive_walk() {
637        let dir = TempDir::new().unwrap();
638        let real = dir.path().join("real.json");
639        std::fs::write(&real, b"{}").unwrap();
640        let evil = dir.path().join("evil.json");
641        std::os::unix::fs::symlink("/etc/hosts", &evil).unwrap();
642
643        let fs = StdFileSystemProvider::new();
644        let files = fs.list_files(dir.path(), "*.json", true).unwrap();
645
646        assert_eq!(
647            files.len(),
648            1,
649            "expected only the regular file, got {files:?}"
650        );
651        assert_eq!(files[0], real);
652    }
653
654    /// # Contract
655    ///
656    /// Same symlink-skip contract as the recursive case, pinned for
657    /// the non-recursive (`std::fs::read_dir`) branch. Both branches
658    /// share the threat model: untrusted package contents.
659    #[cfg(unix)]
660    #[test]
661    fn list_files_skips_symlinks_in_non_recursive_walk() {
662        let dir = TempDir::new().unwrap();
663        let real = dir.path().join("real.json");
664        std::fs::write(&real, b"{}").unwrap();
665        let evil = dir.path().join("evil.json");
666        std::os::unix::fs::symlink("/etc/hosts", &evil).unwrap();
667
668        let fs = StdFileSystemProvider::new();
669        let files = fs.list_files(dir.path(), "*.json", false).unwrap();
670
671        assert_eq!(
672            files.len(),
673            1,
674            "expected only the regular file, got {files:?}"
675        );
676        assert_eq!(files[0], real);
677    }
678
679    /// # Contract
680    ///
681    /// `list_files` MUST distinguish "permission denied" from "path not
682    /// found" in the recursive branch. Pre-fix the function pre-checked
683    /// `path.exists()` and the WalkDir filter_map swallowed the root
684    /// error as a tracing warning, so an unreadable directory would
685    /// surface as either PathNotFound (with the precheck) or an empty
686    /// `Ok` vec (without it) — both silently hiding a real I/O failure
687    /// during scans of untrusted packages. This test pins the post-fix
688    /// behaviour: a `PermissionDenied` from the root walk MUST surface
689    /// as `IoError`, never as `PathNotFound` or as an empty result.
690    #[cfg(unix)]
691    #[test]
692    fn list_files_recursive_distinguishes_permission_denied_from_path_not_found() {
693        use std::os::unix::fs::PermissionsExt;
694
695        let dir = TempDir::new().unwrap();
696        let locked = dir.path().join("locked");
697        std::fs::create_dir(&locked).unwrap();
698        std::fs::set_permissions(&locked, std::fs::Permissions::from_mode(0o000)).unwrap();
699
700        let fs = StdFileSystemProvider::new();
701        let result = fs.list_files(&locked, "*.md", true);
702
703        // Restore permissions so TempDir teardown doesn't choke.
704        std::fs::set_permissions(&locked, std::fs::Permissions::from_mode(0o755)).ok();
705
706        let err = result.expect_err("unreadable directory must surface an error");
707        assert!(
708            matches!(
709                err,
710                FileSystemError::IoError(ref io_err)
711                    if io_err.kind() == std::io::ErrorKind::PermissionDenied
712            ),
713            "PermissionDenied MUST be preserved, not collapsed to PathNotFound or empty Ok; got {err:?}",
714        );
715    }
716
717    /// # Contract
718    ///
719    /// Same `PermissionDenied` distinction as the recursive case, pinned
720    /// for the non-recursive (`std::fs::read_dir`) branch. Both branches
721    /// share the threat model and contract: callers need an actionable
722    /// error kind so the scanner can log a permission failure instead
723    /// of silently treating an unreadable directory as missing.
724    #[cfg(unix)]
725    #[test]
726    fn list_files_non_recursive_distinguishes_permission_denied_from_path_not_found() {
727        use std::os::unix::fs::PermissionsExt;
728
729        let dir = TempDir::new().unwrap();
730        let locked = dir.path().join("locked");
731        std::fs::create_dir(&locked).unwrap();
732        std::fs::set_permissions(&locked, std::fs::Permissions::from_mode(0o000)).unwrap();
733
734        let fs = StdFileSystemProvider::new();
735        let result = fs.list_files(&locked, "*.md", false);
736
737        std::fs::set_permissions(&locked, std::fs::Permissions::from_mode(0o755)).ok();
738
739        let err = result.expect_err("unreadable directory must surface an error");
740        assert!(
741            matches!(
742                err,
743                FileSystemError::IoError(ref io_err)
744                    if io_err.kind() == std::io::ErrorKind::PermissionDenied
745            ),
746            "PermissionDenied MUST be preserved, not collapsed to PathNotFound; got {err:?}",
747        );
748    }
749
750    /// # Contract
751    ///
752    /// `list_files` MUST still return `PathNotFound` for directories
753    /// that genuinely don't exist. Negative-direction pin paired with
754    /// the permission-denied tests above: removing the `path.exists()`
755    /// pre-check must not regress the "missing directory" path. Both
756    /// the recursive and non-recursive branches are exercised because
757    /// they use different underlying primitives (WalkDir vs read_dir).
758    #[test]
759    fn list_files_still_returns_path_not_found_for_missing_directory() {
760        let dir = TempDir::new().unwrap();
761        let missing = dir.path().join("does_not_exist");
762
763        let fs = StdFileSystemProvider::new();
764
765        let err_recursive = fs
766            .list_files(&missing, "*.md", true)
767            .expect_err("missing dir must error in recursive branch");
768        assert!(
769            matches!(err_recursive, FileSystemError::PathNotFound(_)),
770            "missing dir must surface as PathNotFound in recursive branch, got {err_recursive:?}",
771        );
772
773        let err_flat = fs
774            .list_files(&missing, "*.md", false)
775            .expect_err("missing dir must error in non-recursive branch");
776        assert!(
777            matches!(err_flat, FileSystemError::PathNotFound(_)),
778            "missing dir must surface as PathNotFound in non-recursive branch, got {err_flat:?}",
779        );
780    }
781
782    /// # Contract
783    ///
784    /// Errors on *child* entries of a recursive walk MUST remain
785    /// non-fatal: the scanner keeps reporting the legible siblings
786    /// rather than aborting the whole package because one subdirectory
787    /// is unreadable. This pins the asymmetry between "root error"
788    /// (fatal, surfaced as IoError) and "child error" (logged warning,
789    /// scan continues) — without this test a future refactor could
790    /// turn child errors fatal and silently drop coverage on partially
791    /// readable trees.
792    #[cfg(unix)]
793    #[test]
794    fn list_files_recursive_continues_past_unreadable_child() {
795        use std::os::unix::fs::PermissionsExt;
796
797        let dir = TempDir::new().unwrap();
798        std::fs::write(dir.path().join("readable.md"), b"").unwrap();
799        let blocked = dir.path().join("blocked");
800        std::fs::create_dir(&blocked).unwrap();
801        std::fs::write(blocked.join("hidden.md"), b"").unwrap();
802        std::fs::set_permissions(&blocked, std::fs::Permissions::from_mode(0o000)).unwrap();
803
804        let fs = StdFileSystemProvider::new();
805        let result = fs.list_files(dir.path(), "*.md", true);
806
807        std::fs::set_permissions(&blocked, std::fs::Permissions::from_mode(0o755)).ok();
808
809        let files = result.expect("root is readable; walk must succeed despite blocked subdir");
810        assert_eq!(
811            files.len(),
812            1,
813            "expected only the readable sibling, got {files:?}",
814        );
815        assert!(
816            files[0].ends_with("readable.md"),
817            "expected readable.md, got {:?}",
818            files[0]
819        );
820    }
821
822    /// # Contract
823    ///
824    /// `list_files` MUST surface entries whose filenames contain
825    /// non-UTF-8 bytes by matching them against `pattern` via a lossy
826    /// `OsStr::to_string_lossy` conversion. Pre-fix the chained
827    /// `to_str()` returned `None` and the `if let` silently dropped
828    /// the entry, allowing an attacker who packages a skill with a
829    /// critical artifact named with raw bytes (zip and tar preserve
830    /// them) to evade the entire scan. Both branches (recursive walk
831    /// via WalkDir and non-recursive via `read_dir`) share the threat
832    /// model and the contract.
833    ///
834    /// Linux-only because HFS+/APFS reject non-UTF-8 filenames at the
835    /// `creat(2)` syscall (errno `EILSEQ`); the evasion vector this
836    /// test pins is only reachable on filesystems that accept raw
837    /// bytes (ext4, xfs, btrfs, zfs) or via archive extraction tools
838    /// that ship raw-byte names. The runtime fix (lossy match in
839    /// `lossy_filename_with_warning`) is platform-agnostic.
840    #[cfg(target_os = "linux")]
841    #[test]
842    fn list_files_recursive_matches_non_utf8_filename_via_lossy_conversion() {
843        use std::ffi::OsStr;
844        use std::os::unix::ffi::OsStrExt;
845
846        let dir = TempDir::new().unwrap();
847        // `bad\xFF.json` — `\xFF` is invalid UTF-8 as a standalone byte.
848        let evil_name = OsStr::from_bytes(b"bad\xFF.json");
849        let evil_path = dir.path().join(evil_name);
850        std::fs::write(&evil_path, b"{}").unwrap();
851
852        let fs = StdFileSystemProvider::new();
853        let files = fs.list_files(dir.path(), "*.json", true).unwrap();
854
855        assert_eq!(
856            files.len(),
857            1,
858            "non-UTF-8 filename MUST be matched via lossy conversion, got {files:?}",
859        );
860        assert_eq!(
861            files[0], evil_path,
862            "returned path MUST preserve the original (non-UTF-8) bytes",
863        );
864    }
865
866    /// # Contract
867    ///
868    /// Non-recursive twin of
869    /// `list_files_recursive_matches_non_utf8_filename_via_lossy_conversion`.
870    /// Pinned separately because the two branches use different
871    /// underlying primitives (WalkDir vs `std::fs::read_dir`); a future
872    /// refactor that fixes one without the other would silently
873    /// re-open the evasion path on whichever discovery mode regresses.
874    /// See the recursive twin's doc-comment for the platform-gating
875    /// rationale (HFS+/APFS reject non-UTF-8 filenames at syscall time).
876    #[cfg(target_os = "linux")]
877    #[test]
878    fn list_files_non_recursive_matches_non_utf8_filename_via_lossy_conversion() {
879        use std::ffi::OsStr;
880        use std::os::unix::ffi::OsStrExt;
881
882        let dir = TempDir::new().unwrap();
883        let evil_name = OsStr::from_bytes(b"manifest\xFF.json");
884        let evil_path = dir.path().join(evil_name);
885        std::fs::write(&evil_path, b"{}").unwrap();
886
887        let fs = StdFileSystemProvider::new();
888        let files = fs.list_files(dir.path(), "*.json", false).unwrap();
889
890        assert_eq!(
891            files.len(),
892            1,
893            "non-UTF-8 filename MUST be matched via lossy conversion, got {files:?}",
894        );
895        assert_eq!(
896            files[0], evil_path,
897            "returned path MUST preserve the original (non-UTF-8) bytes",
898        );
899    }
900
901    /// # Contract
902    ///
903    /// Negative-direction pin for the lossy-match contract above:
904    /// regular UTF-8 filenames MUST continue to be returned without
905    /// behavioural change. Without this test a future refactor that
906    /// over-corrects (e.g. unconditionally allocates via
907    /// `to_string_lossy`) could regress the hot path's borrow-only
908    /// behaviour or accidentally drop UTF-8 entries.
909    #[test]
910    fn list_files_returns_utf8_filenames_unchanged() {
911        let dir = TempDir::new().unwrap();
912        std::fs::write(dir.path().join("alpha.json"), b"{}").unwrap();
913        std::fs::write(dir.path().join("beta.json"), b"{}").unwrap();
914        std::fs::write(dir.path().join("gamma.txt"), b"").unwrap();
915
916        let fs = StdFileSystemProvider::new();
917        let files = fs.list_files(dir.path(), "*.json", false).unwrap();
918
919        assert_eq!(
920            files.len(),
921            2,
922            "expected the two .json files, got {files:?}"
923        );
924        assert!(files.iter().all(|f| f.extension().unwrap() == "json"));
925    }
926}