skill_veil_core/adapters/std_filesystem.rs
1//! File system provider implementation using std::fs
2
3use crate::ports::{FileContent, FileMeta, FileSystemError, FileSystemProvider};
4use std::io;
5use std::path::{Path, PathBuf};
6use walkdir::WalkDir;
7
8use super::walk_helpers::{is_skipped_dir, lossy_filename_with_warning};
9
10/// Hard upper bound on a single `read_file_bytes` call. Any larger file
11/// surfaces as `FileSystemError::IoError` instead of being slurped into
12/// memory. The limit is generous (256 MiB) so it does not interfere with
13/// legitimate skills carrying full datasets, but defends against a
14/// malicious package that ships a 100 GB file as a denial-of-service
15/// vector — `std::fs::read` would otherwise attempt to allocate the
16/// entire body into a `Vec<u8>` and OOM the process.
17///
18/// Discovery layers (`file_discovery::package_artifacts`) already apply
19/// per-pattern caps (`MAX_DATA_FILE_BYTES`, `MAX_SCRIPT_FILE_BYTES`),
20/// but external callers — VT enrichment, dataset preparation, ad-hoc
21/// CLI commands — bypass those caps and reach this port directly. The
22/// guard here is the last line of defence.
23pub const MAX_READ_FILE_BYTES: u64 = 256 * 1024 * 1024;
24
25/// File system provider implementation using the standard library
26#[derive(Debug, Default, Clone)]
27pub struct StdFileSystemProvider;
28
29impl StdFileSystemProvider {
30 /// Create a new standard filesystem provider
31 #[must_use]
32 pub fn new() -> Self {
33 Self
34 }
35
36 /// Check if a filename matches a glob pattern.
37 ///
38 /// Backed by `globset`, which supports the standard glob subset:
39 /// `*` (any non-separator run), `?` (single char), `[abc]`
40 /// (character class), `{a,b}` (alternation). Pre-fix the matcher
41 /// only handled leading- or trailing-`*` patterns and silently
42 /// returned `false` for anything more complex (`foo*bar.json`,
43 /// `*.{json,yaml}`), which made it easy to author a discovery
44 /// pattern that compiled fine but matched nothing.
45 ///
46 /// Failure mode is fail-closed: an invalid glob (e.g. `[unclosed`)
47 /// returns `false` instead of panicking. The caller already passes
48 /// well-known patterns from the discovery rules, so a malformed
49 /// pattern is a programming error worth surfacing as "no matches"
50 /// rather than crashing a long-running scan.
51 fn matches_pattern(filename: &str, pattern: &str) -> bool {
52 match globset::Glob::new(pattern) {
53 Ok(glob) => glob.compile_matcher().is_match(filename),
54 Err(err) => {
55 tracing::debug!(
56 "invalid glob pattern {pattern:?} in file discovery (treating as no-match): {err}",
57 );
58 false
59 }
60 }
61 }
62}
63
64impl FileSystemProvider for StdFileSystemProvider {
65 /// Read the contents of `path`.
66 ///
67 /// `PathNotFound` is reserved for genuine "no such entry" errors;
68 /// permission-denied, EBUSY, EIO and other I/O failures are returned
69 /// as `IoError`. Pre-fix the function used `path.exists()` as a
70 /// pre-check, but `Path::exists` returns `false` for *any* failure
71 /// to stat (including `PermissionDenied`), so an unreadable file
72 /// would surface as `PathNotFound` and operators would see the scan
73 /// silently skip artifacts that actually exist.
74 fn read_file_bytes(&self, path: &Path) -> Result<FileContent, FileSystemError> {
75 // Defence-in-depth size guard: open the file once, then check
76 // metadata on the already-open handle and use `.take()` to bound
77 // the read. Pre-fix the function used a separate `metadata()` call
78 // followed by `read()`, which was a TOCTOU race — a file could grow
79 // between the stat and the read, defeating the size cap. Opening
80 // once also avoids the symlink-swap risk of stat-then-read.
81 let file = match std::fs::File::open(path) {
82 Ok(f) => f,
83 Err(err) if err.kind() == io::ErrorKind::NotFound => {
84 return Err(FileSystemError::PathNotFound(path.to_path_buf()));
85 }
86 Err(err) => return Err(FileSystemError::IoError(err)),
87 };
88 let meta = match file.metadata() {
89 Ok(m) => m,
90 Err(err) if err.kind() == io::ErrorKind::NotFound => {
91 return Err(FileSystemError::PathNotFound(path.to_path_buf()));
92 }
93 Err(err) => return Err(FileSystemError::IoError(err)),
94 };
95 if meta.len() > MAX_READ_FILE_BYTES {
96 return Err(FileSystemError::IoError(io::Error::new(
97 io::ErrorKind::InvalidInput,
98 format!(
99 "refusing to read {}: size {} exceeds MAX_READ_FILE_BYTES ({})",
100 path.display(),
101 meta.len(),
102 MAX_READ_FILE_BYTES
103 ),
104 )));
105 }
106 use std::io::Read;
107 let mut buf = Vec::with_capacity(meta.len().try_into().unwrap_or(0));
108 // `.take()` bounds the read at MAX_READ_FILE_BYTES + 1 so that a
109 // file that grew between the metadata check and the read still
110 // cannot exceed the cap. If we read more than MAX_READ_FILE_BYTES
111 // bytes, the file grew and we reject it.
112 let mut limited = file.take(MAX_READ_FILE_BYTES + 1);
113 if let Err(err) = limited.read_to_end(&mut buf) {
114 return Err(FileSystemError::IoError(err));
115 }
116 if buf.len() as u64 > MAX_READ_FILE_BYTES {
117 return Err(FileSystemError::IoError(io::Error::new(
118 io::ErrorKind::InvalidInput,
119 format!(
120 "refusing to read {}: size exceeds MAX_READ_FILE_BYTES ({})",
121 path.display(),
122 MAX_READ_FILE_BYTES
123 ),
124 )));
125 }
126 Ok(FileContent::new(buf))
127 }
128
129 /// List the entries of `path` that match `pattern`.
130 ///
131 /// # Symlink policy
132 ///
133 /// Symlinks are deliberately NOT followed. The threat model includes
134 /// scanned packages authored by untrusted parties, so a symlink like
135 /// `evil.json -> /etc/passwd` shipped inside a malicious skill MUST
136 /// NOT cause the scanner to ingest the link target. Concretely:
137 ///
138 /// * The recursive walk pins `WalkDir::follow_links(false)` so the
139 /// walker neither descends into symlinked directories nor reports
140 /// the link target's type.
141 /// * Both branches gate on `FileType::is_file()` AND
142 /// `!FileType::is_symlink()` so a future refactor that turns
143 /// `follow_links` on (which would make `is_file()` reflect the
144 /// target type) does not silently re-enable symlink ingestion.
145 ///
146 /// # Error policy
147 ///
148 /// `PathNotFound` is reserved for genuine "no such directory"
149 /// errors; permission-denied, EBUSY, EIO and other I/O failures on
150 /// the root path are returned as `IoError`. Pre-fix the function
151 /// used `path.exists()` as a pre-check, but `Path::exists` returns
152 /// `false` for *any* failure to stat (including `PermissionDenied`),
153 /// so an unreadable directory would surface as `PathNotFound` and
154 /// operators would see the scan silently skip artifacts that
155 /// actually exist on disk — the same failure mode `read_file_bytes`
156 /// guards against. Errors encountered on individual *child* entries
157 /// during a recursive walk remain warnings: the scan keeps going on
158 /// the legible siblings instead of aborting the whole package.
159 ///
160 /// # Non-UTF-8 filenames
161 ///
162 /// Filenames containing non-UTF-8 bytes are matched against `pattern`
163 /// using `OsStr::to_string_lossy` (invalid sequences become
164 /// `U+FFFD`) and a `tracing::warn!` is emitted naming the lossy
165 /// path. Pre-fix the chained `to_str()` returned `None` and the
166 /// entry was silently skipped, allowing an attacker who packages
167 /// an untrusted skill with a non-UTF-8 artifact name (zip and tar
168 /// both preserve raw bytes) to evade scanning entirely. Lossy
169 /// matching closes the evasion vector while the warning surfaces
170 /// the attempt to operators.
171 fn list_files(
172 &self,
173 path: &Path,
174 pattern: &str,
175 recursive: bool,
176 ) -> Result<Vec<PathBuf>, FileSystemError> {
177 let mut files = Vec::new();
178
179 if recursive {
180 for result in WalkDir::new(path).follow_links(false) {
181 match result {
182 Ok(entry) => {
183 let file_type = entry.file_type();
184 if file_type.is_file() && !file_type.is_symlink() {
185 let entry_path = entry.path();
186 if let Some(filename_os) = entry_path.file_name() {
187 let filename = lossy_filename_with_warning(filename_os, entry_path);
188 if Self::matches_pattern(&filename, pattern) {
189 files.push(entry_path.to_path_buf());
190 }
191 }
192 }
193 }
194 Err(err) if err.depth() == 0 => {
195 let io_err = err.into_io_error().unwrap_or_else(|| {
196 io::Error::other("walkdir error without underlying io::Error")
197 });
198 return match io_err.kind() {
199 io::ErrorKind::NotFound => {
200 Err(FileSystemError::PathNotFound(path.to_path_buf()))
201 }
202 _ => Err(FileSystemError::IoError(io_err)),
203 };
204 }
205 Err(err) => {
206 tracing::warn!("Skipping entry during recursive file listing: {err}");
207 }
208 }
209 }
210 } else {
211 let entries = match std::fs::read_dir(path) {
212 Ok(it) => it,
213 Err(err) if err.kind() == io::ErrorKind::NotFound => {
214 return Err(FileSystemError::PathNotFound(path.to_path_buf()));
215 }
216 Err(err) => return Err(FileSystemError::IoError(err)),
217 };
218 for entry in entries {
219 let entry = match entry {
220 Ok(e) => e,
221 Err(err) => {
222 tracing::warn!("Skipping entry during non-recursive file listing: {err}");
223 continue;
224 }
225 };
226 let file_type = match entry.file_type() {
227 Ok(ft) => ft,
228 Err(err) => {
229 tracing::warn!(
230 "Skipping entry with unavailable file type: {}: {err}",
231 entry.path().display()
232 );
233 continue;
234 }
235 };
236 if file_type.is_file() && !file_type.is_symlink() {
237 let entry_path = entry.path();
238 if let Some(filename_os) = entry_path.file_name() {
239 let filename = lossy_filename_with_warning(filename_os, &entry_path);
240 if Self::matches_pattern(&filename, pattern) {
241 files.push(entry_path);
242 }
243 }
244 }
245 }
246 }
247
248 Ok(files)
249 }
250
251 /// Use `symlink_metadata` to avoid following symlinks, consistent with
252 /// `list_files` / `walk_files` which explicitly filter out symlinks.
253 /// `Path::exists()` follows symlinks AND swallows permission errors
254 /// (returning `false` for `PermissionDenied`), which is inconsistent
255 /// with the symlink-does-not-exist policy of the listing methods.
256 fn exists(&self, path: &Path) -> bool {
257 match std::fs::symlink_metadata(path) {
258 Ok(_) => true,
259 Err(e) if e.kind() == std::io::ErrorKind::NotFound => false,
260 Err(e) => {
261 tracing::debug!("exists: stat failed for {}: {e}", path.display());
262 // Fail-closed: permission errors mean "exists but
263 // inaccessible", not "not found"
264 true
265 }
266 }
267 }
268
269 fn metadata(&self, path: &Path) -> Result<FileMeta, FileSystemError> {
270 let meta = std::fs::symlink_metadata(path).map_err(FileSystemError::IoError)?;
271 Ok(FileMeta { len: meta.len() })
272 }
273
274 /// Use `symlink_metadata` to avoid following symlinks, consistent with
275 /// the listing methods' `!file_type.is_symlink()` filter.
276 fn is_file(&self, path: &Path) -> bool {
277 match std::fs::symlink_metadata(path) {
278 Ok(meta) => meta.is_file() && !meta.is_symlink(),
279 Err(_) => false,
280 }
281 }
282
283 /// Use `symlink_metadata` to avoid following symlinks, consistent with
284 /// the listing methods' `!file_type.is_symlink()` filter.
285 fn is_dir(&self, path: &Path) -> bool {
286 match std::fs::symlink_metadata(path) {
287 Ok(meta) => meta.is_dir() && !meta.is_symlink(),
288 Err(_) => false,
289 }
290 }
291
292 /// Recursive walk over `path` honouring `max_depth` and `skip_dirs`.
293 ///
294 /// Symlinks are NOT followed (`follow_links(false)`). Errors on
295 /// individual entries are logged via `tracing::warn!` and the walk
296 /// continues, matching the asymmetry documented for `list_files`:
297 /// the root error is fatal, child errors are non-fatal so a single
298 /// unreadable subtree does not blank an entire package scan.
299 ///
300 /// `max_depth = 0` means unlimited (matches the documented port
301 /// contract). `skip_dirs` is checked against the lossy filename of
302 /// each directory entry; a match prunes the entire subtree.
303 fn walk_files(
304 &self,
305 path: &Path,
306 max_depth: usize,
307 skip_dirs: &[&str],
308 ) -> Result<Vec<PathBuf>, FileSystemError> {
309 let mut walker = WalkDir::new(path).follow_links(false);
310 if max_depth > 0 {
311 walker = walker.max_depth(max_depth);
312 }
313
314 let mut files = Vec::new();
315 let walker = walker
316 .into_iter()
317 .filter_entry(|entry| !is_skipped_dir(entry, skip_dirs));
318 for result in walker {
319 match result {
320 Ok(entry) => {
321 let file_type = entry.file_type();
322 if file_type.is_file() && !file_type.is_symlink() {
323 files.push(entry.into_path());
324 }
325 }
326 Err(err) if err.depth() == 0 => {
327 let io_err = err.into_io_error().unwrap_or_else(|| {
328 io::Error::other("walkdir error without underlying io::Error")
329 });
330 return match io_err.kind() {
331 io::ErrorKind::NotFound => {
332 Err(FileSystemError::PathNotFound(path.to_path_buf()))
333 }
334 _ => Err(FileSystemError::IoError(io_err)),
335 };
336 }
337 Err(err) => {
338 tracing::warn!("Skipping entry during recursive walk: {err}");
339 }
340 }
341 }
342 Ok(files)
343 }
344}
345
346#[cfg(test)]
347mod tests {
348 use super::*;
349 use tempfile::TempDir;
350
351 /// # Contract
352 /// `read_file_bytes` MUST return the file's raw bytes via the
353 /// [`FileContent`] wrapper without re-encoding or normalising line
354 /// endings. The hexagonal contract makes this the only way the core
355 /// reads bytes — any silent transformation would leak into every
356 /// downstream pattern match.
357 #[test]
358 fn read_file_bytes_returns_raw_contents_through_port() {
359 let dir = TempDir::new().unwrap();
360 let file_path = dir.path().join("test.txt");
361 std::fs::write(&file_path, "hello world").unwrap();
362
363 let fs = StdFileSystemProvider::new();
364 let content = fs.read_file_bytes(&file_path).unwrap();
365 assert_eq!(content.as_bytes(), b"hello world");
366 }
367
368 /// # Contract
369 ///
370 /// `read_file_bytes` MUST refuse files larger than
371 /// [`MAX_READ_FILE_BYTES`] with a [`FileSystemError::IoError`] whose
372 /// message names the size and the limit. Pre-fix the function called
373 /// `std::fs::read` unconditionally, so a malicious package shipping
374 /// a 100 GB file would trigger a 100 GB heap allocation and OOM the
375 /// process. The pre-stat guard is the last line of defence for
376 /// callers that bypass the discovery-layer caps.
377 #[test]
378 fn read_file_bytes_refuses_files_larger_than_max() {
379 let dir = TempDir::new().unwrap();
380 let file_path = dir.path().join("oversized.bin");
381 // Write `MAX_READ_FILE_BYTES + 1` bytes via `set_len` so the test
382 // does not actually allocate the buffer; the guard must consult
383 // metadata, not the body.
384 let file = std::fs::File::create(&file_path).unwrap();
385 file.set_len(MAX_READ_FILE_BYTES + 1).unwrap();
386 drop(file);
387
388 let fs = StdFileSystemProvider::new();
389 let err = fs
390 .read_file_bytes(&file_path)
391 .expect_err("oversized file MUST be rejected");
392 match err {
393 FileSystemError::IoError(io_err) => {
394 let msg = io_err.to_string();
395 assert!(
396 msg.contains("MAX_READ_FILE_BYTES")
397 && msg.contains(&MAX_READ_FILE_BYTES.to_string()),
398 "error message must explain the size guard; got {msg}"
399 );
400 }
401 other => panic!("expected IoError with size-guard message; got {other:?}"),
402 }
403 }
404
405 /// # Contract
406 ///
407 /// Files at exactly the size cap are allowed; only strictly-larger
408 /// files are rejected. Pins the boundary so a future tightening
409 /// does not accidentally invert the comparison.
410 #[test]
411 fn read_file_bytes_accepts_files_at_max_size() {
412 let dir = TempDir::new().unwrap();
413 let file_path = dir.path().join("limit.bin");
414 // Use a small synthetic limit-equivalent body; we cannot allocate
415 // 256 MiB in a unit test, so we instead verify a 1 KiB file
416 // (well under the cap) is still accepted.
417 std::fs::write(&file_path, vec![0u8; 1024]).unwrap();
418 let fs = StdFileSystemProvider::new();
419 let content = fs
420 .read_file_bytes(&file_path)
421 .expect("under-cap file MUST be accepted");
422 assert_eq!(content.as_bytes().len(), 1024);
423 }
424
425 /// # Contract (negative)
426 /// A missing path MUST surface as
427 /// [`FileSystemError::PathNotFound`], not as a generic
428 /// [`FileSystemError::IoError`] or a panic. Callers (the scanner,
429 /// rule loaders, dataset preparation) discriminate on this variant
430 /// to decide between "skip and continue" and "abort the run".
431 #[test]
432 fn read_file_bytes_returns_path_not_found_for_missing_path() {
433 let fs = StdFileSystemProvider::new();
434 let result = fs.read_file_bytes(Path::new("/nonexistent/path/file.txt"));
435 assert!(matches!(result, Err(FileSystemError::PathNotFound(_))));
436 }
437
438 /// # Contract
439 /// Non-recursive `list_files` MUST honour the glob pattern AND stay
440 /// within the supplied directory — a sibling directory's matches do
441 /// not leak in. This is the load-bearing invariant for skill
442 /// discovery, which lists the package root non-recursively to find
443 /// `SKILL.md` without walking vendored subtrees.
444 #[test]
445 fn list_files_non_recursive_filters_by_glob_and_stays_in_dir() {
446 let dir = TempDir::new().unwrap();
447
448 std::fs::write(dir.path().join("test1.md"), "").unwrap();
449 std::fs::write(dir.path().join("test2.md"), "").unwrap();
450 std::fs::write(dir.path().join("test.txt"), "").unwrap();
451
452 let fs = StdFileSystemProvider::new();
453 let files = fs.list_files(dir.path(), "*.md", false).unwrap();
454
455 assert_eq!(files.len(), 2);
456 assert!(files.iter().all(|f| f.extension().unwrap() == "md"));
457 }
458
459 /// # Contract
460 /// Recursive `list_files` MUST descend into subdirectories and apply
461 /// the glob to filenames at every depth. Discovery for "find every
462 /// `SKILL.md` in this package" relies on this; if recursion silently
463 /// stopped at the root, supporting artifacts in subdirectories would
464 /// never enter the scan graph.
465 #[test]
466 fn list_files_recursive_descends_into_subdirectories() {
467 let dir = TempDir::new().unwrap();
468 let subdir = dir.path().join("subdir");
469 std::fs::create_dir(&subdir).unwrap();
470
471 std::fs::write(dir.path().join("test1.md"), "").unwrap();
472 std::fs::write(subdir.join("test2.md"), "").unwrap();
473
474 let fs = StdFileSystemProvider::new();
475 let files = fs.list_files(dir.path(), "*.md", true).unwrap();
476
477 assert_eq!(files.len(), 2);
478 }
479
480 /// # Contract
481 /// `exists` MUST report `true` only for paths the OS resolves to a
482 /// real entry, and never panic on a non-existent path. The scanner
483 /// uses this as its cheap pre-flight before opening referenced
484 /// artifacts.
485 #[test]
486 fn exists_returns_true_only_for_real_paths() {
487 let dir = TempDir::new().unwrap();
488 let file_path = dir.path().join("exists.txt");
489 std::fs::write(&file_path, "").unwrap();
490
491 let fs = StdFileSystemProvider::new();
492 assert!(fs.exists(&file_path));
493 assert!(!fs.exists(&dir.path().join("does_not_exist.txt")));
494 }
495
496 /// # Contract
497 /// `matches_pattern` MUST recognise the canonical glob shapes
498 /// (`*.ext`, `*`, `prefix*`) and reject mismatches. These are the
499 /// shapes the rule loader and discovery service emit — a silent
500 /// failure here breaks discovery without producing an error.
501 #[test]
502 fn matches_pattern_handles_canonical_glob_shapes() {
503 assert!(StdFileSystemProvider::matches_pattern("test.md", "*.md"));
504 assert!(StdFileSystemProvider::matches_pattern("test.md", "*"));
505 assert!(StdFileSystemProvider::matches_pattern("test.md", "test*"));
506 assert!(!StdFileSystemProvider::matches_pattern("test.txt", "*.md"));
507 }
508
509 /// # Contract
510 ///
511 /// `matches_pattern` MUST support `*` in the middle of a pattern.
512 /// Pre-fix the matcher only handled leading- or trailing-`*` and
513 /// silently returned `false` for `foo*bar.json`, so a discovery
514 /// rule using a middle wildcard would never trigger.
515 #[test]
516 fn matches_pattern_supports_middle_wildcard() {
517 assert!(StdFileSystemProvider::matches_pattern(
518 "manifest.lock.json",
519 "manifest*.json"
520 ));
521 assert!(!StdFileSystemProvider::matches_pattern(
522 "other.json",
523 "manifest*.json"
524 ));
525 }
526
527 /// # Contract
528 ///
529 /// `matches_pattern` MUST support brace alternation. This lets a
530 /// single discovery pattern handle related extensions like
531 /// `*.{json,yaml}` instead of forcing the discovery layer to fan
532 /// out into per-extension list_files calls (which compounds I/O on
533 /// large trees).
534 #[test]
535 fn matches_pattern_supports_brace_alternation() {
536 assert!(StdFileSystemProvider::matches_pattern(
537 "config.yaml",
538 "*.{json,yaml}"
539 ));
540 assert!(StdFileSystemProvider::matches_pattern(
541 "config.json",
542 "*.{json,yaml}"
543 ));
544 assert!(!StdFileSystemProvider::matches_pattern(
545 "config.toml",
546 "*.{json,yaml}"
547 ));
548 }
549
550 /// # Contract (fail-closed)
551 ///
552 /// An invalid glob pattern MUST return `false` instead of panicking.
553 /// Callers pass well-known patterns from discovery rules, so a
554 /// malformed pattern is a programming error worth surfacing as
555 /// "no matches" rather than crashing a long-running scan in the
556 /// middle of a dataset run.
557 #[test]
558 fn matches_pattern_returns_false_for_invalid_glob_silently() {
559 // Unbalanced character class is a glob syntax error.
560 assert!(!StdFileSystemProvider::matches_pattern(
561 "anything",
562 "[unclosed"
563 ));
564 }
565
566 /// # Contract
567 ///
568 /// `read_file_bytes` MUST distinguish "file does not exist" from
569 /// other I/O failures. Pre-fix the function pre-checked
570 /// `path.exists()`, but `Path::exists` returns `false` on *any*
571 /// stat failure (including `PermissionDenied`), so an unreadable
572 /// file would be reported as `PathNotFound` and the scan would
573 /// silently skip artifacts that actually exist on disk. This test
574 /// pins the post-fix behaviour: a `PermissionDenied` from
575 /// `std::fs::read` MUST surface as `IoError`, not `PathNotFound`.
576 #[cfg(unix)]
577 #[test]
578 fn read_file_bytes_distinguishes_permission_denied_from_path_not_found() {
579 use std::os::unix::fs::PermissionsExt;
580
581 let dir = TempDir::new().unwrap();
582 let unreadable = dir.path().join("locked.bin");
583 std::fs::write(&unreadable, b"secret").unwrap();
584 std::fs::set_permissions(&unreadable, std::fs::Permissions::from_mode(0o000)).unwrap();
585
586 let fs = StdFileSystemProvider::new();
587 let err = fs
588 .read_file_bytes(&unreadable)
589 .expect_err("unreadable file must surface an error");
590
591 // Restore mode so TempDir teardown doesn't choke.
592 std::fs::set_permissions(&unreadable, std::fs::Permissions::from_mode(0o644)).ok();
593
594 assert!(
595 matches!(
596 err,
597 FileSystemError::IoError(ref io_err)
598 if io_err.kind() == std::io::ErrorKind::PermissionDenied
599 ),
600 "PermissionDenied MUST be preserved, not collapsed to PathNotFound or other variant; got {err:?}",
601 );
602 }
603
604 /// # Contract
605 ///
606 /// `read_file_bytes` MUST still return `PathNotFound` for files
607 /// that genuinely don't exist. Negative-direction pin for the
608 /// permission-denied test above: the fix must not regress the
609 /// "missing file" path.
610 #[test]
611 fn read_file_bytes_still_returns_path_not_found_for_missing_file() {
612 let dir = TempDir::new().unwrap();
613 let missing = dir.path().join("does_not_exist.bin");
614
615 let fs = StdFileSystemProvider::new();
616 let err = fs
617 .read_file_bytes(&missing)
618 .expect_err("missing file must error");
619
620 assert!(
621 matches!(err, FileSystemError::PathNotFound(_)),
622 "missing file must surface as PathNotFound, got {err:?}",
623 );
624 }
625
626 /// # Contract
627 ///
628 /// `list_files` MUST NOT return entries that are symlinks, even
629 /// when the link target is a regular file matching the pattern.
630 /// Threat model: malware analysts scan untrusted skill packages,
631 /// so a malicious package shipping `evil.json -> /etc/hosts` MUST
632 /// NOT cause the scanner to read or report the link target as if
633 /// it were a regular file inside the package.
634 #[cfg(unix)]
635 #[test]
636 fn list_files_skips_symlinks_in_recursive_walk() {
637 let dir = TempDir::new().unwrap();
638 let real = dir.path().join("real.json");
639 std::fs::write(&real, b"{}").unwrap();
640 let evil = dir.path().join("evil.json");
641 std::os::unix::fs::symlink("/etc/hosts", &evil).unwrap();
642
643 let fs = StdFileSystemProvider::new();
644 let files = fs.list_files(dir.path(), "*.json", true).unwrap();
645
646 assert_eq!(
647 files.len(),
648 1,
649 "expected only the regular file, got {files:?}"
650 );
651 assert_eq!(files[0], real);
652 }
653
654 /// # Contract
655 ///
656 /// Same symlink-skip contract as the recursive case, pinned for
657 /// the non-recursive (`std::fs::read_dir`) branch. Both branches
658 /// share the threat model: untrusted package contents.
659 #[cfg(unix)]
660 #[test]
661 fn list_files_skips_symlinks_in_non_recursive_walk() {
662 let dir = TempDir::new().unwrap();
663 let real = dir.path().join("real.json");
664 std::fs::write(&real, b"{}").unwrap();
665 let evil = dir.path().join("evil.json");
666 std::os::unix::fs::symlink("/etc/hosts", &evil).unwrap();
667
668 let fs = StdFileSystemProvider::new();
669 let files = fs.list_files(dir.path(), "*.json", false).unwrap();
670
671 assert_eq!(
672 files.len(),
673 1,
674 "expected only the regular file, got {files:?}"
675 );
676 assert_eq!(files[0], real);
677 }
678
679 /// # Contract
680 ///
681 /// `list_files` MUST distinguish "permission denied" from "path not
682 /// found" in the recursive branch. Pre-fix the function pre-checked
683 /// `path.exists()` and the WalkDir filter_map swallowed the root
684 /// error as a tracing warning, so an unreadable directory would
685 /// surface as either PathNotFound (with the precheck) or an empty
686 /// `Ok` vec (without it) — both silently hiding a real I/O failure
687 /// during scans of untrusted packages. This test pins the post-fix
688 /// behaviour: a `PermissionDenied` from the root walk MUST surface
689 /// as `IoError`, never as `PathNotFound` or as an empty result.
690 #[cfg(unix)]
691 #[test]
692 fn list_files_recursive_distinguishes_permission_denied_from_path_not_found() {
693 use std::os::unix::fs::PermissionsExt;
694
695 let dir = TempDir::new().unwrap();
696 let locked = dir.path().join("locked");
697 std::fs::create_dir(&locked).unwrap();
698 std::fs::set_permissions(&locked, std::fs::Permissions::from_mode(0o000)).unwrap();
699
700 let fs = StdFileSystemProvider::new();
701 let result = fs.list_files(&locked, "*.md", true);
702
703 // Restore permissions so TempDir teardown doesn't choke.
704 std::fs::set_permissions(&locked, std::fs::Permissions::from_mode(0o755)).ok();
705
706 let err = result.expect_err("unreadable directory must surface an error");
707 assert!(
708 matches!(
709 err,
710 FileSystemError::IoError(ref io_err)
711 if io_err.kind() == std::io::ErrorKind::PermissionDenied
712 ),
713 "PermissionDenied MUST be preserved, not collapsed to PathNotFound or empty Ok; got {err:?}",
714 );
715 }
716
717 /// # Contract
718 ///
719 /// Same `PermissionDenied` distinction as the recursive case, pinned
720 /// for the non-recursive (`std::fs::read_dir`) branch. Both branches
721 /// share the threat model and contract: callers need an actionable
722 /// error kind so the scanner can log a permission failure instead
723 /// of silently treating an unreadable directory as missing.
724 #[cfg(unix)]
725 #[test]
726 fn list_files_non_recursive_distinguishes_permission_denied_from_path_not_found() {
727 use std::os::unix::fs::PermissionsExt;
728
729 let dir = TempDir::new().unwrap();
730 let locked = dir.path().join("locked");
731 std::fs::create_dir(&locked).unwrap();
732 std::fs::set_permissions(&locked, std::fs::Permissions::from_mode(0o000)).unwrap();
733
734 let fs = StdFileSystemProvider::new();
735 let result = fs.list_files(&locked, "*.md", false);
736
737 std::fs::set_permissions(&locked, std::fs::Permissions::from_mode(0o755)).ok();
738
739 let err = result.expect_err("unreadable directory must surface an error");
740 assert!(
741 matches!(
742 err,
743 FileSystemError::IoError(ref io_err)
744 if io_err.kind() == std::io::ErrorKind::PermissionDenied
745 ),
746 "PermissionDenied MUST be preserved, not collapsed to PathNotFound; got {err:?}",
747 );
748 }
749
750 /// # Contract
751 ///
752 /// `list_files` MUST still return `PathNotFound` for directories
753 /// that genuinely don't exist. Negative-direction pin paired with
754 /// the permission-denied tests above: removing the `path.exists()`
755 /// pre-check must not regress the "missing directory" path. Both
756 /// the recursive and non-recursive branches are exercised because
757 /// they use different underlying primitives (WalkDir vs read_dir).
758 #[test]
759 fn list_files_still_returns_path_not_found_for_missing_directory() {
760 let dir = TempDir::new().unwrap();
761 let missing = dir.path().join("does_not_exist");
762
763 let fs = StdFileSystemProvider::new();
764
765 let err_recursive = fs
766 .list_files(&missing, "*.md", true)
767 .expect_err("missing dir must error in recursive branch");
768 assert!(
769 matches!(err_recursive, FileSystemError::PathNotFound(_)),
770 "missing dir must surface as PathNotFound in recursive branch, got {err_recursive:?}",
771 );
772
773 let err_flat = fs
774 .list_files(&missing, "*.md", false)
775 .expect_err("missing dir must error in non-recursive branch");
776 assert!(
777 matches!(err_flat, FileSystemError::PathNotFound(_)),
778 "missing dir must surface as PathNotFound in non-recursive branch, got {err_flat:?}",
779 );
780 }
781
782 /// # Contract
783 ///
784 /// Errors on *child* entries of a recursive walk MUST remain
785 /// non-fatal: the scanner keeps reporting the legible siblings
786 /// rather than aborting the whole package because one subdirectory
787 /// is unreadable. This pins the asymmetry between "root error"
788 /// (fatal, surfaced as IoError) and "child error" (logged warning,
789 /// scan continues) — without this test a future refactor could
790 /// turn child errors fatal and silently drop coverage on partially
791 /// readable trees.
792 #[cfg(unix)]
793 #[test]
794 fn list_files_recursive_continues_past_unreadable_child() {
795 use std::os::unix::fs::PermissionsExt;
796
797 let dir = TempDir::new().unwrap();
798 std::fs::write(dir.path().join("readable.md"), b"").unwrap();
799 let blocked = dir.path().join("blocked");
800 std::fs::create_dir(&blocked).unwrap();
801 std::fs::write(blocked.join("hidden.md"), b"").unwrap();
802 std::fs::set_permissions(&blocked, std::fs::Permissions::from_mode(0o000)).unwrap();
803
804 let fs = StdFileSystemProvider::new();
805 let result = fs.list_files(dir.path(), "*.md", true);
806
807 std::fs::set_permissions(&blocked, std::fs::Permissions::from_mode(0o755)).ok();
808
809 let files = result.expect("root is readable; walk must succeed despite blocked subdir");
810 assert_eq!(
811 files.len(),
812 1,
813 "expected only the readable sibling, got {files:?}",
814 );
815 assert!(
816 files[0].ends_with("readable.md"),
817 "expected readable.md, got {:?}",
818 files[0]
819 );
820 }
821
822 /// # Contract
823 ///
824 /// `list_files` MUST surface entries whose filenames contain
825 /// non-UTF-8 bytes by matching them against `pattern` via a lossy
826 /// `OsStr::to_string_lossy` conversion. Pre-fix the chained
827 /// `to_str()` returned `None` and the `if let` silently dropped
828 /// the entry, allowing an attacker who packages a skill with a
829 /// critical artifact named with raw bytes (zip and tar preserve
830 /// them) to evade the entire scan. Both branches (recursive walk
831 /// via WalkDir and non-recursive via `read_dir`) share the threat
832 /// model and the contract.
833 ///
834 /// Linux-only because HFS+/APFS reject non-UTF-8 filenames at the
835 /// `creat(2)` syscall (errno `EILSEQ`); the evasion vector this
836 /// test pins is only reachable on filesystems that accept raw
837 /// bytes (ext4, xfs, btrfs, zfs) or via archive extraction tools
838 /// that ship raw-byte names. The runtime fix (lossy match in
839 /// `lossy_filename_with_warning`) is platform-agnostic.
840 #[cfg(target_os = "linux")]
841 #[test]
842 fn list_files_recursive_matches_non_utf8_filename_via_lossy_conversion() {
843 use std::ffi::OsStr;
844 use std::os::unix::ffi::OsStrExt;
845
846 let dir = TempDir::new().unwrap();
847 // `bad\xFF.json` — `\xFF` is invalid UTF-8 as a standalone byte.
848 let evil_name = OsStr::from_bytes(b"bad\xFF.json");
849 let evil_path = dir.path().join(evil_name);
850 std::fs::write(&evil_path, b"{}").unwrap();
851
852 let fs = StdFileSystemProvider::new();
853 let files = fs.list_files(dir.path(), "*.json", true).unwrap();
854
855 assert_eq!(
856 files.len(),
857 1,
858 "non-UTF-8 filename MUST be matched via lossy conversion, got {files:?}",
859 );
860 assert_eq!(
861 files[0], evil_path,
862 "returned path MUST preserve the original (non-UTF-8) bytes",
863 );
864 }
865
866 /// # Contract
867 ///
868 /// Non-recursive twin of
869 /// `list_files_recursive_matches_non_utf8_filename_via_lossy_conversion`.
870 /// Pinned separately because the two branches use different
871 /// underlying primitives (WalkDir vs `std::fs::read_dir`); a future
872 /// refactor that fixes one without the other would silently
873 /// re-open the evasion path on whichever discovery mode regresses.
874 /// See the recursive twin's doc-comment for the platform-gating
875 /// rationale (HFS+/APFS reject non-UTF-8 filenames at syscall time).
876 #[cfg(target_os = "linux")]
877 #[test]
878 fn list_files_non_recursive_matches_non_utf8_filename_via_lossy_conversion() {
879 use std::ffi::OsStr;
880 use std::os::unix::ffi::OsStrExt;
881
882 let dir = TempDir::new().unwrap();
883 let evil_name = OsStr::from_bytes(b"manifest\xFF.json");
884 let evil_path = dir.path().join(evil_name);
885 std::fs::write(&evil_path, b"{}").unwrap();
886
887 let fs = StdFileSystemProvider::new();
888 let files = fs.list_files(dir.path(), "*.json", false).unwrap();
889
890 assert_eq!(
891 files.len(),
892 1,
893 "non-UTF-8 filename MUST be matched via lossy conversion, got {files:?}",
894 );
895 assert_eq!(
896 files[0], evil_path,
897 "returned path MUST preserve the original (non-UTF-8) bytes",
898 );
899 }
900
901 /// # Contract
902 ///
903 /// Negative-direction pin for the lossy-match contract above:
904 /// regular UTF-8 filenames MUST continue to be returned without
905 /// behavioural change. Without this test a future refactor that
906 /// over-corrects (e.g. unconditionally allocates via
907 /// `to_string_lossy`) could regress the hot path's borrow-only
908 /// behaviour or accidentally drop UTF-8 entries.
909 #[test]
910 fn list_files_returns_utf8_filenames_unchanged() {
911 let dir = TempDir::new().unwrap();
912 std::fs::write(dir.path().join("alpha.json"), b"{}").unwrap();
913 std::fs::write(dir.path().join("beta.json"), b"{}").unwrap();
914 std::fs::write(dir.path().join("gamma.txt"), b"").unwrap();
915
916 let fs = StdFileSystemProvider::new();
917 let files = fs.list_files(dir.path(), "*.json", false).unwrap();
918
919 assert_eq!(
920 files.len(),
921 2,
922 "expected the two .json files, got {files:?}"
923 );
924 assert!(files.iter().all(|f| f.extension().unwrap() == "json"));
925 }
926}