Skip to main content

socket_patch_core/utils/
fs.rs

1//! Filesystem helpers shared by the ecosystem crawlers.
2//!
3//! Each crawler walks one or more package directories and decides
4//! whether each entry is a candidate package. The two operations that
5//! all eight crawlers repeat are:
6//!
7//! - listing entries in a directory while tolerating permission /
8//!   I/O errors (we treat an unreadable directory as "no entries");
9//! - asking whether an entry is a directory while tolerating
10//!   `file_type()` failures (we treat a stat error as "not a dir").
11//!
12//! Centralizing both keeps each crawler free of the
13//! `match read_dir { Ok(rd) => rd, Err(_) => return … }` boilerplate
14//! and gives integration tests a single function to drive when they
15//! want to exercise the read_dir Err arm via `chmod 000`.
16//!
17//! Both helpers are async because the rest of the crawler code is —
18//! they delegate to `tokio::fs`.
19//!
20//! # Symlinks
21//!
22//! `entry_is_dir` follows symlinks (uses `metadata()`, not
23//! `symlink_metadata()`), matching the historical behavior of the
24//! crawlers (pnpm's content-addressed store relies on resolving
25//! symlinks into `node_modules/.pnpm/*`).
26
27use std::path::Path;
28
29use std::fs::FileType;
30use tokio::fs::DirEntry;
31
32/// List the immediate children of `path`.
33///
34/// Returns an empty vector if the directory cannot be read (does not
35/// exist, permission denied, etc.). If a later `next_entry` call
36/// fails mid-iteration, the entries gathered so far are returned and
37/// iteration stops. The crawlers treat all of these the same way:
38/// surface whatever the readable portion of the subtree yields, but
39/// don't abort the whole crawl.
40pub async fn list_dir_entries(path: &Path) -> Vec<DirEntry> {
41    let mut entries = match tokio::fs::read_dir(path).await {
42        Ok(rd) => rd,
43        Err(_) => return Vec::new(),
44    };
45
46    let mut out = Vec::new();
47    while let Ok(Some(entry)) = entries.next_entry().await {
48        out.push(entry);
49    }
50    out
51}
52
53/// Resolve whether `entry` is a directory, following symlinks.
54///
55/// Returns `false` if the stat fails (broken symlink, permission
56/// error, etc.) — the caller then skips the entry rather than
57/// aborting the walk.
58///
59/// `DirEntry::metadata()` does **not** traverse symlinks (it behaves
60/// like `symlink_metadata`), so a symlink pointing at a directory
61/// would wrongly report `false`. To honor the documented
62/// symlink-following contract — which crawlers like deno/python/ruby
63/// rely on for symlinked package directories — we stat the resolved
64/// `entry.path()` via `tokio::fs::metadata`, which does follow links.
65pub async fn entry_is_dir(entry: &DirEntry) -> bool {
66    tokio::fs::metadata(entry.path())
67        .await
68        .map(|m| m.is_dir())
69        .unwrap_or(false)
70}
71
72/// Return the raw `FileType` for `entry`, swallowing stat errors.
73///
74/// Use this instead of `entry_is_dir` when the caller needs to
75/// distinguish real directories from symlinks (e.g. npm's pnpm
76/// support: symlinks point into the content-addressed store and must
77/// be treated as scannable-but-non-recurseable). The returned
78/// `FileType` is the symlink-aware kind from `entry.file_type()`,
79/// not the resolved-target kind from `metadata()`.
80pub async fn entry_file_type(entry: &DirEntry) -> Option<FileType> {
81    entry.file_type().await.ok()
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87
88    #[tokio::test]
89    async fn list_dir_entries_empty_dir() {
90        let tmp = tempfile::tempdir().unwrap();
91        let entries = list_dir_entries(tmp.path()).await;
92        assert!(entries.is_empty());
93    }
94
95    #[tokio::test]
96    async fn list_dir_entries_missing_path_returns_empty() {
97        let tmp = tempfile::tempdir().unwrap();
98        let entries = list_dir_entries(&tmp.path().join("does-not-exist")).await;
99        assert!(entries.is_empty());
100    }
101
102    #[tokio::test]
103    async fn list_dir_entries_returns_children() {
104        let tmp = tempfile::tempdir().unwrap();
105        tokio::fs::create_dir(tmp.path().join("a")).await.unwrap();
106        tokio::fs::create_dir(tmp.path().join("b")).await.unwrap();
107        tokio::fs::write(tmp.path().join("c.txt"), b"")
108            .await
109            .unwrap();
110        let mut names: Vec<String> = list_dir_entries(tmp.path())
111            .await
112            .into_iter()
113            .map(|e| e.file_name().to_string_lossy().to_string())
114            .collect();
115        names.sort();
116        assert_eq!(names, vec!["a", "b", "c.txt"]);
117    }
118
119    #[tokio::test]
120    async fn entry_is_dir_distinguishes_dir_and_file() {
121        let tmp = tempfile::tempdir().unwrap();
122        tokio::fs::create_dir(tmp.path().join("d")).await.unwrap();
123        tokio::fs::write(tmp.path().join("f"), b"x").await.unwrap();
124        let entries = list_dir_entries(tmp.path()).await;
125        for entry in entries {
126            let name = entry.file_name().to_string_lossy().to_string();
127            let is_dir = entry_is_dir(&entry).await;
128            match name.as_str() {
129                "d" => assert!(is_dir),
130                "f" => assert!(!is_dir),
131                other => panic!("unexpected entry: {other}"),
132            }
133        }
134    }
135
136    /// Regression: `entry_is_dir` must follow symlinks. A symlink that
137    /// points at a directory has to report `true`, otherwise crawlers
138    /// silently skip symlinked package directories (pnpm stores,
139    /// virtualenvs, vendored gems, etc.). `DirEntry::metadata()` does
140    /// NOT traverse symlinks, so this guards against regressing back to
141    /// it.
142    #[cfg(unix)]
143    #[tokio::test]
144    async fn entry_is_dir_follows_symlink_to_dir() {
145        let tmp = tempfile::tempdir().unwrap();
146        let target = tmp.path().join("real_dir");
147        tokio::fs::create_dir(&target).await.unwrap();
148        tokio::fs::symlink(&target, tmp.path().join("link_to_dir"))
149            .await
150            .unwrap();
151
152        let entries = list_dir_entries(tmp.path()).await;
153        let link = entries
154            .into_iter()
155            .find(|e| e.file_name().to_string_lossy() == "link_to_dir")
156            .expect("symlink entry present");
157        assert!(
158            entry_is_dir(&link).await,
159            "symlink pointing at a directory must resolve to is_dir = true"
160        );
161    }
162
163    /// A symlink pointing at a regular file must report `false`, and a
164    /// broken/dangling symlink must report `false` rather than panic.
165    #[cfg(unix)]
166    #[tokio::test]
167    async fn entry_is_dir_symlink_to_file_and_broken_link() {
168        let tmp = tempfile::tempdir().unwrap();
169        let file_target = tmp.path().join("real_file");
170        tokio::fs::write(&file_target, b"x").await.unwrap();
171        tokio::fs::symlink(&file_target, tmp.path().join("link_to_file"))
172            .await
173            .unwrap();
174        tokio::fs::symlink(
175            tmp.path().join("missing_target"),
176            tmp.path().join("dangling"),
177        )
178        .await
179        .unwrap();
180
181        for entry in list_dir_entries(tmp.path()).await {
182            let name = entry.file_name().to_string_lossy().to_string();
183            let is_dir = entry_is_dir(&entry).await;
184            match name.as_str() {
185                "real_file" | "link_to_file" | "dangling" => {
186                    assert!(!is_dir, "{name} should not be a dir");
187                }
188                other => panic!("unexpected entry: {other}"),
189            }
190        }
191    }
192
193    /// `entry_file_type` is the symlink-aware counterpart: it reports
194    /// the link itself (`is_symlink`), never the resolved target.
195    #[cfg(unix)]
196    #[tokio::test]
197    async fn entry_file_type_does_not_follow_symlinks() {
198        let tmp = tempfile::tempdir().unwrap();
199        let target = tmp.path().join("real_dir");
200        tokio::fs::create_dir(&target).await.unwrap();
201        tokio::fs::symlink(&target, tmp.path().join("link_to_dir"))
202            .await
203            .unwrap();
204
205        let entries = list_dir_entries(tmp.path()).await;
206        let link = entries
207            .into_iter()
208            .find(|e| e.file_name().to_string_lossy() == "link_to_dir")
209            .expect("symlink entry present");
210        let ft = entry_file_type(&link).await.expect("file_type available");
211        assert!(
212            ft.is_symlink(),
213            "entry_file_type must surface the link kind"
214        );
215        assert!(!ft.is_dir(), "entry_file_type must not resolve the target");
216    }
217}