Skip to main content

tokmd_walk/
lib.rs

1//! # tokmd-walk
2//!
3//! **Tier 2 (Utilities)**
4//!
5//! File listing and asset discovery utilities. Provides filesystem traversal
6//! with gitignore support for analysis workflows.
7//!
8//! ## What belongs here
9//! * Filesystem traversal respecting gitignore
10//! * License candidate detection
11//! * File size queries
12//!
13//! ## What does NOT belong here
14//! * Content scanning (use tokmd-content)
15//! * Git history analysis (use tokmd-git)
16//! * File modification
17
18use std::path::{Component, Path, PathBuf};
19use std::process::{Command, Stdio};
20
21use anyhow::{Context, Result};
22use ignore::WalkBuilder;
23use tokmd_io_port::MemFs;
24
25#[derive(Debug, Clone)]
26pub struct LicenseCandidates {
27    pub license_files: Vec<PathBuf>,
28    pub metadata_files: Vec<PathBuf>,
29}
30
31pub fn list_files(root: &Path, max_files: Option<usize>) -> Result<Vec<PathBuf>> {
32    // Early return for zero-file limit
33    if max_files == Some(0) {
34        return Ok(Vec::new());
35    }
36
37    if let Some(mut files) = git_ls_files(root)? {
38        if let Some(limit) = max_files
39            && files.len() > limit
40        {
41            files.truncate(limit);
42        }
43        return Ok(files);
44    }
45
46    let mut files: Vec<PathBuf> = Vec::new();
47    let mut builder = WalkBuilder::new(root);
48    builder.hidden(false);
49    builder.git_ignore(true);
50    builder.git_exclude(true);
51    builder.git_global(true);
52    builder.follow_links(false);
53
54    for entry in builder.build() {
55        let entry = entry?;
56        if !entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
57            continue;
58        }
59        let path = entry.path().to_path_buf();
60        let rel = path.strip_prefix(root).unwrap_or(&path).to_path_buf();
61        files.push(rel);
62        if let Some(limit) = max_files
63            && files.len() >= limit
64        {
65            break;
66        }
67    }
68
69    files.sort();
70    Ok(files)
71}
72
73/// List files from an in-memory filesystem backend.
74///
75/// Returned paths are relative to `root` and sorted for deterministic output.
76pub fn list_files_from_memfs(
77    fs: &MemFs,
78    root: &Path,
79    max_files: Option<usize>,
80) -> Result<Vec<PathBuf>> {
81    if max_files == Some(0) {
82        return Ok(Vec::new());
83    }
84
85    let normalized_root = normalize_memfs_path(root);
86    let mut files: Vec<PathBuf> = fs
87        .file_paths()
88        .filter_map(|path| memfs_relative_path(path, &normalized_root))
89        .collect();
90
91    files.sort();
92
93    if let Some(limit) = max_files
94        && files.len() > limit
95    {
96        files.truncate(limit);
97    }
98
99    Ok(files)
100}
101
102pub fn license_candidates(files: &[PathBuf]) -> LicenseCandidates {
103    let mut license_files = Vec::new();
104    let mut metadata_files = Vec::new();
105
106    for rel in files {
107        let name = rel
108            .file_name()
109            .and_then(|n| n.to_str())
110            .unwrap_or("")
111            .to_lowercase();
112        if name == "cargo.toml" || name == "package.json" || name == "pyproject.toml" {
113            metadata_files.push(rel.clone());
114            continue;
115        }
116        if name.starts_with("license") || name.starts_with("copying") || name.starts_with("notice")
117        {
118            license_files.push(rel.clone());
119        }
120    }
121
122    license_files.sort();
123    metadata_files.sort();
124
125    LicenseCandidates {
126        license_files,
127        metadata_files,
128    }
129}
130
131fn git_ls_files(root: &Path) -> Result<Option<Vec<PathBuf>>> {
132    let output = Command::new("git")
133        .arg("-C")
134        .arg(root)
135        .arg("ls-files")
136        .arg("-z")
137        .stdout(Stdio::piped())
138        .stderr(Stdio::null())
139        .output();
140
141    let output = match output {
142        Ok(out) => out,
143        Err(_) => return Ok(None),
144    };
145    if !output.status.success() {
146        return Ok(None);
147    }
148
149    let mut files = Vec::new();
150    let bytes = output.stdout;
151    for part in bytes.split(|b| *b == 0) {
152        if part.is_empty() {
153            continue;
154        }
155        let s = String::from_utf8_lossy(part).to_string();
156        files.push(PathBuf::from(s));
157    }
158
159    if files.is_empty() {
160        return Ok(None);
161    }
162
163    Ok(Some(files))
164}
165
166pub fn file_size(root: &Path, relative: &Path) -> Result<u64> {
167    let path = root.join(relative);
168    let meta =
169        std::fs::metadata(&path).with_context(|| format!("Failed to stat {}", path.display()))?;
170    Ok(meta.len())
171}
172
173/// Query a file size from an in-memory filesystem backend.
174pub fn file_size_from_memfs(fs: &MemFs, root: &Path, relative: &Path) -> Result<u64> {
175    let normalized_root = normalize_memfs_path(root);
176    let path = if normalized_root.as_os_str().is_empty() {
177        normalize_memfs_path(relative)
178    } else {
179        normalize_memfs_path(&normalized_root.join(relative))
180    };
181    fs.file_size(&path)
182        .with_context(|| format!("Failed to stat {}", path.display()))
183}
184
185fn normalize_memfs_path(path: &Path) -> PathBuf {
186    let mut normalized = PathBuf::new();
187    for component in path.components() {
188        match component {
189            Component::CurDir | Component::RootDir => {}
190            Component::Normal(part) => normalized.push(part),
191            Component::ParentDir => normalized.push(".."),
192            Component::Prefix(prefix) => normalized.push(prefix.as_os_str()),
193        }
194    }
195    normalized
196}
197
198fn memfs_relative_path(path: &Path, root: &Path) -> Option<PathBuf> {
199    if root.as_os_str().is_empty() {
200        return Some(path.to_path_buf());
201    }
202    path.strip_prefix(root).ok().map(Path::to_path_buf)
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208    use std::fs;
209
210    // ---- license_candidates tests ----
211
212    #[test]
213    fn test_license_candidates_detects_license_files() {
214        let files = vec![
215            PathBuf::from("LICENSE"),
216            PathBuf::from("LICENSE.md"),
217            PathBuf::from("LICENSE-MIT"),
218            PathBuf::from("COPYING"),
219            PathBuf::from("NOTICE"),
220            PathBuf::from("src/main.rs"),
221        ];
222        let result = license_candidates(&files);
223        assert_eq!(result.license_files.len(), 5);
224        assert!(result.metadata_files.is_empty());
225    }
226
227    #[test]
228    fn test_license_candidates_detects_metadata_files() {
229        let files = vec![
230            PathBuf::from("Cargo.toml"),
231            PathBuf::from("package.json"),
232            PathBuf::from("pyproject.toml"),
233            PathBuf::from("src/lib.rs"),
234        ];
235        let result = license_candidates(&files);
236        assert!(result.license_files.is_empty());
237        assert_eq!(result.metadata_files.len(), 3);
238    }
239
240    #[test]
241    fn test_license_candidates_mixed() {
242        let files = vec![
243            PathBuf::from("LICENSE"),
244            PathBuf::from("Cargo.toml"),
245            PathBuf::from("src/main.rs"),
246        ];
247        let result = license_candidates(&files);
248        assert_eq!(result.license_files.len(), 1);
249        assert_eq!(result.metadata_files.len(), 1);
250    }
251
252    #[test]
253    fn test_license_candidates_empty_input() {
254        let result = license_candidates(&[]);
255        assert!(result.license_files.is_empty());
256        assert!(result.metadata_files.is_empty());
257    }
258
259    #[test]
260    fn test_license_candidates_case_insensitive() {
261        let files = vec![PathBuf::from("license"), PathBuf::from("License.txt")];
262        let result = license_candidates(&files);
263        assert_eq!(result.license_files.len(), 2);
264    }
265
266    #[test]
267    fn test_license_candidates_sorted_output() {
268        let files = vec![
269            PathBuf::from("z/Cargo.toml"),
270            PathBuf::from("a/Cargo.toml"),
271            PathBuf::from("z/LICENSE"),
272            PathBuf::from("a/LICENSE"),
273        ];
274        let result = license_candidates(&files);
275        assert_eq!(result.license_files[0], PathBuf::from("a/LICENSE"));
276        assert_eq!(result.license_files[1], PathBuf::from("z/LICENSE"));
277        assert_eq!(result.metadata_files[0], PathBuf::from("a/Cargo.toml"));
278        assert_eq!(result.metadata_files[1], PathBuf::from("z/Cargo.toml"));
279    }
280
281    // ---- file_size tests ----
282
283    #[test]
284    fn test_file_size_returns_correct_bytes() {
285        let dir = tempfile::tempdir().unwrap();
286        let content = "hello world";
287        fs::write(dir.path().join("test.txt"), content).unwrap();
288        let size = file_size(dir.path(), Path::new("test.txt")).unwrap();
289        assert_eq!(size, content.len() as u64);
290    }
291
292    #[test]
293    fn test_file_size_missing_file_errors() {
294        let dir = tempfile::tempdir().unwrap();
295        let result = file_size(dir.path(), Path::new("nonexistent.txt"));
296        assert!(result.is_err());
297    }
298
299    #[test]
300    fn test_file_size_empty_file() {
301        let dir = tempfile::tempdir().unwrap();
302        fs::write(dir.path().join("empty.txt"), "").unwrap();
303        let size = file_size(dir.path(), Path::new("empty.txt")).unwrap();
304        assert_eq!(size, 0);
305    }
306
307    // ---- list_files tests ----
308
309    #[test]
310    fn test_list_files_max_zero_returns_empty() {
311        let dir = tempfile::tempdir().unwrap();
312        fs::write(dir.path().join("a.rs"), "content").unwrap();
313        let files = list_files(dir.path(), Some(0)).unwrap();
314        assert!(files.is_empty());
315    }
316
317    #[test]
318    fn test_list_files_respects_max_limit() {
319        let dir = tempfile::tempdir().unwrap();
320        // Create .git dir so git_ls_files returns Some
321        fs::create_dir_all(dir.path().join(".git")).unwrap();
322        for i in 0..10 {
323            fs::write(dir.path().join(format!("file{i}.txt")), "x").unwrap();
324        }
325        let files = list_files(dir.path(), Some(3)).unwrap();
326        assert!(files.len() <= 3);
327    }
328
329    #[test]
330    fn test_list_files_deterministic_sort() {
331        let dir = tempfile::tempdir().unwrap();
332        // Create .git dir so git_ls_files returns Some
333        fs::create_dir_all(dir.path().join(".git")).unwrap();
334        fs::create_dir_all(dir.path().join("foo")).unwrap();
335        fs::write(dir.path().join("foo/bar"), "content").unwrap();
336        fs::write(dir.path().join("foo/bar.rs"), "content").unwrap();
337        fs::write(dir.path().join("foo.rs"), "content").unwrap();
338
339        let files = list_files(dir.path(), None).unwrap();
340        // The resulting paths are relative to root
341        // Expected sort: foo.rs, foo/bar, foo/bar.rs
342        // rather than lossy string sort which puts foo/bar before foo.rs
343        let expected = vec![
344            PathBuf::from("foo/bar"),
345            PathBuf::from("foo/bar.rs"),
346            PathBuf::from("foo.rs"),
347        ];
348        // Only checking that our added test files are sorted identically
349        // Note: git_ls_files relies on git, so we filter out .git
350        let actual: Vec<PathBuf> = files
351            .into_iter()
352            .filter(|p| {
353                let s = p.to_string_lossy();
354                s.starts_with("foo")
355            })
356            .collect();
357        // They should already be sorted correctly, but if they aren't, the test will fail
358        assert_eq!(actual, expected);
359    }
360}
361
362#[cfg(doctest)]
363pub mod readme_doctests {
364    #![doc = include_str!("../README.md")]
365}