Skip to main content

void_core/repo/
mod.rs

1//! Repository analysis utilities.
2//!
3//! This module provides tools for analyzing repository contents,
4//! including parallel file statistics collection.
5
6use std::fs::{self, File};
7use std::io::Read;
8use std::path::{Path, PathBuf};
9
10use ignore::WalkBuilder;
11use memmap2::Mmap;
12use rayon::prelude::*;
13
14use crate::support::{configure_walker, count_lines};
15use crate::Result;
16
17/// Options for collecting file statistics.
18pub struct FileStatsOptions {
19    /// Root directory to scan.
20    pub root: PathBuf,
21    /// Name of the void directory to skip (e.g., ".void").
22    pub void_dir_name: String,
23    /// Whether to include hidden files (dotfiles).
24    pub include_hidden: bool,
25    /// Files larger than this threshold are memory-mapped.
26    /// Set to 0 to disable mmap (always read into Vec).
27    pub mmap_threshold: u64,
28}
29
30impl Default for FileStatsOptions {
31    fn default() -> Self {
32        Self {
33            root: PathBuf::from("."),
34            void_dir_name: ".void".to_string(),
35            include_hidden: false,
36            mmap_threshold: 64 * 1024, // 64KB
37        }
38    }
39}
40
41/// Statistics for a single file.
42#[derive(Clone)]
43pub struct FileStatEntry {
44    /// Relative path from root (normalized, forward slashes).
45    pub path: String,
46    /// Lowercase file extension, or "(no ext)" if none.
47    pub extension: String,
48    /// File size in bytes.
49    pub size: u64,
50    /// Number of lines in the file.
51    pub lines: u32,
52}
53
54/// Internal job for passing file paths to workers.
55struct PathJob {
56    /// Absolute path on filesystem.
57    abs_path: PathBuf,
58    /// Relative path from root (normalized).
59    rel_path: String,
60}
61
62/// Collect file statistics from a directory in parallel.
63///
64/// Walks the directory tree, skipping the void directory and optionally hidden files.
65/// Uses rayon parallel iterators to read files and count lines in parallel.
66pub fn collect_file_stats(opts: FileStatsOptions) -> Result<Vec<FileStatEntry>> {
67    let root = &opts.root;
68    let void_dir_name = opts.void_dir_name.clone();
69
70    // Phase 1: Collect all file paths (walk is fast, single-threaded)
71    let mut builder = WalkBuilder::new(root);
72    let jobs: Vec<PathJob> = configure_walker(&mut builder)
73        .hidden(!opts.include_hidden)
74        .filter_entry(move |entry| {
75            let name = entry.file_name().to_string_lossy();
76            name != void_dir_name
77                && name != "node_modules"
78                && name != ".git"
79                && name != ".DS_Store"
80                && name != "target"
81        })
82        .build()
83        .flatten()
84        .filter(|entry| entry.file_type().is_some_and(|t| t.is_file()))
85        .filter_map(|entry| {
86            let abs_path = entry.path().to_path_buf();
87            let rel_path = abs_path
88                .strip_prefix(root)
89                .ok()?
90                .to_string_lossy()
91                .replace('\\', "/");
92            Some(PathJob { abs_path, rel_path })
93        })
94        .collect();
95
96    // Phase 2: Process files in parallel (heavy I/O)
97    let mmap_threshold = opts.mmap_threshold;
98    let stats: Vec<FileStatEntry> = jobs
99        .into_par_iter()
100        .filter_map(|job| process_file(&job.abs_path, &job.rel_path, mmap_threshold))
101        .collect();
102
103    Ok(stats)
104}
105
106/// Process a single file and return its statistics.
107fn process_file(abs_path: &Path, rel_path: &str, mmap_threshold: u64) -> Option<FileStatEntry> {
108    let metadata = fs::metadata(abs_path).ok()?;
109    let size = metadata.len();
110
111    // Read file content
112    let content = load_file_content(abs_path, size, mmap_threshold)?;
113
114    // Count lines
115    let lines = count_lines(content.as_ref());
116
117    // Extract extension
118    let extension = abs_path
119        .extension()
120        .and_then(|ext| ext.to_str())
121        .map(|ext| ext.to_lowercase())
122        .unwrap_or_else(|| "(no ext)".to_string());
123
124    Some(FileStatEntry {
125        path: rel_path.to_string(),
126        extension,
127        size,
128        lines,
129    })
130}
131
132/// File content storage (either mmap or in-memory).
133enum FileContentRef {
134    Mmap(Mmap),
135    Bytes(Vec<u8>),
136}
137
138impl AsRef<[u8]> for FileContentRef {
139    fn as_ref(&self) -> &[u8] {
140        match self {
141            FileContentRef::Mmap(mmap) => mmap.as_ref(),
142            FileContentRef::Bytes(bytes) => bytes.as_slice(),
143        }
144    }
145}
146
147/// Load file content, using mmap for files above threshold.
148fn load_file_content(path: &Path, size: u64, mmap_threshold: u64) -> Option<FileContentRef> {
149    let use_mmap = mmap_threshold > 0 && size >= mmap_threshold;
150
151    if use_mmap {
152        if let Ok(file) = File::open(path) {
153            if let Ok(mmap) = unsafe { Mmap::map(&file) } {
154                return Some(FileContentRef::Mmap(mmap));
155            }
156        }
157    }
158
159    // Fall back to reading into memory
160    let mut file = File::open(path).ok()?;
161    let mut bytes = Vec::with_capacity(size as usize);
162    file.read_to_end(&mut bytes).ok()?;
163    Some(FileContentRef::Bytes(bytes))
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169    use std::fs;
170    use tempfile::TempDir;
171
172    #[test]
173    fn collect_basic() {
174        let temp = TempDir::new().unwrap();
175        fs::write(temp.path().join("file1.txt"), "hello\nworld").unwrap();
176        fs::write(temp.path().join("file2.rs"), "fn main() {}").unwrap();
177        fs::create_dir(temp.path().join("subdir")).unwrap();
178        fs::write(temp.path().join("subdir/nested.txt"), "line1\nline2\nline3").unwrap();
179
180        let opts = FileStatsOptions {
181            root: temp.path().to_path_buf(),
182            void_dir_name: ".void".to_string(),
183            include_hidden: false,
184            mmap_threshold: 0,
185        };
186
187        let stats = collect_file_stats(opts).unwrap();
188        assert_eq!(stats.len(), 3);
189
190        // Check that we got the expected files (order may vary due to parallelism)
191        let paths: Vec<_> = stats.iter().map(|s| s.path.as_str()).collect();
192        assert!(paths.contains(&"file1.txt"));
193        assert!(paths.contains(&"file2.rs"));
194        assert!(paths.contains(&"subdir/nested.txt"));
195    }
196
197    #[test]
198    fn collect_skips_void_dir() {
199        let temp = TempDir::new().unwrap();
200        fs::write(temp.path().join("file.txt"), "hello").unwrap();
201        fs::create_dir(temp.path().join(".void")).unwrap();
202        fs::write(temp.path().join(".void/secret"), "should be skipped").unwrap();
203
204        let opts = FileStatsOptions {
205            root: temp.path().to_path_buf(),
206            void_dir_name: ".void".to_string(),
207            include_hidden: false,
208            mmap_threshold: 0,
209        };
210
211        let stats = collect_file_stats(opts).unwrap();
212        assert_eq!(stats.len(), 1);
213        assert_eq!(stats[0].path, "file.txt");
214    }
215
216    #[test]
217    fn collect_respects_hidden_option() {
218        let temp = TempDir::new().unwrap();
219        fs::write(temp.path().join("visible.txt"), "visible").unwrap();
220        fs::write(temp.path().join(".hidden"), "hidden").unwrap();
221
222        // Without hidden files
223        let opts = FileStatsOptions {
224            root: temp.path().to_path_buf(),
225            void_dir_name: ".void".to_string(),
226            include_hidden: false,
227            mmap_threshold: 0,
228        };
229        let stats = collect_file_stats(opts).unwrap();
230        assert_eq!(stats.len(), 1);
231        assert_eq!(stats[0].path, "visible.txt");
232
233        // With hidden files
234        let opts = FileStatsOptions {
235            root: temp.path().to_path_buf(),
236            void_dir_name: ".void".to_string(),
237            include_hidden: true,
238            mmap_threshold: 0,
239        };
240        let stats = collect_file_stats(opts).unwrap();
241        assert_eq!(stats.len(), 2);
242    }
243
244    #[test]
245    fn extract_extension() {
246        let temp = TempDir::new().unwrap();
247        fs::write(temp.path().join("file.TXT"), "text").unwrap();
248        fs::write(temp.path().join("noext"), "no extension").unwrap();
249
250        let opts = FileStatsOptions {
251            root: temp.path().to_path_buf(),
252            void_dir_name: ".void".to_string(),
253            include_hidden: false,
254            mmap_threshold: 0,
255        };
256
257        let stats = collect_file_stats(opts).unwrap();
258
259        let txt_file = stats.iter().find(|s| s.path == "file.TXT").unwrap();
260        assert_eq!(txt_file.extension, "txt"); // lowercase
261
262        let noext_file = stats.iter().find(|s| s.path == "noext").unwrap();
263        assert_eq!(noext_file.extension, "(no ext)");
264    }
265}