Skip to main content

tokmd_scan/
lib.rs

1//! # tokmd-scan
2//!
3//! **Tier 1 (Adapter)**
4//!
5//! This crate adapts the `tokei` library for use within `tokmd`.
6//! It isolates the dependency on `tokei` to a single location.
7//!
8//! ## What belongs here
9//! * Tokei configuration and invocation
10//! * Mapping `tokmd` args to `tokei` config
11//!
12//! ## What does NOT belong here
13//! * Business logic (filtering, sorting, aggregation)
14//! * Output formatting
15//! * Receipt construction
16
17use anyhow::Result;
18use std::collections::BTreeSet;
19use std::fs;
20use std::path::{Component, Path, PathBuf};
21use tokei::{Config, Languages};
22
23use tokmd_settings::ScanOptions;
24use tokmd_types::ConfigMode;
25
26/// A single logical file supplied from memory rather than the host filesystem.
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub struct InMemoryFile {
29    pub path: PathBuf,
30    pub bytes: Vec<u8>,
31}
32
33impl InMemoryFile {
34    #[must_use]
35    pub fn new(path: impl Into<PathBuf>, bytes: impl Into<Vec<u8>>) -> Self {
36        Self {
37            path: path.into(),
38            bytes: bytes.into(),
39        }
40    }
41}
42
43/// A scan result that keeps its backing temp root alive for downstream row modeling.
44///
45/// Keep this wrapper alive while any downstream code needs to read file metadata from
46/// the scanned paths. `tokmd-model` uses the underlying paths to compute byte and token
47/// counts after the scan phase.
48///
49/// When converting these scan results into `FileRow`s, pass [`Self::strip_prefix`] as the
50/// `strip_prefix` argument so receipts keep the logical in-memory paths rather than the
51/// temp backing root.
52#[derive(Debug)]
53pub struct MaterializedScan {
54    languages: Languages,
55    logical_paths: Vec<PathBuf>,
56    root: tempfile::TempDir,
57}
58
59impl MaterializedScan {
60    #[must_use]
61    pub fn languages(&self) -> &Languages {
62        &self.languages
63    }
64
65    #[must_use]
66    pub fn logical_paths(&self) -> &[PathBuf] {
67        &self.logical_paths
68    }
69
70    #[must_use]
71    pub fn strip_prefix(&self) -> &Path {
72        self.root.path()
73    }
74}
75
76/// Scans a set of paths and computes line counts for each language found.
77///
78/// # Examples
79///
80/// ```
81/// use std::fs;
82/// use std::path::PathBuf;
83/// use tokmd_settings::ScanOptions;
84/// use tokmd_types::ConfigMode;
85/// use tokmd_scan::scan;
86///
87/// # fn main() -> anyhow::Result<()> {
88/// let dir = tempfile::tempdir()?;
89/// let file_path = dir.path().join("main.rs");
90/// fs::write(&file_path, "fn main() { println!(\"hello\"); }\n")?;
91///
92/// let paths = vec![file_path];
93/// let options = ScanOptions {
94///     config: ConfigMode::None,
95///     ..Default::default()
96/// };
97///
98/// let languages = scan(&paths, &options)?;
99/// let rust_stats = languages.get(&tokei::LanguageType::Rust).unwrap();
100///
101/// assert_eq!(rust_stats.code, 1);
102/// # Ok(())
103/// # }
104/// ```
105pub fn scan(paths: &[PathBuf], args: &ScanOptions) -> Result<Languages> {
106    let cfg = config_from_scan_options(args);
107    let ignores = ignored_patterns(args);
108    for path in paths {
109        if !path.exists() {
110            anyhow::bail!("Path not found: {}", path.display());
111        }
112    }
113
114    let mut languages = Languages::new();
115    languages.get_statistics(paths, &ignores, &cfg);
116
117    Ok(languages)
118}
119
120/// Build the `tokei` config used for a scan from clap-free `ScanOptions`.
121#[must_use]
122pub fn config_from_scan_options(args: &ScanOptions) -> Config {
123    build_config(args)
124}
125
126/// Normalize ordered in-memory inputs into deterministic logical paths.
127///
128/// This rejects empty, absolute, escaping, and case-only-colliding paths so
129/// native and browser callers see the same contract.
130pub fn normalize_in_memory_paths(inputs: &[InMemoryFile]) -> Result<Vec<PathBuf>> {
131    normalize_logical_paths(inputs, true)
132}
133
134pub fn scan_in_memory(inputs: &[InMemoryFile], args: &ScanOptions) -> Result<MaterializedScan> {
135    let root = tempfile::tempdir()?;
136    let logical_paths = normalize_in_memory_paths(inputs)?;
137
138    for (logical_path, input) in logical_paths.iter().zip(inputs) {
139        let full_path = root.path().join(logical_path);
140        if let Some(parent) = full_path.parent() {
141            fs::create_dir_all(parent)?;
142        }
143        fs::write(full_path, &input.bytes)?;
144    }
145
146    let scan_root = vec![root.path().to_path_buf()];
147    let languages = scan(&scan_root, args)?;
148
149    Ok(MaterializedScan {
150        languages,
151        logical_paths,
152        root,
153    })
154}
155
156fn build_config(args: &ScanOptions) -> Config {
157    let mut cfg = match args.config {
158        ConfigMode::Auto => Config::from_config_files(),
159        ConfigMode::None => Config::default(),
160    };
161
162    // Only override config file settings when the user explicitly asked for it.
163    if args.hidden {
164        cfg.hidden = Some(true);
165    }
166    if args.no_ignore {
167        cfg.no_ignore = Some(true);
168        cfg.no_ignore_dot = Some(true);
169        cfg.no_ignore_parent = Some(true);
170        cfg.no_ignore_vcs = Some(true);
171    }
172    if args.no_ignore_dot {
173        cfg.no_ignore_dot = Some(true);
174    }
175    if args.no_ignore_parent {
176        cfg.no_ignore_parent = Some(true);
177    }
178    if args.no_ignore_vcs {
179        cfg.no_ignore_vcs = Some(true);
180    }
181    if args.treat_doc_strings_as_comments {
182        cfg.treat_doc_strings_as_comments = Some(true);
183    }
184
185    cfg
186}
187
188fn ignored_patterns(args: &ScanOptions) -> Vec<&str> {
189    args.excluded.iter().map(|s| s.as_str()).collect()
190}
191
192fn normalize_logical_paths(
193    inputs: &[InMemoryFile],
194    case_insensitive: bool,
195) -> Result<Vec<PathBuf>> {
196    let mut seen = BTreeSet::new();
197    let mut normalized = Vec::with_capacity(inputs.len());
198
199    for input in inputs {
200        let logical_path = normalize_logical_path(&input.path)?;
201        if !seen.insert(logical_path_key(&logical_path, case_insensitive)) {
202            anyhow::bail!("Duplicate in-memory path: {}", logical_path.display());
203        }
204        normalized.push(logical_path);
205    }
206
207    Ok(normalized)
208}
209
210fn logical_path_key(path: &Path, case_insensitive: bool) -> String {
211    let rendered = path.to_string_lossy();
212    if case_insensitive {
213        rendered.to_lowercase()
214    } else {
215        rendered.into_owned()
216    }
217}
218
219fn normalize_logical_path(path: &Path) -> Result<PathBuf> {
220    if path.as_os_str().is_empty() {
221        anyhow::bail!("In-memory path must not be empty");
222    }
223
224    let mut normalized = PathBuf::new();
225    for component in path.components() {
226        match component {
227            Component::Normal(segment) => normalized.push(segment),
228            Component::CurDir => {}
229            Component::ParentDir => {
230                anyhow::bail!(
231                    "In-memory path must not contain parent traversal: {}",
232                    path.display()
233                );
234            }
235            Component::RootDir | Component::Prefix(_) => {
236                anyhow::bail!("In-memory path must be relative: {}", path.display());
237            }
238        }
239    }
240
241    if normalized.as_os_str().is_empty() {
242        anyhow::bail!("In-memory path must resolve to a file: {}", path.display());
243    }
244
245    Ok(normalized)
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251
252    fn default_scan_options() -> ScanOptions {
253        ScanOptions {
254            excluded: vec![],
255            config: ConfigMode::Auto,
256            hidden: false,
257            no_ignore: false,
258            no_ignore_parent: false,
259            no_ignore_dot: false,
260            no_ignore_vcs: false,
261            treat_doc_strings_as_comments: false,
262        }
263    }
264
265    // Get a valid test path - the crate's own source directory
266    fn test_path() -> PathBuf {
267        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src")
268    }
269
270    // ========================
271    // Basic Scan Tests
272    // ========================
273
274    #[test]
275    fn scan_finds_rust_files() -> Result<()> {
276        let args = default_scan_options();
277        let paths = vec![test_path()];
278        let result = scan(&paths, &args)?;
279        // Should find at least this lib.rs file
280        assert!(!result.is_empty());
281        assert!(result.get(&tokei::LanguageType::Rust).is_some());
282        Ok(())
283    }
284
285    #[test]
286    fn scan_with_nonexistent_path_returns_error() -> Result<()> {
287        let args = default_scan_options();
288        let dir = tempfile::tempdir()?;
289        let nonexistent = dir.path().join("definitely-not-created");
290        let paths = vec![nonexistent];
291        let result = scan(&paths, &args);
292        // Should return an error for nonexistent paths
293        assert!(result.is_err());
294        assert!(
295            result
296                .expect_err("should have failed")
297                .to_string()
298                .contains("Path not found")
299        );
300        Ok(())
301    }
302
303    // ========================
304    // Config Flag Tests
305    // ========================
306
307    #[test]
308    fn scan_with_hidden_flag() -> Result<()> {
309        let mut args = default_scan_options();
310        args.hidden = true;
311        let paths = vec![test_path()];
312        let result = scan(&paths, &args);
313        assert!(result.is_ok());
314        Ok(())
315    }
316
317    #[test]
318    fn scan_with_no_ignore_flag() -> Result<()> {
319        let mut args = default_scan_options();
320        args.no_ignore = true;
321        let paths = vec![test_path()];
322        // no_ignore should imply all other no_ignore_* flags
323        let result = scan(&paths, &args);
324        assert!(result.is_ok());
325        Ok(())
326    }
327
328    #[test]
329    fn scan_with_individual_no_ignore_flags() -> Result<()> {
330        let mut args = default_scan_options();
331        args.no_ignore_parent = true;
332        args.no_ignore_dot = true;
333        args.no_ignore_vcs = true;
334        let paths = vec![test_path()];
335        let result = scan(&paths, &args);
336        assert!(result.is_ok());
337        Ok(())
338    }
339
340    #[test]
341    fn scan_with_treat_doc_strings_as_comments() -> Result<()> {
342        let mut args = default_scan_options();
343        args.treat_doc_strings_as_comments = true;
344        let paths = vec![test_path()];
345        let result = scan(&paths, &args);
346        assert!(result.is_ok());
347        Ok(())
348    }
349
350    #[test]
351    fn scan_with_config_mode_none() -> Result<()> {
352        let mut args = default_scan_options();
353        args.config = ConfigMode::None;
354        let paths = vec![test_path()];
355        let result = scan(&paths, &args);
356        assert!(result.is_ok());
357        Ok(())
358    }
359
360    #[test]
361    fn scan_with_excluded_patterns() -> Result<()> {
362        let mut args = default_scan_options();
363        args.excluded = vec!["target".to_string(), "*.min.js".to_string()];
364        let paths = vec![test_path()];
365        let result = scan(&paths, &args);
366        assert!(result.is_ok());
367        Ok(())
368    }
369
370    #[test]
371    fn scan_with_all_flags_combined() -> Result<()> {
372        let args = ScanOptions {
373            excluded: vec!["node_modules".to_string()],
374            config: ConfigMode::None,
375            hidden: true,
376            no_ignore: true,
377            no_ignore_parent: true,
378            no_ignore_dot: true,
379            no_ignore_vcs: true,
380            treat_doc_strings_as_comments: true,
381        };
382        let paths = vec![test_path()];
383        // Should handle all flags without panicking
384        let result = scan(&paths, &args);
385        assert!(result.is_ok());
386        Ok(())
387    }
388
389    #[test]
390    fn scan_returns_code_stats() -> Result<()> {
391        let args = default_scan_options();
392        let paths = vec![test_path()];
393        let result = scan(&paths, &args)?;
394
395        let rust = result
396            .get(&tokei::LanguageType::Rust)
397            .expect("should find rust in src/lib.rs");
398        // The lib.rs file should have some code
399        assert!(rust.code > 0);
400        assert!(rust.lines() > 0);
401        Ok(())
402    }
403
404    #[test]
405    fn normalize_logical_path_strips_dot_segments() -> Result<()> {
406        let normalized = normalize_logical_path(Path::new("./src/./lib.rs"))?;
407        assert_eq!(normalized, PathBuf::from("src/lib.rs"));
408        Ok(())
409    }
410
411    #[test]
412    fn normalize_logical_path_rejects_absolute_paths() {
413        let err = normalize_logical_path(Path::new("/src/lib.rs")).unwrap_err();
414        assert!(err.to_string().contains("must be relative"));
415    }
416
417    #[test]
418    fn normalize_logical_path_rejects_parent_traversal() {
419        let err = normalize_logical_path(Path::new("../src/lib.rs")).unwrap_err();
420        assert!(err.to_string().contains("parent traversal"));
421    }
422
423    #[test]
424    fn normalize_logical_paths_rejects_duplicate_after_normalization() {
425        let inputs = vec![
426            InMemoryFile::new("./src/lib.rs", "fn main() {}\n"),
427            InMemoryFile::new("src/lib.rs", "fn main() {}\n"),
428        ];
429
430        let err = normalize_logical_paths(&inputs, false).unwrap_err();
431        assert!(err.to_string().contains("Duplicate in-memory path"));
432    }
433
434    #[test]
435    fn normalize_logical_paths_rejects_case_only_collision_on_case_insensitive_fs() {
436        let inputs = vec![
437            InMemoryFile::new("src/lib.rs", "fn main() {}\n"),
438            InMemoryFile::new("SRC/LIB.rs", "fn main() {}\n"),
439        ];
440
441        let err = normalize_logical_paths(&inputs, true).unwrap_err();
442        assert!(err.to_string().contains("Duplicate in-memory path"));
443    }
444}