Skip to main content

tokmd_scan/
lib.rs

1//! # tokmd-scan
2//!
3//! **Tier 1 (Adapter)**
4//!
5//! This crate adapts the `tokei` library for use within `tokmd`.
6//! It isolates the dependency on `tokei` to a single location.
7//!
8//! ## What belongs here
9//! * Tokei configuration and invocation
10//! * Mapping `tokmd` args to `tokei` config
11//!
12//! ## What does NOT belong here
13//! * Business logic (filtering, sorting, aggregation)
14//! * Output formatting
15//! * Receipt construction
16
17use anyhow::Result;
18use std::collections::BTreeSet;
19use std::fs;
20use std::path::{Component, Path, PathBuf};
21use tokei::{Config, Languages};
22
23use crate::path::ValidatedRoot;
24use tokmd_settings::ScanOptions;
25use tokmd_types::ConfigMode;
26
27/// A single logical file supplied from memory rather than the host filesystem.
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct InMemoryFile {
30    pub path: PathBuf,
31    pub bytes: Vec<u8>,
32}
33
34impl InMemoryFile {
35    #[must_use]
36    pub fn new(path: impl Into<PathBuf>, bytes: impl Into<Vec<u8>>) -> Self {
37        Self {
38            path: path.into(),
39            bytes: bytes.into(),
40        }
41    }
42}
43
44/// A scan result that keeps its backing temp root alive for downstream row modeling.
45///
46/// Keep this wrapper alive while any downstream code needs to read file metadata from
47/// the scanned paths. `tokmd-model` uses the underlying paths to compute byte and token
48/// counts after the scan phase.
49///
50/// When converting these scan results into `FileRow`s, pass [`Self::strip_prefix`] as the
51/// `strip_prefix` argument so receipts keep the logical in-memory paths rather than the
52/// temp backing root.
53#[derive(Debug)]
54pub struct MaterializedScan {
55    languages: Languages,
56    logical_paths: Vec<PathBuf>,
57    root: tempfile::TempDir,
58}
59
60impl MaterializedScan {
61    #[must_use]
62    pub fn languages(&self) -> &Languages {
63        &self.languages
64    }
65
66    #[must_use]
67    pub fn logical_paths(&self) -> &[PathBuf] {
68        &self.logical_paths
69    }
70
71    #[must_use]
72    pub fn strip_prefix(&self) -> &Path {
73        self.root.path()
74    }
75}
76
77/// Scans a set of paths and computes line counts for each language found.
78///
79/// # Examples
80///
81/// ```
82/// use std::fs;
83/// use std::path::PathBuf;
84/// use tokmd_settings::ScanOptions;
85/// use tokmd_types::ConfigMode;
86/// use tokmd_scan::scan;
87///
88/// # fn main() -> anyhow::Result<()> {
89/// let dir = tempfile::tempdir()?;
90/// let file_path = dir.path().join("main.rs");
91/// fs::write(&file_path, "fn main() { println!(\"hello\"); }\n")?;
92///
93/// let paths = vec![file_path];
94/// let options = ScanOptions {
95///     config: ConfigMode::None,
96///     ..Default::default()
97/// };
98///
99/// let languages = scan(&paths, &options)?;
100/// let rust_stats = languages.get(&tokei::LanguageType::Rust).unwrap();
101///
102/// assert_eq!(rust_stats.code, 1);
103/// # Ok(())
104/// # }
105/// ```
106pub fn scan(paths: &[PathBuf], args: &ScanOptions) -> Result<Languages> {
107    let cfg = config_from_scan_options(args);
108    let ignores = ignored_patterns(args);
109    let roots: Vec<ValidatedRoot> = paths
110        .iter()
111        .map(ValidatedRoot::new)
112        .collect::<std::result::Result<_, _>>()?;
113    let scan_paths: Vec<PathBuf> = roots
114        .iter()
115        .map(|root| root.input().to_path_buf())
116        .collect();
117
118    let mut languages = Languages::new();
119    languages.get_statistics(&scan_paths, &ignores, &cfg);
120
121    Ok(languages)
122}
123
124/// Build the `tokei` config used for a scan from clap-free `ScanOptions`.
125#[must_use]
126pub fn config_from_scan_options(args: &ScanOptions) -> Config {
127    build_config(args)
128}
129
130/// Normalize ordered in-memory inputs into deterministic logical paths.
131///
132/// This rejects empty, absolute, escaping, and case-only-colliding paths so
133/// native and browser callers see the same contract.
134pub fn normalize_in_memory_paths(inputs: &[InMemoryFile]) -> Result<Vec<PathBuf>> {
135    normalize_logical_paths(inputs, true)
136}
137
138pub fn scan_in_memory(inputs: &[InMemoryFile], args: &ScanOptions) -> Result<MaterializedScan> {
139    let root = tempfile::tempdir()?;
140    let logical_paths = normalize_in_memory_paths(inputs)?;
141
142    for (logical_path, input) in logical_paths.iter().zip(inputs) {
143        let full_path = root.path().join(logical_path);
144        if let Some(parent) = full_path.parent() {
145            fs::create_dir_all(parent)?;
146        }
147        fs::write(full_path, &input.bytes)?;
148    }
149
150    let scan_root = vec![root.path().to_path_buf()];
151    let languages = scan(&scan_root, args)?;
152
153    Ok(MaterializedScan {
154        languages,
155        logical_paths,
156        root,
157    })
158}
159
160fn build_config(args: &ScanOptions) -> Config {
161    let mut cfg = match args.config {
162        ConfigMode::Auto => Config::from_config_files(),
163        ConfigMode::None => Config::default(),
164    };
165
166    // Only override config file settings when the user explicitly asked for it.
167    if args.hidden {
168        cfg.hidden = Some(true);
169    }
170    if args.no_ignore {
171        cfg.no_ignore = Some(true);
172        cfg.no_ignore_dot = Some(true);
173        cfg.no_ignore_parent = Some(true);
174        cfg.no_ignore_vcs = Some(true);
175    }
176    if args.no_ignore_dot {
177        cfg.no_ignore_dot = Some(true);
178    }
179    if args.no_ignore_parent {
180        cfg.no_ignore_parent = Some(true);
181    }
182    if args.no_ignore_vcs {
183        cfg.no_ignore_vcs = Some(true);
184    }
185    if args.treat_doc_strings_as_comments {
186        cfg.treat_doc_strings_as_comments = Some(true);
187    }
188
189    cfg
190}
191
192fn ignored_patterns(args: &ScanOptions) -> Vec<&str> {
193    args.excluded.iter().map(|s| s.as_str()).collect()
194}
195
196fn normalize_logical_paths(
197    inputs: &[InMemoryFile],
198    case_insensitive: bool,
199) -> Result<Vec<PathBuf>> {
200    let mut seen = BTreeSet::new();
201    let mut normalized = Vec::with_capacity(inputs.len());
202
203    for input in inputs {
204        let logical_path = normalize_logical_path(&input.path)?;
205        if !seen.insert(logical_path_key(&logical_path, case_insensitive)) {
206            anyhow::bail!("Duplicate in-memory path: {}", logical_path.display());
207        }
208        normalized.push(logical_path);
209    }
210
211    Ok(normalized)
212}
213
214fn logical_path_key(path: &Path, case_insensitive: bool) -> String {
215    let rendered = path.to_string_lossy();
216    if case_insensitive {
217        rendered.to_lowercase()
218    } else {
219        rendered.into_owned()
220    }
221}
222
223fn normalize_logical_path(path: &Path) -> Result<PathBuf> {
224    if path.as_os_str().is_empty() {
225        anyhow::bail!("In-memory path must not be empty");
226    }
227
228    let mut normalized = PathBuf::new();
229    for component in path.components() {
230        match component {
231            Component::Normal(segment) => normalized.push(segment),
232            Component::CurDir => {}
233            Component::ParentDir => {
234                anyhow::bail!(
235                    "In-memory path must not contain parent traversal: {}",
236                    path.display()
237                );
238            }
239            Component::RootDir | Component::Prefix(_) => {
240                anyhow::bail!("In-memory path must be relative: {}", path.display());
241            }
242        }
243    }
244
245    if normalized.as_os_str().is_empty() {
246        anyhow::bail!("In-memory path must resolve to a file: {}", path.display());
247    }
248
249    Ok(normalized)
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255
256    fn default_scan_options() -> ScanOptions {
257        ScanOptions {
258            excluded: vec![],
259            config: ConfigMode::Auto,
260            hidden: false,
261            no_ignore: false,
262            no_ignore_parent: false,
263            no_ignore_dot: false,
264            no_ignore_vcs: false,
265            treat_doc_strings_as_comments: false,
266        }
267    }
268
269    // Get a valid test path - the crate's own source directory
270    fn test_path() -> PathBuf {
271        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src")
272    }
273
274    // ========================
275    // Basic Scan Tests
276    // ========================
277
278    #[test]
279    fn scan_finds_rust_files() -> Result<()> {
280        let args = default_scan_options();
281        let paths = vec![test_path()];
282        let result = scan(&paths, &args)?;
283        // Should find at least this lib.rs file
284        assert!(!result.is_empty());
285        assert!(result.get(&tokei::LanguageType::Rust).is_some());
286        Ok(())
287    }
288
289    #[test]
290    fn scan_with_nonexistent_path_returns_error() -> Result<()> {
291        let args = default_scan_options();
292        let dir = tempfile::tempdir()?;
293        let nonexistent = dir.path().join("definitely-not-created");
294        let paths = vec![nonexistent];
295        let result = scan(&paths, &args);
296        // Should return an error for nonexistent paths
297        assert!(result.is_err());
298        assert!(
299            result
300                .expect_err("should have failed")
301                .to_string()
302                .contains("Path not found")
303        );
304        Ok(())
305    }
306
307    // ========================
308    // Config Flag Tests
309    // ========================
310
311    #[test]
312    fn scan_with_hidden_flag() -> Result<()> {
313        let mut args = default_scan_options();
314        args.hidden = true;
315        let paths = vec![test_path()];
316        let result = scan(&paths, &args);
317        assert!(result.is_ok());
318        Ok(())
319    }
320
321    #[test]
322    fn scan_with_no_ignore_flag() -> Result<()> {
323        let mut args = default_scan_options();
324        args.no_ignore = true;
325        let paths = vec![test_path()];
326        // no_ignore should imply all other no_ignore_* flags
327        let result = scan(&paths, &args);
328        assert!(result.is_ok());
329        Ok(())
330    }
331
332    #[test]
333    fn scan_with_individual_no_ignore_flags() -> Result<()> {
334        let mut args = default_scan_options();
335        args.no_ignore_parent = true;
336        args.no_ignore_dot = true;
337        args.no_ignore_vcs = true;
338        let paths = vec![test_path()];
339        let result = scan(&paths, &args);
340        assert!(result.is_ok());
341        Ok(())
342    }
343
344    #[test]
345    fn scan_with_treat_doc_strings_as_comments() -> Result<()> {
346        let mut args = default_scan_options();
347        args.treat_doc_strings_as_comments = true;
348        let paths = vec![test_path()];
349        let result = scan(&paths, &args);
350        assert!(result.is_ok());
351        Ok(())
352    }
353
354    #[test]
355    fn scan_with_config_mode_none() -> Result<()> {
356        let mut args = default_scan_options();
357        args.config = ConfigMode::None;
358        let paths = vec![test_path()];
359        let result = scan(&paths, &args);
360        assert!(result.is_ok());
361        Ok(())
362    }
363
364    #[test]
365    fn scan_with_excluded_patterns() -> Result<()> {
366        let mut args = default_scan_options();
367        args.excluded = vec!["target".to_string(), "*.min.js".to_string()];
368        let paths = vec![test_path()];
369        let result = scan(&paths, &args);
370        assert!(result.is_ok());
371        Ok(())
372    }
373
374    #[test]
375    fn scan_with_all_flags_combined() -> Result<()> {
376        let args = ScanOptions {
377            excluded: vec!["node_modules".to_string()],
378            config: ConfigMode::None,
379            hidden: true,
380            no_ignore: true,
381            no_ignore_parent: true,
382            no_ignore_dot: true,
383            no_ignore_vcs: true,
384            treat_doc_strings_as_comments: true,
385        };
386        let paths = vec![test_path()];
387        // Should handle all flags without panicking
388        let result = scan(&paths, &args);
389        assert!(result.is_ok());
390        Ok(())
391    }
392
393    #[test]
394    fn scan_returns_code_stats() -> Result<()> {
395        let args = default_scan_options();
396        let paths = vec![test_path()];
397        let result = scan(&paths, &args)?;
398
399        let rust = result
400            .get(&tokei::LanguageType::Rust)
401            .expect("should find rust in src/lib.rs");
402        // The lib.rs file should have some code
403        assert!(rust.code > 0);
404        assert!(rust.lines() > 0);
405        Ok(())
406    }
407
408    #[test]
409    fn normalize_logical_path_strips_dot_segments() -> Result<()> {
410        let normalized = normalize_logical_path(Path::new("./src/./lib.rs"))?;
411        assert_eq!(normalized, PathBuf::from("src/lib.rs"));
412        Ok(())
413    }
414
415    #[test]
416    fn normalize_logical_path_rejects_absolute_paths() {
417        let err = normalize_logical_path(Path::new("/src/lib.rs")).unwrap_err();
418        assert!(err.to_string().contains("must be relative"));
419    }
420
421    #[test]
422    fn normalize_logical_path_rejects_parent_traversal() {
423        let err = normalize_logical_path(Path::new("../src/lib.rs")).unwrap_err();
424        assert!(err.to_string().contains("parent traversal"));
425    }
426
427    #[test]
428    fn normalize_logical_paths_rejects_duplicate_after_normalization() {
429        let inputs = vec![
430            InMemoryFile::new("./src/lib.rs", "fn main() {}\n"),
431            InMemoryFile::new("src/lib.rs", "fn main() {}\n"),
432        ];
433
434        let err = normalize_logical_paths(&inputs, false).unwrap_err();
435        assert!(err.to_string().contains("Duplicate in-memory path"));
436    }
437
438    #[test]
439    fn normalize_logical_paths_rejects_case_only_collision_on_case_insensitive_fs() {
440        let inputs = vec![
441            InMemoryFile::new("src/lib.rs", "fn main() {}\n"),
442            InMemoryFile::new("SRC/LIB.rs", "fn main() {}\n"),
443        ];
444
445        let err = normalize_logical_paths(&inputs, true).unwrap_err();
446        assert!(err.to_string().contains("Duplicate in-memory path"));
447    }
448}
449
450pub mod exclude;
451pub mod math;
452pub mod path;
453pub mod tokeignore;
454pub mod walk;
455
456pub use exclude::{add_exclude_pattern, has_exclude_pattern, normalize_exclude_pattern};
457pub use math::{gini_coefficient, percentile, round_f64, safe_ratio};
458pub use path::{normalize_rel_path, normalize_slashes};
459pub use tokeignore::{InitArgs, InitProfile, init_tokeignore};