Skip to main content

tokmd_sensor/
substrate_builder.rs

1//! Substrate builder: runs a tokei scan once and builds a `RepoSubstrate`.
2
3use std::collections::BTreeMap;
4use std::path::PathBuf;
5
6use anyhow::Result;
7use tokmd_settings::ScanOptions;
8use tokmd_substrate::{DiffRange, LangSummary, RepoSubstrate, SubstrateFile};
9use tokmd_types::ChildIncludeMode;
10
11/// Build a `RepoSubstrate` from a scan of the given repo root.
12///
13/// This function runs tokei once, aggregates the results, and optionally
14/// marks files that appear in the given diff range.
15pub fn build_substrate(
16    repo_root: &str,
17    scan_options: &ScanOptions,
18    module_roots: &[String],
19    module_depth: usize,
20    diff_range: Option<DiffRange>,
21) -> Result<RepoSubstrate> {
22    let paths = vec![PathBuf::from(repo_root)];
23
24    // Run tokei scan
25    let languages = tokmd_scan::scan(&paths, scan_options)?;
26
27    // Build file rows using the model layer
28    let file_rows = tokmd_model::collect_file_rows(
29        &languages,
30        module_roots,
31        module_depth,
32        ChildIncludeMode::ParentsOnly,
33        Some(std::path::Path::new(repo_root)),
34    );
35
36    // Normalize changed_files through the same path normalization used for file rows,
37    // so both sides use identical path representation regardless of scan/git root differences.
38    let strip_prefix = std::path::Path::new(repo_root);
39    let normalized_changed: Vec<String> = diff_range
40        .as_ref()
41        .map(|dr| {
42            dr.changed_files
43                .iter()
44                .map(|s| tokmd_model::normalize_path(std::path::Path::new(s), Some(strip_prefix)))
45                .collect()
46        })
47        .unwrap_or_default();
48    let changed_set: std::collections::BTreeSet<&str> =
49        normalized_changed.iter().map(|s| s.as_str()).collect();
50
51    // Convert file rows to substrate files
52    let files: Vec<SubstrateFile> = file_rows
53        .iter()
54        .map(|row| SubstrateFile {
55            path: row.path.clone(),
56            lang: row.lang.clone(),
57            code: row.code,
58            lines: row.lines,
59            bytes: row.bytes,
60            tokens: row.tokens,
61            module: row.module.clone(),
62            in_diff: changed_set.contains(row.path.as_str()),
63        })
64        .collect();
65
66    // Aggregate per-language summary
67    let mut lang_summary: BTreeMap<String, LangSummary> = BTreeMap::new();
68    for f in &files {
69        let entry = lang_summary.entry(f.lang.clone()).or_insert(LangSummary {
70            files: 0,
71            code: 0,
72            lines: 0,
73            bytes: 0,
74            tokens: 0,
75        });
76        entry.files += 1;
77        entry.code += f.code;
78        entry.lines += f.lines;
79        entry.bytes += f.bytes;
80        entry.tokens += f.tokens;
81    }
82
83    // Compute totals
84    let total_tokens: usize = files.iter().map(|f| f.tokens).sum();
85    let total_bytes: usize = files.iter().map(|f| f.bytes).sum();
86    let total_code_lines: usize = files.iter().map(|f| f.code).sum();
87
88    Ok(RepoSubstrate {
89        repo_root: repo_root.to_string(),
90        files,
91        lang_summary,
92        diff_range,
93        total_tokens,
94        total_bytes,
95        total_code_lines,
96    })
97}
98
99#[cfg(test)]
100mod tests {
101    use super::*;
102    use tokmd_settings::ScanOptions;
103
104    #[test]
105    fn build_substrate_scans_self() {
106        let manifest_dir = env!("CARGO_MANIFEST_DIR");
107        let substrate = build_substrate(
108            &format!("{}/src", manifest_dir),
109            &ScanOptions::default(),
110            &[],
111            2,
112            None,
113        )
114        .unwrap();
115
116        assert!(!substrate.files.is_empty());
117        assert!(substrate.lang_summary.contains_key("Rust"));
118        assert!(substrate.total_code_lines > 0);
119        assert!(substrate.diff_range.is_none());
120    }
121
122    #[test]
123    fn build_substrate_with_diff_range() {
124        let manifest_dir = env!("CARGO_MANIFEST_DIR");
125        // Use crate root as repo_root (not src/), so file rows have paths like "src/lib.rs".
126        // Provide changed_files as repo-relative paths, matching git diff --numstat output.
127        let diff = DiffRange {
128            base: "main".to_string(),
129            head: "HEAD".to_string(),
130            changed_files: vec!["src/lib.rs".to_string()],
131            commit_count: 1,
132            insertions: 5,
133            deletions: 2,
134        };
135        let substrate =
136            build_substrate(manifest_dir, &ScanOptions::default(), &[], 2, Some(diff)).unwrap();
137
138        assert!(substrate.diff_range.is_some());
139        let diff_files: Vec<&str> = substrate
140            .files
141            .iter()
142            .filter(|f| f.in_diff)
143            .map(|f| f.path.as_str())
144            .collect();
145        assert!(!diff_files.is_empty());
146        assert!(diff_files.contains(&"src/lib.rs"));
147        // Selectivity: files not in changed_files should NOT be marked
148        let non_diff: Vec<&str> = substrate
149            .files
150            .iter()
151            .filter(|f| !f.in_diff && f.path.contains("substrate_builder"))
152            .map(|f| f.path.as_str())
153            .collect();
154        assert!(!non_diff.is_empty());
155    }
156
157    #[test]
158    fn build_substrate_errors_on_missing_root() {
159        let dir = tempfile::tempdir().expect("temp dir");
160        let missing = dir.path().join("definitely-not-created");
161        let result = build_substrate(
162            missing.to_string_lossy().as_ref(),
163            &ScanOptions::default(),
164            &[],
165            2,
166            None,
167        );
168        assert!(result.is_err());
169    }
170}