Skip to main content

tokmd_scan/
in_memory.rs

1//! In-memory scan input materialization.
2//!
3//! This module owns the browser/native contract for logical in-memory file
4//! paths before they are materialized into a temporary tokei scan root.
5
6use anyhow::Result;
7use std::collections::BTreeSet;
8use std::fs;
9use std::path::{Component, Path, PathBuf};
10use tokei::Languages;
11
12use tokmd_settings::ScanOptions;
13
14/// A single logical file supplied from memory rather than the host filesystem.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct InMemoryFile {
17    pub path: PathBuf,
18    pub bytes: Vec<u8>,
19}
20
21impl InMemoryFile {
22    #[must_use]
23    pub fn new(path: impl Into<PathBuf>, bytes: impl Into<Vec<u8>>) -> Self {
24        Self {
25            path: path.into(),
26            bytes: bytes.into(),
27        }
28    }
29}
30
31/// A scan result that keeps its backing temp root alive for downstream row modeling.
32///
33/// Keep this wrapper alive while any downstream code needs to read file metadata from
34/// the scanned paths. `tokmd-model` uses the underlying paths to compute byte and token
35/// counts after the scan phase.
36///
37/// When converting these scan results into `FileRow`s, pass [`Self::strip_prefix`] as the
38/// `strip_prefix` argument so receipts keep the logical in-memory paths rather than the
39/// temp backing root.
40#[derive(Debug)]
41pub struct MaterializedScan {
42    languages: Languages,
43    logical_paths: Vec<PathBuf>,
44    root: tempfile::TempDir,
45}
46
47impl MaterializedScan {
48    #[must_use]
49    pub fn languages(&self) -> &Languages {
50        &self.languages
51    }
52
53    #[must_use]
54    pub fn logical_paths(&self) -> &[PathBuf] {
55        &self.logical_paths
56    }
57
58    #[must_use]
59    pub fn strip_prefix(&self) -> &Path {
60        self.root.path()
61    }
62}
63
64/// Normalize ordered in-memory inputs into deterministic logical paths.
65///
66/// This rejects empty, absolute, escaping, and case-only-colliding paths so
67/// native and browser callers see the same contract.
68pub fn normalize_in_memory_paths(inputs: &[InMemoryFile]) -> Result<Vec<PathBuf>> {
69    normalize_logical_paths(inputs, true)
70}
71
72pub fn scan_in_memory(inputs: &[InMemoryFile], args: &ScanOptions) -> Result<MaterializedScan> {
73    let root = tempfile::tempdir()?;
74    let logical_paths = normalize_in_memory_paths(inputs)?;
75
76    for (logical_path, input) in logical_paths.iter().zip(inputs) {
77        let full_path = root.path().join(logical_path);
78        if let Some(parent) = full_path.parent() {
79            fs::create_dir_all(parent)?;
80        }
81        fs::write(full_path, &input.bytes)?;
82    }
83
84    let scan_root = vec![root.path().to_path_buf()];
85    let languages = crate::scan(&scan_root, args)?;
86
87    Ok(MaterializedScan {
88        languages,
89        logical_paths,
90        root,
91    })
92}
93
94fn normalize_logical_paths(
95    inputs: &[InMemoryFile],
96    case_insensitive: bool,
97) -> Result<Vec<PathBuf>> {
98    let mut seen = BTreeSet::new();
99    let mut normalized = Vec::with_capacity(inputs.len());
100
101    for input in inputs {
102        let logical_path = normalize_logical_path(&input.path)?;
103        if !seen.insert(logical_path_key(&logical_path, case_insensitive)) {
104            anyhow::bail!("Duplicate in-memory path: {}", logical_path.display());
105        }
106        normalized.push(logical_path);
107    }
108
109    Ok(normalized)
110}
111
112fn logical_path_key(path: &Path, case_insensitive: bool) -> String {
113    let rendered = path.to_string_lossy();
114    if case_insensitive {
115        rendered.to_lowercase()
116    } else {
117        rendered.into_owned()
118    }
119}
120
121fn normalize_logical_path(path: &Path) -> Result<PathBuf> {
122    if path.as_os_str().is_empty() {
123        anyhow::bail!("In-memory path must not be empty");
124    }
125
126    let mut normalized = PathBuf::new();
127    for component in path.components() {
128        match component {
129            Component::Normal(segment) => normalized.push(segment),
130            Component::CurDir => {}
131            Component::ParentDir => {
132                anyhow::bail!(
133                    "In-memory path must not contain parent traversal: {}",
134                    path.display()
135                );
136            }
137            Component::RootDir | Component::Prefix(_) => {
138                anyhow::bail!("In-memory path must be relative: {}", path.display());
139            }
140        }
141    }
142
143    if normalized.as_os_str().is_empty() {
144        anyhow::bail!("In-memory path must resolve to a file: {}", path.display());
145    }
146
147    Ok(normalized)
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153
154    #[test]
155    fn normalize_logical_path_strips_dot_segments() -> Result<()> {
156        let normalized = normalize_logical_path(Path::new("./src/./lib.rs"))?;
157        assert_eq!(normalized, PathBuf::from("src/lib.rs"));
158        Ok(())
159    }
160
161    #[test]
162    fn normalize_logical_path_rejects_absolute_paths() {
163        let err = normalize_logical_path(Path::new("/src/lib.rs")).unwrap_err();
164        assert!(err.to_string().contains("must be relative"));
165    }
166
167    #[test]
168    fn normalize_logical_path_rejects_parent_traversal() {
169        let err = normalize_logical_path(Path::new("../src/lib.rs")).unwrap_err();
170        assert!(err.to_string().contains("parent traversal"));
171    }
172
173    #[test]
174    fn normalize_logical_paths_rejects_duplicate_after_normalization() {
175        let inputs = vec![
176            InMemoryFile::new("./src/lib.rs", "fn main() {}\n"),
177            InMemoryFile::new("src/lib.rs", "fn main() {}\n"),
178        ];
179
180        let err = normalize_logical_paths(&inputs, false).unwrap_err();
181        assert!(err.to_string().contains("Duplicate in-memory path"));
182    }
183
184    #[test]
185    fn normalize_logical_paths_rejects_case_only_collision_on_case_insensitive_fs() {
186        let inputs = vec![
187            InMemoryFile::new("src/lib.rs", "fn main() {}\n"),
188            InMemoryFile::new("SRC/LIB.rs", "fn main() {}\n"),
189        ];
190
191        let err = normalize_logical_paths(&inputs, true).unwrap_err();
192        assert!(err.to_string().contains("Duplicate in-memory path"));
193    }
194}