Skip to main content

roder_code_index/
merkle.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::fs;
3use std::path::{Path, PathBuf};
4
5use anyhow::{Context, bail};
6use ignore::{DirEntry, WalkBuilder};
7use roder_api::code_index::{
8    CodeIndexNodeKind, ContentHash, MerkleHash, PathHash, WorkspaceMerkleNode, WorkspaceMerkleTree,
9    WorkspaceSimilarityHash,
10};
11
12use crate::hex_sha256;
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct MerkleBuildOptions {
16    pub scopes: Vec<PathBuf>,
17}
18
19impl Default for MerkleBuildOptions {
20    fn default() -> Self {
21        Self {
22            scopes: vec![PathBuf::from(".")],
23        }
24    }
25}
26
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub struct FileManifestEntry {
29    pub path: PathBuf,
30    pub path_hash: PathHash,
31    pub content_hash: ContentHash,
32    pub size: u64,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct MerkleBuild {
37    pub tree: WorkspaceMerkleTree,
38    pub files: Vec<FileManifestEntry>,
39}
40
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub struct MerkleDiff {
43    pub changed_files: Vec<PathBuf>,
44    pub deleted_files: Vec<PathBuf>,
45    pub unchanged_files: Vec<PathBuf>,
46}
47
48pub fn build_workspace_merkle(root: impl AsRef<Path>) -> anyhow::Result<MerkleBuild> {
49    build_workspace_merkle_with_options(root, &MerkleBuildOptions::default())
50}
51
52pub fn build_workspace_merkle_with_options(
53    root: impl AsRef<Path>,
54    options: &MerkleBuildOptions,
55) -> anyhow::Result<MerkleBuild> {
56    let workspace_root = fs::canonicalize(root.as_ref())
57        .with_context(|| format!("canonicalize workspace {}", root.as_ref().display()))?;
58    if !workspace_root.is_dir() {
59        bail!(
60            "workspace root is not a directory: {}",
61            workspace_root.display()
62        );
63    }
64
65    let scopes = canonical_scopes(&workspace_root, &options.scopes)?;
66    let mut files = Vec::new();
67    let mut directory_children: BTreeMap<PathBuf, BTreeSet<MerkleHash>> = BTreeMap::new();
68
69    let mut walk = WalkBuilder::new(&workspace_root);
70    walk.standard_filters(true).hidden(false);
71    for result in walk.build() {
72        let entry = result?;
73        if should_skip(&workspace_root, &entry) {
74            continue;
75        }
76        let path = entry.path();
77        if path == workspace_root {
78            continue;
79        }
80        if !in_scope(path, &scopes) {
81            continue;
82        }
83        if !path.is_file() {
84            continue;
85        }
86
87        let rel = path
88            .strip_prefix(&workspace_root)
89            .with_context(|| format!("strip workspace root from {}", path.display()))?
90            .to_path_buf();
91        let bytes = fs::read(path).with_context(|| format!("read {}", path.display()))?;
92        let content_hash = hex_sha256(&bytes);
93        let path_hash = hash_path(&rel);
94        let file_hash = hash_node("file", &rel, &[content_hash.as_str()]);
95        files.push(FileManifestEntry {
96            path: rel.clone(),
97            path_hash: path_hash.clone(),
98            content_hash,
99            size: bytes.len() as u64,
100        });
101        for ancestor in rel.ancestors().skip(1) {
102            directory_children
103                .entry(ancestor.to_path_buf())
104                .or_default()
105                .insert(file_hash.clone());
106        }
107    }
108
109    files.sort_by(|a, b| a.path.cmp(&b.path));
110
111    let mut nodes = Vec::new();
112    for file in &files {
113        nodes.push(WorkspaceMerkleNode {
114            path: file.path.clone(),
115            path_hash: file.path_hash.clone(),
116            content_hash: hash_node("file", &file.path, &[file.content_hash.as_str()]),
117            kind: CodeIndexNodeKind::File,
118            children: Vec::new(),
119        });
120    }
121
122    let mut directory_hashes = BTreeMap::<PathBuf, MerkleHash>::new();
123    let mut directories = directory_children.keys().cloned().collect::<Vec<_>>();
124    directories.sort_by(|a, b| {
125        b.components()
126            .count()
127            .cmp(&a.components().count())
128            .then(a.cmp(b))
129    });
130    for directory in directories {
131        let mut children = directory_children
132            .get(&directory)
133            .cloned()
134            .unwrap_or_default()
135            .into_iter()
136            .collect::<Vec<_>>();
137        for (child_dir, child_hash) in &directory_hashes {
138            if child_dir.parent() == Some(directory.as_path()) {
139                children.push(child_hash.clone());
140            }
141        }
142        children.sort();
143        let child_refs = children.iter().map(String::as_str).collect::<Vec<_>>();
144        let content_hash = hash_node("dir", &directory, &child_refs);
145        directory_hashes.insert(directory.clone(), content_hash.clone());
146        nodes.push(WorkspaceMerkleNode {
147            path: directory.clone(),
148            path_hash: hash_path(&directory),
149            content_hash,
150            kind: CodeIndexNodeKind::Directory,
151            children,
152        });
153    }
154
155    let root_hash = directory_hashes
156        .get(Path::new(""))
157        .cloned()
158        .unwrap_or_else(|| hash_node("dir", Path::new(""), &[]));
159    nodes.sort_by(|a, b| {
160        a.path
161            .cmp(&b.path)
162            .then_with(|| node_kind_rank(&a.kind).cmp(&node_kind_rank(&b.kind)))
163    });
164
165    Ok(MerkleBuild {
166        tree: WorkspaceMerkleTree {
167            workspace_root,
168            root_hash: root_hash.clone(),
169            similarity_hash: WorkspaceSimilarityHash {
170                algorithm: "roder-simhash-v1".to_string(),
171                value: similarity_hash(&files),
172            },
173            nodes,
174        },
175        files,
176    })
177}
178
179pub fn diff_file_manifests(
180    previous: &[FileManifestEntry],
181    current: &[FileManifestEntry],
182) -> MerkleDiff {
183    let previous_by_path = previous
184        .iter()
185        .map(|entry| (entry.path.clone(), entry.content_hash.clone()))
186        .collect::<BTreeMap<_, _>>();
187    let current_by_path = current
188        .iter()
189        .map(|entry| (entry.path.clone(), entry.content_hash.clone()))
190        .collect::<BTreeMap<_, _>>();
191
192    let mut changed_files = Vec::new();
193    let mut deleted_files = Vec::new();
194    let mut unchanged_files = Vec::new();
195
196    for (path, content_hash) in &current_by_path {
197        match previous_by_path.get(path) {
198            Some(previous_hash) if previous_hash == content_hash => {
199                unchanged_files.push(path.clone())
200            }
201            _ => changed_files.push(path.clone()),
202        }
203    }
204    for path in previous_by_path.keys() {
205        if !current_by_path.contains_key(path) {
206            deleted_files.push(path.clone());
207        }
208    }
209
210    MerkleDiff {
211        changed_files,
212        deleted_files,
213        unchanged_files,
214    }
215}
216
217pub fn hash_path(path: &Path) -> PathHash {
218    hex_sha256(normalize_path(path))
219}
220
221fn canonical_scopes(workspace_root: &Path, scopes: &[PathBuf]) -> anyhow::Result<Vec<PathBuf>> {
222    if scopes.is_empty() {
223        return Ok(vec![workspace_root.to_path_buf()]);
224    }
225    let mut out = Vec::new();
226    for scope in scopes {
227        let joined = if scope.is_absolute() {
228            scope.clone()
229        } else {
230            workspace_root.join(scope)
231        };
232        let canonical = fs::canonicalize(&joined)
233            .with_context(|| format!("canonicalize index scope {}", joined.display()))?;
234        if !canonical.starts_with(workspace_root) {
235            bail!("index scope escapes workspace: {}", canonical.display());
236        }
237        out.push(canonical);
238    }
239    Ok(out)
240}
241
242fn in_scope(path: &Path, scopes: &[PathBuf]) -> bool {
243    scopes.iter().any(|scope| path.starts_with(scope))
244}
245
246fn should_skip(root: &Path, entry: &DirEntry) -> bool {
247    let path = entry.path();
248    let Ok(rel) = path.strip_prefix(root) else {
249        return true;
250    };
251    rel.components().any(|component| {
252        let value = component.as_os_str().to_string_lossy();
253        matches!(
254            value.as_ref(),
255            ".git" | ".roder" | "target" | "node_modules"
256        )
257    })
258}
259
260fn hash_node(kind: &str, path: &Path, parts: &[&str]) -> MerkleHash {
261    let mut material = String::new();
262    material.push_str(kind);
263    material.push('\0');
264    material.push_str(&normalize_path(path));
265    for part in parts {
266        material.push('\0');
267        material.push_str(part);
268    }
269    hex_sha256(material)
270}
271
272fn normalize_path(path: &Path) -> String {
273    path.components()
274        .map(|component| component.as_os_str().to_string_lossy())
275        .collect::<Vec<_>>()
276        .join("/")
277}
278
279fn similarity_hash(files: &[FileManifestEntry]) -> String {
280    let mut material = String::new();
281    for file in files {
282        material.push_str(&normalize_path(&file.path));
283        material.push('\0');
284        material.push_str(&file.content_hash[..16.min(file.content_hash.len())]);
285        material.push('\n');
286    }
287    hex_sha256(material)[..16].to_string()
288}
289
290fn node_kind_rank(kind: &CodeIndexNodeKind) -> u8 {
291    match kind {
292        CodeIndexNodeKind::Directory => 0,
293        CodeIndexNodeKind::File => 1,
294    }
295}
296
297#[cfg(test)]
298mod tests {
299    use super::*;
300
301    #[test]
302    fn merkle_respects_ignore_rules_and_path_scope() {
303        let root = tempdir("merkle_respects_ignore_rules_and_path_scope");
304        write(root.join("src/lib.rs"), "pub fn allowed() {}\n");
305        write(root.join("src/private.rs"), "pub fn private() {}\n");
306        write(root.join(".roder/state.json"), "{}\n");
307        write(root.join("target/debug/output"), "ignored\n");
308        write(root.join(".git/HEAD"), "ignored\n");
309
310        let build = build_workspace_merkle_with_options(
311            &root,
312            &MerkleBuildOptions {
313                scopes: vec![PathBuf::from("src/lib.rs")],
314            },
315        )
316        .unwrap();
317
318        let paths = build
319            .files
320            .iter()
321            .map(|file| file.path.clone())
322            .collect::<Vec<_>>();
323        assert_eq!(paths, vec![PathBuf::from("src/lib.rs")]);
324        assert!(
325            build
326                .tree
327                .nodes
328                .iter()
329                .any(|node| node.path == PathBuf::from("src/lib.rs"))
330        );
331    }
332
333    #[test]
334    fn merkle_detects_changed_and_deleted_files() {
335        let root = tempdir("merkle_detects_changed_and_deleted_files");
336        write(root.join("src/a.rs"), "pub fn a() {}\n");
337        write(root.join("src/b.rs"), "pub fn b() {}\n");
338        let first = build_workspace_merkle(&root).unwrap();
339
340        write(root.join("src/a.rs"), "pub fn a_changed() {}\n");
341        fs::remove_file(root.join("src/b.rs")).unwrap();
342        let second = build_workspace_merkle(&root).unwrap();
343        let diff = diff_file_manifests(&first.files, &second.files);
344
345        assert_eq!(diff.changed_files, vec![PathBuf::from("src/a.rs")]);
346        assert_eq!(diff.deleted_files, vec![PathBuf::from("src/b.rs")]);
347        assert_ne!(first.tree.root_hash, second.tree.root_hash);
348    }
349
350    fn write(path: PathBuf, contents: &str) {
351        fs::create_dir_all(path.parent().unwrap()).unwrap();
352        fs::write(path, contents).unwrap();
353    }
354
355    fn tempdir(name: &str) -> PathBuf {
356        let path = std::env::temp_dir().join(format!(
357            "roder-code-index-{name}-{}-{}",
358            std::process::id(),
359            time::OffsetDateTime::now_utc().unix_timestamp_nanos()
360        ));
361        fs::create_dir_all(&path).unwrap();
362        path
363    }
364}