1use std::collections::{BTreeMap, BTreeSet};
2use std::fs;
3use std::path::{Path, PathBuf};
4
5use anyhow::{Context, bail};
6use ignore::{DirEntry, WalkBuilder};
7use roder_api::code_index::{
8 CodeIndexNodeKind, ContentHash, MerkleHash, PathHash, WorkspaceMerkleNode, WorkspaceMerkleTree,
9 WorkspaceSimilarityHash,
10};
11
12use crate::hex_sha256;
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct MerkleBuildOptions {
16 pub scopes: Vec<PathBuf>,
17}
18
19impl Default for MerkleBuildOptions {
20 fn default() -> Self {
21 Self {
22 scopes: vec![PathBuf::from(".")],
23 }
24 }
25}
26
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub struct FileManifestEntry {
29 pub path: PathBuf,
30 pub path_hash: PathHash,
31 pub content_hash: ContentHash,
32 pub size: u64,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct MerkleBuild {
37 pub tree: WorkspaceMerkleTree,
38 pub files: Vec<FileManifestEntry>,
39}
40
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub struct MerkleDiff {
43 pub changed_files: Vec<PathBuf>,
44 pub deleted_files: Vec<PathBuf>,
45 pub unchanged_files: Vec<PathBuf>,
46}
47
48pub fn build_workspace_merkle(root: impl AsRef<Path>) -> anyhow::Result<MerkleBuild> {
49 build_workspace_merkle_with_options(root, &MerkleBuildOptions::default())
50}
51
52pub fn build_workspace_merkle_with_options(
53 root: impl AsRef<Path>,
54 options: &MerkleBuildOptions,
55) -> anyhow::Result<MerkleBuild> {
56 let workspace_root = fs::canonicalize(root.as_ref())
57 .with_context(|| format!("canonicalize workspace {}", root.as_ref().display()))?;
58 if !workspace_root.is_dir() {
59 bail!(
60 "workspace root is not a directory: {}",
61 workspace_root.display()
62 );
63 }
64
65 let scopes = canonical_scopes(&workspace_root, &options.scopes)?;
66 let mut files = Vec::new();
67 let mut directory_children: BTreeMap<PathBuf, BTreeSet<MerkleHash>> = BTreeMap::new();
68
69 let mut walk = WalkBuilder::new(&workspace_root);
70 walk.standard_filters(true).hidden(false);
71 for result in walk.build() {
72 let entry = result?;
73 if should_skip(&workspace_root, &entry) {
74 continue;
75 }
76 let path = entry.path();
77 if path == workspace_root {
78 continue;
79 }
80 if !in_scope(path, &scopes) {
81 continue;
82 }
83 if !path.is_file() {
84 continue;
85 }
86
87 let rel = path
88 .strip_prefix(&workspace_root)
89 .with_context(|| format!("strip workspace root from {}", path.display()))?
90 .to_path_buf();
91 let bytes = fs::read(path).with_context(|| format!("read {}", path.display()))?;
92 let content_hash = hex_sha256(&bytes);
93 let path_hash = hash_path(&rel);
94 let file_hash = hash_node("file", &rel, &[content_hash.as_str()]);
95 files.push(FileManifestEntry {
96 path: rel.clone(),
97 path_hash: path_hash.clone(),
98 content_hash,
99 size: bytes.len() as u64,
100 });
101 for ancestor in rel.ancestors().skip(1) {
102 directory_children
103 .entry(ancestor.to_path_buf())
104 .or_default()
105 .insert(file_hash.clone());
106 }
107 }
108
109 files.sort_by(|a, b| a.path.cmp(&b.path));
110
111 let mut nodes = Vec::new();
112 for file in &files {
113 nodes.push(WorkspaceMerkleNode {
114 path: file.path.clone(),
115 path_hash: file.path_hash.clone(),
116 content_hash: hash_node("file", &file.path, &[file.content_hash.as_str()]),
117 kind: CodeIndexNodeKind::File,
118 children: Vec::new(),
119 });
120 }
121
122 let mut directory_hashes = BTreeMap::<PathBuf, MerkleHash>::new();
123 let mut directories = directory_children.keys().cloned().collect::<Vec<_>>();
124 directories.sort_by(|a, b| {
125 b.components()
126 .count()
127 .cmp(&a.components().count())
128 .then(a.cmp(b))
129 });
130 for directory in directories {
131 let mut children = directory_children
132 .get(&directory)
133 .cloned()
134 .unwrap_or_default()
135 .into_iter()
136 .collect::<Vec<_>>();
137 for (child_dir, child_hash) in &directory_hashes {
138 if child_dir.parent() == Some(directory.as_path()) {
139 children.push(child_hash.clone());
140 }
141 }
142 children.sort();
143 let child_refs = children.iter().map(String::as_str).collect::<Vec<_>>();
144 let content_hash = hash_node("dir", &directory, &child_refs);
145 directory_hashes.insert(directory.clone(), content_hash.clone());
146 nodes.push(WorkspaceMerkleNode {
147 path: directory.clone(),
148 path_hash: hash_path(&directory),
149 content_hash,
150 kind: CodeIndexNodeKind::Directory,
151 children,
152 });
153 }
154
155 let root_hash = directory_hashes
156 .get(Path::new(""))
157 .cloned()
158 .unwrap_or_else(|| hash_node("dir", Path::new(""), &[]));
159 nodes.sort_by(|a, b| {
160 a.path
161 .cmp(&b.path)
162 .then_with(|| node_kind_rank(&a.kind).cmp(&node_kind_rank(&b.kind)))
163 });
164
165 Ok(MerkleBuild {
166 tree: WorkspaceMerkleTree {
167 workspace_root,
168 root_hash: root_hash.clone(),
169 similarity_hash: WorkspaceSimilarityHash {
170 algorithm: "roder-simhash-v1".to_string(),
171 value: similarity_hash(&files),
172 },
173 nodes,
174 },
175 files,
176 })
177}
178
179pub fn diff_file_manifests(
180 previous: &[FileManifestEntry],
181 current: &[FileManifestEntry],
182) -> MerkleDiff {
183 let previous_by_path = previous
184 .iter()
185 .map(|entry| (entry.path.clone(), entry.content_hash.clone()))
186 .collect::<BTreeMap<_, _>>();
187 let current_by_path = current
188 .iter()
189 .map(|entry| (entry.path.clone(), entry.content_hash.clone()))
190 .collect::<BTreeMap<_, _>>();
191
192 let mut changed_files = Vec::new();
193 let mut deleted_files = Vec::new();
194 let mut unchanged_files = Vec::new();
195
196 for (path, content_hash) in ¤t_by_path {
197 match previous_by_path.get(path) {
198 Some(previous_hash) if previous_hash == content_hash => {
199 unchanged_files.push(path.clone())
200 }
201 _ => changed_files.push(path.clone()),
202 }
203 }
204 for path in previous_by_path.keys() {
205 if !current_by_path.contains_key(path) {
206 deleted_files.push(path.clone());
207 }
208 }
209
210 MerkleDiff {
211 changed_files,
212 deleted_files,
213 unchanged_files,
214 }
215}
216
217pub fn hash_path(path: &Path) -> PathHash {
218 hex_sha256(normalize_path(path))
219}
220
221fn canonical_scopes(workspace_root: &Path, scopes: &[PathBuf]) -> anyhow::Result<Vec<PathBuf>> {
222 if scopes.is_empty() {
223 return Ok(vec![workspace_root.to_path_buf()]);
224 }
225 let mut out = Vec::new();
226 for scope in scopes {
227 let joined = if scope.is_absolute() {
228 scope.clone()
229 } else {
230 workspace_root.join(scope)
231 };
232 let canonical = fs::canonicalize(&joined)
233 .with_context(|| format!("canonicalize index scope {}", joined.display()))?;
234 if !canonical.starts_with(workspace_root) {
235 bail!("index scope escapes workspace: {}", canonical.display());
236 }
237 out.push(canonical);
238 }
239 Ok(out)
240}
241
242fn in_scope(path: &Path, scopes: &[PathBuf]) -> bool {
243 scopes.iter().any(|scope| path.starts_with(scope))
244}
245
246fn should_skip(root: &Path, entry: &DirEntry) -> bool {
247 let path = entry.path();
248 let Ok(rel) = path.strip_prefix(root) else {
249 return true;
250 };
251 rel.components().any(|component| {
252 let value = component.as_os_str().to_string_lossy();
253 matches!(
254 value.as_ref(),
255 ".git" | ".roder" | "target" | "node_modules"
256 )
257 })
258}
259
260fn hash_node(kind: &str, path: &Path, parts: &[&str]) -> MerkleHash {
261 let mut material = String::new();
262 material.push_str(kind);
263 material.push('\0');
264 material.push_str(&normalize_path(path));
265 for part in parts {
266 material.push('\0');
267 material.push_str(part);
268 }
269 hex_sha256(material)
270}
271
272fn normalize_path(path: &Path) -> String {
273 path.components()
274 .map(|component| component.as_os_str().to_string_lossy())
275 .collect::<Vec<_>>()
276 .join("/")
277}
278
279fn similarity_hash(files: &[FileManifestEntry]) -> String {
280 let mut material = String::new();
281 for file in files {
282 material.push_str(&normalize_path(&file.path));
283 material.push('\0');
284 material.push_str(&file.content_hash[..16.min(file.content_hash.len())]);
285 material.push('\n');
286 }
287 hex_sha256(material)[..16].to_string()
288}
289
290fn node_kind_rank(kind: &CodeIndexNodeKind) -> u8 {
291 match kind {
292 CodeIndexNodeKind::Directory => 0,
293 CodeIndexNodeKind::File => 1,
294 }
295}
296
297#[cfg(test)]
298mod tests {
299 use super::*;
300
301 #[test]
302 fn merkle_respects_ignore_rules_and_path_scope() {
303 let root = tempdir("merkle_respects_ignore_rules_and_path_scope");
304 write(root.join("src/lib.rs"), "pub fn allowed() {}\n");
305 write(root.join("src/private.rs"), "pub fn private() {}\n");
306 write(root.join(".roder/state.json"), "{}\n");
307 write(root.join("target/debug/output"), "ignored\n");
308 write(root.join(".git/HEAD"), "ignored\n");
309
310 let build = build_workspace_merkle_with_options(
311 &root,
312 &MerkleBuildOptions {
313 scopes: vec![PathBuf::from("src/lib.rs")],
314 },
315 )
316 .unwrap();
317
318 let paths = build
319 .files
320 .iter()
321 .map(|file| file.path.clone())
322 .collect::<Vec<_>>();
323 assert_eq!(paths, vec![PathBuf::from("src/lib.rs")]);
324 assert!(
325 build
326 .tree
327 .nodes
328 .iter()
329 .any(|node| node.path == PathBuf::from("src/lib.rs"))
330 );
331 }
332
333 #[test]
334 fn merkle_detects_changed_and_deleted_files() {
335 let root = tempdir("merkle_detects_changed_and_deleted_files");
336 write(root.join("src/a.rs"), "pub fn a() {}\n");
337 write(root.join("src/b.rs"), "pub fn b() {}\n");
338 let first = build_workspace_merkle(&root).unwrap();
339
340 write(root.join("src/a.rs"), "pub fn a_changed() {}\n");
341 fs::remove_file(root.join("src/b.rs")).unwrap();
342 let second = build_workspace_merkle(&root).unwrap();
343 let diff = diff_file_manifests(&first.files, &second.files);
344
345 assert_eq!(diff.changed_files, vec![PathBuf::from("src/a.rs")]);
346 assert_eq!(diff.deleted_files, vec![PathBuf::from("src/b.rs")]);
347 assert_ne!(first.tree.root_hash, second.tree.root_hash);
348 }
349
350 fn write(path: PathBuf, contents: &str) {
351 fs::create_dir_all(path.parent().unwrap()).unwrap();
352 fs::write(path, contents).unwrap();
353 }
354
355 fn tempdir(name: &str) -> PathBuf {
356 let path = std::env::temp_dir().join(format!(
357 "roder-code-index-{name}-{}-{}",
358 std::process::id(),
359 time::OffsetDateTime::now_utc().unix_timestamp_nanos()
360 ));
361 fs::create_dir_all(&path).unwrap();
362 path
363 }
364}