1use std::fs;
4use std::path::{Path, PathBuf};
5use std::time::SystemTime;
6
7use chrono::{DateTime, Utc};
8use ignore::WalkBuilder;
9use serde::Serialize;
10use sha2::{Digest, Sha256};
11
12const DOC_EXTENSIONS: &[&str] = &[
14 "md", "txt", "yml", "yaml", "json", "toml", "rst", "adoc",
15];
16
17const SPECIAL_NAMES: &[&str] = &[
19 "README",
20 "CHANGELOG",
21 "LICENSE",
22 "CONTRIBUTING",
23 "CLAUDE",
24 "AGENTS",
25 "SECURITY",
26 "CODE_OF_CONDUCT",
27 ".cursorrules",
28 ".clinerules",
29];
30
31const DOC_DIRECTORIES: &[&str] = &[
33 "docs/",
34 "doc/",
35 "documentation/",
36 ".claude/",
37];
38
39#[derive(Debug, Clone, Serialize)]
41pub struct DiscoveredFile {
42 pub path: PathBuf,
44 pub relative_path: String,
46 pub size: u64,
48 pub modified_at: Option<DateTime<Utc>>,
50 pub extension: Option<String>,
52 pub is_markdown: bool,
54 pub content_hash: String,
56}
57
58pub struct FileDiscovery {
60 project_root: PathBuf,
61 excludes: Vec<String>,
62 includes: Vec<String>,
63 use_gitignore: bool,
64 custom_ignore_file: Option<PathBuf>,
65}
66
67impl FileDiscovery {
68 pub fn new(project_root: impl AsRef<Path>) -> Self {
70 Self {
71 project_root: project_root.as_ref().to_path_buf(),
72 excludes: Vec::new(),
73 includes: Vec::new(),
74 use_gitignore: true,
75 custom_ignore_file: None,
76 }
77 }
78
79 pub fn with_excludes(mut self, excludes: Vec<String>) -> Self {
81 self.excludes = excludes;
82 self
83 }
84
85 pub fn with_includes(mut self, includes: Vec<String>) -> Self {
87 self.includes = includes;
88 self
89 }
90
91 pub fn with_gitignore(mut self, use_gitignore: bool) -> Self {
93 self.use_gitignore = use_gitignore;
94 self
95 }
96
97 pub fn with_custom_ignore_file(mut self, path: impl AsRef<Path>) -> Self {
99 self.custom_ignore_file = Some(path.as_ref().to_path_buf());
100 self
101 }
102
103 pub fn discover(&self) -> Result<Vec<DiscoveredFile>, ScanError> {
106 let project_root = self
107 .project_root
108 .canonicalize()
109 .map_err(|e| ScanError::Io(format!("Cannot resolve project root: {}", e)))?;
110
111 let mut builder = WalkBuilder::new(&project_root);
112 builder
113 .hidden(false)
114 .git_ignore(self.use_gitignore)
115 .git_global(self.use_gitignore)
116 .git_exclude(self.use_gitignore)
117 .follow_links(false);
118
119 builder.add_custom_ignore_filename(".kardo.ignore");
121
122 if let Some(ref ignore_path) = self.custom_ignore_file {
124 if let Some(err) = builder.add_ignore(ignore_path) {
125 return Err(ScanError::Io(format!(
126 "Cannot read ignore file '{}': {}",
127 ignore_path.display(),
128 err
129 )));
130 }
131 }
132
133 if !self.excludes.is_empty() || !self.includes.is_empty() {
135 let mut ob = ignore::overrides::OverrideBuilder::new(&project_root);
136
137 if !self.excludes.is_empty() && !self.includes.is_empty() {
140 ob.add("**").map_err(|e| {
141 ScanError::Io(format!("Failed to add catch-all override: {}", e))
142 })?;
143 }
144
145 for pattern in &self.excludes {
146 let negated = format!("!{}", pattern);
147 ob.add(&negated).map_err(|e| {
148 ScanError::Io(format!("Invalid exclude pattern '{}': {}", pattern, e))
149 })?;
150 }
151 for pattern in &self.includes {
152 ob.add(pattern).map_err(|e| {
153 ScanError::Io(format!("Invalid include pattern '{}': {}", pattern, e))
154 })?;
155 }
156
157 let overrides = ob
158 .build()
159 .map_err(|e| ScanError::Io(format!("Failed to build overrides: {}", e)))?;
160 builder.overrides(overrides);
161 }
162
163 let walker = builder.build();
164
165 let mut files = Vec::new();
166
167 for entry in walker {
168 let entry = match entry {
169 Ok(e) => e,
170 Err(err) => {
171 log::warn!("Skipping entry: {}", err);
172 continue;
173 }
174 };
175
176 if entry.file_type().map_or(true, |ft| !ft.is_file()) {
177 continue;
178 }
179
180 let abs_path = entry.path().to_path_buf();
181 let relative = abs_path
182 .strip_prefix(&project_root)
183 .unwrap_or(&abs_path)
184 .to_string_lossy()
185 .to_string();
186
187 if !Self::is_relevant_file(&abs_path, &relative) {
188 continue;
189 }
190
191 match Self::build_discovered_file(&abs_path, &relative) {
192 Ok(df) => files.push(df),
193 Err(err) => {
194 log::warn!("Skipping {}: {}", relative, err);
195 }
196 }
197 }
198
199 files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
200 Ok(files)
201 }
202
203 fn is_relevant_file(abs_path: &Path, relative: &str) -> bool {
205 if relative.starts_with(".kardo/") || relative.starts_with(".kardo\\") {
207 return false;
208 }
209
210 for dir in DOC_DIRECTORIES {
212 if relative.starts_with(dir) {
213 return true;
214 }
215 }
216
217 let file_name = abs_path
219 .file_name()
220 .and_then(|n| n.to_str())
221 .unwrap_or("");
222 let stem = abs_path
223 .file_stem()
224 .and_then(|s| s.to_str())
225 .unwrap_or("");
226
227 for special in SPECIAL_NAMES {
228 if file_name.eq_ignore_ascii_case(special)
229 || stem.eq_ignore_ascii_case(special)
230 {
231 return true;
232 }
233 }
234
235 if let Some(ext) = abs_path.extension().and_then(|e| e.to_str()) {
237 let ext_lower = ext.to_lowercase();
238 return DOC_EXTENSIONS.contains(&ext_lower.as_str());
239 }
240
241 false
242 }
243
244 fn hash_file(path: &Path) -> Result<String, ScanError> {
246 let content = fs::read(path)
247 .map_err(|e| ScanError::Io(format!("Cannot read {}: {}", path.display(), e)))?;
248 let mut hasher = Sha256::new();
249 hasher.update(&content);
250 Ok(format!("{:x}", hasher.finalize()))
251 }
252
253 fn build_discovered_file(abs_path: &Path, relative: &str) -> Result<DiscoveredFile, ScanError> {
255 let metadata = fs::metadata(abs_path)
256 .map_err(|e| ScanError::Io(format!("Cannot read metadata for {}: {}", relative, e)))?;
257
258 let size = metadata.len();
259
260 let modified_at: Option<DateTime<Utc>> = metadata.modified().ok().and_then(|t| {
261 t.duration_since(SystemTime::UNIX_EPOCH)
262 .ok()
263 .and_then(|d| {
264 DateTime::from_timestamp(d.as_secs() as i64, d.subsec_nanos())
265 })
266 });
267
268 let extension = abs_path
269 .extension()
270 .and_then(|e| e.to_str())
271 .map(|e| e.to_lowercase());
272
273 let is_markdown = extension.as_deref() == Some("md");
274
275 let content_hash = Self::hash_file(abs_path)?;
276
277 Ok(DiscoveredFile {
278 path: abs_path.to_path_buf(),
279 relative_path: relative.to_string(),
280 size,
281 modified_at,
282 extension,
283 is_markdown,
284 content_hash,
285 })
286 }
287}
288
289#[derive(Debug, thiserror::Error)]
291pub enum ScanError {
292 #[error("IO error: {0}")]
293 Io(String),
294}
295
296#[cfg(test)]
297mod tests {
298 use super::*;
299 use std::fs;
300 use tempfile::TempDir;
301
302 fn init_git_repo(root: &Path) {
303 fs::create_dir_all(root.join(".git")).unwrap();
304 fs::write(root.join(".git/HEAD"), "ref: refs/heads/main\n").unwrap();
305 fs::create_dir_all(root.join(".git/objects")).unwrap();
306 fs::create_dir_all(root.join(".git/refs")).unwrap();
307 fs::write(
308 root.join(".git/config"),
309 "[core]\n\trepositoryformatversion = 0\n\tfilemode = true\n\tbare = false\n",
310 )
311 .unwrap();
312 }
313
314 fn create_test_project(tmp: &TempDir) {
315 let root = tmp.path();
316 init_git_repo(root);
317
318 fs::write(root.join("README.md"), "# Test Project\n").unwrap();
319 fs::write(root.join("CLAUDE.md"), "# Claude instructions\n").unwrap();
320 fs::create_dir_all(root.join("docs")).unwrap();
321 fs::write(root.join("docs/guide.md"), "# Guide\nSome content").unwrap();
322 fs::write(root.join("docs/config.yml"), "key: value\n").unwrap();
323 fs::write(root.join("docs/data.json"), "{}").unwrap();
324 fs::write(root.join("docs/settings.toml"), "[settings]\n").unwrap();
325
326 fs::create_dir_all(root.join("src")).unwrap();
328 fs::write(root.join("src/main.rs"), "fn main() {}").unwrap();
329 fs::write(root.join("src/lib.rs"), "pub mod foo;").unwrap();
330
331 fs::write(root.join(".gitignore"), "target/\n*.log\n").unwrap();
333
334 fs::create_dir_all(root.join("target")).unwrap();
336 fs::write(root.join("target/debug.log"), "log").unwrap();
337 fs::write(root.join("build.log"), "log").unwrap();
338 }
339
340 #[test]
341 fn test_discover_finds_doc_files() {
342 let tmp = TempDir::new().unwrap();
343 create_test_project(&tmp);
344
345 let scanner = FileDiscovery::new(tmp.path());
346 let files = scanner.discover().unwrap();
347 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
348
349 assert!(rel_paths.contains(&"README.md"), "Missing README.md, found: {:?}", rel_paths);
350 assert!(rel_paths.contains(&"CLAUDE.md"), "Missing CLAUDE.md, found: {:?}", rel_paths);
351 assert!(rel_paths.contains(&"docs/guide.md"));
352 assert!(rel_paths.contains(&"docs/config.yml"));
353 assert!(rel_paths.contains(&"docs/data.json"));
354 assert!(rel_paths.contains(&"docs/settings.toml"));
355
356 assert!(!rel_paths.contains(&"src/main.rs"));
358 assert!(!rel_paths.contains(&"src/lib.rs"));
359 }
360
361 #[test]
362 fn test_gitignore_respected() {
363 let tmp = TempDir::new().unwrap();
364 create_test_project(&tmp);
365
366 let scanner = FileDiscovery::new(tmp.path());
367 let files = scanner.discover().unwrap();
368 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
369
370 assert!(!rel_paths.contains(&"target/debug.log"));
371 assert!(!rel_paths.contains(&"build.log"));
372 }
373
374 #[test]
375 fn test_is_relevant_file_filter() {
376 assert!(FileDiscovery::is_relevant_file(Path::new("guide.md"), "guide.md"));
378 assert!(FileDiscovery::is_relevant_file(Path::new("notes.txt"), "notes.txt"));
379 assert!(FileDiscovery::is_relevant_file(Path::new("doc.rst"), "doc.rst"));
380 assert!(FileDiscovery::is_relevant_file(Path::new("doc.adoc"), "doc.adoc"));
381
382 assert!(!FileDiscovery::is_relevant_file(Path::new("main.rs"), "main.rs"));
384 assert!(!FileDiscovery::is_relevant_file(Path::new("app.py"), "app.py"));
385 assert!(!FileDiscovery::is_relevant_file(Path::new("style.css"), "style.css"));
386
387 assert!(FileDiscovery::is_relevant_file(Path::new("README"), "README"));
389 assert!(FileDiscovery::is_relevant_file(Path::new("README.md"), "README.md"));
390 assert!(FileDiscovery::is_relevant_file(Path::new("LICENSE"), "LICENSE"));
391 assert!(FileDiscovery::is_relevant_file(Path::new("CLAUDE.md"), "CLAUDE.md"));
392 assert!(FileDiscovery::is_relevant_file(Path::new("AGENTS.md"), "AGENTS.md"));
393 assert!(FileDiscovery::is_relevant_file(Path::new(".cursorrules"), ".cursorrules"));
394 assert!(FileDiscovery::is_relevant_file(Path::new(".clinerules"), ".clinerules"));
395
396 assert!(!FileDiscovery::is_relevant_file(
398 Path::new(".kardo/config.json"),
399 ".kardo/config.json"
400 ));
401 }
402
403 #[test]
404 fn test_content_hashing() {
405 let tmp = TempDir::new().unwrap();
406 init_git_repo(tmp.path());
407
408 fs::write(tmp.path().join("a.md"), "identical content").unwrap();
410 fs::write(tmp.path().join("b.md"), "identical content").unwrap();
411 fs::write(tmp.path().join("c.md"), "different content").unwrap();
413
414 let scanner = FileDiscovery::new(tmp.path());
415 let files = scanner.discover().unwrap();
416
417 let a = files.iter().find(|f| f.relative_path == "a.md").unwrap();
418 let b = files.iter().find(|f| f.relative_path == "b.md").unwrap();
419 let c = files.iter().find(|f| f.relative_path == "c.md").unwrap();
420
421 assert_eq!(a.content_hash, b.content_hash);
422 assert_ne!(a.content_hash, c.content_hash);
423
424 assert_eq!(a.content_hash.len(), 64);
426 assert!(a.content_hash.chars().all(|ch| ch.is_ascii_hexdigit()));
427 }
428
429 #[test]
430 fn test_relative_paths_correct() {
431 let tmp = TempDir::new().unwrap();
432 init_git_repo(tmp.path());
433
434 fs::create_dir_all(tmp.path().join("docs/deep/nested")).unwrap();
435 fs::write(tmp.path().join("docs/deep/nested/file.md"), "deep").unwrap();
436 fs::write(tmp.path().join("README.md"), "root").unwrap();
437
438 let scanner = FileDiscovery::new(tmp.path());
439 let files = scanner.discover().unwrap();
440
441 let readme = files.iter().find(|f| f.relative_path == "README.md").unwrap();
442 assert!(readme.path.is_absolute());
443
444 let nested = files.iter().find(|f| f.relative_path == "docs/deep/nested/file.md").unwrap();
445 assert!(nested.path.is_absolute());
446 assert!(!nested.relative_path.starts_with('/'));
447 }
448
449 #[test]
450 fn test_hidden_claude_dir_discovered() {
451 let tmp = TempDir::new().unwrap();
452 init_git_repo(tmp.path());
453
454 fs::create_dir_all(tmp.path().join(".claude")).unwrap();
455 fs::write(tmp.path().join(".claude/instructions"), "AI instructions").unwrap();
456 fs::create_dir_all(tmp.path().join(".claude/research")).unwrap();
457 fs::write(tmp.path().join(".claude/research/notes.md"), "notes").unwrap();
458
459 let scanner = FileDiscovery::new(tmp.path());
460 let files = scanner.discover().unwrap();
461 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
462
463 assert!(
464 rel_paths.contains(&".claude/instructions"),
465 "Missing .claude/instructions, found: {:?}",
466 rel_paths
467 );
468 assert!(rel_paths.contains(&".claude/research/notes.md"));
469 }
470
471 #[test]
472 fn test_kardo_dir_excluded() {
473 let tmp = TempDir::new().unwrap();
474 init_git_repo(tmp.path());
475
476 fs::create_dir_all(tmp.path().join(".kardo")).unwrap();
477 fs::write(tmp.path().join(".kardo/kardo.db"), "fake db").unwrap();
478 fs::write(tmp.path().join(".kardo/config.json"), "{}").unwrap();
479 fs::write(tmp.path().join("README.md"), "hi").unwrap();
480
481 let scanner = FileDiscovery::new(tmp.path());
482 let files = scanner.discover().unwrap();
483 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
484
485 assert!(!rel_paths.iter().any(|p| p.starts_with(".kardo/")));
486 }
487
488 #[test]
489 fn test_file_metadata() {
490 let tmp = TempDir::new().unwrap();
491 init_git_repo(tmp.path());
492
493 fs::write(tmp.path().join("README.md"), "# Hello\n").unwrap();
494
495 let scanner = FileDiscovery::new(tmp.path());
496 let files = scanner.discover().unwrap();
497 let readme = files.iter().find(|f| f.relative_path == "README.md").unwrap();
498
499 assert!(readme.is_markdown);
500 assert_eq!(readme.extension.as_deref(), Some("md"));
501 assert_eq!(readme.size, 8); assert!(readme.modified_at.is_some());
503 assert!(readme.path.is_absolute());
504 }
505
506 #[test]
507 fn test_sorted_output() {
508 let tmp = TempDir::new().unwrap();
509 init_git_repo(tmp.path());
510
511 fs::write(tmp.path().join("z-file.md"), "z").unwrap();
512 fs::write(tmp.path().join("a-file.md"), "a").unwrap();
513 fs::create_dir_all(tmp.path().join("m")).unwrap();
514 fs::write(tmp.path().join("m/middle.md"), "m").unwrap();
515
516 let scanner = FileDiscovery::new(tmp.path());
517 let files = scanner.discover().unwrap();
518 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
519
520 let mut sorted = rel_paths.clone();
521 sorted.sort();
522 assert_eq!(rel_paths, sorted);
523 }
524
525 #[test]
526 fn test_custom_kardo_ignore_file() {
527 let tmp = TempDir::new().unwrap();
528 let root = tmp.path();
529 init_git_repo(root);
530
531 fs::write(root.join("README.md"), "# Project").unwrap();
532 fs::create_dir_all(root.join(".claude/research/calibration")).unwrap();
533 fs::write(root.join(".claude/research/calibration/data.md"), "cal").unwrap();
534 fs::write(root.join(".claude/research/notes.md"), "notes").unwrap();
535
536 fs::write(root.join(".kardo.ignore"), ".claude/research/calibration/\n").unwrap();
538
539 let scanner = FileDiscovery::new(root);
540 let files = scanner.discover().unwrap();
541 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
542
543 assert!(rel_paths.contains(&"README.md"));
544 assert!(rel_paths.contains(&".claude/research/notes.md"));
545 assert!(
546 !rel_paths.contains(&".claude/research/calibration/data.md"),
547 "calibration data should be excluded by .kardo.ignore, found: {:?}",
548 rel_paths
549 );
550 }
551
552 #[test]
553 fn test_exclude_pattern_via_builder() {
554 let tmp = TempDir::new().unwrap();
555 let root = tmp.path();
556 init_git_repo(root);
557
558 fs::write(root.join("README.md"), "# Project").unwrap();
559 fs::create_dir_all(root.join(".claude/research")).unwrap();
560 fs::write(root.join(".claude/research/notes.md"), "notes").unwrap();
561 fs::create_dir_all(root.join("docs")).unwrap();
562 fs::write(root.join("docs/guide.md"), "guide").unwrap();
563
564 let scanner = FileDiscovery::new(root)
565 .with_excludes(vec![".claude/research/**".into()]);
566 let files = scanner.discover().unwrap();
567 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
568
569 assert!(rel_paths.contains(&"README.md"));
570 assert!(rel_paths.contains(&"docs/guide.md"));
571 assert!(
572 !rel_paths.contains(&".claude/research/notes.md"),
573 "research notes should be excluded, found: {:?}",
574 rel_paths
575 );
576 }
577
578 #[test]
579 fn test_include_overrides_exclude() {
580 let tmp = TempDir::new().unwrap();
581 let root = tmp.path();
582 init_git_repo(root);
583
584 fs::write(root.join("README.md"), "# Project").unwrap();
585 fs::create_dir_all(root.join("docs")).unwrap();
586 fs::write(root.join("docs/guide.md"), "guide").unwrap();
587 fs::write(root.join("docs/important.md"), "important").unwrap();
588 fs::write(root.join("docs/draft.md"), "draft").unwrap();
589
590 let scanner = FileDiscovery::new(root)
591 .with_excludes(vec!["docs/**".into()])
592 .with_includes(vec!["docs/important.md".into()]);
593 let files = scanner.discover().unwrap();
594 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
595
596 assert!(
598 rel_paths.contains(&"README.md"),
599 "README.md should be included, found: {:?}",
600 rel_paths
601 );
602 assert!(
604 rel_paths.contains(&"docs/important.md"),
605 "important.md should override exclude, found: {:?}",
606 rel_paths
607 );
608 assert!(!rel_paths.contains(&"docs/guide.md"));
610 assert!(!rel_paths.contains(&"docs/draft.md"));
611 }
612
613 #[test]
614 fn test_no_gitignore_flag() {
615 let tmp = TempDir::new().unwrap();
616 let root = tmp.path();
617 init_git_repo(root);
618
619 fs::write(root.join(".gitignore"), "ignored.md\n").unwrap();
620 fs::write(root.join("README.md"), "# Project").unwrap();
621 fs::write(root.join("ignored.md"), "this is ignored by git").unwrap();
622
623 let with_gi = FileDiscovery::new(root);
625 let files = with_gi.discover().unwrap();
626 let paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
627 assert!(!paths.contains(&"ignored.md"), "should be gitignored");
628
629 let without_gi = FileDiscovery::new(root).with_gitignore(false);
631 let files = without_gi.discover().unwrap();
632 let paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
633 assert!(
634 paths.contains(&"ignored.md"),
635 "should appear with gitignore disabled, found: {:?}",
636 paths
637 );
638 }
639
640 #[test]
641 fn test_builder_backward_compat() {
642 let tmp = TempDir::new().unwrap();
643 create_test_project(&tmp);
644
645 let scanner = FileDiscovery::new(tmp.path());
647 let files = scanner.discover().unwrap();
648 let rel_paths: Vec<&str> = files.iter().map(|f| f.relative_path.as_str()).collect();
649
650 assert!(rel_paths.contains(&"README.md"));
652 assert!(rel_paths.contains(&"CLAUDE.md"));
653 assert!(rel_paths.contains(&"docs/guide.md"));
654 assert!(!rel_paths.contains(&"src/main.rs"));
655 assert!(!rel_paths.contains(&"target/debug.log"));
656 assert!(!rel_paths.contains(&"build.log"));
657 }
658}