1use std::collections::{BTreeMap, HashSet};
10use std::path::{Path, PathBuf};
11
12use regex::Regex;
13
14use crate::store::{Layer, Store};
15
16#[derive(Debug, Clone, Default, PartialEq)]
19pub struct Stats {
20 pub total_files: usize,
22 pub files_per_layer: BTreeMap<Layer, usize>,
24 pub total_size_bytes: u64,
26 pub type_distribution: BTreeMap<String, usize>,
28 pub orphan_count: usize,
30 pub broken_link_count: usize,
32 pub top_types: Vec<(String, usize)>,
34}
35
36const TOP_TYPES_LIMIT: usize = 10;
38
39struct FileFacts {
43 node_id: PathBuf,
46 layer: Layer,
48 size_bytes: u64,
50 type_: Option<String>,
52 raw_targets: Vec<PathBuf>,
56}
57
58impl FileFacts {
59 fn resolvable_targets(&self) -> impl Iterator<Item = &PathBuf> {
65 self.raw_targets.iter().filter(|t| is_full_path(t))
66 }
67}
68
69pub fn compute(store: &Store) -> crate::Result<Stats> {
72 let link_re = wiki_link_regex();
73
74 let mut existing_nodes: HashSet<PathBuf> = HashSet::new();
78 let mut facts: Vec<FileFacts> = Vec::new();
79
80 for layer in Layer::all() {
81 let layer_root = store.root.join(layer_dir_name(layer));
82 for abs in walk_layer_content_files(&layer_root)? {
83 let rel = abs.strip_prefix(&store.root).unwrap_or(&abs).to_path_buf();
84 let node_id = strip_md(&rel);
85 existing_nodes.insert(node_id.clone());
86
87 let size_bytes = std::fs::metadata(&abs).map(|m| m.len()).unwrap_or(0);
88 let text = std::fs::read_to_string(&abs).unwrap_or_default();
89 let type_ = parse_type(&text);
90 let raw_targets = extract_link_targets(&text, &link_re);
91
92 facts.push(FileFacts {
93 node_id,
94 layer,
95 size_bytes,
96 type_,
97 raw_targets,
98 });
99 }
100 }
101
102 let mut stats = Stats::default();
108 let mut linked_to: HashSet<PathBuf> = HashSet::new();
109 for file in &facts {
110 for target in file.resolvable_targets() {
111 if target == &file.node_id {
115 continue;
116 }
117 if existing_nodes.contains(target) {
118 linked_to.insert(target.clone());
119 } else {
120 stats.broken_link_count += 1;
122 }
123 }
124 }
125
126 for file in &facts {
129 stats.total_files += 1;
130 *stats.files_per_layer.entry(file.layer).or_insert(0) += 1;
131 stats.total_size_bytes += file.size_bytes;
132
133 if let Some(t) = &file.type_ {
134 *stats.type_distribution.entry(t.clone()).or_insert(0) += 1;
135 }
136
137 let has_outgoing = file
138 .resolvable_targets()
139 .any(|t| t != &file.node_id && existing_nodes.contains(t));
140 let has_incoming = linked_to.contains(&file.node_id);
141 if !has_outgoing && !has_incoming {
142 stats.orphan_count += 1;
143 }
144 }
145
146 stats.top_types = top_types(&stats.type_distribution, TOP_TYPES_LIMIT);
147
148 Ok(stats)
149}
150
151fn layer_dir_name(layer: Layer) -> &'static str {
154 match layer {
155 Layer::Sources => "sources",
156 Layer::Records => "records",
157 Layer::Wiki => "wiki",
158 }
159}
160
161fn walk_layer_content_files(layer_root: &Path) -> crate::Result<Vec<PathBuf>> {
166 let mut out = Vec::new();
167 if !layer_root.is_dir() {
168 return Ok(out);
169 }
170 let walker = walkdir::WalkDir::new(layer_root)
171 .into_iter()
172 .filter_entry(|e| {
173 let name = e.file_name().to_string_lossy();
175 if name.starts_with('.') {
176 return false;
177 }
178 if e.file_type().is_dir() && name == "log" {
179 return false;
180 }
181 true
182 });
183 for entry in walker {
184 let entry = entry.map_err(|e| {
185 crate::Error::Io(
186 e.into_io_error()
187 .unwrap_or_else(|| std::io::Error::other("walk error")),
188 )
189 })?;
190 if !entry.file_type().is_file() {
191 continue;
192 }
193 let path = entry.path();
194 let name = entry.file_name().to_string_lossy();
195 if !name.ends_with(".md") || name == "index.md" {
198 continue;
199 }
200 out.push(path.to_path_buf());
201 }
202 out.sort();
203 Ok(out)
204}
205
206fn wiki_link_regex() -> Regex {
210 Regex::new(r"\[\[([^\[\]|]+)(?:\|[^\]]*)?\]\]").expect("static wiki-link regex is valid")
212}
213
214fn extract_link_targets(text: &str, re: &Regex) -> Vec<PathBuf> {
224 let mut out = Vec::new();
225 let mut in_fence = false;
226 for line in text.lines() {
227 let trimmed = line.trim_start();
228 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
229 in_fence = !in_fence;
230 continue;
231 }
232 if in_fence {
233 continue;
234 }
235 for cap in re.captures_iter(line) {
236 if let Some(m) = cap.get(1) {
237 let raw = m.as_str().trim();
238 out.push(strip_md(Path::new(raw)));
239 }
240 }
241 }
242 out
243}
244
245fn strip_md(path: &Path) -> PathBuf {
247 let s = path.to_string_lossy();
248 match s.strip_suffix(".md") {
249 Some(stem) => PathBuf::from(stem),
250 None => path.to_path_buf(),
251 }
252}
253
254fn is_full_path(target: &Path) -> bool {
266 let mut parts = target.components();
267 let first = match parts.next() {
268 Some(std::path::Component::Normal(s)) => s.to_string_lossy(),
269 _ => return false,
270 };
271 let has_rest = parts.next().is_some();
272 matches!(first.as_ref(), "sources" | "records" | "wiki") && has_rest
273}
274
275fn parse_type(text: &str) -> Option<String> {
280 let yaml = frontmatter_block(text)?;
281 let value: serde_norway::Value = serde_norway::from_str(&yaml).ok()?;
282 let mapping = value.as_mapping()?;
283 let type_val = mapping.get(serde_norway::Value::String("type".to_string()))?;
284 let s = type_val.as_str()?.trim();
285 if s.is_empty() {
286 None
287 } else {
288 Some(s.to_string())
289 }
290}
291
292fn frontmatter_block(text: &str) -> Option<String> {
296 let text = text.strip_prefix('\u{feff}').unwrap_or(text);
298 let mut lines = text.lines();
299 let first = lines.next()?;
300 if first.trim_end() != "---" {
301 return None;
302 }
303 let mut body = String::new();
304 for line in lines {
305 if line.trim_end() == "---" {
306 return Some(body);
307 }
308 body.push_str(line);
309 body.push('\n');
310 }
311 None
313}
314
315fn top_types(dist: &BTreeMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
318 let mut pairs: Vec<(String, usize)> = dist.iter().map(|(k, v)| (k.clone(), *v)).collect();
319 pairs.sort_by_key(|p| std::cmp::Reverse(p.1));
322 pairs.truncate(limit);
323 pairs
324}
325
326#[cfg(test)]
327mod tests {
328 use super::*;
329 use crate::parser::Config;
330 use std::fs;
331 use tempfile::TempDir;
332
333 fn temp_store() -> (TempDir, Store) {
337 let dir = TempDir::new().expect("tempdir");
338 fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").expect("write DB.md");
339 let store = Store {
340 root: dir.path().to_path_buf(),
341 config: Config::default(),
342 };
343 (dir, store)
344 }
345
346 fn write_rel(store: &Store, rel: &str, contents: &str) {
348 let abs = store.root.join(rel);
349 if let Some(parent) = abs.parent() {
350 fs::create_dir_all(parent).expect("mkdir parents");
351 }
352 fs::write(abs, contents).expect("write content file");
353 }
354
355 fn doc(type_: &str, summary: &str) -> String {
357 format!("---\ntype: {type_}\nsummary: \"{summary}\"\n---\n\nbody\n")
358 }
359
360 #[test]
361 fn empty_store_is_all_zeros() {
362 let (_d, store) = temp_store();
363 let s = compute(&store).expect("compute");
364 assert_eq!(s.total_files, 0);
365 assert_eq!(s.total_size_bytes, 0);
366 assert!(s.files_per_layer.is_empty());
367 assert!(s.type_distribution.is_empty());
368 assert_eq!(s.orphan_count, 0);
369 assert_eq!(s.broken_link_count, 0);
370 assert!(s.top_types.is_empty());
371 }
372
373 #[test]
374 fn counts_files_per_layer_and_total() {
375 let (_d, store) = temp_store();
376 write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
377 write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
378 write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
379 write_rel(&store, "wiki/people/p.md", &doc("wiki-page", "p"));
380
381 let s = compute(&store).expect("compute");
382 assert_eq!(s.total_files, 4);
383 assert_eq!(s.files_per_layer.get(&Layer::Sources), Some(&2));
384 assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
385 assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&1));
386 }
387
388 #[test]
389 fn ignores_meta_files_and_non_md_and_dotdirs_and_log() {
390 let (_d, store) = temp_store();
391 write_rel(&store, "records/contacts/real.md", &doc("contact", "real"));
393 write_rel(
395 &store,
396 "records/contacts/index.md",
397 "---\ntype: index\nscope: type-folder\n---\n",
398 );
399 write_rel(&store, "records/contacts/index.jsonl", "{}\n");
400 write_rel(&store, "records/notes.txt", "not markdown\n");
401 write_rel(&store, "sources/log/2026-04.md", &doc("email", "archived"));
403 write_rel(
405 &store,
406 "wiki/.obsidian/cache.md",
407 &doc("wiki-page", "hidden"),
408 );
409
410 let s = compute(&store).expect("compute");
411 assert_eq!(s.total_files, 1, "only the one real content file counts");
412 assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
413 assert_eq!(s.files_per_layer.get(&Layer::Sources), None);
414 assert_eq!(s.files_per_layer.get(&Layer::Wiki), None);
415 }
416
417 #[test]
418 fn total_size_is_sum_of_content_file_bytes() {
419 let (_d, store) = temp_store();
420 let a = doc("email", "a");
421 let b = "---\ntype: contact\nsummary: x\n---\n\nlonger body text here\n".to_string();
422 write_rel(&store, "sources/emails/a.md", &a);
423 write_rel(&store, "records/contacts/b.md", &b);
424 write_rel(
426 &store,
427 "records/contacts/index.md",
428 "---\ntype: index\n---\nbig meta file padding padding\n",
429 );
430
431 let s = compute(&store).expect("compute");
432 let expected = a.len() as u64 + b.len() as u64;
433 assert_eq!(s.total_size_bytes, expected);
434 }
435
436 #[test]
437 fn type_distribution_counts_each_type_value() {
438 let (_d, store) = temp_store();
439 write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
440 write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
441 write_rel(&store, "sources/emails/c.md", &doc("email", "c"));
442 write_rel(&store, "records/contacts/d.md", &doc("contact", "d"));
443 write_rel(&store, "records/proposals/e.md", &doc("proposal", "e"));
444
445 let s = compute(&store).expect("compute");
446 assert_eq!(s.type_distribution.get("email"), Some(&3));
447 assert_eq!(s.type_distribution.get("contact"), Some(&1));
448 assert_eq!(s.type_distribution.get("proposal"), Some(&1));
449 assert_eq!(s.type_distribution.len(), 3);
450 }
451
452 #[test]
453 fn file_without_type_is_counted_in_totals_but_not_distribution() {
454 let (_d, store) = temp_store();
455 write_rel(
457 &store,
458 "wiki/themes/x.md",
459 "---\nsummary: no type here\n---\n\nbody\n",
460 );
461 write_rel(&store, "wiki/themes/y.md", "just a body, no frontmatter\n");
463
464 let s = compute(&store).expect("compute");
465 assert_eq!(s.total_files, 2, "untyped files still count toward totals");
466 assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&2));
467 assert!(
468 s.type_distribution.is_empty(),
469 "no type key => no distribution entry, not an empty-string bucket"
470 );
471 }
472
473 #[test]
474 fn top_types_orders_by_count_desc_then_name_asc() {
475 let (_d, store) = temp_store();
476 write_rel(&store, "records/contacts/c1.md", &doc("contact", "1"));
478 write_rel(&store, "records/contacts/c2.md", &doc("contact", "2"));
479 write_rel(&store, "records/contacts/c3.md", &doc("contact", "3"));
480 write_rel(&store, "sources/emails/e1.md", &doc("email", "1"));
481 write_rel(&store, "sources/emails/e2.md", &doc("email", "2"));
482 write_rel(&store, "sources/emails/e3.md", &doc("email", "3"));
483 write_rel(&store, "records/decisions/d1.md", &doc("decision", "1"));
484
485 let s = compute(&store).expect("compute");
486 assert_eq!(
487 s.top_types,
488 vec![
489 ("contact".to_string(), 3),
490 ("email".to_string(), 3),
491 ("decision".to_string(), 1),
492 ],
493 "ties (contact, email both 3) break by name ascending; decision trails"
494 );
495 }
496
497 #[test]
498 fn top_types_is_capped_at_ten() {
499 let (_d, store) = temp_store();
500 for i in 0..12 {
502 let t = format!("type{i:02}");
503 write_rel(&store, &format!("records/{t}/f.md"), &doc(&t, "x"));
504 }
505 let s = compute(&store).expect("compute");
506 assert_eq!(s.top_types.len(), 10, "top_types caps at 10");
507 assert_eq!(
508 s.type_distribution.len(),
509 12,
510 "distribution keeps all types"
511 );
512 }
513
514 #[test]
515 fn orphans_are_files_with_no_incoming_and_no_outgoing_links() {
516 let (_d, store) = temp_store();
517 write_rel(
519 &store,
520 "records/contacts/a.md",
521 "---\ntype: contact\nsummary: a\n---\n\nSee [[records/contacts/b]].\n",
522 );
523 write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
524 write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
525
526 let s = compute(&store).expect("compute");
527 assert_eq!(s.orphan_count, 1, "only c is an orphan");
528 }
529
530 #[test]
531 fn a_file_with_only_a_self_link_is_an_orphan_matching_graph() {
532 let (_d, store) = temp_store();
533 write_rel(
536 &store,
537 "records/contacts/solo.md",
538 "---\ntype: contact\nsummary: solo\n---\n\nSee [[records/contacts/solo]].\n",
539 );
540 let s = compute(&store).expect("compute");
541 assert_eq!(
542 s.orphan_count, 1,
543 "a self-only-linking file is an orphan: {s:?}"
544 );
545 }
546
547 #[test]
548 fn a_file_with_only_an_incoming_link_is_not_an_orphan() {
549 let (_d, store) = temp_store();
550 write_rel(
553 &store,
554 "wiki/people/a.md",
555 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]]\n",
556 );
557 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
558
559 let s = compute(&store).expect("compute");
560 assert_eq!(s.orphan_count, 0);
561 }
562
563 #[test]
564 fn frontmatter_wiki_links_count_as_edges_for_orphans() {
565 let (_d, store) = temp_store();
566 write_rel(
569 &store,
570 "records/contacts/sarah.md",
571 "---\ntype: contact\nsummary: s\ncompany: [[records/companies/acme]]\n---\n\nbody\n",
572 );
573 write_rel(&store, "records/companies/acme.md", &doc("company", "acme"));
574
575 let s = compute(&store).expect("compute");
576 assert_eq!(
577 s.orphan_count, 0,
578 "a frontmatter wiki-link is a real edge; neither endpoint is orphaned"
579 );
580 }
581
582 #[test]
583 fn broken_links_count_targets_that_do_not_exist() {
584 let (_d, store) = temp_store();
585 write_rel(
587 &store,
588 "wiki/people/a.md",
589 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]] and [[records/contacts/ghost]]\n",
590 );
591 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
592
593 let s = compute(&store).expect("compute");
594 assert_eq!(s.broken_link_count, 1, "only the ghost target is broken");
595 }
596
597 #[test]
598 fn broken_link_resolves_with_md_extension_stripped() {
599 let (_d, store) = temp_store();
600 write_rel(
603 &store,
604 "wiki/people/a.md",
605 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b.md]]\n",
606 );
607 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
608
609 let s = compute(&store).expect("compute");
610 assert_eq!(
611 s.broken_link_count, 0,
612 "a `.md`-suffixed target resolves to the same node and is not broken"
613 );
614 }
615
616 #[test]
617 fn short_form_links_are_not_broken_and_do_not_wire_the_graph() {
618 let (_d, store) = temp_store();
619 write_rel(
623 &store,
624 "records/contacts/a.md",
625 "---\ntype: contact\nsummary: a\n---\n\n[[b]]\n",
626 );
627 write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
628
629 let s = compute(&store).expect("compute");
630 assert_eq!(
631 s.broken_link_count, 0,
632 "short-form links are not counted as broken by stats"
633 );
634 assert_eq!(s.orphan_count, 2);
637 }
638
639 #[test]
640 fn display_alias_links_resolve_to_the_target_not_the_alias() {
641 let (_d, store) = temp_store();
642 write_rel(
646 &store,
647 "wiki/people/a.md",
648 "---\ntype: wiki-page\nsummary: a\n---\n\nmet [[wiki/people/b|Bob]] today\n",
649 );
650 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
651
652 let s = compute(&store).expect("compute");
653 assert_eq!(s.broken_link_count, 0, "alias target resolves and exists");
654 assert_eq!(s.orphan_count, 0, "a links out, b is linked to");
655 }
656
657 #[test]
658 fn duplicate_links_in_one_file_count_broken_per_occurrence() {
659 let (_d, store) = temp_store();
660 write_rel(
662 &store,
663 "wiki/people/a.md",
664 "---\ntype: wiki-page\nsummary: a\n---\n\n[[records/contacts/ghost]] [[records/contacts/ghost]]\n",
665 );
666 let s = compute(&store).expect("compute");
667 assert_eq!(
668 s.broken_link_count, 2,
669 "broken links count occurrences, not distinct targets"
670 );
671 }
672
673 #[test]
674 fn markdown_links_are_not_treated_as_wiki_links() {
675 let (_d, store) = temp_store();
676 write_rel(
679 &store,
680 "wiki/people/a.md",
681 "---\ntype: wiki-page\nsummary: a\n---\n\nSee [Acme](https://acme.io/path).\n",
682 );
683 let s = compute(&store).expect("compute");
684 assert_eq!(s.broken_link_count, 0, "markdown links aren't graph edges");
685 assert_eq!(s.orphan_count, 1, "the file has no wiki-links => orphan");
686 }
687
688 #[test]
689 fn regression_non_layer_multi_segment_link_is_not_broken() {
690 let (_d, store) = temp_store();
697 write_rel(
698 &store,
699 "records/contacts/a.md",
700 "---\ntype: contact\nsummary: a\n---\n\nSee [[contacts/sarah-chen]].\n",
701 );
702 let s = compute(&store).expect("compute");
703 assert_eq!(
704 s.broken_link_count, 0,
705 "a non-layer multi-segment target is a short-form error, not broken"
706 );
707 assert_eq!(
710 s.orphan_count, 1,
711 "the non-layer link does not wire `a` out of orphan status"
712 );
713 }
714
715 #[test]
716 fn regression_wiki_links_in_code_fences_are_ignored() {
717 let (_d, store) = temp_store();
723 write_rel(
726 &store,
727 "wiki/pages/howto.md",
728 "---\ntype: wiki-page\nsummary: howto\n---\n\
729 \nWrite links like this:\n\
730 \n```\n[[records/contacts/ghost]]\n```\n\
731 \nor this:\n\
732 \n~~~\n[[wiki/pages/real]]\n~~~\n",
733 );
734 write_rel(&store, "wiki/pages/real.md", &doc("wiki-page", "real"));
735 let s = compute(&store).expect("compute");
736 assert_eq!(
737 s.broken_link_count, 0,
738 "a `[[...]]` inside a code fence is not a real (broken) edge"
739 );
740 assert_eq!(
744 s.orphan_count, 2,
745 "fenced wiki-links do not wire files out of orphan status: {s:?}"
746 );
747 }
748
749 #[test]
750 fn a_link_to_an_existing_file_in_another_layer_resolves() {
751 let (_d, store) = temp_store();
752 write_rel(
755 &store,
756 "wiki/people/a.md",
757 "---\ntype: wiki-page\nsummary: a\n---\n\nfrom [[sources/emails/2026/05/m]]\n",
758 );
759 write_rel(&store, "sources/emails/2026/05/m.md", &doc("email", "m"));
760
761 let s = compute(&store).expect("compute");
762 assert_eq!(s.broken_link_count, 0);
763 assert_eq!(s.orphan_count, 0, "both endpoints are wired");
764 }
765}