1use std::collections::{BTreeMap, HashSet};
10use std::path::{Path, PathBuf};
11
12use regex::Regex;
13
14use crate::store::{Layer, Store};
15
16#[derive(Debug, Clone, Default, PartialEq)]
19pub struct Stats {
20 pub total_files: usize,
22 pub files_per_layer: BTreeMap<Layer, usize>,
24 pub total_size_bytes: u64,
26 pub type_distribution: BTreeMap<String, usize>,
28 pub orphan_count: usize,
30 pub broken_link_count: usize,
32 pub top_types: Vec<(String, usize)>,
34}
35
36const TOP_TYPES_LIMIT: usize = 10;
38
39struct FileFacts {
43 node_id: PathBuf,
46 layer: Layer,
48 size_bytes: u64,
50 type_: Option<String>,
52 raw_targets: Vec<PathBuf>,
56}
57
58impl FileFacts {
59 fn resolvable_targets(&self) -> impl Iterator<Item = &PathBuf> {
65 self.raw_targets.iter().filter(|t| is_full_path(t))
66 }
67}
68
69pub fn compute(store: &Store) -> crate::Result<Stats> {
72 let link_re = wiki_link_regex();
73
74 let mut existing_nodes: HashSet<PathBuf> = HashSet::new();
78 let mut facts: Vec<FileFacts> = Vec::new();
79
80 for layer in Layer::all() {
81 let layer_root = store.root.join(layer_dir_name(layer));
82 for abs in walk_layer_content_files(&layer_root)? {
83 let rel = abs.strip_prefix(&store.root).unwrap_or(&abs).to_path_buf();
84 let node_id = strip_md(&rel);
85 existing_nodes.insert(node_id.clone());
86
87 let size_bytes = std::fs::metadata(&abs).map(|m| m.len()).unwrap_or(0);
88 let text = std::fs::read_to_string(&abs).unwrap_or_default();
89 let type_ = parse_type(&text);
90 let raw_targets = extract_link_targets(&text, &link_re);
91
92 facts.push(FileFacts {
93 node_id,
94 layer,
95 size_bytes,
96 type_,
97 raw_targets,
98 });
99 }
100 }
101
102 let mut stats = Stats::default();
108 let mut linked_to: HashSet<PathBuf> = HashSet::new();
109 for file in &facts {
110 for target in file.resolvable_targets() {
111 if target == &file.node_id {
115 continue;
116 }
117 if existing_nodes.contains(target) {
118 linked_to.insert(target.clone());
119 } else {
120 stats.broken_link_count += 1;
122 }
123 }
124 }
125
126 for file in &facts {
129 stats.total_files += 1;
130 *stats.files_per_layer.entry(file.layer).or_insert(0) += 1;
131 stats.total_size_bytes += file.size_bytes;
132
133 if let Some(t) = &file.type_ {
134 *stats.type_distribution.entry(t.clone()).or_insert(0) += 1;
135 }
136
137 let has_outgoing = file
138 .resolvable_targets()
139 .any(|t| t != &file.node_id && existing_nodes.contains(t));
140 let has_incoming = linked_to.contains(&file.node_id);
141 if !has_outgoing && !has_incoming {
142 stats.orphan_count += 1;
143 }
144 }
145
146 stats.top_types = top_types(&stats.type_distribution, TOP_TYPES_LIMIT);
147
148 Ok(stats)
149}
150
151fn layer_dir_name(layer: Layer) -> &'static str {
154 match layer {
155 Layer::Sources => "sources",
156 Layer::Records => "records",
157 Layer::Wiki => "wiki",
158 }
159}
160
161fn walk_layer_content_files(layer_root: &Path) -> crate::Result<Vec<PathBuf>> {
166 let mut out = Vec::new();
167 if !layer_root.is_dir() {
168 return Ok(out);
169 }
170 let walker = walkdir::WalkDir::new(layer_root)
171 .into_iter()
172 .filter_entry(|e| {
173 let name = e.file_name().to_string_lossy();
175 if name.starts_with('.') {
176 return false;
177 }
178 if e.file_type().is_dir() && name == "log" {
179 return false;
180 }
181 true
182 });
183 for entry in walker {
184 let entry = entry.map_err(|e| {
185 crate::Error::Io(
186 e.into_io_error()
187 .unwrap_or_else(|| std::io::Error::other("walk error")),
188 )
189 })?;
190 if !entry.file_type().is_file() {
191 continue;
192 }
193 let path = entry.path();
194 let name = entry.file_name().to_string_lossy();
195 if !name.ends_with(".md") || name == "index.md" {
198 continue;
199 }
200 out.push(path.to_path_buf());
201 }
202 out.sort();
203 Ok(out)
204}
205
206fn wiki_link_regex() -> Regex {
210 Regex::new(r"\[\[([^\[\]|]+)(?:\|[^\]]*)?\]\]").expect("static wiki-link regex is valid")
212}
213
214fn extract_link_targets(text: &str, re: &Regex) -> Vec<PathBuf> {
217 re.captures_iter(text)
218 .filter_map(|c| c.get(1))
219 .map(|m| {
220 let raw = m.as_str().trim();
221 strip_md(Path::new(raw))
222 })
223 .collect()
224}
225
226fn strip_md(path: &Path) -> PathBuf {
228 let s = path.to_string_lossy();
229 match s.strip_suffix(".md") {
230 Some(stem) => PathBuf::from(stem),
231 None => path.to_path_buf(),
232 }
233}
234
235fn is_full_path(target: &Path) -> bool {
239 target.components().count() > 1
240}
241
242fn parse_type(text: &str) -> Option<String> {
247 let yaml = frontmatter_block(text)?;
248 let value: serde_norway::Value = serde_norway::from_str(&yaml).ok()?;
249 let mapping = value.as_mapping()?;
250 let type_val = mapping.get(serde_norway::Value::String("type".to_string()))?;
251 let s = type_val.as_str()?.trim();
252 if s.is_empty() {
253 None
254 } else {
255 Some(s.to_string())
256 }
257}
258
259fn frontmatter_block(text: &str) -> Option<String> {
263 let text = text.strip_prefix('\u{feff}').unwrap_or(text);
265 let mut lines = text.lines();
266 let first = lines.next()?;
267 if first.trim_end() != "---" {
268 return None;
269 }
270 let mut body = String::new();
271 for line in lines {
272 if line.trim_end() == "---" {
273 return Some(body);
274 }
275 body.push_str(line);
276 body.push('\n');
277 }
278 None
280}
281
282fn top_types(dist: &BTreeMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
285 let mut pairs: Vec<(String, usize)> = dist.iter().map(|(k, v)| (k.clone(), *v)).collect();
286 pairs.sort_by_key(|p| std::cmp::Reverse(p.1));
289 pairs.truncate(limit);
290 pairs
291}
292
293#[cfg(test)]
294mod tests {
295 use super::*;
296 use crate::parser::Config;
297 use std::fs;
298 use tempfile::TempDir;
299
300 fn temp_store() -> (TempDir, Store) {
304 let dir = TempDir::new().expect("tempdir");
305 fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").expect("write DB.md");
306 let store = Store {
307 root: dir.path().to_path_buf(),
308 config: Config::default(),
309 };
310 (dir, store)
311 }
312
313 fn write_rel(store: &Store, rel: &str, contents: &str) {
315 let abs = store.root.join(rel);
316 if let Some(parent) = abs.parent() {
317 fs::create_dir_all(parent).expect("mkdir parents");
318 }
319 fs::write(abs, contents).expect("write content file");
320 }
321
322 fn doc(type_: &str, summary: &str) -> String {
324 format!("---\ntype: {type_}\nsummary: \"{summary}\"\n---\n\nbody\n")
325 }
326
327 #[test]
328 fn empty_store_is_all_zeros() {
329 let (_d, store) = temp_store();
330 let s = compute(&store).expect("compute");
331 assert_eq!(s.total_files, 0);
332 assert_eq!(s.total_size_bytes, 0);
333 assert!(s.files_per_layer.is_empty());
334 assert!(s.type_distribution.is_empty());
335 assert_eq!(s.orphan_count, 0);
336 assert_eq!(s.broken_link_count, 0);
337 assert!(s.top_types.is_empty());
338 }
339
340 #[test]
341 fn counts_files_per_layer_and_total() {
342 let (_d, store) = temp_store();
343 write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
344 write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
345 write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
346 write_rel(&store, "wiki/people/p.md", &doc("wiki-page", "p"));
347
348 let s = compute(&store).expect("compute");
349 assert_eq!(s.total_files, 4);
350 assert_eq!(s.files_per_layer.get(&Layer::Sources), Some(&2));
351 assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
352 assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&1));
353 }
354
355 #[test]
356 fn ignores_meta_files_and_non_md_and_dotdirs_and_log() {
357 let (_d, store) = temp_store();
358 write_rel(&store, "records/contacts/real.md", &doc("contact", "real"));
360 write_rel(
362 &store,
363 "records/contacts/index.md",
364 "---\ntype: index\nscope: type-folder\n---\n",
365 );
366 write_rel(&store, "records/contacts/index.jsonl", "{}\n");
367 write_rel(&store, "records/notes.txt", "not markdown\n");
368 write_rel(&store, "sources/log/2026-04.md", &doc("email", "archived"));
370 write_rel(
372 &store,
373 "wiki/.obsidian/cache.md",
374 &doc("wiki-page", "hidden"),
375 );
376
377 let s = compute(&store).expect("compute");
378 assert_eq!(s.total_files, 1, "only the one real content file counts");
379 assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
380 assert_eq!(s.files_per_layer.get(&Layer::Sources), None);
381 assert_eq!(s.files_per_layer.get(&Layer::Wiki), None);
382 }
383
384 #[test]
385 fn total_size_is_sum_of_content_file_bytes() {
386 let (_d, store) = temp_store();
387 let a = doc("email", "a");
388 let b = "---\ntype: contact\nsummary: x\n---\n\nlonger body text here\n".to_string();
389 write_rel(&store, "sources/emails/a.md", &a);
390 write_rel(&store, "records/contacts/b.md", &b);
391 write_rel(
393 &store,
394 "records/contacts/index.md",
395 "---\ntype: index\n---\nbig meta file padding padding\n",
396 );
397
398 let s = compute(&store).expect("compute");
399 let expected = a.len() as u64 + b.len() as u64;
400 assert_eq!(s.total_size_bytes, expected);
401 }
402
403 #[test]
404 fn type_distribution_counts_each_type_value() {
405 let (_d, store) = temp_store();
406 write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
407 write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
408 write_rel(&store, "sources/emails/c.md", &doc("email", "c"));
409 write_rel(&store, "records/contacts/d.md", &doc("contact", "d"));
410 write_rel(&store, "records/proposals/e.md", &doc("proposal", "e"));
411
412 let s = compute(&store).expect("compute");
413 assert_eq!(s.type_distribution.get("email"), Some(&3));
414 assert_eq!(s.type_distribution.get("contact"), Some(&1));
415 assert_eq!(s.type_distribution.get("proposal"), Some(&1));
416 assert_eq!(s.type_distribution.len(), 3);
417 }
418
419 #[test]
420 fn file_without_type_is_counted_in_totals_but_not_distribution() {
421 let (_d, store) = temp_store();
422 write_rel(
424 &store,
425 "wiki/themes/x.md",
426 "---\nsummary: no type here\n---\n\nbody\n",
427 );
428 write_rel(&store, "wiki/themes/y.md", "just a body, no frontmatter\n");
430
431 let s = compute(&store).expect("compute");
432 assert_eq!(s.total_files, 2, "untyped files still count toward totals");
433 assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&2));
434 assert!(
435 s.type_distribution.is_empty(),
436 "no type key => no distribution entry, not an empty-string bucket"
437 );
438 }
439
440 #[test]
441 fn top_types_orders_by_count_desc_then_name_asc() {
442 let (_d, store) = temp_store();
443 write_rel(&store, "records/contacts/c1.md", &doc("contact", "1"));
445 write_rel(&store, "records/contacts/c2.md", &doc("contact", "2"));
446 write_rel(&store, "records/contacts/c3.md", &doc("contact", "3"));
447 write_rel(&store, "sources/emails/e1.md", &doc("email", "1"));
448 write_rel(&store, "sources/emails/e2.md", &doc("email", "2"));
449 write_rel(&store, "sources/emails/e3.md", &doc("email", "3"));
450 write_rel(&store, "records/decisions/d1.md", &doc("decision", "1"));
451
452 let s = compute(&store).expect("compute");
453 assert_eq!(
454 s.top_types,
455 vec![
456 ("contact".to_string(), 3),
457 ("email".to_string(), 3),
458 ("decision".to_string(), 1),
459 ],
460 "ties (contact, email both 3) break by name ascending; decision trails"
461 );
462 }
463
464 #[test]
465 fn top_types_is_capped_at_ten() {
466 let (_d, store) = temp_store();
467 for i in 0..12 {
469 let t = format!("type{i:02}");
470 write_rel(&store, &format!("records/{t}/f.md"), &doc(&t, "x"));
471 }
472 let s = compute(&store).expect("compute");
473 assert_eq!(s.top_types.len(), 10, "top_types caps at 10");
474 assert_eq!(
475 s.type_distribution.len(),
476 12,
477 "distribution keeps all types"
478 );
479 }
480
481 #[test]
482 fn orphans_are_files_with_no_incoming_and_no_outgoing_links() {
483 let (_d, store) = temp_store();
484 write_rel(
486 &store,
487 "records/contacts/a.md",
488 "---\ntype: contact\nsummary: a\n---\n\nSee [[records/contacts/b]].\n",
489 );
490 write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
491 write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
492
493 let s = compute(&store).expect("compute");
494 assert_eq!(s.orphan_count, 1, "only c is an orphan");
495 }
496
497 #[test]
498 fn a_file_with_only_a_self_link_is_an_orphan_matching_graph() {
499 let (_d, store) = temp_store();
500 write_rel(
503 &store,
504 "records/contacts/solo.md",
505 "---\ntype: contact\nsummary: solo\n---\n\nSee [[records/contacts/solo]].\n",
506 );
507 let s = compute(&store).expect("compute");
508 assert_eq!(
509 s.orphan_count, 1,
510 "a self-only-linking file is an orphan: {s:?}"
511 );
512 }
513
514 #[test]
515 fn a_file_with_only_an_incoming_link_is_not_an_orphan() {
516 let (_d, store) = temp_store();
517 write_rel(
520 &store,
521 "wiki/people/a.md",
522 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]]\n",
523 );
524 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
525
526 let s = compute(&store).expect("compute");
527 assert_eq!(s.orphan_count, 0);
528 }
529
530 #[test]
531 fn frontmatter_wiki_links_count_as_edges_for_orphans() {
532 let (_d, store) = temp_store();
533 write_rel(
536 &store,
537 "records/contacts/sarah.md",
538 "---\ntype: contact\nsummary: s\ncompany: [[records/companies/acme]]\n---\n\nbody\n",
539 );
540 write_rel(&store, "records/companies/acme.md", &doc("company", "acme"));
541
542 let s = compute(&store).expect("compute");
543 assert_eq!(
544 s.orphan_count, 0,
545 "a frontmatter wiki-link is a real edge; neither endpoint is orphaned"
546 );
547 }
548
549 #[test]
550 fn broken_links_count_targets_that_do_not_exist() {
551 let (_d, store) = temp_store();
552 write_rel(
554 &store,
555 "wiki/people/a.md",
556 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]] and [[records/contacts/ghost]]\n",
557 );
558 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
559
560 let s = compute(&store).expect("compute");
561 assert_eq!(s.broken_link_count, 1, "only the ghost target is broken");
562 }
563
564 #[test]
565 fn broken_link_resolves_with_md_extension_stripped() {
566 let (_d, store) = temp_store();
567 write_rel(
570 &store,
571 "wiki/people/a.md",
572 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b.md]]\n",
573 );
574 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
575
576 let s = compute(&store).expect("compute");
577 assert_eq!(
578 s.broken_link_count, 0,
579 "a `.md`-suffixed target resolves to the same node and is not broken"
580 );
581 }
582
583 #[test]
584 fn short_form_links_are_not_broken_and_do_not_wire_the_graph() {
585 let (_d, store) = temp_store();
586 write_rel(
590 &store,
591 "records/contacts/a.md",
592 "---\ntype: contact\nsummary: a\n---\n\n[[b]]\n",
593 );
594 write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
595
596 let s = compute(&store).expect("compute");
597 assert_eq!(
598 s.broken_link_count, 0,
599 "short-form links are not counted as broken by stats"
600 );
601 assert_eq!(s.orphan_count, 2);
604 }
605
606 #[test]
607 fn display_alias_links_resolve_to_the_target_not_the_alias() {
608 let (_d, store) = temp_store();
609 write_rel(
613 &store,
614 "wiki/people/a.md",
615 "---\ntype: wiki-page\nsummary: a\n---\n\nmet [[wiki/people/b|Bob]] today\n",
616 );
617 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
618
619 let s = compute(&store).expect("compute");
620 assert_eq!(s.broken_link_count, 0, "alias target resolves and exists");
621 assert_eq!(s.orphan_count, 0, "a links out, b is linked to");
622 }
623
624 #[test]
625 fn duplicate_links_in_one_file_count_broken_per_occurrence() {
626 let (_d, store) = temp_store();
627 write_rel(
629 &store,
630 "wiki/people/a.md",
631 "---\ntype: wiki-page\nsummary: a\n---\n\n[[records/contacts/ghost]] [[records/contacts/ghost]]\n",
632 );
633 let s = compute(&store).expect("compute");
634 assert_eq!(
635 s.broken_link_count, 2,
636 "broken links count occurrences, not distinct targets"
637 );
638 }
639
640 #[test]
641 fn markdown_links_are_not_treated_as_wiki_links() {
642 let (_d, store) = temp_store();
643 write_rel(
646 &store,
647 "wiki/people/a.md",
648 "---\ntype: wiki-page\nsummary: a\n---\n\nSee [Acme](https://acme.io/path).\n",
649 );
650 let s = compute(&store).expect("compute");
651 assert_eq!(s.broken_link_count, 0, "markdown links aren't graph edges");
652 assert_eq!(s.orphan_count, 1, "the file has no wiki-links => orphan");
653 }
654
655 #[test]
656 fn a_link_to_an_existing_file_in_another_layer_resolves() {
657 let (_d, store) = temp_store();
658 write_rel(
661 &store,
662 "wiki/people/a.md",
663 "---\ntype: wiki-page\nsummary: a\n---\n\nfrom [[sources/emails/2026/05/m]]\n",
664 );
665 write_rel(&store, "sources/emails/2026/05/m.md", &doc("email", "m"));
666
667 let s = compute(&store).expect("compute");
668 assert_eq!(s.broken_link_count, 0);
669 assert_eq!(s.orphan_count, 0, "both endpoints are wired");
670 }
671}