1use std::collections::{BTreeMap, BTreeSet, HashSet};
10use std::path::{Path, PathBuf};
11
12use regex::Regex;
13
14use crate::store::{Layer, Store};
15
16#[derive(Debug, Clone, Default, PartialEq)]
19pub struct Stats {
20 pub total_files: usize,
22 pub files_per_layer: BTreeMap<Layer, usize>,
24 pub total_size_bytes: u64,
26 pub type_distribution: BTreeMap<String, usize>,
28 pub orphan_count: usize,
30 pub broken_link_count: usize,
32 pub top_types: Vec<(String, usize)>,
34 pub recognized_types_present: Vec<String>,
36 pub custom_types_present: Vec<String>,
38}
39
40const RECOGNIZED_CONTENT_TYPES: &[&str] = &[
48 "email",
49 "transcript",
50 "pdf-source",
51 "contact",
52 "company",
53 "expense",
54 "meeting",
55 "decision",
56 "invoice",
57 "wiki-page",
58];
59
60const TOP_TYPES_LIMIT: usize = 10;
62
63struct FileFacts {
67 node_id: PathBuf,
70 layer: Layer,
72 size_bytes: u64,
74 type_: Option<String>,
76 raw_targets: Vec<PathBuf>,
80}
81
82impl FileFacts {
83 fn resolvable_targets(&self) -> impl Iterator<Item = &PathBuf> {
89 self.raw_targets.iter().filter(|t| is_full_path(t))
90 }
91}
92
93pub fn compute(store: &Store) -> crate::Result<Stats> {
96 let link_re = wiki_link_regex();
97
98 let mut existing_nodes: HashSet<PathBuf> = HashSet::new();
102 let mut facts: Vec<FileFacts> = Vec::new();
103
104 for layer in Layer::all() {
105 let layer_root = store.root.join(layer_dir_name(layer));
106 for abs in walk_layer_content_files(&layer_root)? {
107 let rel = abs.strip_prefix(&store.root).unwrap_or(&abs).to_path_buf();
108 let node_id = strip_md(&rel);
109 existing_nodes.insert(node_id.clone());
110
111 let size_bytes = std::fs::metadata(&abs).map(|m| m.len()).unwrap_or(0);
112 let text = std::fs::read_to_string(&abs).unwrap_or_default();
113 let type_ = parse_type(&text);
114 let raw_targets = extract_link_targets(&text, &link_re);
115
116 facts.push(FileFacts {
117 node_id,
118 layer,
119 size_bytes,
120 type_,
121 raw_targets,
122 });
123 }
124 }
125
126 let mut stats = Stats::default();
132 let mut linked_to: HashSet<PathBuf> = HashSet::new();
133 for file in &facts {
134 for target in file.resolvable_targets() {
135 if existing_nodes.contains(target) {
136 linked_to.insert(target.clone());
137 } else {
138 stats.broken_link_count += 1;
140 }
141 }
142 }
143
144 for file in &facts {
147 stats.total_files += 1;
148 *stats.files_per_layer.entry(file.layer).or_insert(0) += 1;
149 stats.total_size_bytes += file.size_bytes;
150
151 if let Some(t) = &file.type_ {
152 *stats.type_distribution.entry(t.clone()).or_insert(0) += 1;
153 }
154
155 let has_outgoing = file
156 .resolvable_targets()
157 .any(|t| existing_nodes.contains(t));
158 let has_incoming = linked_to.contains(&file.node_id);
159 if !has_outgoing && !has_incoming {
160 stats.orphan_count += 1;
161 }
162 }
163
164 stats.top_types = top_types(&stats.type_distribution, TOP_TYPES_LIMIT);
165 let (recognized, custom) = split_schema_coverage(&stats.type_distribution);
166 stats.recognized_types_present = recognized;
167 stats.custom_types_present = custom;
168
169 Ok(stats)
170}
171
172fn layer_dir_name(layer: Layer) -> &'static str {
175 match layer {
176 Layer::Sources => "sources",
177 Layer::Records => "records",
178 Layer::Wiki => "wiki",
179 }
180}
181
182fn walk_layer_content_files(layer_root: &Path) -> crate::Result<Vec<PathBuf>> {
187 let mut out = Vec::new();
188 if !layer_root.is_dir() {
189 return Ok(out);
190 }
191 let walker = walkdir::WalkDir::new(layer_root)
192 .into_iter()
193 .filter_entry(|e| {
194 let name = e.file_name().to_string_lossy();
196 if name.starts_with('.') {
197 return false;
198 }
199 if e.file_type().is_dir() && name == "log" {
200 return false;
201 }
202 true
203 });
204 for entry in walker {
205 let entry = entry.map_err(|e| {
206 crate::Error::Io(
207 e.into_io_error()
208 .unwrap_or_else(|| std::io::Error::other("walk error")),
209 )
210 })?;
211 if !entry.file_type().is_file() {
212 continue;
213 }
214 let path = entry.path();
215 let name = entry.file_name().to_string_lossy();
216 if !name.ends_with(".md") || name == "index.md" {
219 continue;
220 }
221 out.push(path.to_path_buf());
222 }
223 out.sort();
224 Ok(out)
225}
226
227fn wiki_link_regex() -> Regex {
231 Regex::new(r"\[\[([^\[\]|]+)(?:\|[^\]]*)?\]\]").expect("static wiki-link regex is valid")
233}
234
235fn extract_link_targets(text: &str, re: &Regex) -> Vec<PathBuf> {
238 re.captures_iter(text)
239 .filter_map(|c| c.get(1))
240 .map(|m| {
241 let raw = m.as_str().trim();
242 strip_md(Path::new(raw))
243 })
244 .collect()
245}
246
247fn strip_md(path: &Path) -> PathBuf {
249 let s = path.to_string_lossy();
250 match s.strip_suffix(".md") {
251 Some(stem) => PathBuf::from(stem),
252 None => path.to_path_buf(),
253 }
254}
255
256fn is_full_path(target: &Path) -> bool {
260 target.components().count() > 1
261}
262
263fn parse_type(text: &str) -> Option<String> {
268 let yaml = frontmatter_block(text)?;
269 let value: serde_yml::Value = serde_yml::from_str(&yaml).ok()?;
270 let mapping = value.as_mapping()?;
271 let type_val = mapping.get(serde_yml::Value::String("type".to_string()))?;
272 let s = type_val.as_str()?.trim();
273 if s.is_empty() {
274 None
275 } else {
276 Some(s.to_string())
277 }
278}
279
280fn frontmatter_block(text: &str) -> Option<String> {
284 let text = text.strip_prefix('\u{feff}').unwrap_or(text);
286 let mut lines = text.lines();
287 let first = lines.next()?;
288 if first.trim_end() != "---" {
289 return None;
290 }
291 let mut body = String::new();
292 for line in lines {
293 if line.trim_end() == "---" {
294 return Some(body);
295 }
296 body.push_str(line);
297 body.push('\n');
298 }
299 None
301}
302
303fn top_types(dist: &BTreeMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
306 let mut pairs: Vec<(String, usize)> = dist.iter().map(|(k, v)| (k.clone(), *v)).collect();
307 pairs.sort_by_key(|p| std::cmp::Reverse(p.1));
310 pairs.truncate(limit);
311 pairs
312}
313
314fn split_schema_coverage(dist: &BTreeMap<String, usize>) -> (Vec<String>, Vec<String>) {
317 let canonical: BTreeSet<&str> = RECOGNIZED_CONTENT_TYPES.iter().copied().collect();
318 let mut recognized = Vec::new();
319 let mut custom = Vec::new();
320 for type_ in dist.keys() {
322 if canonical.contains(type_.as_str()) {
323 recognized.push(type_.clone());
324 } else {
325 custom.push(type_.clone());
326 }
327 }
328 (recognized, custom)
329}
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334 use crate::parser::Config;
335 use std::fs;
336 use tempfile::TempDir;
337
338 fn temp_store() -> (TempDir, Store) {
342 let dir = TempDir::new().expect("tempdir");
343 fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").expect("write DB.md");
344 let store = Store {
345 root: dir.path().to_path_buf(),
346 config: Config::default(),
347 };
348 (dir, store)
349 }
350
351 fn write_rel(store: &Store, rel: &str, contents: &str) {
353 let abs = store.root.join(rel);
354 if let Some(parent) = abs.parent() {
355 fs::create_dir_all(parent).expect("mkdir parents");
356 }
357 fs::write(abs, contents).expect("write content file");
358 }
359
360 fn doc(type_: &str, summary: &str) -> String {
362 format!("---\ntype: {type_}\nsummary: \"{summary}\"\n---\n\nbody\n")
363 }
364
365 #[test]
366 fn empty_store_is_all_zeros() {
367 let (_d, store) = temp_store();
368 let s = compute(&store).expect("compute");
369 assert_eq!(s.total_files, 0);
370 assert_eq!(s.total_size_bytes, 0);
371 assert!(s.files_per_layer.is_empty());
372 assert!(s.type_distribution.is_empty());
373 assert_eq!(s.orphan_count, 0);
374 assert_eq!(s.broken_link_count, 0);
375 assert!(s.top_types.is_empty());
376 assert!(s.recognized_types_present.is_empty());
377 assert!(s.custom_types_present.is_empty());
378 }
379
380 #[test]
381 fn counts_files_per_layer_and_total() {
382 let (_d, store) = temp_store();
383 write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
384 write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
385 write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
386 write_rel(&store, "wiki/people/p.md", &doc("wiki-page", "p"));
387
388 let s = compute(&store).expect("compute");
389 assert_eq!(s.total_files, 4);
390 assert_eq!(s.files_per_layer.get(&Layer::Sources), Some(&2));
391 assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
392 assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&1));
393 }
394
395 #[test]
396 fn ignores_meta_files_and_non_md_and_dotdirs_and_log() {
397 let (_d, store) = temp_store();
398 write_rel(&store, "records/contacts/real.md", &doc("contact", "real"));
400 write_rel(
402 &store,
403 "records/contacts/index.md",
404 "---\ntype: index\nscope: type-folder\n---\n",
405 );
406 write_rel(&store, "records/contacts/index.jsonl", "{}\n");
407 write_rel(&store, "records/notes.txt", "not markdown\n");
408 write_rel(&store, "sources/log/2026-04.md", &doc("email", "archived"));
410 write_rel(
412 &store,
413 "wiki/.obsidian/cache.md",
414 &doc("wiki-page", "hidden"),
415 );
416
417 let s = compute(&store).expect("compute");
418 assert_eq!(s.total_files, 1, "only the one real content file counts");
419 assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
420 assert_eq!(s.files_per_layer.get(&Layer::Sources), None);
421 assert_eq!(s.files_per_layer.get(&Layer::Wiki), None);
422 }
423
424 #[test]
425 fn total_size_is_sum_of_content_file_bytes() {
426 let (_d, store) = temp_store();
427 let a = doc("email", "a");
428 let b = "---\ntype: contact\nsummary: x\n---\n\nlonger body text here\n".to_string();
429 write_rel(&store, "sources/emails/a.md", &a);
430 write_rel(&store, "records/contacts/b.md", &b);
431 write_rel(
433 &store,
434 "records/contacts/index.md",
435 "---\ntype: index\n---\nbig meta file padding padding\n",
436 );
437
438 let s = compute(&store).expect("compute");
439 let expected = a.len() as u64 + b.len() as u64;
440 assert_eq!(s.total_size_bytes, expected);
441 }
442
443 #[test]
444 fn type_distribution_counts_each_type_value() {
445 let (_d, store) = temp_store();
446 write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
447 write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
448 write_rel(&store, "sources/emails/c.md", &doc("email", "c"));
449 write_rel(&store, "records/contacts/d.md", &doc("contact", "d"));
450 write_rel(&store, "records/proposals/e.md", &doc("proposal", "e"));
451
452 let s = compute(&store).expect("compute");
453 assert_eq!(s.type_distribution.get("email"), Some(&3));
454 assert_eq!(s.type_distribution.get("contact"), Some(&1));
455 assert_eq!(s.type_distribution.get("proposal"), Some(&1));
456 assert_eq!(s.type_distribution.len(), 3);
457 }
458
459 #[test]
460 fn file_without_type_is_counted_in_totals_but_not_distribution() {
461 let (_d, store) = temp_store();
462 write_rel(
464 &store,
465 "wiki/themes/x.md",
466 "---\nsummary: no type here\n---\n\nbody\n",
467 );
468 write_rel(&store, "wiki/themes/y.md", "just a body, no frontmatter\n");
470
471 let s = compute(&store).expect("compute");
472 assert_eq!(s.total_files, 2, "untyped files still count toward totals");
473 assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&2));
474 assert!(
475 s.type_distribution.is_empty(),
476 "no type key => no distribution entry, not an empty-string bucket"
477 );
478 }
479
480 #[test]
481 fn top_types_orders_by_count_desc_then_name_asc() {
482 let (_d, store) = temp_store();
483 write_rel(&store, "records/contacts/c1.md", &doc("contact", "1"));
485 write_rel(&store, "records/contacts/c2.md", &doc("contact", "2"));
486 write_rel(&store, "records/contacts/c3.md", &doc("contact", "3"));
487 write_rel(&store, "sources/emails/e1.md", &doc("email", "1"));
488 write_rel(&store, "sources/emails/e2.md", &doc("email", "2"));
489 write_rel(&store, "sources/emails/e3.md", &doc("email", "3"));
490 write_rel(&store, "records/decisions/d1.md", &doc("decision", "1"));
491
492 let s = compute(&store).expect("compute");
493 assert_eq!(
494 s.top_types,
495 vec![
496 ("contact".to_string(), 3),
497 ("email".to_string(), 3),
498 ("decision".to_string(), 1),
499 ],
500 "ties (contact, email both 3) break by name ascending; decision trails"
501 );
502 }
503
504 #[test]
505 fn top_types_is_capped_at_ten() {
506 let (_d, store) = temp_store();
507 for i in 0..12 {
509 let t = format!("type{i:02}");
510 write_rel(&store, &format!("records/{t}/f.md"), &doc(&t, "x"));
511 }
512 let s = compute(&store).expect("compute");
513 assert_eq!(s.top_types.len(), 10, "top_types caps at 10");
514 assert_eq!(
515 s.type_distribution.len(),
516 12,
517 "distribution keeps all types"
518 );
519 }
520
521 #[test]
522 fn schema_coverage_splits_recognized_from_custom() {
523 let (_d, store) = temp_store();
524 write_rel(&store, "records/contacts/c.md", &doc("contact", "c")); write_rel(&store, "sources/emails/e.md", &doc("email", "e")); write_rel(&store, "wiki/people/p.md", &doc("wiki-page", "p")); write_rel(&store, "records/proposals/x.md", &doc("proposal", "x")); write_rel(&store, "records/widgets/w.md", &doc("widget", "w")); let s = compute(&store).expect("compute");
531 assert_eq!(
532 s.recognized_types_present,
533 vec![
534 "contact".to_string(),
535 "email".to_string(),
536 "wiki-page".to_string()
537 ],
538 "recognized canonical content types, sorted ascending"
539 );
540 assert_eq!(
541 s.custom_types_present,
542 vec!["proposal".to_string(), "widget".to_string()],
543 "non-canonical types land in custom, sorted ascending"
544 );
545 }
546
547 #[test]
548 fn meta_types_are_not_recognized_content_types() {
549 let (_d, store) = temp_store();
553 write_rel(&store, "wiki/synthesis/weird.md", &doc("log", "weird"));
554 let s = compute(&store).expect("compute");
555 assert!(
556 s.recognized_types_present.is_empty(),
557 "`log` is a meta type, not a recognized content type"
558 );
559 assert_eq!(s.custom_types_present, vec!["log".to_string()]);
560 }
561
562 #[test]
563 fn orphans_are_files_with_no_incoming_and_no_outgoing_links() {
564 let (_d, store) = temp_store();
565 write_rel(
567 &store,
568 "records/contacts/a.md",
569 "---\ntype: contact\nsummary: a\n---\n\nSee [[records/contacts/b]].\n",
570 );
571 write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
572 write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
573
574 let s = compute(&store).expect("compute");
575 assert_eq!(s.orphan_count, 1, "only c is an orphan");
576 }
577
578 #[test]
579 fn a_file_with_only_an_incoming_link_is_not_an_orphan() {
580 let (_d, store) = temp_store();
581 write_rel(
584 &store,
585 "wiki/people/a.md",
586 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]]\n",
587 );
588 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
589
590 let s = compute(&store).expect("compute");
591 assert_eq!(s.orphan_count, 0);
592 }
593
594 #[test]
595 fn frontmatter_wiki_links_count_as_edges_for_orphans() {
596 let (_d, store) = temp_store();
597 write_rel(
600 &store,
601 "records/contacts/sarah.md",
602 "---\ntype: contact\nsummary: s\ncompany: [[records/companies/acme]]\n---\n\nbody\n",
603 );
604 write_rel(&store, "records/companies/acme.md", &doc("company", "acme"));
605
606 let s = compute(&store).expect("compute");
607 assert_eq!(
608 s.orphan_count, 0,
609 "a frontmatter wiki-link is a real edge; neither endpoint is orphaned"
610 );
611 }
612
613 #[test]
614 fn broken_links_count_targets_that_do_not_exist() {
615 let (_d, store) = temp_store();
616 write_rel(
618 &store,
619 "wiki/people/a.md",
620 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]] and [[records/contacts/ghost]]\n",
621 );
622 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
623
624 let s = compute(&store).expect("compute");
625 assert_eq!(s.broken_link_count, 1, "only the ghost target is broken");
626 }
627
628 #[test]
629 fn broken_link_resolves_with_md_extension_stripped() {
630 let (_d, store) = temp_store();
631 write_rel(
634 &store,
635 "wiki/people/a.md",
636 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b.md]]\n",
637 );
638 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
639
640 let s = compute(&store).expect("compute");
641 assert_eq!(
642 s.broken_link_count, 0,
643 "a `.md`-suffixed target resolves to the same node and is not broken"
644 );
645 }
646
647 #[test]
648 fn short_form_links_are_not_broken_and_do_not_wire_the_graph() {
649 let (_d, store) = temp_store();
650 write_rel(
654 &store,
655 "records/contacts/a.md",
656 "---\ntype: contact\nsummary: a\n---\n\n[[b]]\n",
657 );
658 write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
659
660 let s = compute(&store).expect("compute");
661 assert_eq!(
662 s.broken_link_count, 0,
663 "short-form links are not counted as broken by stats"
664 );
665 assert_eq!(s.orphan_count, 2);
668 }
669
670 #[test]
671 fn display_alias_links_resolve_to_the_target_not_the_alias() {
672 let (_d, store) = temp_store();
673 write_rel(
677 &store,
678 "wiki/people/a.md",
679 "---\ntype: wiki-page\nsummary: a\n---\n\nmet [[wiki/people/b|Bob]] today\n",
680 );
681 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
682
683 let s = compute(&store).expect("compute");
684 assert_eq!(s.broken_link_count, 0, "alias target resolves and exists");
685 assert_eq!(s.orphan_count, 0, "a links out, b is linked to");
686 }
687
688 #[test]
689 fn duplicate_links_in_one_file_count_broken_per_occurrence() {
690 let (_d, store) = temp_store();
691 write_rel(
693 &store,
694 "wiki/people/a.md",
695 "---\ntype: wiki-page\nsummary: a\n---\n\n[[records/contacts/ghost]] [[records/contacts/ghost]]\n",
696 );
697 let s = compute(&store).expect("compute");
698 assert_eq!(
699 s.broken_link_count, 2,
700 "broken links count occurrences, not distinct targets"
701 );
702 }
703
704 #[test]
705 fn markdown_links_are_not_treated_as_wiki_links() {
706 let (_d, store) = temp_store();
707 write_rel(
710 &store,
711 "wiki/people/a.md",
712 "---\ntype: wiki-page\nsummary: a\n---\n\nSee [Acme](https://acme.io/path).\n",
713 );
714 let s = compute(&store).expect("compute");
715 assert_eq!(s.broken_link_count, 0, "markdown links aren't graph edges");
716 assert_eq!(s.orphan_count, 1, "the file has no wiki-links => orphan");
717 }
718
719 #[test]
720 fn a_link_to_an_existing_file_in_another_layer_resolves() {
721 let (_d, store) = temp_store();
722 write_rel(
725 &store,
726 "wiki/people/a.md",
727 "---\ntype: wiki-page\nsummary: a\n---\n\nfrom [[sources/emails/2026/05/m]]\n",
728 );
729 write_rel(&store, "sources/emails/2026/05/m.md", &doc("email", "m"));
730
731 let s = compute(&store).expect("compute");
732 assert_eq!(s.broken_link_count, 0);
733 assert_eq!(s.orphan_count, 0, "both endpoints are wired");
734 }
735}