1use std::collections::{BTreeMap, HashSet};
10use std::path::{Path, PathBuf};
11
12use regex::Regex;
13
14use crate::store::{Layer, Store};
15
16#[derive(Debug, Clone, Default, PartialEq)]
19pub struct Stats {
20 pub total_files: usize,
22 pub files_per_layer: BTreeMap<Layer, usize>,
24 pub total_size_bytes: u64,
26 pub type_distribution: BTreeMap<String, usize>,
28 pub orphan_count: usize,
30 pub broken_link_count: usize,
32 pub top_types: Vec<(String, usize)>,
34}
35
36const TOP_TYPES_LIMIT: usize = 10;
38
39struct FileFacts {
43 node_id: PathBuf,
46 layer: Layer,
48 size_bytes: u64,
50 type_: Option<String>,
52 raw_targets: Vec<PathBuf>,
56}
57
58impl FileFacts {
59 fn resolvable_targets(&self) -> impl Iterator<Item = &PathBuf> {
65 self.raw_targets.iter().filter(|t| is_full_path(t))
66 }
67}
68
69pub fn compute(store: &Store) -> crate::Result<Stats> {
72 let link_re = wiki_link_regex();
73
74 let mut existing_nodes: HashSet<PathBuf> = HashSet::new();
78 let mut facts: Vec<FileFacts> = Vec::new();
79
80 for layer in Layer::all() {
81 let layer_root = store.root.join(layer_dir_name(layer));
82 for abs in walk_layer_content_files(&layer_root)? {
83 let rel = abs.strip_prefix(&store.root).unwrap_or(&abs).to_path_buf();
84 let node_id = strip_md(&rel);
85 existing_nodes.insert(node_id.clone());
86
87 let size_bytes = std::fs::metadata(&abs).map(|m| m.len()).unwrap_or(0);
88 let text = std::fs::read_to_string(&abs).unwrap_or_default();
89 let type_ = parse_type(&text);
90 let raw_targets = extract_link_targets(&text, &link_re);
91
92 facts.push(FileFacts {
93 node_id,
94 layer,
95 size_bytes,
96 type_,
97 raw_targets,
98 });
99 }
100 }
101
102 let mut stats = Stats::default();
108 let mut linked_to: HashSet<PathBuf> = HashSet::new();
109 for file in &facts {
110 for target in file.resolvable_targets() {
111 if existing_nodes.contains(target) {
112 linked_to.insert(target.clone());
113 } else {
114 stats.broken_link_count += 1;
116 }
117 }
118 }
119
120 for file in &facts {
123 stats.total_files += 1;
124 *stats.files_per_layer.entry(file.layer).or_insert(0) += 1;
125 stats.total_size_bytes += file.size_bytes;
126
127 if let Some(t) = &file.type_ {
128 *stats.type_distribution.entry(t.clone()).or_insert(0) += 1;
129 }
130
131 let has_outgoing = file
132 .resolvable_targets()
133 .any(|t| existing_nodes.contains(t));
134 let has_incoming = linked_to.contains(&file.node_id);
135 if !has_outgoing && !has_incoming {
136 stats.orphan_count += 1;
137 }
138 }
139
140 stats.top_types = top_types(&stats.type_distribution, TOP_TYPES_LIMIT);
141
142 Ok(stats)
143}
144
145fn layer_dir_name(layer: Layer) -> &'static str {
148 match layer {
149 Layer::Sources => "sources",
150 Layer::Records => "records",
151 Layer::Wiki => "wiki",
152 }
153}
154
155fn walk_layer_content_files(layer_root: &Path) -> crate::Result<Vec<PathBuf>> {
160 let mut out = Vec::new();
161 if !layer_root.is_dir() {
162 return Ok(out);
163 }
164 let walker = walkdir::WalkDir::new(layer_root)
165 .into_iter()
166 .filter_entry(|e| {
167 let name = e.file_name().to_string_lossy();
169 if name.starts_with('.') {
170 return false;
171 }
172 if e.file_type().is_dir() && name == "log" {
173 return false;
174 }
175 true
176 });
177 for entry in walker {
178 let entry = entry.map_err(|e| {
179 crate::Error::Io(
180 e.into_io_error()
181 .unwrap_or_else(|| std::io::Error::other("walk error")),
182 )
183 })?;
184 if !entry.file_type().is_file() {
185 continue;
186 }
187 let path = entry.path();
188 let name = entry.file_name().to_string_lossy();
189 if !name.ends_with(".md") || name == "index.md" {
192 continue;
193 }
194 out.push(path.to_path_buf());
195 }
196 out.sort();
197 Ok(out)
198}
199
200fn wiki_link_regex() -> Regex {
204 Regex::new(r"\[\[([^\[\]|]+)(?:\|[^\]]*)?\]\]").expect("static wiki-link regex is valid")
206}
207
208fn extract_link_targets(text: &str, re: &Regex) -> Vec<PathBuf> {
211 re.captures_iter(text)
212 .filter_map(|c| c.get(1))
213 .map(|m| {
214 let raw = m.as_str().trim();
215 strip_md(Path::new(raw))
216 })
217 .collect()
218}
219
220fn strip_md(path: &Path) -> PathBuf {
222 let s = path.to_string_lossy();
223 match s.strip_suffix(".md") {
224 Some(stem) => PathBuf::from(stem),
225 None => path.to_path_buf(),
226 }
227}
228
229fn is_full_path(target: &Path) -> bool {
233 target.components().count() > 1
234}
235
236fn parse_type(text: &str) -> Option<String> {
241 let yaml = frontmatter_block(text)?;
242 let value: serde_norway::Value = serde_norway::from_str(&yaml).ok()?;
243 let mapping = value.as_mapping()?;
244 let type_val = mapping.get(serde_norway::Value::String("type".to_string()))?;
245 let s = type_val.as_str()?.trim();
246 if s.is_empty() {
247 None
248 } else {
249 Some(s.to_string())
250 }
251}
252
253fn frontmatter_block(text: &str) -> Option<String> {
257 let text = text.strip_prefix('\u{feff}').unwrap_or(text);
259 let mut lines = text.lines();
260 let first = lines.next()?;
261 if first.trim_end() != "---" {
262 return None;
263 }
264 let mut body = String::new();
265 for line in lines {
266 if line.trim_end() == "---" {
267 return Some(body);
268 }
269 body.push_str(line);
270 body.push('\n');
271 }
272 None
274}
275
276fn top_types(dist: &BTreeMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
279 let mut pairs: Vec<(String, usize)> = dist.iter().map(|(k, v)| (k.clone(), *v)).collect();
280 pairs.sort_by_key(|p| std::cmp::Reverse(p.1));
283 pairs.truncate(limit);
284 pairs
285}
286
287#[cfg(test)]
288mod tests {
289 use super::*;
290 use crate::parser::Config;
291 use std::fs;
292 use tempfile::TempDir;
293
294 fn temp_store() -> (TempDir, Store) {
298 let dir = TempDir::new().expect("tempdir");
299 fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").expect("write DB.md");
300 let store = Store {
301 root: dir.path().to_path_buf(),
302 config: Config::default(),
303 };
304 (dir, store)
305 }
306
307 fn write_rel(store: &Store, rel: &str, contents: &str) {
309 let abs = store.root.join(rel);
310 if let Some(parent) = abs.parent() {
311 fs::create_dir_all(parent).expect("mkdir parents");
312 }
313 fs::write(abs, contents).expect("write content file");
314 }
315
316 fn doc(type_: &str, summary: &str) -> String {
318 format!("---\ntype: {type_}\nsummary: \"{summary}\"\n---\n\nbody\n")
319 }
320
321 #[test]
322 fn empty_store_is_all_zeros() {
323 let (_d, store) = temp_store();
324 let s = compute(&store).expect("compute");
325 assert_eq!(s.total_files, 0);
326 assert_eq!(s.total_size_bytes, 0);
327 assert!(s.files_per_layer.is_empty());
328 assert!(s.type_distribution.is_empty());
329 assert_eq!(s.orphan_count, 0);
330 assert_eq!(s.broken_link_count, 0);
331 assert!(s.top_types.is_empty());
332 }
333
334 #[test]
335 fn counts_files_per_layer_and_total() {
336 let (_d, store) = temp_store();
337 write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
338 write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
339 write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
340 write_rel(&store, "wiki/people/p.md", &doc("wiki-page", "p"));
341
342 let s = compute(&store).expect("compute");
343 assert_eq!(s.total_files, 4);
344 assert_eq!(s.files_per_layer.get(&Layer::Sources), Some(&2));
345 assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
346 assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&1));
347 }
348
349 #[test]
350 fn ignores_meta_files_and_non_md_and_dotdirs_and_log() {
351 let (_d, store) = temp_store();
352 write_rel(&store, "records/contacts/real.md", &doc("contact", "real"));
354 write_rel(
356 &store,
357 "records/contacts/index.md",
358 "---\ntype: index\nscope: type-folder\n---\n",
359 );
360 write_rel(&store, "records/contacts/index.jsonl", "{}\n");
361 write_rel(&store, "records/notes.txt", "not markdown\n");
362 write_rel(&store, "sources/log/2026-04.md", &doc("email", "archived"));
364 write_rel(
366 &store,
367 "wiki/.obsidian/cache.md",
368 &doc("wiki-page", "hidden"),
369 );
370
371 let s = compute(&store).expect("compute");
372 assert_eq!(s.total_files, 1, "only the one real content file counts");
373 assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
374 assert_eq!(s.files_per_layer.get(&Layer::Sources), None);
375 assert_eq!(s.files_per_layer.get(&Layer::Wiki), None);
376 }
377
378 #[test]
379 fn total_size_is_sum_of_content_file_bytes() {
380 let (_d, store) = temp_store();
381 let a = doc("email", "a");
382 let b = "---\ntype: contact\nsummary: x\n---\n\nlonger body text here\n".to_string();
383 write_rel(&store, "sources/emails/a.md", &a);
384 write_rel(&store, "records/contacts/b.md", &b);
385 write_rel(
387 &store,
388 "records/contacts/index.md",
389 "---\ntype: index\n---\nbig meta file padding padding\n",
390 );
391
392 let s = compute(&store).expect("compute");
393 let expected = a.len() as u64 + b.len() as u64;
394 assert_eq!(s.total_size_bytes, expected);
395 }
396
397 #[test]
398 fn type_distribution_counts_each_type_value() {
399 let (_d, store) = temp_store();
400 write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
401 write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
402 write_rel(&store, "sources/emails/c.md", &doc("email", "c"));
403 write_rel(&store, "records/contacts/d.md", &doc("contact", "d"));
404 write_rel(&store, "records/proposals/e.md", &doc("proposal", "e"));
405
406 let s = compute(&store).expect("compute");
407 assert_eq!(s.type_distribution.get("email"), Some(&3));
408 assert_eq!(s.type_distribution.get("contact"), Some(&1));
409 assert_eq!(s.type_distribution.get("proposal"), Some(&1));
410 assert_eq!(s.type_distribution.len(), 3);
411 }
412
413 #[test]
414 fn file_without_type_is_counted_in_totals_but_not_distribution() {
415 let (_d, store) = temp_store();
416 write_rel(
418 &store,
419 "wiki/themes/x.md",
420 "---\nsummary: no type here\n---\n\nbody\n",
421 );
422 write_rel(&store, "wiki/themes/y.md", "just a body, no frontmatter\n");
424
425 let s = compute(&store).expect("compute");
426 assert_eq!(s.total_files, 2, "untyped files still count toward totals");
427 assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&2));
428 assert!(
429 s.type_distribution.is_empty(),
430 "no type key => no distribution entry, not an empty-string bucket"
431 );
432 }
433
434 #[test]
435 fn top_types_orders_by_count_desc_then_name_asc() {
436 let (_d, store) = temp_store();
437 write_rel(&store, "records/contacts/c1.md", &doc("contact", "1"));
439 write_rel(&store, "records/contacts/c2.md", &doc("contact", "2"));
440 write_rel(&store, "records/contacts/c3.md", &doc("contact", "3"));
441 write_rel(&store, "sources/emails/e1.md", &doc("email", "1"));
442 write_rel(&store, "sources/emails/e2.md", &doc("email", "2"));
443 write_rel(&store, "sources/emails/e3.md", &doc("email", "3"));
444 write_rel(&store, "records/decisions/d1.md", &doc("decision", "1"));
445
446 let s = compute(&store).expect("compute");
447 assert_eq!(
448 s.top_types,
449 vec![
450 ("contact".to_string(), 3),
451 ("email".to_string(), 3),
452 ("decision".to_string(), 1),
453 ],
454 "ties (contact, email both 3) break by name ascending; decision trails"
455 );
456 }
457
458 #[test]
459 fn top_types_is_capped_at_ten() {
460 let (_d, store) = temp_store();
461 for i in 0..12 {
463 let t = format!("type{i:02}");
464 write_rel(&store, &format!("records/{t}/f.md"), &doc(&t, "x"));
465 }
466 let s = compute(&store).expect("compute");
467 assert_eq!(s.top_types.len(), 10, "top_types caps at 10");
468 assert_eq!(
469 s.type_distribution.len(),
470 12,
471 "distribution keeps all types"
472 );
473 }
474
475 #[test]
476 fn orphans_are_files_with_no_incoming_and_no_outgoing_links() {
477 let (_d, store) = temp_store();
478 write_rel(
480 &store,
481 "records/contacts/a.md",
482 "---\ntype: contact\nsummary: a\n---\n\nSee [[records/contacts/b]].\n",
483 );
484 write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
485 write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
486
487 let s = compute(&store).expect("compute");
488 assert_eq!(s.orphan_count, 1, "only c is an orphan");
489 }
490
491 #[test]
492 fn a_file_with_only_an_incoming_link_is_not_an_orphan() {
493 let (_d, store) = temp_store();
494 write_rel(
497 &store,
498 "wiki/people/a.md",
499 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]]\n",
500 );
501 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
502
503 let s = compute(&store).expect("compute");
504 assert_eq!(s.orphan_count, 0);
505 }
506
507 #[test]
508 fn frontmatter_wiki_links_count_as_edges_for_orphans() {
509 let (_d, store) = temp_store();
510 write_rel(
513 &store,
514 "records/contacts/sarah.md",
515 "---\ntype: contact\nsummary: s\ncompany: [[records/companies/acme]]\n---\n\nbody\n",
516 );
517 write_rel(&store, "records/companies/acme.md", &doc("company", "acme"));
518
519 let s = compute(&store).expect("compute");
520 assert_eq!(
521 s.orphan_count, 0,
522 "a frontmatter wiki-link is a real edge; neither endpoint is orphaned"
523 );
524 }
525
526 #[test]
527 fn broken_links_count_targets_that_do_not_exist() {
528 let (_d, store) = temp_store();
529 write_rel(
531 &store,
532 "wiki/people/a.md",
533 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]] and [[records/contacts/ghost]]\n",
534 );
535 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
536
537 let s = compute(&store).expect("compute");
538 assert_eq!(s.broken_link_count, 1, "only the ghost target is broken");
539 }
540
541 #[test]
542 fn broken_link_resolves_with_md_extension_stripped() {
543 let (_d, store) = temp_store();
544 write_rel(
547 &store,
548 "wiki/people/a.md",
549 "---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b.md]]\n",
550 );
551 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
552
553 let s = compute(&store).expect("compute");
554 assert_eq!(
555 s.broken_link_count, 0,
556 "a `.md`-suffixed target resolves to the same node and is not broken"
557 );
558 }
559
560 #[test]
561 fn short_form_links_are_not_broken_and_do_not_wire_the_graph() {
562 let (_d, store) = temp_store();
563 write_rel(
567 &store,
568 "records/contacts/a.md",
569 "---\ntype: contact\nsummary: a\n---\n\n[[b]]\n",
570 );
571 write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
572
573 let s = compute(&store).expect("compute");
574 assert_eq!(
575 s.broken_link_count, 0,
576 "short-form links are not counted as broken by stats"
577 );
578 assert_eq!(s.orphan_count, 2);
581 }
582
583 #[test]
584 fn display_alias_links_resolve_to_the_target_not_the_alias() {
585 let (_d, store) = temp_store();
586 write_rel(
590 &store,
591 "wiki/people/a.md",
592 "---\ntype: wiki-page\nsummary: a\n---\n\nmet [[wiki/people/b|Bob]] today\n",
593 );
594 write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
595
596 let s = compute(&store).expect("compute");
597 assert_eq!(s.broken_link_count, 0, "alias target resolves and exists");
598 assert_eq!(s.orphan_count, 0, "a links out, b is linked to");
599 }
600
601 #[test]
602 fn duplicate_links_in_one_file_count_broken_per_occurrence() {
603 let (_d, store) = temp_store();
604 write_rel(
606 &store,
607 "wiki/people/a.md",
608 "---\ntype: wiki-page\nsummary: a\n---\n\n[[records/contacts/ghost]] [[records/contacts/ghost]]\n",
609 );
610 let s = compute(&store).expect("compute");
611 assert_eq!(
612 s.broken_link_count, 2,
613 "broken links count occurrences, not distinct targets"
614 );
615 }
616
617 #[test]
618 fn markdown_links_are_not_treated_as_wiki_links() {
619 let (_d, store) = temp_store();
620 write_rel(
623 &store,
624 "wiki/people/a.md",
625 "---\ntype: wiki-page\nsummary: a\n---\n\nSee [Acme](https://acme.io/path).\n",
626 );
627 let s = compute(&store).expect("compute");
628 assert_eq!(s.broken_link_count, 0, "markdown links aren't graph edges");
629 assert_eq!(s.orphan_count, 1, "the file has no wiki-links => orphan");
630 }
631
632 #[test]
633 fn a_link_to_an_existing_file_in_another_layer_resolves() {
634 let (_d, store) = temp_store();
635 write_rel(
638 &store,
639 "wiki/people/a.md",
640 "---\ntype: wiki-page\nsummary: a\n---\n\nfrom [[sources/emails/2026/05/m]]\n",
641 );
642 write_rel(&store, "sources/emails/2026/05/m.md", &doc("email", "m"));
643
644 let s = compute(&store).expect("compute");
645 assert_eq!(s.broken_link_count, 0);
646 assert_eq!(s.orphan_count, 0, "both endpoints are wired");
647 }
648}