use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::path::{Path, PathBuf};
use regex::Regex;
use crate::store::{Layer, Store};
#[derive(Debug, Clone, Default, PartialEq)]
pub struct Stats {
pub total_files: usize,
pub files_per_layer: BTreeMap<Layer, usize>,
pub total_size_bytes: u64,
pub type_distribution: BTreeMap<String, usize>,
pub orphan_count: usize,
pub broken_link_count: usize,
pub top_types: Vec<(String, usize)>,
pub recognized_types_present: Vec<String>,
pub custom_types_present: Vec<String>,
}
const RECOGNIZED_CONTENT_TYPES: &[&str] = &[
"email",
"transcript",
"pdf-source",
"contact",
"company",
"expense",
"meeting",
"decision",
"invoice",
"wiki-page",
];
const TOP_TYPES_LIMIT: usize = 10;
struct FileFacts {
node_id: PathBuf,
layer: Layer,
size_bytes: u64,
type_: Option<String>,
raw_targets: Vec<PathBuf>,
}
impl FileFacts {
fn resolvable_targets(&self) -> impl Iterator<Item = &PathBuf> {
self.raw_targets.iter().filter(|t| is_full_path(t))
}
}
pub fn compute(store: &Store) -> crate::Result<Stats> {
let link_re = wiki_link_regex();
let mut existing_nodes: HashSet<PathBuf> = HashSet::new();
let mut facts: Vec<FileFacts> = Vec::new();
for layer in Layer::all() {
let layer_root = store.root.join(layer_dir_name(layer));
for abs in walk_layer_content_files(&layer_root)? {
let rel = abs.strip_prefix(&store.root).unwrap_or(&abs).to_path_buf();
let node_id = strip_md(&rel);
existing_nodes.insert(node_id.clone());
let size_bytes = std::fs::metadata(&abs).map(|m| m.len()).unwrap_or(0);
let text = std::fs::read_to_string(&abs).unwrap_or_default();
let type_ = parse_type(&text);
let raw_targets = extract_link_targets(&text, &link_re);
facts.push(FileFacts {
node_id,
layer,
size_bytes,
type_,
raw_targets,
});
}
}
let mut stats = Stats::default();
let mut linked_to: HashSet<PathBuf> = HashSet::new();
for file in &facts {
for target in file.resolvable_targets() {
if existing_nodes.contains(target) {
linked_to.insert(target.clone());
} else {
stats.broken_link_count += 1;
}
}
}
for file in &facts {
stats.total_files += 1;
*stats.files_per_layer.entry(file.layer).or_insert(0) += 1;
stats.total_size_bytes += file.size_bytes;
if let Some(t) = &file.type_ {
*stats.type_distribution.entry(t.clone()).or_insert(0) += 1;
}
let has_outgoing = file
.resolvable_targets()
.any(|t| existing_nodes.contains(t));
let has_incoming = linked_to.contains(&file.node_id);
if !has_outgoing && !has_incoming {
stats.orphan_count += 1;
}
}
stats.top_types = top_types(&stats.type_distribution, TOP_TYPES_LIMIT);
let (recognized, custom) = split_schema_coverage(&stats.type_distribution);
stats.recognized_types_present = recognized;
stats.custom_types_present = custom;
Ok(stats)
}
fn layer_dir_name(layer: Layer) -> &'static str {
match layer {
Layer::Sources => "sources",
Layer::Records => "records",
Layer::Wiki => "wiki",
}
}
fn walk_layer_content_files(layer_root: &Path) -> crate::Result<Vec<PathBuf>> {
let mut out = Vec::new();
if !layer_root.is_dir() {
return Ok(out);
}
let walker = walkdir::WalkDir::new(layer_root)
.into_iter()
.filter_entry(|e| {
let name = e.file_name().to_string_lossy();
if name.starts_with('.') {
return false;
}
if e.file_type().is_dir() && name == "log" {
return false;
}
true
});
for entry in walker {
let entry = entry.map_err(|e| {
crate::Error::Io(
e.into_io_error()
.unwrap_or_else(|| std::io::Error::other("walk error")),
)
})?;
if !entry.file_type().is_file() {
continue;
}
let path = entry.path();
let name = entry.file_name().to_string_lossy();
if !name.ends_with(".md") || name == "index.md" {
continue;
}
out.push(path.to_path_buf());
}
out.sort();
Ok(out)
}
fn wiki_link_regex() -> Regex {
Regex::new(r"\[\[([^\[\]|]+)(?:\|[^\]]*)?\]\]").expect("static wiki-link regex is valid")
}
fn extract_link_targets(text: &str, re: &Regex) -> Vec<PathBuf> {
re.captures_iter(text)
.filter_map(|c| c.get(1))
.map(|m| {
let raw = m.as_str().trim();
strip_md(Path::new(raw))
})
.collect()
}
fn strip_md(path: &Path) -> PathBuf {
let s = path.to_string_lossy();
match s.strip_suffix(".md") {
Some(stem) => PathBuf::from(stem),
None => path.to_path_buf(),
}
}
fn is_full_path(target: &Path) -> bool {
target.components().count() > 1
}
fn parse_type(text: &str) -> Option<String> {
let yaml = frontmatter_block(text)?;
let value: serde_yml::Value = serde_yml::from_str(&yaml).ok()?;
let mapping = value.as_mapping()?;
let type_val = mapping.get(serde_yml::Value::String("type".to_string()))?;
let s = type_val.as_str()?.trim();
if s.is_empty() {
None
} else {
Some(s.to_string())
}
}
fn frontmatter_block(text: &str) -> Option<String> {
let text = text.strip_prefix('\u{feff}').unwrap_or(text);
let mut lines = text.lines();
let first = lines.next()?;
if first.trim_end() != "---" {
return None;
}
let mut body = String::new();
for line in lines {
if line.trim_end() == "---" {
return Some(body);
}
body.push_str(line);
body.push('\n');
}
None
}
fn top_types(dist: &BTreeMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
let mut pairs: Vec<(String, usize)> = dist.iter().map(|(k, v)| (k.clone(), *v)).collect();
pairs.sort_by_key(|p| std::cmp::Reverse(p.1));
pairs.truncate(limit);
pairs
}
fn split_schema_coverage(dist: &BTreeMap<String, usize>) -> (Vec<String>, Vec<String>) {
let canonical: BTreeSet<&str> = RECOGNIZED_CONTENT_TYPES.iter().copied().collect();
let mut recognized = Vec::new();
let mut custom = Vec::new();
for type_ in dist.keys() {
if canonical.contains(type_.as_str()) {
recognized.push(type_.clone());
} else {
custom.push(type_.clone());
}
}
(recognized, custom)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::Config;
use std::fs;
use tempfile::TempDir;
fn temp_store() -> (TempDir, Store) {
let dir = TempDir::new().expect("tempdir");
fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").expect("write DB.md");
let store = Store {
root: dir.path().to_path_buf(),
config: Config::default(),
};
(dir, store)
}
fn write_rel(store: &Store, rel: &str, contents: &str) {
let abs = store.root.join(rel);
if let Some(parent) = abs.parent() {
fs::create_dir_all(parent).expect("mkdir parents");
}
fs::write(abs, contents).expect("write content file");
}
fn doc(type_: &str, summary: &str) -> String {
format!("---\ntype: {type_}\nsummary: \"{summary}\"\n---\n\nbody\n")
}
#[test]
fn empty_store_is_all_zeros() {
let (_d, store) = temp_store();
let s = compute(&store).expect("compute");
assert_eq!(s.total_files, 0);
assert_eq!(s.total_size_bytes, 0);
assert!(s.files_per_layer.is_empty());
assert!(s.type_distribution.is_empty());
assert_eq!(s.orphan_count, 0);
assert_eq!(s.broken_link_count, 0);
assert!(s.top_types.is_empty());
assert!(s.recognized_types_present.is_empty());
assert!(s.custom_types_present.is_empty());
}
#[test]
fn counts_files_per_layer_and_total() {
let (_d, store) = temp_store();
write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
write_rel(&store, "wiki/people/p.md", &doc("wiki-page", "p"));
let s = compute(&store).expect("compute");
assert_eq!(s.total_files, 4);
assert_eq!(s.files_per_layer.get(&Layer::Sources), Some(&2));
assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&1));
}
#[test]
fn ignores_meta_files_and_non_md_and_dotdirs_and_log() {
let (_d, store) = temp_store();
write_rel(&store, "records/contacts/real.md", &doc("contact", "real"));
write_rel(
&store,
"records/contacts/index.md",
"---\ntype: index\nscope: type-folder\n---\n",
);
write_rel(&store, "records/contacts/index.jsonl", "{}\n");
write_rel(&store, "records/notes.txt", "not markdown\n");
write_rel(&store, "sources/log/2026-04.md", &doc("email", "archived"));
write_rel(
&store,
"wiki/.obsidian/cache.md",
&doc("wiki-page", "hidden"),
);
let s = compute(&store).expect("compute");
assert_eq!(s.total_files, 1, "only the one real content file counts");
assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
assert_eq!(s.files_per_layer.get(&Layer::Sources), None);
assert_eq!(s.files_per_layer.get(&Layer::Wiki), None);
}
#[test]
fn total_size_is_sum_of_content_file_bytes() {
let (_d, store) = temp_store();
let a = doc("email", "a");
let b = "---\ntype: contact\nsummary: x\n---\n\nlonger body text here\n".to_string();
write_rel(&store, "sources/emails/a.md", &a);
write_rel(&store, "records/contacts/b.md", &b);
write_rel(
&store,
"records/contacts/index.md",
"---\ntype: index\n---\nbig meta file padding padding\n",
);
let s = compute(&store).expect("compute");
let expected = a.len() as u64 + b.len() as u64;
assert_eq!(s.total_size_bytes, expected);
}
#[test]
fn type_distribution_counts_each_type_value() {
let (_d, store) = temp_store();
write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
write_rel(&store, "sources/emails/c.md", &doc("email", "c"));
write_rel(&store, "records/contacts/d.md", &doc("contact", "d"));
write_rel(&store, "records/proposals/e.md", &doc("proposal", "e"));
let s = compute(&store).expect("compute");
assert_eq!(s.type_distribution.get("email"), Some(&3));
assert_eq!(s.type_distribution.get("contact"), Some(&1));
assert_eq!(s.type_distribution.get("proposal"), Some(&1));
assert_eq!(s.type_distribution.len(), 3);
}
#[test]
fn file_without_type_is_counted_in_totals_but_not_distribution() {
let (_d, store) = temp_store();
write_rel(
&store,
"wiki/themes/x.md",
"---\nsummary: no type here\n---\n\nbody\n",
);
write_rel(&store, "wiki/themes/y.md", "just a body, no frontmatter\n");
let s = compute(&store).expect("compute");
assert_eq!(s.total_files, 2, "untyped files still count toward totals");
assert_eq!(s.files_per_layer.get(&Layer::Wiki), Some(&2));
assert!(
s.type_distribution.is_empty(),
"no type key => no distribution entry, not an empty-string bucket"
);
}
#[test]
fn top_types_orders_by_count_desc_then_name_asc() {
let (_d, store) = temp_store();
write_rel(&store, "records/contacts/c1.md", &doc("contact", "1"));
write_rel(&store, "records/contacts/c2.md", &doc("contact", "2"));
write_rel(&store, "records/contacts/c3.md", &doc("contact", "3"));
write_rel(&store, "sources/emails/e1.md", &doc("email", "1"));
write_rel(&store, "sources/emails/e2.md", &doc("email", "2"));
write_rel(&store, "sources/emails/e3.md", &doc("email", "3"));
write_rel(&store, "records/decisions/d1.md", &doc("decision", "1"));
let s = compute(&store).expect("compute");
assert_eq!(
s.top_types,
vec![
("contact".to_string(), 3),
("email".to_string(), 3),
("decision".to_string(), 1),
],
"ties (contact, email both 3) break by name ascending; decision trails"
);
}
#[test]
fn top_types_is_capped_at_ten() {
let (_d, store) = temp_store();
for i in 0..12 {
let t = format!("type{i:02}");
write_rel(&store, &format!("records/{t}/f.md"), &doc(&t, "x"));
}
let s = compute(&store).expect("compute");
assert_eq!(s.top_types.len(), 10, "top_types caps at 10");
assert_eq!(
s.type_distribution.len(),
12,
"distribution keeps all types"
);
}
#[test]
fn schema_coverage_splits_recognized_from_custom() {
let (_d, store) = temp_store();
write_rel(&store, "records/contacts/c.md", &doc("contact", "c")); write_rel(&store, "sources/emails/e.md", &doc("email", "e")); write_rel(&store, "wiki/people/p.md", &doc("wiki-page", "p")); write_rel(&store, "records/proposals/x.md", &doc("proposal", "x")); write_rel(&store, "records/widgets/w.md", &doc("widget", "w"));
let s = compute(&store).expect("compute");
assert_eq!(
s.recognized_types_present,
vec![
"contact".to_string(),
"email".to_string(),
"wiki-page".to_string()
],
"recognized canonical content types, sorted ascending"
);
assert_eq!(
s.custom_types_present,
vec!["proposal".to_string(), "widget".to_string()],
"non-canonical types land in custom, sorted ascending"
);
}
#[test]
fn meta_types_are_not_recognized_content_types() {
let (_d, store) = temp_store();
write_rel(&store, "wiki/synthesis/weird.md", &doc("log", "weird"));
let s = compute(&store).expect("compute");
assert!(
s.recognized_types_present.is_empty(),
"`log` is a meta type, not a recognized content type"
);
assert_eq!(s.custom_types_present, vec!["log".to_string()]);
}
#[test]
fn orphans_are_files_with_no_incoming_and_no_outgoing_links() {
let (_d, store) = temp_store();
write_rel(
&store,
"records/contacts/a.md",
"---\ntype: contact\nsummary: a\n---\n\nSee [[records/contacts/b]].\n",
);
write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
let s = compute(&store).expect("compute");
assert_eq!(s.orphan_count, 1, "only c is an orphan");
}
#[test]
fn a_file_with_only_an_incoming_link_is_not_an_orphan() {
let (_d, store) = temp_store();
write_rel(
&store,
"wiki/people/a.md",
"---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]]\n",
);
write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
let s = compute(&store).expect("compute");
assert_eq!(s.orphan_count, 0);
}
#[test]
fn frontmatter_wiki_links_count_as_edges_for_orphans() {
let (_d, store) = temp_store();
write_rel(
&store,
"records/contacts/sarah.md",
"---\ntype: contact\nsummary: s\ncompany: [[records/companies/acme]]\n---\n\nbody\n",
);
write_rel(&store, "records/companies/acme.md", &doc("company", "acme"));
let s = compute(&store).expect("compute");
assert_eq!(
s.orphan_count, 0,
"a frontmatter wiki-link is a real edge; neither endpoint is orphaned"
);
}
#[test]
fn broken_links_count_targets_that_do_not_exist() {
let (_d, store) = temp_store();
write_rel(
&store,
"wiki/people/a.md",
"---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b]] and [[records/contacts/ghost]]\n",
);
write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
let s = compute(&store).expect("compute");
assert_eq!(s.broken_link_count, 1, "only the ghost target is broken");
}
#[test]
fn broken_link_resolves_with_md_extension_stripped() {
let (_d, store) = temp_store();
write_rel(
&store,
"wiki/people/a.md",
"---\ntype: wiki-page\nsummary: a\n---\n\n[[wiki/people/b.md]]\n",
);
write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
let s = compute(&store).expect("compute");
assert_eq!(
s.broken_link_count, 0,
"a `.md`-suffixed target resolves to the same node and is not broken"
);
}
#[test]
fn short_form_links_are_not_broken_and_do_not_wire_the_graph() {
let (_d, store) = temp_store();
write_rel(
&store,
"records/contacts/a.md",
"---\ntype: contact\nsummary: a\n---\n\n[[b]]\n",
);
write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
let s = compute(&store).expect("compute");
assert_eq!(
s.broken_link_count, 0,
"short-form links are not counted as broken by stats"
);
assert_eq!(s.orphan_count, 2);
}
#[test]
fn display_alias_links_resolve_to_the_target_not_the_alias() {
let (_d, store) = temp_store();
write_rel(
&store,
"wiki/people/a.md",
"---\ntype: wiki-page\nsummary: a\n---\n\nmet [[wiki/people/b|Bob]] today\n",
);
write_rel(&store, "wiki/people/b.md", &doc("wiki-page", "b"));
let s = compute(&store).expect("compute");
assert_eq!(s.broken_link_count, 0, "alias target resolves and exists");
assert_eq!(s.orphan_count, 0, "a links out, b is linked to");
}
#[test]
fn duplicate_links_in_one_file_count_broken_per_occurrence() {
let (_d, store) = temp_store();
write_rel(
&store,
"wiki/people/a.md",
"---\ntype: wiki-page\nsummary: a\n---\n\n[[records/contacts/ghost]] [[records/contacts/ghost]]\n",
);
let s = compute(&store).expect("compute");
assert_eq!(
s.broken_link_count, 2,
"broken links count occurrences, not distinct targets"
);
}
#[test]
fn markdown_links_are_not_treated_as_wiki_links() {
let (_d, store) = temp_store();
write_rel(
&store,
"wiki/people/a.md",
"---\ntype: wiki-page\nsummary: a\n---\n\nSee [Acme](https://acme.io/path).\n",
);
let s = compute(&store).expect("compute");
assert_eq!(s.broken_link_count, 0, "markdown links aren't graph edges");
assert_eq!(s.orphan_count, 1, "the file has no wiki-links => orphan");
}
#[test]
fn a_link_to_an_existing_file_in_another_layer_resolves() {
let (_d, store) = temp_store();
write_rel(
&store,
"wiki/people/a.md",
"---\ntype: wiki-page\nsummary: a\n---\n\nfrom [[sources/emails/2026/05/m]]\n",
);
write_rel(&store, "sources/emails/2026/05/m.md", &doc("email", "m"));
let s = compute(&store).expect("compute");
assert_eq!(s.broken_link_count, 0);
assert_eq!(s.orphan_count, 0, "both endpoints are wired");
}
}